# Influenza and Pneumonia Hospitalisations - Processing

## Dependencies

Ensure that the required libraries have been installed locally as per the README.md file included in this project.

Run the following cell the import the required dependencies for this notebook.

In [1]:
# Import libraries
import pandas as pd

## Load Dataset

In [2]:
# Load the raw data.
df = pd.read_csv('raw.csv')

## Exploratory Analysis of Raw Data

In [3]:
# Generate summary statistics for object columns.
object_summary_stats = df.describe(include=['O']).transpose()                                                       # Generate summary statistics for object columns.
object_summary_stats['missing_values'] = df.isnull().sum()                                                          # Add missing values to the summary table.
object_summary_stats['present_values'] = df.notnull().sum()                                                         # Add present values to the summary table.
object_summary_stats['datatype'] = df.dtypes                                                                        # Add data types to the summary table.
object_summary_stats = object_summary_stats[['datatype', 'present_values', 'missing_values', 'unique']]             # Select features and reorder table.

# Display the summary tables with titles.
print("Dataset Head:")                                                                                              # Display the dataset head title.
display(df.head().style.set_table_styles([{'selector': 'th', 'props': [('min-width', '100px')]}]))                  # Display the dataset head. For better visualization, set the minimum width of the table headers to 100px.

print("\nObject Summary Statistics:")                                                                               # Display the object summary statistics.
display(object_summary_stats.style.set_table_styles([{'selector': 'th', 'props': [('min-width', '100px')]}]))       # Display the object summary statistics. For better visualization, set the minimum width of the table headers to 100px.

Dataset Head:


Unnamed: 0,Age (years),Sex,LHD,Period,"Rate per 100,000 population",LL 95% CI,UL 95% CI
0,0-4 years,Males,Sydney LHD,01/02,817.3,678.7,975.8
1,0-4 years,Males,Sydney LHD,02/03,720.5,591.6,869.1
2,0-4 years,Males,Sydney LHD,03/04,786.0,652.2,939.2
3,0-4 years,Males,Sydney LHD,04/05,596.6,481.6,730.9
4,0-4 years,Males,Sydney LHD,05/06,418.3,323.5,532.2



Object Summary Statistics:


Unnamed: 0,datatype,present_values,missing_values,unique
Age (years),object,2700,0,3
Sex,object,2700,0,3
LHD,object,2700,0,15
Period,object,2700,0,20
"Rate per 100,000 population",object,2700,0,2411
LL 95% CI,object,2700,0,2347
UL 95% CI,object,2700,0,2424


## Data Manipulation

In [4]:
# Rename columns to match Air Quality data set.
df = df.rename(columns={'LHD': 'lhd', 'Period': 'financial year'})                                                  # Rename columns
df.columns = df.columns.str.lower()                                                                                 # Lowercase column names

# Remove ' LHD' for Local Health District values.
df['lhd'] = df['lhd'].str.replace(' LHD', '')                                                                       # Remove ' LHD' from lhd column

# Remove rows representing state-wide aggregated data.
df = df.dropna(subset=['lhd'])                                                                                      # Remove rows with missing values in 'lhd' column
df = df[~df['lhd'].str.contains('All')]                                                                             # Remove rows with 'All' in 'lhd' column

# Remove columns holding Confidence Interval data.
df = df.loc[:, ~df.columns.str.contains('% ci')]                                                                    # Remove columns with '% ci' in the name

# Reformat 'financial year' values from XX/YY to XXXX/YYYY.
df['financial year'] = df['financial year'].apply(                                                                  # Add '20' to the start of each year value
    lambda x: f'20{x[:2]}/20{x[3:]}' if isinstance(x, str) else x
)

# Remove rows where 'risk group' is not 'All ages'.
df = df[df['age (years)'] == 'All ages']                                                                            # Keep rows where 'risk group' is 'All ages'
df = df.drop(columns=['age (years)'])                                                                               # Drop 'risk group' column

# View the cleaned data.
display(df.head())                                                                                                  # Display the first few rows of the cleaned data

Unnamed: 0,sex,lhd,financial year,"rate per 100,000 population"
1800,Males,Sydney,2001/2002,345.0
1801,Males,Sydney,2002/2003,349.1
1802,Males,Sydney,2003/2004,362.4
1803,Males,Sydney,2004/2005,364.8
1804,Males,Sydney,2005/2006,285.2


In [5]:
# Convert 'rate per 100,000 population' to numeric, coercing errors to NaN
df['rate per 100,000 population'] = pd.to_numeric(df['rate per 100,000 population'], errors='coerce')                      # Convert 'rate per 100,000 population' to numeric

# Pivot the dataframe to have 'sex' as columns
df = df.pivot_table(index=['financial year', 'lhd'], columns='sex', values='rate per 100,000 population').reset_index()    # Pivot the dataframe

# Rename the columns to match the desired format
df.columns.name = None                                                                                                     # Remove the columns name
df = df.rename(columns={                                                                                                   # Rename the columns
    'Persons': 'Persons rate per 100,000 population',
    'Males': 'Male rate per 100,000 population', 
    'Females': 'Female rate per 100,000 population'
})

# View the DataFrame
display(df.head()) 

Unnamed: 0,financial year,lhd,"Female rate per 100,000 population","Male rate per 100,000 population","Persons rate per 100,000 population"
0,2001/2002,Central Coast,198.1,294.6,239.5
1,2001/2002,Hunter New England,249.6,354.7,294.8
2,2001/2002,Illawarra Shoalhaven,198.4,286.8,236.2
3,2001/2002,Mid North Coast,301.8,381.6,337.1
4,2001/2002,Murrumbidgee,333.3,464.6,390.0


## Set Date Range

Set the range of financial years from 2014/2015 to 2023/2024.

In [6]:
# Drop pre 2014/2015 data.
df = df[~df['financial year'].isin(['2011/2012', '2012/2013', '2013/2014'])]                                        # Drop pre 2014/2015 data.

# Add rows for each LHD for the missing years until 2023/2024.
missing_rows = []                                                                                                   # Create a list to store the missing rows.
lhds = df['lhd'].unique()                                                                                           # Get unique LHDs.
years = [f"{year}/{year + 1}" for year in range(2014, 2024)]                                                        # Create a list of years from 2014/2015 to 2023/2024.

for lhd in lhds:
    for year in years:
        if not ((df['lhd'] == lhd) & (df['financial year'] == year)).any():                                         # Check if the row is missing.
            missing_rows.append({                                                                                   # Append the missing row to the list.
                'lhd': lhd, 
                'financial year': year, 
                'Female rate per 100,000 population': None,
                'Male rate per 100,000 population': None,
                'Persons rate per 100,000 population': None,
            })   

# Create a DataFrame from the missing rows and concatenate it to the original DataFrame
if missing_rows:
    df_missing = pd.DataFrame(missing_rows)                                                                         # Create a DataFrame from the missing rows.
    df = pd.concat([df, df_missing], ignore_index=True)                                                             # Concatenate the DataFrames.

# Sort the DataFrame by 'lhd' and 'financial year'.
df = df.sort_values(by=['lhd', 'financial year']).reset_index(drop=True)                                            # Sort the DataFrame by 'lhd' and 'financial year'.

# View the DataFrame.
display(df.tail())                                                                                                  # View the last 5 rows of the DataFrame.

  df = pd.concat([df, df_missing], ignore_index=True)                                                             # Concatenate the DataFrames.


Unnamed: 0,financial year,lhd,"Female rate per 100,000 population","Male rate per 100,000 population","Persons rate per 100,000 population"
275,2019/2020,Western Sydney,311.1,365.6,334.8
276,2020/2021,Western Sydney,161.9,228.0,192.9
277,2021/2022,Western Sydney,,,
278,2022/2023,Western Sydney,,,
279,2023/2024,Western Sydney,,,


Fill missing values using linear interpolation.

In [7]:
df['Female rate per 100,000 population'] = df['Female rate per 100,000 population'].interpolate()                       # Fill missing values using linear interpolation.
df['Male rate per 100,000 population'] = df['Male rate per 100,000 population'].interpolate()                           # Fill missing values using linear interpolation.
df['Persons rate per 100,000 population'] = df['Persons rate per 100,000 population'].interpolate()                     # Fill missing values using linear interpolation.

## Output Processed Dataset

In [8]:
# Save the processed data to a new CSV file.
df.to_csv('processed.csv', index=False)                                                                                # Save the processed data to a new CSV file.

## View Processed Dataset

In [9]:
# Display the processed data.
display(df.head())

Unnamed: 0,financial year,lhd,"Female rate per 100,000 population","Male rate per 100,000 population","Persons rate per 100,000 population"
0,2001/2002,Central Coast,198.1,294.6,239.5
1,2002/2003,Central Coast,213.2,317.7,258.9
2,2003/2004,Central Coast,217.8,308.5,258.3
3,2004/2005,Central Coast,190.7,278.8,229.9
4,2005/2006,Central Coast,208.5,321.6,258.0
