# Influenza and Pneumonia Deaths | Processing

## Set Up

Ensure that the required libraries are available by running the below code in the terminal before execution:
- pip install pandas


Execute the following in the jupyter notebook before execution to ensure that the required libraries are imported:

In [1]:
import pandas as pd

## Load Dataset

In [2]:
df = pd.read_csv('raw.csv')

## Data Manipulation

In [4]:
# Rename columns to match Air Quality data set.
df = df.rename(columns={'LHD': 'lhd', 'Period': 'financial year'})                                                            # Rename columns
df.columns = df.columns.str.lower()                                                                                 # Lowercase column names

# Remove ' LHD' for Local Health District values.
df['lhd'] = df['lhd'].str.replace(' LHD', '')                                                                       # Remove ' LHD' from lhd column

# Remove rows representing state-wide aggregated data.
df = df.dropna(subset=['lhd'])                                                                                      # Remove rows with missing values in 'lhd' column
df = df[~df['lhd'].str.contains('All')]                                                                             # Remove rows with 'All' in 'lhd' column

# Remove columns holding Confidence Interval data.
df = df.loc[:, ~df.columns.str.contains('% ci')]                                                                    # Remove columns with '% ci' in the name

# Reformat 'financial year' values from XX/YY to XXXX/YYYY.
df['financial year'] = df['financial year'].str.replace('-', '/')                                                   # Replace '-' with '/'

# View the cleaned data.
df.head()                                                                                                           # Display the first few rows of the cleaned data

Unnamed: 0,sex,lhd,financial year,"rate per 100,000 population"
0,Males,Sydney,2011/2012,9.2
1,Males,Sydney,2012/2013,8.7
2,Males,Sydney,2013/2014,9.0
3,Males,Sydney,2014/2015,7.9
4,Males,Sydney,2015/2016,8.4


In [5]:
# Pivot the dataframe to have 'sex' as columns
df = df.pivot_table(index=['financial year', 'lhd'], columns='sex', values='rate per 100,000 population').reset_index()

# Rename the columns to match the desired format
df.columns.name = None
df = df.rename(columns={
    'Persons': 'Persons rate per 100,000 population',
    'Males': 'Male rate per 100,000 population', 
    'Females': 'Female rate per 100,000 population'
})

# View the DataFrame
df.head() 

Unnamed: 0,financial year,lhd,"Female rate per 100,000 population","Male rate per 100,000 population","Persons rate per 100,000 population"
0,2011/2012,Central Coast,7.6,10.2,8.7
1,2011/2012,Hunter New England,7.8,11.3,9.4
2,2011/2012,Illawarra Shoalhaven,8.6,7.9,8.3
3,2011/2012,Mid North Coast,10.6,9.8,10.2
4,2011/2012,Murrumbidgee,9.3,10.6,9.8


## Set Date Range

Set the range of financial years from 2014/2015 to 2023/2024.

In [6]:
# Drop pre 2014/2015 data.
df = df[~df['financial year'].isin(['2011/2012', '2012/2013', '2013/2014'])]                                        # Drop pre 2014/2015 data.

# Add rows for each LHD for the missing years until 2023/2024.
missing_rows = []                                                                                                   # Create a list to store the missing rows.
lhds = df['lhd'].unique()                                                                                           # Get unique LHDs.
years = [f"{year}/{year + 1}" for year in range(2014, 2024)]                                                        # Create a list of years from 2014/2015 to 2023/2024.

for lhd in lhds:
    for year in years:
        if not ((df['lhd'] == lhd) & (df['financial year'] == year)).any():                                         # Check if the row is missing.
            missing_rows.append({                                                                                   # Append the missing row to the list.
                'lhd': lhd, 
                'financial year': year, 
                'Female rate per 100,000 population': None,
                'Male rate per 100,000 population': None,
                'Persons rate per 100,000 population': None,
            })   

# Create a DataFrame from the missing rows and concatenate it to the original DataFrame
if missing_rows:
    df_missing = pd.DataFrame(missing_rows)                                                                         # Create a DataFrame from the missing rows.
    df = pd.concat([df, df_missing], ignore_index=True)                                                             # Concatenate the DataFrames.

# Sort the DataFrame by 'lhd' and 'financial year'.
df = df.sort_values(by=['lhd', 'financial year']).reset_index(drop=True)                                           # Sort the DataFrame by 'lhd' and 'financial year'.

# View the DataFrame.
df.tail()                                                                                                          # View the last 5 rows of the DataFrame.

  df = pd.concat([df, df_missing], ignore_index=True)                                                             # Concatenate the DataFrames.


Unnamed: 0,financial year,lhd,"Female rate per 100,000 population","Male rate per 100,000 population","Persons rate per 100,000 population"
135,2019/2020,Western Sydney,7.2,9.2,8.1
136,2020/2021,Western Sydney,4.0,5.7,4.8
137,2021/2022,Western Sydney,,,
138,2022/2023,Western Sydney,,,
139,2023/2024,Western Sydney,,,


Fill missing values using linear interpolation.

In [7]:
df['Female rate per 100,000 population'] = df['Female rate per 100,000 population'].interpolate()                                       # Fill missing values using linear interpolation.
df['Male rate per 100,000 population'] = df['Male rate per 100,000 population'].interpolate()                                        # Fill missing values using linear interpolation.
df['Persons rate per 100,000 population'] = df['Persons rate per 100,000 population'].interpolate()                                     # Fill missing values using linear interpolation.

## Output Processed Dataset

In [8]:
# Save the processed data to a new CSV file.
df.to_csv('processed.csv', index=False)                                                                            # Save the processed data to a new CSV file.

# View the DataFrame.
df.head()                                                                                                          # View the first 5 rows of the DataFrame.

Unnamed: 0,financial year,lhd,"Female rate per 100,000 population","Male rate per 100,000 population","Persons rate per 100,000 population"
0,2014/2015,Central Coast,9.4,11.5,10.2
1,2015/2016,Central Coast,9.9,10.2,10.1
2,2016/2017,Central Coast,11.1,11.4,11.3
3,2017/2018,Central Coast,9.7,13.3,11.3
4,2018/2019,Central Coast,7.8,13.3,10.1
