# Asthma Deaths | Processing

## Set Up

Ensure that the required libraries are available by running the below code in the terminal before execution:
- pip install pandas


Execute the following in the jupyter notebook before execution to ensure that the required libraries are imported:

In [76]:
import pandas as pd

## Load Dataset

In [77]:
df = pd.read_csv('raw.csv')

## Data Manipulation

In [78]:
# Rename columns to match Air Quality data set.
df = df.rename(columns={'LHD': 'lhd', 'Period': 'financial year'})                                                            # Rename columns
df.columns = df.columns.str.lower()                                                                                 # Lowercase column names

# Remove ' LHD' for Local Health District values.
df['lhd'] = df['lhd'].str.replace(' LHD', '')                                                                       # Remove ' LHD' from lhd column

# Remove rows representing state-wide aggregated data.
df = df.dropna(subset=['lhd'])                                                                                      # Remove rows with missing values in 'lhd' column
df = df[~df['lhd'].str.contains('All')]                                                                             # Remove rows with 'All' in 'lhd' column

# Remove columns holding Confidence Interval data.
df = df.loc[:, ~df.columns.str.contains('% ci')]                                                                    # Remove columns with '% ci' in the name

# Reformat 'financial year' values from XX/YY to XXXX/YYYY.
df['financial year'] = df['financial year'].str.replace('-', '/')                                                   # Replace '-' with '/'

# Remove rows where 'risk group' is not 'All ages'.
df = df[df['risk group'] == 'All ages']                                                                             # Keep rows where 'risk group' is 'All ages'
df = df.drop(columns=['risk group'])                                                                                # Drop 'risk group' column

# View the cleaned data.
df.head()                                                                                                           # Display the first few rows of the cleaned data

Unnamed: 0,sex,lhd,financial year,"rate per 100,000 population"
480,Males,Sydney,2011/2012,26.5
481,Males,Sydney,2012/2013,27.8
482,Males,Sydney,2013/2014,31.1
483,Males,Sydney,2014/2015,29.5
484,Males,Sydney,2015/2016,23.3


In [79]:
# Pivot the dataframe to have 'sex' as columns
df = df.pivot_table(index=['financial year', 'lhd'], columns='sex', values='rate per 100,000 population').reset_index()

# Rename the columns to match the desired format
df.columns.name = None
df = df.rename(columns={
    'Persons': 'Persons rate per 100,000 population',
    'Males': 'Male rate per 100,000 population', 
    'Females': 'Female rate per 100,000 population'
})

# View the DataFrame
df.head() 

Unnamed: 0,financial year,lhd,"Female rate per 100,000 population","Male rate per 100,000 population","Persons rate per 100,000 population"
0,2011/2012,Central Coast,24.9,33.1,28.2
1,2011/2012,Far West,30.9,38.5,32.6
2,2011/2012,Hunter New England,19.1,34.3,25.4
3,2011/2012,Illawarra Shoalhaven,20.3,34.0,26.1
4,2011/2012,Mid North Coast,18.8,38.8,27.7


## Set Date Range

Set the range of financial years from 2014/2015 to 2023/2024.

In [80]:
# Drop pre 2014/2015 data.
df = df[~df['financial year'].isin(['2011/2012', '2012/2013', '2013/2014'])]                                        # Drop pre 2014/2015 data.

# Add rows for each LHD for the missing years until 2023/2024.
missing_rows = []                                                                                                   # Create a list to store the missing rows.
lhds = df['lhd'].unique()                                                                                           # Get unique LHDs.
years = [f"{year}/{year + 1}" for year in range(2014, 2024)]                                                        # Create a list of years from 2014/2015 to 2023/2024.

for lhd in lhds:
    for year in years:
        if not ((df['lhd'] == lhd) & (df['financial year'] == year)).any():                                         # Check if the row is missing.
            missing_rows.append({                                                                                   # Append the missing row to the list.
                'lhd': lhd, 
                'financial year': year, 
                'Female rate per 100,000 population': None,
                'Male rate per 100,000 population': None,
                'Persons rate per 100,000 population': None,
            })   

# Create a DataFrame from the missing rows and concatenate it to the original DataFrame
if missing_rows:
    df_missing = pd.DataFrame(missing_rows)                                                                         # Create a DataFrame from the missing rows.
    df = pd.concat([df, df_missing], ignore_index=True)                                                             # Concatenate the DataFrames.

# Sort the DataFrame by 'lhd' and 'financial year'.
df = df.sort_values(by=['lhd', 'financial year']).reset_index(drop=True)                                           # Sort the DataFrame by 'lhd' and 'financial year'.

# View the DataFrame.
df.tail()                                                                                                          # View the last 5 rows of the DataFrame.

  df = pd.concat([df, df_missing], ignore_index=True)                                                             # Concatenate the DataFrames.


Unnamed: 0,financial year,lhd,"Female rate per 100,000 population","Male rate per 100,000 population","Persons rate per 100,000 population"
145,2019/2020,Western Sydney,13.5,24.5,18.3
146,2020/2021,Western Sydney,13.8,20.7,16.8
147,2021/2022,Western Sydney,,,
148,2022/2023,Western Sydney,,,
149,2023/2024,Western Sydney,,,


Fill missing values using linear interpolation.

In [None]:
df['Female rate per 100,000 population'] = df['Female rate per 100,000 population'].interpolate()                                       # Fill missing values using linear interpolation.
df['Male rate per 100,000 population'] = df['Male rate per 100,000 population'].interpolate()                                        # Fill missing values using linear interpolation.
df['Persons rate per 100,000 population'] = df['Persons rate per 100,000 population'].interpolate()                                     # Fill missing values using linear interpolation.

## Output Processed Dataset

In [48]:
# Save the processed data to a new CSV file.
df.to_csv('processed.csv', index=False)                                                                            # Save the processed data to a new CSV file.

# View the DataFrame.
df.head()                                                                                                          # View the first 5 rows of the DataFrame.

Unnamed: 0,lhd,financial year,"rate per 100,000 population"
0,Central Coast,2012/2012,57.9
1,Central Coast,2013/2013,64.2
2,Central Coast,2014/2014,66.791667
3,Central Coast,2014/2015,68.725
4,Central Coast,2015/2015,68.725
