# Asthma Deaths | Processing

## Set Up

Ensure that the required libraries are available by running the below code in the terminal before execution:
- pip install pandas


Execute the following in the jupyter notebook before execution to ensure that the required libraries are imported:

In [20]:
import pandas as pd

## Load Dataset

In [21]:
df = pd.read_csv('raw.csv')

## Data Manipulation

In [22]:
# Rename columns to match Air Quality data set.
df = df.rename(columns={'LHD': 'lhd', 'Period': 'date'})                                                            # Rename columns
df.columns = df.columns.str.lower()                                                                                 # Lowercase column names

# Remove ' LHD' for Local Health District values.
df['lhd'] = df['lhd'].str.replace(' LHD', '')                                                                       # Remove ' LHD' from lhd column

# Remove rows representing state-wide aggregated data.
df = df.dropna(subset=['lhd'])                                                                                      # Remove rows with missing values in 'lhd' column
df = df[~df['lhd'].str.contains('All')]                                                                             # Remove rows with 'All' in 'lhd' column

# Remove columns holding Confidence Interval data.
df = df.loc[:, ~df.columns.str.contains('% ci')]                                                                    # Remove columns with '% ci' in the name

# Convert 'per cent' to 'rate per 100,000 population'.
df = df.rename(columns={'per cent': 'rate per 100,000 population'})                                                 # Rename 'per cent' column
df['rate per 100,000 population'] = df['rate per 100,000 population'] * 1000                                        # Remove ' per cent' from 'rate per 100,000 population' column

# View the cleaned data.
df.head()                                                                                                           # Display the first few rows of the cleaned data

Unnamed: 0,asthma type,lhd,date,"rate per 100,000 population"
0,Current Asthma,Sydney,2002-2004,11700.0
1,Current Asthma,Sydney,2003-2005,10700.0
2,Current Asthma,Sydney,2004-2006,13200.0
3,Current Asthma,Sydney,2005-2007,12500.0
4,Current Asthma,Sydney,2006-2008,14600.0


## Data Normalization

Convert rolling 2-Year totals to annual totals.

In [23]:
yearly_data = []                                                                                                   # Create a list to hold the new data.

# Ensure column names are stripped of leading/trailing spaces
df.columns = df.columns.str.strip()

# Iterate over the rows.
for index, row in df.iterrows():                                                                                   # Iterate over the rows.
    start_year, end_year = map(int, row['date'].split('-'))                                                        # Get the start and end years.
    mid_year = start_year + 1                                                                                      # Calculate the mid year.
    split_value = row['rate per 100,000 population'] / 2                                                           # Calculate the split value.

    yearly_data.extend([
        {'lhd': row['lhd'], 'financial year': f"{start_year}/{mid_year}", 'rate per 100,000 population': split_value},         # Append the first financial year of the period.
        {'lhd': row['lhd'], 'financial year': f"{mid_year}/{end_year}", 'rate per 100,000 population': split_value}            # Append the second financial year of the period.
    ])
    
# Create a new DataFrame.
df_yearly = pd.DataFrame(yearly_data)                                                                              # Create a new DataFrame.

# Group by 'lhd' and 'financial year'. Get an average of the 'value' column.
df_yearly = df_yearly.groupby(['financial year', 'lhd']).mean().reset_index()                                      # Group by 'lhd' and 'financial year'. Get an average of the 'value' column.

# Assign to original DataFrame.
df = df_yearly                                                                                                     # Assign to original DataFrame.

# View the cleaned data.
df.head()                                                                                                          # Display the first few rows of the cleaned data

Unnamed: 0,financial year,lhd,"rate per 100,000 population"
0,2002/2003,Central Coast,12025.0
1,2002/2003,Far West,12450.0
2,2002/2003,Hunter New England,11725.0
3,2002/2003,Illawarra Shoalhaven,10200.0
4,2002/2003,Mid North Coast,10775.0


## Set Date Range

Set the range of financial years from 2014/2015 to 2023/2024.

In [24]:
# Drop pre 2014/2015 data.
df = df[~df['financial year'].isin(['2011/2012', '2012/2013', '2013/2014'])]                                        # Drop pre 2014/2015 data.

# Add rows for each LHD for the missing years until 2023/2024.
missing_rows = []                                                                                                   # Create a list to store the missing rows.
lhds = df['lhd'].unique()                                                                                           # Get unique LHDs.
years = [f"{year}/{year + 1}" for year in range(2014, 2024)]                                                        # Create a list of years from 2014/2015 to 2023/2024.

for lhd in lhds:
    for year in years:
        if not ((df['lhd'] == lhd) & (df['financial year'] == year)).any():                                         # Check if the row is missing.
            missing_rows.append({'lhd': lhd, 'financial year': year, 'rate per 100,000 population': None})          # Append the missing row to the list.

# Create a DataFrame from the missing rows and concatenate it to the original DataFrame
if missing_rows:
    df_missing = pd.DataFrame(missing_rows)                                                                         # Create a DataFrame from the missing rows.
    df = pd.concat([df, df_missing], ignore_index=True)                                                             # Concatenate the DataFrames.

# Sort the DataFrame by 'lhd' and 'financial year'.
df = df.sort_values(by=['lhd', 'financial year']).reset_index(drop=True)                                           # Sort the DataFrame by 'lhd' and 'financial year'.

# View the DataFrame.
df.head()                                                                                                          # View the last 5 rows of the DataFrame.

  df = pd.concat([df, df_missing], ignore_index=True)                                                             # Concatenate the DataFrames.


Unnamed: 0,financial year,lhd,"rate per 100,000 population"
0,2002/2003,Central Coast,12025.0
1,2003/2004,Central Coast,12200.0
2,2004/2005,Central Coast,12900.0
3,2005/2006,Central Coast,12662.5
4,2006/2007,Central Coast,11737.5


Fill missing values using linear interpolation.

In [25]:
df['rate per 100,000 population'] = df['rate per 100,000 population'].interpolate()                                # Fill missing values using linear interpolation.

## Output Processed Dataset

In [26]:
# Save the processed data to a new CSV file.
df.to_csv('processed.csv', index=False)                                                                            # Save the processed data to a new CSV file.

# View the DataFrame.
df.head()                                                                                                          # View the first 5 rows of the DataFrame.

Unnamed: 0,financial year,lhd,"rate per 100,000 population"
0,2002/2003,Central Coast,12025.0
1,2003/2004,Central Coast,12200.0
2,2004/2005,Central Coast,12900.0
3,2005/2006,Central Coast,12662.5
4,2006/2007,Central Coast,11737.5
