# Asthma Deaths - Processing

## Dependencies

Ensure that the required libraries have been installed locally as per the README.md file included in this project.

Run the following cell the import the required dependencies for this notebook.

In [8]:
# Import libraries
import pandas as pd

## Load Dataset

In [9]:
# Load the data
df = pd.read_csv('raw.csv')

## Exploratory Analysis of Raw Data

In [10]:
# Generate summary statistics for object columns.
object_summary_stats = df.describe(include=['O']).transpose()                                                       # Generate summary statistics for object columns.
object_summary_stats['missing_values'] = df.isnull().sum()                                                          # Add missing values to the summary table.
object_summary_stats['present_values'] = df.notnull().sum()                                                         # Add present values to the summary table.
object_summary_stats['datatype'] = df.dtypes                                                                        # Add data types to the summary table.
object_summary_stats = object_summary_stats[['datatype', 'present_values', 'missing_values', 'unique']]             # Select features and reorder table.

# Display the summary tables with titles.
print("Dataset Head:")                                                                                              # Display the dataset head title.
display(df.head().style.set_table_styles([{'selector': 'th', 'props': [('min-width', '100px')]}]))                  # Display the dataset head. For better visualization, set the minimum width of the table headers to 100px.

print("\nObject Summary Statistics:")                                                                               # Display the object summary statistics.
display(object_summary_stats.style.set_table_styles([{'selector': 'th', 'props': [('min-width', '100px')]}]))       # Display the object summary statistics. For better visualization, set the minimum width of the table headers to 100px.

Dataset Head:


Unnamed: 0,LHD,Period,"Rate per 100,000 population",LL 95% CI,UL 95% CI
0,Sydney LHD,2011-2013,1.6,1.1,2.3
1,Sydney LHD,2012-2014,2.4,1.7,3.2
2,Sydney LHD,2013-2015,1.9,1.3,2.7
3,Sydney LHD,2014-2016,1.5,1.0,2.2
4,Sydney LHD,2015-2017,0.8,0.4,1.3



Object Summary Statistics:


Unnamed: 0,datatype,present_values,missing_values,unique
LHD,object,135,0,15
Period,object,135,0,9


## Data Manipulation

In [11]:
# Rename columns to match Air Quality data set.
df = df.rename(columns={'LHD': 'lhd', 'Period': 'date'})                                                            # Rename columns
df.columns = df.columns.str.lower()                                                                                 # Lowercase column names

# Remove ' LHD' for Local Health District values.
df['lhd'] = df['lhd'].str.replace(' LHD', '')                                                                       # Remove ' LHD' from lhd column

# Remove rows representing state-wide aggregated data.
df = df.dropna(subset=['lhd'])                                                                                      # Remove rows with missing values in 'lhd' column
df = df[~df['lhd'].str.contains('All')]                                                                             # Remove rows with 'All' in 'lhd' column

# Remove columns holding Confidence Interval data.
df = df.loc[:, ~df.columns.str.contains('% ci')]                                                                    # Remove columns with '% ci' in the name

# View the cleaned data.
display(df.head())                                                                                                  # Display the first few rows of the cleaned data

Unnamed: 0,lhd,date,"rate per 100,000 population"
0,Sydney,2011-2013,1.6
1,Sydney,2012-2014,2.4
2,Sydney,2013-2015,1.9
3,Sydney,2014-2016,1.5
4,Sydney,2015-2017,0.8


## Data Normalization

Convert rolling 2-Year totals to annual totals.

In [12]:
yearly_data = []                                                                                                   # Create a list to hold the new data.

# Iterate over the rows.
for index, row in df.iterrows():                                                                                   # Iterate over the rows.
    start_year, end_year = map(int, row['date'].split('-'))                                                        # Get the start and end years.
    mid_year = start_year + 1                                                                                      # Calculate the mid year.
    split_value = row['rate per 100,000 population'] / 2                                                           # Calculate the split value.

    yearly_data.extend([
        {'lhd': row['lhd'], 'financial year': f"{start_year}/{mid_year}", 'rate per 100,000 population': split_value},         # Append the first financial year of the period.
        {'lhd': row['lhd'], 'financial year': f"{mid_year}/{end_year}", 'rate per 100,000 population': split_value}            # Append the second financial year of the period.
    ])
    
# Create a new DataFrame.
df_yearly = pd.DataFrame(yearly_data)                                                                              # Create a new DataFrame.

# Group by 'lhd' and 'financial year'. Get an average of the 'value' column.
df_yearly = df_yearly.groupby(['financial year', 'lhd']).mean().reset_index()                                      # Group by 'lhd' and 'financial year'. Get an average of the 'value' column.

# Assign to original DataFrame.
df = df_yearly                                                                                                     # Assign to original DataFrame.

# View the cleaned data.
display(df.head())                                                                                                 # Display the first few rows of the cleaned data

Unnamed: 0,financial year,lhd,"rate per 100,000 population"
0,2011/2012,Central Coast,0.6
1,2011/2012,Hunter New England,0.55
2,2011/2012,Illawarra Shoalhaven,0.7
3,2011/2012,Mid North Coast,0.95
4,2011/2012,Murrumbidgee,0.9


## Set Date Range

Set the range of financial years from 2014/2015 to 2023/2024.

In [13]:
# Drop pre 2014/2015 data.
df = df[~df['financial year'].isin(['2011/2012', '2012/2013', '2013/2014'])]                                        # Drop pre 2014/2015 data.

# Add rows for each LHD for the missing years until 2023/2024.
missing_rows = []                                                                                                   # Create a list to store the missing rows.
lhds = df['lhd'].unique()                                                                                           # Get unique LHDs.
years = [f"{year}/{year + 1}" for year in range(2014, 2024)]                                                        # Create a list of years from 2014/2015 to 2023/2024.

for lhd in lhds:
    for year in years:
        if not ((df['lhd'] == lhd) & (df['financial year'] == year)).any():                                         # Check if the row is missing.
            missing_rows.append({'lhd': lhd, 'financial year': year, 'rate per 100,000 population': None})          # Append the missing row to the list.

# Create a DataFrame from the missing rows and concatenate it to the original DataFrame
if missing_rows:
    df_missing = pd.DataFrame(missing_rows)                                                                         # Create a DataFrame from the missing rows.
    df = pd.concat([df, df_missing], ignore_index=True)                                                             # Concatenate the DataFrames.

# Sort the DataFrame by 'lhd' and 'financial year'.
df = df.sort_values(by=['financial year', 'lhd']).reset_index(drop=True)                                           # Sort the DataFrame by 'lhd' and 'financial year'.

# View the DataFrame.
df.tail()                                                                                                          # View the last 5 rows of the DataFrame.

  df = pd.concat([df, df_missing], ignore_index=True)                                                             # Concatenate the DataFrames.


Unnamed: 0,financial year,lhd,"rate per 100,000 population"
135,2023/2024,South Western Sydney,
136,2023/2024,Southern NSW,
137,2023/2024,Sydney,
138,2023/2024,Western NSW,
139,2023/2024,Western Sydney,


Fill missing values using linear interpolation.

In [14]:
df['rate per 100,000 population'] = df['rate per 100,000 population'].interpolate()                                # Fill missing values using linear interpolation.

## Output Processed Dataset

In [15]:
# Save the processed data to a new CSV file.
df.to_csv('processed.csv', index=False)                                                                            # Save the processed data to a new CSV file.

## View Processed Dataset

In [16]:
# Display the processed data.
display(df.head())

Unnamed: 0,financial year,lhd,"rate per 100,000 population"
0,2014/2015,Central Coast,0.675
1,2014/2015,Hunter New England,0.825
2,2014/2015,Illawarra Shoalhaven,0.65
3,2014/2015,Mid North Coast,0.75
4,2014/2015,Murrumbidgee,0.975
