# Integrate Annual Datasets

## Dependencies

Ensure that the required libraries have been installed locally as per the README.md file included in this project.

Run the following cell the import the required dependencies for this notebook.

In [14]:
# Import libraries
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.impute import SimpleImputer

## Load Datasets

In [15]:
# Load data into dataframes.
df_air_quality = pd.read_csv('../air-quality/processed-financial-year.csv')
df_asthma_deaths = pd.read_csv('../health-stats/asthma-deaths/processed.csv')
df_asthma_edp = pd.read_csv('../health-stats/asthma-edp/processed.csv')
df_asthma_hospitalisations = pd.read_csv('../health-stats/asthma-hospitalisations/processed.csv')
df_asthma_pic = pd.read_csv('../health-stats/asthma-pic/processed.csv')
df_copd_deaths = pd.read_csv('../health-stats/copd-deaths/processed.csv')
df_copd_hospitalisations = pd.read_csv('../health-stats/copd-hospitalisations/processed.csv')
df_iap_deaths = pd.read_csv('../health-stats/iap-deaths/processed.csv')
df_iap_hospitalisations = pd.read_csv('../health-stats/iap-hospitalisations/processed.csv')

In [16]:
# Get headers from all dataframes and create a table to show what they have.
headers = {
    'Air Quality': df_air_quality.columns.tolist(),
    'Asthma Deaths': df_asthma_deaths.columns.tolist(),
    'Asthma Emergency Department Presentations': df_asthma_edp.columns.tolist(),
    'Asthma Hospitalisations': df_asthma_hospitalisations.columns.tolist(),
    'Asthma Prevalence in Children': df_asthma_pic.columns.tolist(),
    'Chronic Obstructive Pulmonary Disease Deaths': df_copd_deaths.columns.tolist(),
    'Chronic Obstructive Pulmonary Disease Hospitalisations': df_copd_hospitalisations.columns.tolist(),
    'Influenza and Pneumonia Deaths': df_iap_deaths.columns.tolist(),
    'Influenza and Pneumonia Hospitalisations': df_iap_hospitalisations.columns.tolist()
}

## Exploratory Analysis of Raw Data

In [17]:
# Create a dataframe to display the headers.
df_headers = pd.DataFrame.from_dict(headers, orient='index').transpose()                                                   # Create a dataframe from the headers dictionary.
df_headers = df_headers.transpose()                                                                                        # Transpose the dataframe to have pd names on the left and headers going across the screen.
display(df_headers)                                                                                                        # Display header information from all dataframes.

Unnamed: 0,0,1,2,3,4,5,6,7
Air Quality,financial year,lhd,CO ppm,NO pphm,NO2 pphm,OZONE pphm,PM10 µg/m³,SO2 pphm
Asthma Deaths,financial year,lhd,"rate per 100,000 population",,,,,
Asthma Emergency Department Presentations,financial year,lhd,"Female rate per 100,000 population","Male rate per 100,000 population","Persons rate per 100,000 population",,,
Asthma Hospitalisations,financial year,lhd,"Female rate per 100,000 population","Male rate per 100,000 population","Persons rate per 100,000 population",,,
Asthma Prevalence in Children,financial year,lhd,"rate per 100,000 population",,,,,
Chronic Obstructive Pulmonary Disease Deaths,financial year,lhd,"Female rate per 100,000 population","Male rate per 100,000 population","Persons rate per 100,000 population",,,
Chronic Obstructive Pulmonary Disease Hospitalisations,financial year,lhd,"Female rate per 100,000 population","Male rate per 100,000 population","Persons rate per 100,000 population",,,
Influenza and Pneumonia Deaths,financial year,lhd,"Female rate per 100,000 population","Male rate per 100,000 population","Persons rate per 100,000 population",,,
Influenza and Pneumonia Hospitalisations,financial year,lhd,"Female rate per 100,000 population","Male rate per 100,000 population","Persons rate per 100,000 population",,,


## Data Manipulation

Rename columns for clarity.

In [18]:
# Asthma Deaths
df_asthma_deaths = df_asthma_deaths.rename(columns={
    'rate per 100,000 population': 'asthma deaths',
})

# Asthma Emergency Department Presentations
df_asthma_edp = df_asthma_edp.rename(columns={
    'Persons rate per 100,000 population': 'asthma edp',
    'Female rate per 100,000 population': 'asthma edp [f]',
    'Male rate per 100,000 population': 'asthma edp [m]'
})

# Asthma Hospitalisations
df_asthma_hospitalisations = df_asthma_hospitalisations.rename(columns={
    'Persons rate per 100,000 population': 'asthma hospitalisations',
    'Female rate per 100,000 population': 'asthma hospitalisations [f]',
    'Male rate per 100,000 population': 'asthma hospitalisations [m]'
})

# Asthma Prevalence in Children
df_asthma_pic = df_asthma_pic.rename(columns={
    'rate per 100,000 population': 'asthma pic',
})

# Chronic Obstructive Pulmonary Disease Deaths
df_copd_deaths = df_copd_deaths.rename(columns={
    'Persons rate per 100,000 population': 'copd deaths',
    'Female rate per 100,000 population': 'copd deaths [f]',
    'Male rate per 100,000 population': 'copd deaths [m]'
})

# Chronic Obstructive Pulmonary Disease Hospitalisations
df_copd_hospitalisations = df_copd_hospitalisations.rename(columns={
    'Persons rate per 100,000 population': 'copd hospitalisations',
    'Female rate per 100,000 population': 'copd hospitalisations [f]',
    'Male rate per 100,000 population': 'copd hospitalisations [m]'
})

# Influenza and Pneumonia Deaths
df_iap_deaths = df_iap_deaths.rename(columns={
    'Persons rate per 100,000 population': 'iap deaths',
    'Female rate per 100,000 population': 'iap deaths [f]',
    'Male rate per 100,000 population': 'iap deaths [m]'
})

# Influenza and Pneumonia Hospitalisations
df_iap_hospitalisations = df_iap_hospitalisations.rename(columns={
    'Persons rate per 100,000 population': 'iap hospitalisations',
    'Female rate per 100,000 population': 'iap hospitalisations [f]',
    'Male rate per 100,000 population': 'iap hospitalisations [m]'
})

# Get headers from all dataframes and create a table to show what they have.
headers = {
    'Air Quality': df_air_quality.columns.tolist(),
    'Asthma Deaths': df_asthma_deaths.columns.tolist(),
    'Asthma Emergency Department Presentations': df_asthma_edp.columns.tolist(),
    'Asthma Hospitalisations': df_asthma_hospitalisations.columns.tolist(),
    'Asthma Prevalence in Children': df_asthma_pic.columns.tolist(),
    'Chronic Obstructive Pulmonary Disease Deaths': df_copd_deaths.columns.tolist(),
    'Chronic Obstructive Pulmonary Disease Hospitalisations': df_copd_hospitalisations.columns.tolist(),
    'Influenza and Pneumonia Deaths': df_iap_deaths.columns.tolist(),
    'Influenza and Pneumonia Hospitalisations': df_iap_hospitalisations.columns.tolist()
}

# Create a dataframe to display the headers.
df_headers = pd.DataFrame.from_dict(headers, orient='index').transpose()                                                   # Create a dataframe from the headers dictionary.
df_headers = df_headers.transpose()                                                                                        # Transpose the dataframe to have pd names on the left and headers going across the screen.
display(df_headers)                                                                                                        # Display header information from all dataframes.

Unnamed: 0,0,1,2,3,4,5,6,7
Air Quality,financial year,lhd,CO ppm,NO pphm,NO2 pphm,OZONE pphm,PM10 µg/m³,SO2 pphm
Asthma Deaths,financial year,lhd,asthma deaths,,,,,
Asthma Emergency Department Presentations,financial year,lhd,asthma edp [f],asthma edp [m],asthma edp,,,
Asthma Hospitalisations,financial year,lhd,asthma hospitalisations [f],asthma hospitalisations [m],asthma hospitalisations,,,
Asthma Prevalence in Children,financial year,lhd,asthma pic,,,,,
Chronic Obstructive Pulmonary Disease Deaths,financial year,lhd,copd deaths [f],copd deaths [m],copd deaths,,,
Chronic Obstructive Pulmonary Disease Hospitalisations,financial year,lhd,copd hospitalisations [f],copd hospitalisations [m],copd hospitalisations,,,
Influenza and Pneumonia Deaths,financial year,lhd,iap deaths [f],iap deaths [m],iap deaths,,,
Influenza and Pneumonia Hospitalisations,financial year,lhd,iap hospitalisations [f],iap hospitalisations [m],iap hospitalisations,,,


## Merge Datasets

Merge dataframes on 'financial year' and 'lhd' - inner.

In [19]:
# Initialize the merged dataframe with air quality data
df_merged = df_air_quality

# List of dataframes to merge
dataframes = [
    df_asthma_deaths,
    df_asthma_edp,
    df_asthma_hospitalisations,
    df_asthma_pic,
    df_copd_deaths,
    df_copd_hospitalisations,
    df_iap_deaths,
    df_iap_hospitalisations
]

# Merge each dataframe on 'financial year' and 'lhd' columns
for df in dataframes:
    df_merged = pd.merge(df_merged, df, on=['financial year', 'lhd'], how='inner')


# Sort the merged dataframe by 'lhd' and 'financial year'.
df_merged = df_merged.sort_values(by=['lhd', 'financial year'])

# View headers of merged dataframe.
display(df_merged.head())

Unnamed: 0,financial year,lhd,CO ppm,NO pphm,NO2 pphm,OZONE pphm,PM10 µg/m³,SO2 pphm,asthma deaths,asthma edp [f],...,copd deaths,copd hospitalisations [f],copd hospitalisations [m],copd hospitalisations,iap deaths [f],iap deaths [m],iap deaths,iap hospitalisations [f],iap hospitalisations [m],iap hospitalisations
0,2014/2015,Central Coast,0.1,0.233333,0.466667,1.79119,15.158333,0.05,0.675,373.6,...,33.0,277.0,315.9,292.1,9.4,11.5,10.2,269.0,402.9,329.0
1,2015/2016,Central Coast,0.108333,0.216667,0.458333,1.691667,15.375,0.058333,0.725,405.8,...,33.1,301.9,288.7,293.2,9.9,10.2,10.1,298.5,404.2,345.3
2,2016/2017,Central Coast,0.1,0.216667,0.491667,1.775,15.358333,0.058333,0.7,412.0,...,35.5,312.5,303.7,306.9,11.1,11.4,11.3,260.3,324.7,287.0
3,2017/2018,Central Coast,0.108333,0.175,0.441667,1.861183,16.816667,0.075,0.675,386.7,...,36.4,298.9,327.5,309.7,9.7,13.3,11.3,343.6,421.9,376.8
4,2018/2019,Central Coast,0.108333,0.216667,0.416667,1.733333,17.75,0.091667,0.675,403.8,...,35.4,307.8,286.2,295.8,7.8,13.3,10.1,339.6,420.6,376.0


## Divide into Gendered and Genderless Datasets.

In [20]:
df_gendered = df_merged.iloc[:, :8]                      # Create a dataframe to store gendered data.
df_genderless = df_merged.iloc[:, :8]                    # Create a dataframe to store genderless data.

# Split the merged dataframe 
for column in df_merged.columns[8:]:                     # Iterate through the columns of the merged dataframe.
    if '[m]' in column or '[f]' in column:               # Check if the column is gendered.
        df_gendered[column] = df_merged[column]          # If the column is gendered, add it to the gendered dataframe.
    else:                                                # If the column is not gendered.
        df_genderless[column] = df_merged[column]        # Add it to the genderless dataframe.

Display the gendered data.

In [21]:
# Display the gendered dataframe.
display(df_gendered.head())

Unnamed: 0,financial year,lhd,CO ppm,NO pphm,NO2 pphm,OZONE pphm,PM10 µg/m³,SO2 pphm,asthma edp [f],asthma edp [m],asthma hospitalisations [f],asthma hospitalisations [m],copd deaths [f],copd deaths [m],copd hospitalisations [f],copd hospitalisations [m],iap deaths [f],iap deaths [m],iap hospitalisations [f],iap hospitalisations [m]
0,2014/2015,Central Coast,0.1,0.233333,0.466667,1.79119,15.158333,0.05,373.6,354.3,145.65,86.75,25.3,43.1,277.0,315.9,9.4,11.5,269.0,402.9
1,2015/2016,Central Coast,0.108333,0.216667,0.458333,1.691667,15.375,0.058333,405.8,350.7,147.75,91.275,28.2,40.0,301.9,288.7,9.9,10.2,298.5,404.2
2,2016/2017,Central Coast,0.1,0.216667,0.491667,1.775,15.358333,0.058333,412.0,339.7,145.7,87.35,33.1,39.6,312.5,303.7,11.1,11.4,260.3,324.7
3,2017/2018,Central Coast,0.108333,0.175,0.441667,1.861183,16.816667,0.075,386.7,344.0,133.575,89.9,33.9,40.6,298.9,327.5,9.7,13.3,343.6,421.9
4,2018/2019,Central Coast,0.108333,0.216667,0.416667,1.733333,17.75,0.091667,403.8,349.0,127.975,84.975,31.6,41.1,307.8,286.2,7.8,13.3,339.6,420.6


Display the genderless data.

In [22]:
# Display the genderless dataframe.
df_genderless.head()

Unnamed: 0,financial year,lhd,CO ppm,NO pphm,NO2 pphm,OZONE pphm,PM10 µg/m³,SO2 pphm,asthma deaths,asthma edp,asthma hospitalisations,asthma pic,copd deaths,copd hospitalisations,iap deaths,iap hospitalisations
0,2014/2015,Central Coast,0.1,0.233333,0.466667,1.79119,15.158333,0.05,0.675,366.6,117.375,9562.5,33.0,292.1,10.2,329.0
1,2015/2016,Central Coast,0.108333,0.216667,0.458333,1.691667,15.375,0.058333,0.725,380.9,121.225,9200.0,33.1,293.2,10.1,345.3
2,2016/2017,Central Coast,0.1,0.216667,0.491667,1.775,15.358333,0.058333,0.7,378.4,118.225,8762.5,35.5,306.9,11.3,287.0
3,2017/2018,Central Coast,0.108333,0.175,0.441667,1.861183,16.816667,0.075,0.675,368.2,113.575,8100.0,36.4,309.7,11.3,376.8
4,2018/2019,Central Coast,0.108333,0.216667,0.416667,1.733333,17.75,0.091667,0.675,378.8,107.725,7700.0,35.4,295.8,10.1,376.0


## Interpolate Missing Pollutant Values

Some Local Health Districts did not record any raw data for certain pollutants. This section of the notebook fills in these missing values using linear regression based on all available data.

### Define Function to Fill Missing Values

In [23]:
# Function to fill missing values for a specific pollutant using a regression model
def fill_missing_values(data, target_column, features):

    # Separate data where the target column is missing
    data_missing_values = data[data[target_column].isna()]
    
    # If there are no missing values, return the original data
    if data_missing_values.empty:
        return data
    
    # Separate data where the target column is not missing
    data_with_values = data[data[target_column].notna()]
    
    # Prepare the training data
    X_train = data_with_values[features]
    y_train = data_with_values[target_column]
    
    # Impute missing values in the features
    imputer = SimpleImputer(strategy='mean')
    X_train = imputer.fit_transform(X_train)
    
    # Train the regression model
    model = LinearRegression()
    model.fit(X_train, y_train)
    
    # Predict missing values
    X_test = data_missing_values[features]
    X_test = imputer.transform(X_test)
    predicted_values = model.predict(X_test)
    
    # Fill the missing values in the original dataset
    data.loc[data[target_column].isna(), target_column] = predicted_values
    
    return data

### Call Function to Fill Missing Values

In [24]:
# Fill missing values for CO ppm
df_genderless = fill_missing_values(df_genderless, 'CO ppm', df_genderless.columns[2:])
df_gendered = fill_missing_values(df_gendered, 'CO ppm', df_gendered.columns[2:])

# Fill missing values for PM10 µg/m³
df_genderless = fill_missing_values(df_genderless, 'PM10 µg/m³', df_genderless.columns[2:])
df_gendered = fill_missing_values(df_gendered, 'PM10 µg/m³', df_gendered.columns[2:])

# Fill missing values for SO2 pphm
df_genderless = fill_missing_values(df_genderless, 'SO2 pphm', df_genderless.columns[2:])
df_gendered = fill_missing_values(df_gendered, 'SO2 pphm', df_gendered.columns[2:])

## Output Process Datasets

In [25]:
# Save the processed data.
df_genderless.to_csv('processed-annual-genderless.csv', index=False)
df_gendered.to_csv('processed-annual-gendered.csv', index=False)

## View Processed Datasets

In [26]:
# Display the processed data.
print('Genderless Data Head:')
display(df_genderless.head())

print('Gendered Data Head:')
display(df_gendered.head())

Genderless Data Head:


Unnamed: 0,financial year,lhd,CO ppm,NO pphm,NO2 pphm,OZONE pphm,PM10 µg/m³,SO2 pphm,asthma deaths,asthma edp,asthma hospitalisations,asthma pic,copd deaths,copd hospitalisations,iap deaths,iap hospitalisations
0,2014/2015,Central Coast,0.1,0.233333,0.466667,1.79119,15.158333,0.05,0.675,366.6,117.375,9562.5,33.0,292.1,10.2,329.0
1,2015/2016,Central Coast,0.108333,0.216667,0.458333,1.691667,15.375,0.058333,0.725,380.9,121.225,9200.0,33.1,293.2,10.1,345.3
2,2016/2017,Central Coast,0.1,0.216667,0.491667,1.775,15.358333,0.058333,0.7,378.4,118.225,8762.5,35.5,306.9,11.3,287.0
3,2017/2018,Central Coast,0.108333,0.175,0.441667,1.861183,16.816667,0.075,0.675,368.2,113.575,8100.0,36.4,309.7,11.3,376.8
4,2018/2019,Central Coast,0.108333,0.216667,0.416667,1.733333,17.75,0.091667,0.675,378.8,107.725,7700.0,35.4,295.8,10.1,376.0


Gendered Data Head:


Unnamed: 0,financial year,lhd,CO ppm,NO pphm,NO2 pphm,OZONE pphm,PM10 µg/m³,SO2 pphm,asthma edp [f],asthma edp [m],asthma hospitalisations [f],asthma hospitalisations [m],copd deaths [f],copd deaths [m],copd hospitalisations [f],copd hospitalisations [m],iap deaths [f],iap deaths [m],iap hospitalisations [f],iap hospitalisations [m]
0,2014/2015,Central Coast,0.1,0.233333,0.466667,1.79119,15.158333,0.05,373.6,354.3,145.65,86.75,25.3,43.1,277.0,315.9,9.4,11.5,269.0,402.9
1,2015/2016,Central Coast,0.108333,0.216667,0.458333,1.691667,15.375,0.058333,405.8,350.7,147.75,91.275,28.2,40.0,301.9,288.7,9.9,10.2,298.5,404.2
2,2016/2017,Central Coast,0.1,0.216667,0.491667,1.775,15.358333,0.058333,412.0,339.7,145.7,87.35,33.1,39.6,312.5,303.7,11.1,11.4,260.3,324.7
3,2017/2018,Central Coast,0.108333,0.175,0.441667,1.861183,16.816667,0.075,386.7,344.0,133.575,89.9,33.9,40.6,298.9,327.5,9.7,13.3,343.6,421.9
4,2018/2019,Central Coast,0.108333,0.216667,0.416667,1.733333,17.75,0.091667,403.8,349.0,127.975,84.975,31.6,41.1,307.8,286.2,7.8,13.3,339.6,420.6
