# Integrate Monthly Datasets

## Set Up

Ensure that the required libraries are available by running the below code in the terminal before execution:
- pip install pandas


Execute the following in the jupyter notebook before execution to ensure that the required libraries are imported:

In [6]:
import pandas as pd

## Load Datasets

In [7]:
# Load data into dataframes.
df_air_quality = pd.read_csv('../../2-nsw-air-quality/data-processed-financial-year.csv')
df_asthma_deaths = pd.read_csv('../../3-nsw-health-stats/respiratory-health/asthma/deaths/data-processed.csv')
df_asthma_edp = pd.read_csv('../../3-nsw-health-stats/respiratory-health/asthma/emergency-department-presentations/yearly/data-processed-alt.csv')
df_asthma_hospitalisations = pd.read_csv('../../3-nsw-health-stats/respiratory-health/asthma/hospitalisations/data-processed-alt.csv')
df_asthma_children = pd.read_csv('../../3-nsw-health-stats/respiratory-health/asthma/prevelance-in-children/data-processed-alt.csv')
df_copd_deaths = pd.read_csv('../../3-nsw-health-stats/respiratory-health/chronic-obstructive-pulmonary-disease/deaths/data-processed-alt.csv')
df_copd_hospitalisations = pd.read_csv('../../3-nsw-health-stats/respiratory-health/chronic-obstructive-pulmonary-disease/hospitalisations/data-processed-alt.csv')
df_iap_deaths = pd.read_csv('../../3-nsw-health-stats/respiratory-health/influenza-and-pneumonia/deaths/data-processed-alt.csv')
df_iap_hospitalisations = pd.read_csv('../../3-nsw-health-stats/respiratory-health/influenza-and-pneumonia/hospitalisations/data-processed-alt.csv')

# View Headers.
print("Air Quality Headers:", df_air_quality.columns.tolist())

print("\nAsthma Deaths Headers:")
print(df_asthma_deaths.columns.tolist())

print("\nAsthma Emergency Department Presentations Headers:")
print(df_asthma_edp.columns.tolist())

print("\nAsthma Hospitalisations Headers:")
print(df_asthma_hospitalisations.columns.tolist())

print("\nAsthma Prevelance in Children Headers:")
print(df_asthma_children.columns.tolist())

print("\nChronic Obstructive Pulmonary Disease Deaths Headers:")
print(df_copd_deaths.columns.tolist())

print("\nChronic Obstructive Pulmonary Disease Hospitalisations Headers:")
print(df_copd_hospitalisations.columns.tolist())

print("\nInfluenza and Pneumonia Deaths Headers:")
print(df_iap_deaths.columns.tolist())

print("\nInfluenza and Pneumonia Hospitalisations Headers:")
print(df_iap_hospitalisations.columns.tolist())

Air Quality Headers: ['financial year', 'lhd', 'CO ppm', 'NO pphm', 'NO2 pphm', 'OZONE pphm', 'PM10 µg/m³', 'SO2 pphm']

Asthma Deaths Headers:
['lhd', 'financial year', 'rate per 100,000 population']

Asthma Emergency Department Presentations Headers:
['financial year', 'lhd', 'Female rate per 100,000 population', 'Male rate per 100,000 population']

Asthma Hospitalisations Headers:
['financial year', 'lhd', 'Female rate per 100,000 population', 'Male rate per 100,000 population']

Asthma Prevelance in Children Headers:
['lhd', 'financial year', 'per cent']

Chronic Obstructive Pulmonary Disease Deaths Headers:
['financial year', 'lhd', 'Female rate per 100,000 population', 'Male rate per 100,000 population']

Chronic Obstructive Pulmonary Disease Hospitalisations Headers:
['financial year', 'lhd', 'Female rate per 100,000 population', 'Male rate per 100,000 population']

Influenza and Pneumonia Deaths Headers:
['financial year', 'lhd', 'Female rate per 100,000 population', 'Male rate

## Data Manipulation

Rename columns for clarity.

In [8]:
# Asthma Deaths
df_asthma_deaths = df_asthma_deaths.rename(columns={
    'rate per 100,000 population': 'asthma deaths [rate per 100,000]',
})

# Asthma Emergency Department Presentations
df_asthma_edp = df_asthma_edp.rename(columns={
    'Female rate per 100,000 population': 'asthma edp [f] [rate per 100,000]',
    'Male rate per 100,000 population': 'asthma edp [m] [rate per 100,000]'
})

# Asthma Hospitalisations
df_asthma_hospitalisations = df_asthma_hospitalisations.rename(columns={
    'Female rate per 100,000 population': 'asthma hospitalisations [f] [rate per 100,000]',
    'Male rate per 100,000 population': 'asthma hospitalisations [m] [rate per 100,000]'
})

# Asthma Prevelance in Children
df_asthma_children = df_asthma_children.rename(columns={
    'per cent': 'asthma prevelance in children [% of children]'
})

# Chronic Obstructive Pulmonary Disease Deaths
df_copd_deaths = df_copd_deaths.rename(columns={
    'Female rate per 100,000 population': 'copd deaths [f] [rate per 100,000]',
    'Male rate per 100,000 population': 'copd deaths [m] [rate per 100,000]'
})

# Chronic Obstructive Pulmonary Disease Hospitalisations
df_copd_hospitalisations = df_copd_hospitalisations.rename(columns={
    'Female rate per 100,000 population': 'copd hospitalisations [f] [rate per 100,000]',
    'Male rate per 100,000 population': 'copd hospitalisations [m] [rate per 100,000]'
})

# Influenza and Pneumonia Deaths
df_iap_deaths = df_iap_deaths.rename(columns={
    'Female rate per 100,000 population': 'iap deaths [f] [rate per 100,000]',
    'Male rate per 100,000 population': 'iap deaths [m] [rate per 100,000]'
})

# Influenza and Pneumonia Hospitalisations
df_iap_hospitalisations = df_iap_hospitalisations.rename(columns={
    'Female rate per 100,000 population': 'iap hospitalisations [f] [rate per 100,000]',
    'Male rate per 100,000 population': 'iap hospitalisations [m] [rate per 100,000]'
})

## Merge Datasets

Merge dataframes on 'financial year' and 'lhd' - inner.

In [9]:
# Merge dataframes on 'date' and 'lhd' columns.
df_merged_inner = df_air_quality
df_merged_inner = pd.merge(df_merged_inner, df_asthma_deaths, on=['financial year', 'lhd'], how='inner')
df_merged_inner = pd.merge(df_merged_inner, df_asthma_edp, on=['financial year', 'lhd'], how='inner')
df_merged_inner = pd.merge(df_merged_inner, df_asthma_hospitalisations, on=['financial year', 'lhd'], how='inner')
df_merged_inner = pd.merge(df_merged_inner, df_asthma_children, on=['financial year', 'lhd'], how='inner')
df_merged_inner = pd.merge(df_merged_inner, df_copd_deaths, on=['financial year', 'lhd'], how='inner')
df_merged_inner = pd.merge(df_merged_inner, df_copd_hospitalisations, on=['financial year', 'lhd'], how='inner')
df_merged_inner = pd.merge(df_merged_inner, df_iap_deaths, on=['financial year', 'lhd'], how='inner')
df_merged_inner = pd.merge(df_merged_inner, df_iap_hospitalisations, on=['financial year', 'lhd'], how='inner')

# Sort the merged dataframe by 'lhd' and 'financial year'.
df_merged_inner = df_merged_inner.sort_values(by=['lhd', 'financial year'])

# View headers of merged dataframe.
df_merged_inner.head()

Unnamed: 0,financial year,lhd,CO ppm,NO pphm,NO2 pphm,OZONE pphm,PM10 µg/m³,SO2 pphm,"asthma deaths [rate per 100,000]","asthma edp [f] [rate per 100,000]",...,"asthma hospitalisations [m] [rate per 100,000]",asthma prevelance in children [% of children],"copd deaths [f] [rate per 100,000]","copd deaths [m] [rate per 100,000]","copd hospitalisations [f] [rate per 100,000]","copd hospitalisations [m] [rate per 100,000]","iap deaths [f] [rate per 100,000]","iap deaths [m] [rate per 100,000]","iap hospitalisations [f] [rate per 100,000]","iap hospitalisations [m] [rate per 100,000]"
0,2014/2015,Central Coast,0.1,0.166667,0.444444,1.922222,16.233333,0.044444,0.675,373.6,...,148.5,15.7,25.3,43.1,277.0,315.9,9.4,11.5,269.0,402.9
1,2015/2016,Central Coast,0.1,0.188889,0.444444,1.755556,15.833333,0.066667,0.725,405.8,...,157.4,14.4,28.2,40.0,301.9,288.7,9.9,10.2,298.5,404.2
2,2016/2017,Central Coast,0.1,0.18,0.45,1.87,16.1,0.07,0.7,412.0,...,149.1,12.55,33.1,39.6,312.5,303.7,11.1,11.4,260.3,324.7
3,2017/2018,Central Coast,0.1,0.128571,0.4,2.028571,17.985714,0.085714,0.675,386.7,...,141.6,11.15,33.9,40.6,298.9,327.5,9.7,13.3,343.6,421.9
4,2014/2015,Hunter New England,0.246667,0.465115,0.707188,1.991198,22.876567,0.155568,0.825,455.6,...,136.0,16.3,25.4,33.9,239.3,279.4,8.3,11.3,317.4,391.0


Merge dataframes on 'date' and 'lhd' columns - outer.

In [10]:
# Merge dataframes on 'date' and 'lhd' columns.
df_merged_outer = df_air_quality
df_merged_outer = pd.merge(df_merged_outer, df_asthma_deaths, on=['financial year', 'lhd'], how='outer')
df_merged_outer = pd.merge(df_merged_outer, df_asthma_edp, on=['financial year', 'lhd'], how='outer')
df_merged_outer = pd.merge(df_merged_outer, df_asthma_hospitalisations, on=['financial year', 'lhd'], how='outer')
df_merged_outer = pd.merge(df_merged_outer, df_asthma_children, on=['financial year', 'lhd'], how='outer')
df_merged_outer = pd.merge(df_merged_outer, df_copd_deaths, on=['financial year', 'lhd'], how='outer')
df_merged_outer = pd.merge(df_merged_outer, df_copd_hospitalisations, on=['financial year', 'lhd'], how='outer')
df_merged_outer = pd.merge(df_merged_outer, df_iap_deaths, on=['financial year', 'lhd'], how='outer')
df_merged_outer = pd.merge(df_merged_outer, df_iap_hospitalisations, on=['financial year', 'lhd'], how='outer')

# Sort the merged dataframe by 'lhd' and 'financial year'.
df_merged_outer = df_merged_outer.sort_values(by=['lhd', 'financial year'])

# View headers of merged dataframe.
df_merged_outer.head()

Unnamed: 0,financial year,lhd,CO ppm,NO pphm,NO2 pphm,OZONE pphm,PM10 µg/m³,SO2 pphm,"asthma deaths [rate per 100,000]","asthma edp [f] [rate per 100,000]",...,"asthma hospitalisations [m] [rate per 100,000]",asthma prevelance in children [% of children],"copd deaths [f] [rate per 100,000]","copd deaths [m] [rate per 100,000]","copd hospitalisations [f] [rate per 100,000]","copd hospitalisations [m] [rate per 100,000]","iap deaths [f] [rate per 100,000]","iap deaths [m] [rate per 100,000]","iap hospitalisations [f] [rate per 100,000]","iap hospitalisations [m] [rate per 100,000]"
0,2000/2001,Central Coast,0.065493,0.214184,0.39078,1.762143,15.59507,0.07,,,...,,,,,,,,,,
12,2001/2002,Central Coast,0.065493,0.214184,0.39078,1.762143,15.59507,0.07,,,...,204.6,,,,235.4,348.2,,,198.1,294.6
27,2002/2003,Central Coast,0.065493,0.214184,0.39078,1.762143,15.59507,0.07,,,...,167.2,,,,228.1,334.0,,,213.2,317.7
42,2003/2004,Central Coast,0.065493,0.214184,0.39078,1.762143,15.59507,0.07,,,...,154.1,16.7,,,219.0,377.1,,,217.8,308.5
57,2004/2005,Central Coast,0.065493,0.214184,0.39078,1.762143,15.59507,0.07,,,...,184.7,18.15,,,251.2,390.6,,,190.7,278.8


## Output Dataset

In [11]:
df_merged_inner.to_csv('data-merged.csv', index=False)
df_merged_outer.to_csv('data-merged-alt.csv', index=False)