In [11]:
# Import necessary libraries
import pandas as pd
import os

# Clean Datasets and Merge

# Clean PPH dataset
pph_df = pd.read_csv("8-admitted-patient-care-2022-23-tables-safety-and-quality_1.csv", skiprows=1)
pph_df.rename(columns={pph_df.columns[0]: 'Condition'}, inplace=True)
pph_df = pph_df.drop(columns=[col for col in pph_df.columns if 'Total' in col or 'Unnamed' in col], errors='ignore')

pph_long = pph_df.melt(id_vars=['Condition'], var_name='State', value_name='PPH_per_1000')
pph_long.dropna(inplace=True)
pph_long['PPH_per_1000'] = pd.to_numeric(pph_long['PPH_per_1000'], errors='coerce')

# Clean expenditure dataset
expenditure_df = pd.read_csv("HWE-101-Health-Expenditure-Australia-datacube-2022-23.csv", skiprows=7)
expenditure_df.columns = [col.strip() for col in expenditure_df.columns]

expenditure_df = expenditure_df[['Year', 'Jurisdiction', 'Sector', 'Current per person ($)']]
expenditure_df = expenditure_df.rename(columns={'Current per person ($)': 'Cost_per_person'})

expenditure_df['Cost_per_person'] = (
    expenditure_df['Cost_per_person']
    .astype(str)
    .str.replace(',', '', regex=False)
    .astype(float)
)

expenditure_df = expenditure_df[
    (expenditure_df['Year'].astype(str).str.contains('2022')) &
    (expenditure_df['Sector'].str.contains('Hospital', na=False))
]

# Merge datasets
merged_df = pph_long.merge(expenditure_df, left_on='State', right_on='Jurisdiction', how='inner')
merged_df['Cost_per_PPH'] = (merged_df['Cost_per_person'] * 1000) / merged_df['PPH_per_1000']

# Save cleaned datasets
os.makedirs("data/processed", exist_ok=True)

pph_long.to_csv('data/processed/cleaned_pph.csv', index=False)
expenditure_df.to_csv('data/processed/cleaned_expenditure.csv', index=False)
merged_df.to_csv('data/processed/master_cost_outcome.csv', index=False)

# Preview outputs 
print("PPH Long Format:")
print(pph_long.head())

print("\nCleaned Expenditure Data:")
print(expenditure_df.head())

print("\nMerged Cost vs Outcome Data:")
print(merged_df.head())


PPH Long Format:
                                     Condition State  PPH_per_1000
1  Pneumonia and vaccine-preventable influenza   NSW           0.9
2         Other vaccine-preventable conditions   NSW           0.8
3      Total vaccine-preventable conditions(c)   NSW           1.7
5          Pneumonia (not vaccine-preventable)   NSW           0.0
6                                   Cellulitis   NSW           1.9

Cleaned Expenditure Data:
         Year Jurisdiction     Sector  Cost_per_person
6520  2022-23          ACT  Hospitals             84.0
6521  2022-23          ACT  Hospitals            129.0
6522  2022-23          ACT  Hospitals            101.0
6523  2022-23          ACT  Hospitals            302.0
6524  2022-23          ACT  Hospitals            241.0

Merged Cost vs Outcome Data:
                                     Condition State  PPH_per_1000     Year  \
0  Pneumonia and vaccine-preventable influenza   NSW           0.9  2022-23   
1  Pneumonia and vaccine-preventable