# Merging Cost and PPH Datasets


In [3]:
# importing the required libraries
import pandas as pd

# Loading the preprocessed cost and pph dataset files 
cost_df =  pd.read_csv('../Data/preprocessed-data/headlthcare_costs_summary_by_state_2022_23.csv')
pph_df = pd.read_csv('../Data/preprocessed-data/pph_ppn_level_2022_23.csv')

In [4]:
# Creating a state mapping for identical states on both the data frames

state_map = {
    'NSW': 'NSW', 'VIC': 'VIC', 'QLD': 'QLD', 'SA': 'SA', 
    'WA': 'WA', 'TAS': 'TAS', 'NT': 'NT', 'ACT': 'ACT',
    'Qld': 'QLD', 'Vic': 'VIC', 'Tas': 'TAS'
}

cost_df['State'] = cost_df['State'].map(state_map)
pph_df['State'] = pph_df['State'].map(state_map)

In [5]:
# Checking for missing state values

print("Missing states in cost:", cost_df['State'].isna().sum())
print("Missing states in PPH:", pph_df['State'].isna().sum())

# Unique states
print("Cost DF States:", sorted(cost_df['State'].unique()))
print("PPH DF States:", sorted(pph_df['State'].unique()))

Missing states in cost: 0
Missing states in PPH: 0
Cost DF States: ['ACT', 'NSW', 'NT', 'QLD', 'SA', 'TAS', 'VIC', 'WA']
PPH DF States: ['ACT', 'NSW', 'NT', 'QLD', 'SA', 'TAS', 'VIC', 'WA']


In [None]:
# Merging two datasets
merged_cost_pph_df = pph_df.merge(cost_df, on='State', how='left')


In [10]:
# Checking for missing cost data after merging
print("Missing cost values:", merged_cost_pph_df['Cost_per_person'].isna().sum())

# Ensure each state in the merged dataset has only one unique cost value
state_cost_check = merged_cost_pph_df.groupby('State')['Cost_per_person'].nunique()
print(state_cost_check)

# Previewing the merge datasets
merged_cost_pph_df.head(30)

Missing cost values: 0
State
ACT    1
NSW    1
NT     1
QLD    1
SA     1
TAS    1
VIC    1
WA     1
Name: Cost_per_person, dtype: int64


Unnamed: 0,PHN_Code,PHN_Name,State,PPH_rate_per_100k,Cost_per_person
0,PHN101,Central and Eastern Sydney,NSW,1688.577,111.426
1,PHN102,Northern Sydney,NSW,1795.76,111.426
2,PHN103,Western Sydney,NSW,1616.84,111.426
3,PHN104,Nepean Blue Mountains,NSW,1695.08,111.426
4,PHN105,South Western Sydney,NSW,1639.889,111.426
5,PHN106,South Eastern NSW,NSW,1651.692,111.426
6,PHN107,Western NSW,NSW,1833.08,111.426
7,PHN108,Hunter New England and Central Coast,NSW,1701.037,111.426
8,PHN109,North Coast,NSW,1775.222,111.426
9,PHN110,Murrumbidgee,NSW,2161.56,111.426


In [None]:
# Saving the merged dataframe into dataset file
merged_cost_pph_df.to_csv('../Data/preprocessed-data/merged_cost_pph.csv', index=False)

: 