# Merging Cost_PPH and SEIFA Datasets


In [18]:
# importing the required libraries
import pandas as pd

# Loading the preprocessed cost and pph dataset files 
seifa_phn_df =  pd.read_csv('../Data/preprocessed-data/seifa_phn.csv')
cost_pph_df = pd.read_csv('../Data/preprocessed-data/merged_cost_pph.csv')

In [19]:
# Prechecks on the datasets before merging

# Clean and check columns
for df_name, df in [('Cost_PPH', cost_pph_df), ('SEIFA', seifa_phn_df)]:
    assert 'PHN_Code' in df.columns, f"PHN_Code missing in {df_name}"
    df['PHN_Code'] = df['PHN_Code'].astype(str).str.strip()
    print(f"NaN PHN in {df_name}:", df['PHN_Code'].isna().sum())
    print(f"Duplicate PHN in {df_name}:", df['PHN_Code'].duplicated().sum())

# Check PHN overlap
cost_phns  = set(cost_pph_df['PHN_Code'])
seifa_phns = set(seifa_phn_df['PHN_Code'])

print("PHNs in baseline not in SEIFA:", sorted(cost_phns - seifa_phns))
print("PHNs in SEIFA not in baseline:", sorted(seifa_phns - cost_phns))


NaN PHN in Cost_PPH: 0
Duplicate PHN in Cost_PPH: 0
NaN PHN in SEIFA: 0
Duplicate PHN in SEIFA: 0
PHNs in baseline not in SEIFA: []
PHNs in SEIFA not in baseline: ['PHN205']


In [20]:
# Exclude PHN205 to keep consistency with state-based cost measure
n_before = len(seifa_phn_df)
seifa_phn_agg = seifa_phn_df[seifa_phn_df['PHN_Code'] != 'PHN205'].copy()
print(f"Dropped PHN205 from SEIFA table: {n_before - len(seifa_phn_agg)} row removed")


Dropped PHN205 from SEIFA table: 1 row removed


### Cross‑border PHN handling.
The SEIFA dataset includes PHN205 (VIC/NSW cross‑border). Our cost measure is state‑level (one value per state), so a single cost per person cannot be assigned to a cross‑border PHN without additional population share weighting. To maintain internal consistency and avoid introducing a partially imputed value, PHN205 was excluded from the analysis. This decision affects only one PHN and does not change conclusions at the national level.

In [21]:
# Merging two datasets
final_cost_pph_seifa_df = cost_pph_df.merge(seifa_phn_agg[['PHN_Code','SEIFA_IRSD_Score','IRSD_Decile_Mean']],
                                             on='PHN_Code', how='inner'
                                             )

final_cost_pph_seifa_df.head()

Unnamed: 0,PHN_Code,PHN_Name,State,PPH_rate_per_100k,Cost_per_person,SEIFA_IRSD_Score,IRSD_Decile_Mean
0,PHN101,Central and Eastern Sydney,NSW,1688.577,111.426,1033.035,6.871
1,PHN102,Northern Sydney,NSW,1795.76,111.426,1085.809,9.188
2,PHN103,Western Sydney,NSW,1616.84,111.426,997.145,5.857
3,PHN104,Nepean Blue Mountains,NSW,1695.08,111.426,1005.928,5.719
4,PHN105,South Western Sydney,NSW,1639.889,111.426,931.202,4.225


In [None]:
# Saving  Final data frame into a csv file
final_cost_pph_seifa_df.to_csv('../Data/preprocessed-data/final_cost_pph_seifa_dataset.csv', index=False)

: 