In [1]:
import pandas as pd
import os

# define file path
DATA_PATH = "../data/interim/"

# load cleaned datasets
water = pd.read_csv(os.path.join(DATA_PATH, "water_access_full_clean.csv"))
child = pd.read_csv(os.path.join(DATA_PATH, "child_mortality_clean.csv"))
infant = pd.read_csv(os.path.join(DATA_PATH, "infant_mortality_clean.csv"))
life = pd.read_csv(os.path.join(DATA_PATH, "life_expectancy_clean.csv"))
ncd = pd.read_csv(os.path.join(DATA_PATH, "mortality_ncds_clean.csv"))
obesity = pd.read_csv(os.path.join(DATA_PATH, "obesity_prevalence_clean.csv"))

# metadata (not merged)
schema = pd.read_csv(os.path.join(DATA_PATH, "health_schema_summary.csv"))

# quick overview
for name, df in zip(
    ["Water", "Child Mortality", "Infant Mortality", "Life Expectancy", "NCD Mortality", "Obesity"],
    [water, child, infant, life, ncd, obesity]
):
    print(f"{name:<20}: {df.shape}")

print("\nAll datasets loaded successfully!")


Water               : (5116, 10)
Child Mortality     : (6654, 4)
Infant Mortality    : (6633, 4)
Life Expectancy     : (19609, 4)
NCD Mortality       : (4092, 4)
Obesity             : (6600, 4)

All datasets loaded successfully!


In [2]:
# standardize column names for consistent merging

child.rename(columns={
    "Entity": "country",
    "Code": "iso_code",
    "Year": "year",
    "Child mortality rate": "child_mortality"
}, inplace=True)

infant.rename(columns={
    "Entity": "country",
    "Code": "iso_code",
    "Year": "year",
    "Infant mortality rate of babies aged under one year, per 100 live births": "infant_mortality"
}, inplace=True)

life.rename(columns={
    "Entity": "country",
    "Code": "iso_code",
    "Year": "year",
    "Period life expectancy at birth": "life_expectancy"
}, inplace=True)

ncd.rename(columns={
    "Entity": "country",
    "Code": "iso_code",
    "Year": "year",
    "Mortality from CVD, cancer, diabetes or CRD between exact ages 30 and 70 (%)": "ncd_mortality"
}, inplace=True)

obesity.rename(columns={
    "Entity": "country",
    "Code": "iso_code",
    "Year": "year",
    "Prevalence of obesity among adults, BMI >= 30 (crude estimate) (%) - Sex: both sexes - Age group: 18+ years of age": "obesity_rate"
}, inplace=True)

print("All column names standardized successfully!")


All column names standardized successfully!


In [3]:
# start merging datasets
merged = life.merge(child, on=["country", "iso_code", "year"], how="outer") \
             .merge(infant, on=["country", "iso_code", "year"], how="outer") \
             .merge(ncd, on=["country", "iso_code", "year"], how="outer") \
             .merge(obesity, on=["country", "iso_code", "year"], how="outer") \
             .merge(water, on=["country", "iso_code", "year"], how="outer")

# check structure
print("Merging complete!")
print(f"Final merged dataset shape: {merged.shape}")

# quick data check
print("\nPreview of merged data:")
display(merged.head())


Merging complete!
Final merged dataset shape: (19618, 15)

Preview of merged data:


Unnamed: 0,country,iso_code,year,life_expectancy,child_mortality_rate,infant_mortality_rate,ncd_mortality,obesity_rate,water_safely_managed,water_basic,water_limited,water_unimproved,water_surface,water_access_total,water_inadequate_total
0,Afghanistan,AFG,1950,28.16,,,,,,,,,,,
1,Afghanistan,AFG,1951,28.58,,,,,,,,,,,
2,Afghanistan,AFG,1952,29.01,,,,,,,,,,,
3,Afghanistan,AFG,1953,29.45,,,,,,,,,,,
4,Afghanistan,AFG,1954,29.7,,,,,,,,,,,


In [4]:
# check year range per indicator
year_summary = {
    "life_expectancy": (life['year'].min(), life['year'].max()),
    "child_mortality": (child['year'].min(), child['year'].max()),
    "infant_mortality": (infant['year'].min(), infant['year'].max()),
    "ncd_mortality": (ncd['year'].min(), ncd['year'].max()),
    "obesity_rate": (obesity['year'].min(), obesity['year'].max()),
    "water_access": (water['year'].min(), water['year'].max())
}

summary_df = pd.DataFrame(year_summary, index=["min_year", "max_year"]).T
summary_df


Unnamed: 0,min_year,max_year
life_expectancy,1543,2023
child_mortality,1990,2022
infant_mortality,1990,2022
ncd_mortality,2000,2021
obesity_rate,1990,2022
water_access,2000,2022


In [5]:
# focus on the shared period across datasets (2000â€“2021)
merged_filtered = merged[(merged['year'] >= 2000) & (merged['year'] <= 2021)].reset_index(drop=True)

print("Filtered merged dataset shape:", merged_filtered.shape)
print("Year range:", merged_filtered['year'].min(), "-", merged_filtered['year'].max())

# quick check
merged_filtered.head()


Filtered merged dataset shape: (5245, 15)
Year range: 2000 - 2021


Unnamed: 0,country,iso_code,year,life_expectancy,child_mortality_rate,infant_mortality_rate,ncd_mortality,obesity_rate,water_safely_managed,water_basic,water_limited,water_unimproved,water_surface,water_access_total,water_inadequate_total
0,Afghanistan,AFG,2000,55.0,13.17,110.1,43.2,3.69,11.093327,16.34853,3.299203,43.856777,25.402164,27.441857,72.558144
1,Afghanistan,AFG,2001,55.51,12.74,107.0,43.5,4.01,11.105221,16.368359,3.299883,43.843445,25.383093,27.47358,72.526421
2,Afghanistan,AFG,2002,56.23,12.31,103.8,43.1,4.34,12.007733,17.66713,3.607177,42.260395,24.457567,29.674863,70.325139
3,Afghanistan,AFG,2003,57.17,11.87,100.6,42.5,4.7,12.909922,18.965668,3.914072,40.67728,23.533058,31.87559,68.12441
4,Afghanistan,AFG,2004,57.81,11.42,97.2,42.3,5.08,13.818684,20.275747,4.220617,39.086002,22.59895,34.094431,65.905569


In [9]:
# drop duplicate rows (if any)
merged_filtered = merged_filtered.drop_duplicates(subset=['country', 'iso_code', 'year'])

# reorder columns for clarity
cols = [
    'country', 'iso_code', 'year',
    'life_expectancy', 'child_mortality_rate', 'infant_mortality_rate',
    'ncd_mortality', 'obesity_rate',
    'water_safely_managed', 'water_basic', 'water_limited',
    'water_unimproved', 'water_surface',
    'water_access_total', 'water_inadequate_total'
]
cols += [c for c in merged_filtered.columns if c not in cols]  # keep extras if any
merged_filtered = merged_filtered[cols]

# save to processed folder
os.makedirs("../data/processed", exist_ok=True)
merged_filtered.to_csv("../data/processed/health_merged.csv", index=False)

print("Final merged dataset saved as '../data/processed/health_merged.csv'")
print(f"Final shape: {merged_filtered.shape}")


Final merged dataset saved as '../data/processed/health_merged.csv'
Final shape: (5245, 15)
