In [6]:
# 1. Imports
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
import os

# 2. Load the cleaned + merged data
input_path = "../data/raw/"
files = os.listdir(input_path)

# Load all CSVs
df_life = pd.read_csv(input_path + "life_expectancy.csv")
df_san = pd.read_csv(input_path + "sanitation.csv")
df_poll = pd.read_csv(input_path + "pollution.csv")
df_mort = pd.read_csv(input_path + "mortality_u5.csv")
df_exp = pd.read_csv(input_path + "health_expenditure.csv")
df_gdp = pd.read_csv(input_path + "gdp_per_capita.csv")

# Obesity may not exist — try/except
try:
    df_obesity = pd.read_csv(input_path + "obesity.csv")
except:
    df_obesity = None
    print("Obesity data not found. Skipping for now.")

# 3. Merge everything on country_code + year
def merge_on(df_base, df_new, col_name):
    df_new = df_new[['country_code', 'year', 'value']].rename(columns={'value': col_name})
    return df_base.merge(df_new, on=['country_code', 'year'], how='left')

df = df_life[['country_code', 'year', 'value']].rename(columns={'value': 'life_expectancy'})
df = merge_on(df, df_exp, 'health_expenditure')
df = merge_on(df, df_gdp, 'gdp_per_capita')
df = merge_on(df, df_san, 'sanitation')
df = merge_on(df, df_poll, 'pollution')
df = merge_on(df, df_mort, 'mortality_u5')

if df_obesity is not None:
    df = merge_on(df, df_obesity, 'obesity')

# Add country name
df = df.merge(df_life[['country_code', 'country']], on='country_code', how='left')

# Reorder columns
columns = ['country', 'country_code', 'year', 'life_expectancy', 'health_expenditure',
           'gdp_per_capita', 'sanitation', 'pollution', 'mortality_u5']
if df_obesity is not None:
    columns.append('obesity')
df = df[columns]

# 4. Normalize selected features
features = ['life_expectancy', 'sanitation', 'health_expenditure', 'mortality_u5', 'pollution']
if df_obesity is not None:
    features.append('obesity')

df_score = df.dropna(subset=features).copy()
scaler = MinMaxScaler()
df_score[features] = scaler.fit_transform(df_score[features])

# 5. Create Longevity Score (weighted formula)
df_score['longevity_score'] = (
    0.35 * df_score['life_expectancy'] +
    0.20 * df_score['sanitation'] +
    0.15 * df_score['health_expenditure'] +
    -0.15 * df_score['mortality_u5'] +
    -0.10 * df_score['pollution']
)

if df_obesity is not None:
    df_score['longevity_score'] -= 0.05 * df_score['obesity']

# 6. Save final scored data
output_path = "../data/cleaned/longevity_index.csv"
os.makedirs("../data/cleaned", exist_ok=True)
df_score.to_csv(output_path, index=False)
print(f"✅ Saved cleaned dataset with Longevity Score to: {output_path}")

# 7. Preview top 10 countries
df_score.sort_values(by='longevity_score', ascending=False).head(10)

Obesity data not found. Skipping for now.
✅ Saved cleaned dataset with Longevity Score to: ../data/cleaned/longevity_index.csv


Unnamed: 0,country,country_code,year,life_expectancy,health_expenditure,gdp_per_capita,sanitation,pollution,mortality_u5,longevity_score
236,Switzerland,CH,2020,0.912969,0.886416,85897.784334,0.9989,0.051926,0.019835,0.644114
178,Monaco,MC,2020,1.0,0.64897,176891.886538,1.0,0.058408,0.01157,0.639769
195,Norway,NO,2020,0.918879,0.669976,68340.018103,0.978594,0.014468,0.006612,0.615384
254,United States,US,2020,0.743372,1.0,64401.507435,0.996335,0.036393,0.040496,0.599733
58,Australia,AU,2020,0.918604,0.509459,51791.54018,1.0,0.041838,0.018182,0.591019
136,Iceland,IS,2020,0.914755,0.487161,59023.566347,0.986606,0.002622,0.009091,0.588934
164,Luxembourg,LU,2020,0.888849,0.580895,116860.028172,0.973545,0.047111,0.007438,0.587113
235,Sweden,SE,2020,0.894827,0.513764,52653.756593,0.987694,0.009243,0.008264,0.585629
146,Japan,JP,2020,0.956921,0.383712,40028.734173,0.99938,0.098969,0.006612,0.581467
101,Denmark,DK,2020,0.873593,0.558993,60985.48856,0.995576,0.052035,0.017355,0.580915
