In [17]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler

# 1. Load the raw dataset
df = pd.read_csv('Life Expectancy Data.csv')
df.columns = df.columns.str.strip()

In [18]:
rename_map = {
    'Life expectancy': 'LifeExpectancy',
    'Adult Mortality': 'AdultMortality',
    'infant deaths': 'InfantMortality',
    'Total expenditure': 'HealthExpenditure',
    'Income composition of resources': 'IncomeLevel',
    'HIV/AIDS': 'HIV_AIDS'}
df.rename(columns=rename_map, inplace=True)

In [19]:
# STEP 1: MISSING VALUE HANDLING (Fix #1)
# Explicit check
print("Nulls found per column:\n", df.isnull().sum()[df.isnull().sum() > 0])

# Robust Imputation: Using Median 
# Justification: Median is less sensitive to extreme outliers (GDP/Mortality) than Mean.
numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
df[numeric_cols] = df[numeric_cols].fillna(df[numeric_cols].median())

# Handle Status just in case
df['Status'] = df['Status'].fillna(df['Status'].mode()[0])

print("\n✅ Missing values handled with Median Imputation.")

Nulls found per column:
 LifeExpectancy           10
AdultMortality           10
Alcohol                 194
Hepatitis B             553
BMI                      34
Polio                    19
HealthExpenditure       226
Diphtheria               19
GDP                     448
Population              652
thinness  1-19 years     34
thinness 5-9 years       34
IncomeLevel             167
Schooling               163
dtype: int64

✅ Missing values handled with Median Imputation.


In [20]:
# STEP 2: FEATURE ENGINEERING
# 2.1 Status Encoding (Preserve info instead of dropping)
df['Status_Encoded'] = df['Status'].map({'Developed': 1, 'Developing': 0})

# 2.2 Mathematically Sound Ratios
df['health_efficiency_ratio'] = df['HealthExpenditure'] / (df['AdultMortality'] + 1)
df['health_gdp_ratio'] = df['HealthExpenditure'] / (df['GDP'] + 1)

# 2.3 Log Transformation for skewed GDP
df['log_GDP'] = np.log1p(df['GDP'])

# 2.4 Composite Indices (Normalizing components FIRST to avoid bias)
scaler_minmax = MinMaxScaler()
df['development_index'] = scaler_minmax.fit_transform(df[['GDP', 'IncomeLevel', 'Schooling']]).mean(axis=1)
df['disease_index'] = scaler_minmax.fit_transform(df[['AdultMortality', 'InfantMortality', 'HIV_AIDS']]).mean(axis=1)

# 2.5 Interaction Features
df['mortality_health_interaction'] = df['AdultMortality'] * df['HealthExpenditure']
df['education_income_interaction'] = df['Schooling'] * df['IncomeLevel']

In [21]:
#  STEP 3: PREVENTING DATA LEAKAGE


# Split first, Scale second.
X = df.drop(['LifeExpectancy', 'Country', 'Status'], axis=1)
y = df['LifeExpectancy']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Convert to float to avoid pandas FutureWarnings during scaling
X_train = X_train.astype(float)
X_test = X_test.astype(float)

In [22]:
#  STEP 4: FINAL SCALING (Standardization)
# ==========================================

# Identify columns to scale (EXCLUDE pre-scaled indices and categorical encodings)
cols_to_scale = X_train.select_dtypes(include=[np.number]).columns.tolist()
cols_to_exclude = ['development_index', 'disease_index', 'Year', 'Status_Encoded']
final_scale_cols = [c for c in cols_to_scale if c not in cols_to_exclude]

scaler_std = StandardScaler()
# Fit ONLY on Training data to prevent leakage
X_train.loc[:, final_scale_cols] = scaler_std.fit_transform(X_train[final_scale_cols])
X_test.loc[:, final_scale_cols] = scaler_std.transform(X_test[final_scale_cols])

In [23]:
#  STEP 5: EXPORT MASTER FILES
train_final = pd.concat([X_train, y_train], axis=1)
test_final = pd.concat([X_test, y_test], axis=1)

train_final.to_csv('life_expectancy_train_master.csv', index=False)
test_final.to_csv('life_expectancy_test_master.csv', index=False)

print("\n--- Final Check ---")
print(f"Missing values in Train set: {train_final.isnull().sum().sum()}")
print("✅ Files 'life_expectancy_train_master.csv' and 'life_expectancy_test_master.csv' saved.")


--- Final Check ---
Missing values in Train set: 0
✅ Files 'life_expectancy_train_master.csv' and 'life_expectancy_test_master.csv' saved.
