In [19]:
# -----------------------------------------
# FEATURE ENGINEERING NOTEBOOK
# Life Expectancy Project
# -----------------------------------------

# 1. IMPORT LIBRARIES
import pandas as pd
import numpy as np
from sklearn.impute import KNNImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder

# -----------------------------------------
# 2. LOAD CLEANED DATA
# -----------------------------------------
df = pd.read_csv("../data/life_expectancy_cleaned.csv")
print("Original data shape:", df.shape)
print(df.head())

# -----------------------------------------
# 3. IDENTIFY COLUMNS
# -----------------------------------------
# Separate numeric and categorical columns
target_col = 'Life_expectancy'

numeric_cols = df.select_dtypes(include=np.number).columns.tolist()
if target_col in numeric_cols:
    numeric_cols.remove(target_col)

categorical_cols = ['Country', 'Status']  # specify categorical columns
categorical_cols = [col for col in categorical_cols if col in df.columns]

print("Numeric columns:", numeric_cols)
print("Categorical columns:", categorical_cols)

# -----------------------------------------
# 4. HANDLE MISSING VALUES (NUMERIC)
# -----------------------------------------
# KNN Imputer for numeric columns
imputer = KNNImputer(n_neighbors=3)
df[numeric_cols] = imputer.fit_transform(df[numeric_cols])
print("Numeric missing values handled.")

# -----------------------------------------
# 5. SCALE NUMERIC FEATURES
# -----------------------------------------
scaler = StandardScaler()
df[numeric_cols] = scaler.fit_transform(df[numeric_cols])
print("Numeric features scaled.")

# -----------------------------------------
# 6. ENCODE CATEGORICAL FEATURES
# -----------------------------------------
if len(categorical_cols) > 0:
    encoder = OneHotEncoder(sparse_output=False, drop='first')  # modern syntax
    encoded_array = encoder.fit_transform(df[categorical_cols])
    encoded_data = pd.DataFrame(encoded_array, columns=encoder.get_feature_names_out(categorical_cols))
    # Merge encoded columns and drop original categorical columns
    df = pd.concat([df.drop(categorical_cols, axis=1), encoded_data], axis=1)
    print("Categorical encoding completed.")

# -----------------------------------------
# 7. OPTIONAL FEATURE CREATION
# Example: Create Health_Index by combining some health-related metrics
# -----------------------------------------
if 'Alcohol' in df.columns and 'Hepatitis_B' in df.columns:
    df['Health_Index'] = df['Alcohol'] + df['Hepatitis_B']
    print("Optional feature 'Health_Index' created.")

# -----------------------------------------
# 8. FINAL CHECK
# -----------------------------------------
print("Data shape after feature engineering:", df.shape)
print(df.head())

# -----------------------------------------
# 9. SAVE FEATURE ENGINEERED DATA
# -----------------------------------------
df.to_csv("../data/life_expectancy_features.csv", index=False)
print("Feature-engineered dataset saved as 'life_expectancy_features.csv'")



Original data shape: (2938, 22)
       Country    Year      Status  Life expectancy   Adult Mortality  \
0  Afghanistan  2015.0  Developing              65.0            263.0   
1  Afghanistan  2014.0  Developing              59.9            271.0   
2  Afghanistan  2013.0  Developing              59.9            268.0   
3  Afghanistan  2012.0  Developing              59.5            272.0   
4  Afghanistan  2011.0  Developing              59.2            275.0   

   infant deaths  Alcohol  percentage expenditure  Hepatitis B    Measles   \
0      30.303948     0.01               71.279624         65.0  2419.59224   
1      30.303948     0.01               73.523582         62.0   492.00000   
2      30.303948     0.01               73.219243         64.0   430.00000   
3      30.303948     0.01               78.184215         67.0  2419.59224   
4      30.303948     0.01                7.097109         68.0  2419.59224   

   ...      Polio  Total expenditure  Diphtheria    HIV/AIDS