---
## 0. Setup Environment

In [68]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
import os 
from sklearn.dummy import DummyRegressor
#

---
## A. Project Description


The objective of this project is to develop a predictive model that estimates life expectancy based on a variety of health, demographic, and socio-economic indicators. By leveraging features such as immunization rates, mortality rates, GDP, population, schooling, and other relevant factors, the model aims to provide accurate life expectancy predictions. This will assist policymakers, researchers, and organizations in understanding the key drivers of life expectancy and in making informed decisions to improve public health outcomes. The project involves comprehensive data exploration, feature engineering, and model evaluation to ensure robust and reliable predictions that reflect real-world health dynamics.

---
## C. Data Understanding

### C.1   Load Datasets



In [70]:
# # Load training data
training_df = pd.read_csv("../data/raw/Life_Expectancy_Data.csv")

### C.2 Explore Training Set


In [71]:
training_df.head()

Unnamed: 0,Country,Year,Status,Life expectancy,Adult Mortality,infant deaths,Alcohol,percentage expenditure,Hepatitis B,Measles,...,Polio,Total expenditure,Diphtheria,HIV/AIDS,GDP,Population,thinness 1-19 years,thinness 5-9 years,Income composition of resources,Schooling
0,Afghanistan,2015,Developing,65.0,263.0,62,0.01,71.279624,65.0,1154,...,6.0,8.16,65.0,0.1,584.25921,33736494.0,17.2,17.3,0.479,10.1
1,Afghanistan,2014,Developing,59.9,271.0,64,0.01,73.523582,62.0,492,...,58.0,8.18,62.0,0.1,612.696514,327582.0,17.5,17.5,0.476,10.0
2,Afghanistan,2013,Developing,59.9,268.0,66,0.01,73.219243,64.0,430,...,62.0,8.13,64.0,0.1,631.744976,31731688.0,17.7,17.7,0.47,9.9
3,Afghanistan,2012,Developing,59.5,272.0,69,0.01,78.184215,67.0,2787,...,67.0,8.52,67.0,0.1,669.959,3696958.0,17.9,18.0,0.463,9.8
4,Afghanistan,2011,Developing,59.2,275.0,71,0.01,7.097109,68.0,3013,...,68.0,7.87,68.0,0.1,63.537231,2978599.0,18.2,18.2,0.454,9.5


In [72]:
training_df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2938 entries, 0 to 2937
Data columns (total 22 columns):
 #   Column                           Non-Null Count  Dtype  
---  ------                           --------------  -----  
 0   Country                          2938 non-null   object 
 1   Year                             2938 non-null   int64  
 2   Status                           2938 non-null   object 
 3   Life expectancy                  2928 non-null   float64
 4   Adult Mortality                  2928 non-null   float64
 5   infant deaths                    2938 non-null   int64  
 6   Alcohol                          2744 non-null   float64
 7   percentage expenditure           2938 non-null   float64
 8   Hepatitis B                      2385 non-null   float64
 9   Measles                          2938 non-null   int64  
 10   BMI                             2904 non-null   float64
 11  under-five deaths                2938 non-null   int64  
 12  Polio               

In [73]:
training_df.isnull().sum().sort_values(ascending=False)

Population                         652
Hepatitis B                        553
GDP                                448
Total expenditure                  226
Alcohol                            194
Income composition of resources    167
Schooling                          163
 thinness  1-19 years               34
 thinness 5-9 years                 34
 BMI                                34
Diphtheria                          19
Polio                               19
Life expectancy                     10
Adult Mortality                     10
infant deaths                        0
Status                               0
Country                              0
Year                                 0
under-five deaths                    0
Measles                              0
percentage expenditure               0
 HIV/AIDS                            0
dtype: int64

In [74]:
training_df.duplicated().sum()

np.int64(0)

### C.3 Explore Numerical Features


In [75]:
training_df.describe(include= 'number')

Unnamed: 0,Year,Life expectancy,Adult Mortality,infant deaths,Alcohol,percentage expenditure,Hepatitis B,Measles,BMI,under-five deaths,Polio,Total expenditure,Diphtheria,HIV/AIDS,GDP,Population,thinness 1-19 years,thinness 5-9 years,Income composition of resources,Schooling
count,2938.0,2928.0,2928.0,2938.0,2744.0,2938.0,2385.0,2938.0,2904.0,2938.0,2919.0,2712.0,2919.0,2938.0,2490.0,2286.0,2904.0,2904.0,2771.0,2775.0
mean,2007.51872,69.224932,164.796448,30.303948,4.602861,738.251295,80.940461,2419.59224,38.321247,42.035739,82.550188,5.93819,82.324084,1.742103,7483.158469,12753380.0,4.839704,4.870317,0.627551,11.992793
std,4.613841,9.523867,124.292079,117.926501,4.052413,1987.914858,25.070016,11467.272489,20.044034,160.445548,23.428046,2.49832,23.716912,5.077785,14270.169342,61012100.0,4.420195,4.508882,0.210904,3.35892
min,2000.0,36.3,1.0,0.0,0.01,0.0,1.0,0.0,1.0,0.0,3.0,0.37,2.0,0.1,1.68135,34.0,0.1,0.1,0.0,0.0
25%,2004.0,63.1,74.0,0.0,0.8775,4.685343,77.0,0.0,19.3,0.0,78.0,4.26,78.0,0.1,463.935626,195793.2,1.6,1.5,0.493,10.1
50%,2008.0,72.1,144.0,3.0,3.755,64.912906,92.0,17.0,43.5,4.0,93.0,5.755,93.0,0.1,1766.947595,1386542.0,3.3,3.3,0.677,12.3
75%,2012.0,75.7,228.0,22.0,7.7025,441.534144,97.0,360.25,56.2,28.0,97.0,7.4925,97.0,0.8,5910.806335,7420359.0,7.2,7.2,0.779,14.3
max,2015.0,89.0,723.0,1800.0,17.87,19479.91161,99.0,212183.0,87.3,2500.0,99.0,17.6,99.0,50.6,119172.7418,1293859000.0,27.7,28.6,0.948,20.7


### C.4 Explore Categorical Features


In [76]:
training_df.describe(include= 'object')


Unnamed: 0,Country,Status
count,2938,2938
unique,193,2
top,Afghanistan,Developing
freq,16,2426


In [77]:
categorical_cols = training_df.select_dtypes(include='object').columns

### C.5 Explore Target Variable




In [78]:
target_name = 'Life expectancy '
training_df[target_name].head()

0    65.0
1    59.9
2    59.9
3    59.5
4    59.2
Name: Life expectancy , dtype: float64

In [79]:
numerical_cols = training_df.select_dtypes(include='number').columns

---
## D. Feature Selection


### D.1 Approach 1

### D.z Final Selection of Features


In [80]:
features_list = []

---
## E. Data Cleaning

### E.1 Copy Datasets



In [81]:
training_df_clean=training_df.copy()

### E.2 Fixing "Missing data "




### E.3 Fixing "Outliers in the final list of features"



---
## F. Feature Engineering

### F.1 Copy Datasets



In [82]:
# Create copy of datasets

training_df_eng = training_df_clean.copy()


### F.2 New Feature ""






---
## G. Data Transformation

### G.1 Copy Datasets



In [83]:
# Create copy of datasets

training_df_trans = training_df_eng.copy()


### G.2 Data Transformation Encoding the categorical features 



---
## H. Data Preparation for Modeling

### H.1 Copy Datasets



In [84]:


# Split into train (70%), temp (30%)
train_df, temp_df = train_test_split(training_df_eng, test_size=0.3, random_state=42)

# Split temp into validation (15%) and test (15%)
val_df, test_df = train_test_split(temp_df, test_size=0.5, random_state=42)

print(f"Train shape: {train_df.shape}")
print(f"Validation shape: {val_df.shape}")
print(f"Test shape: {test_df.shape}")

Train shape: (2056, 22)
Validation shape: (441, 22)
Test shape: (441, 22)


### H.2 Split Features and Target Variables

In [85]:

X_train = train_df.drop(columns=[target_name])
y_train = train_df[target_name]

X_val = val_df.drop(columns=[target_name])
y_val = val_df[target_name]

X_test = test_df.drop(columns=[target_name])
y_test = test_df[target_name]

---
## I. Save Datasets

> Do not change this code

In [86]:
# Ensure the processed data directory exists
os.makedirs('../data/processed', exist_ok=True)


In [87]:

X_train.to_csv('../data/processed/X_train.csv', index=False)
y_train.to_csv('../data/processed/y_train.csv', index=False)

---
## J. Assess Baseline Model

### J.1 Generate Predictions with Baseline Model

In [88]:


dummy_regressor = DummyRegressor(strategy="mean")  # Predicts the mean of the target values
dummy_regressor.fit(X_train, y_train)
y_pred = dummy_regressor.predict(X_val)
y_pred



ValueError: Input y contains NaN.

### J.2 Selection of Performance Metrics




In [None]:

rmse = np.sqrt(mean_squared_error(y_val, y_pred))
print(f"RMSE: {rmse}")


RMSE: 71.41122641110427


In [None]:
mae = mean_absolute_error(y_val, y_pred)
print(f"MAE: {mae}")

MAE: 28.384357945254933


### J.3 Baseline Model Performance




In [None]:

y_pred = dummy_regressor.predict(X_test)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
print(f"RMSE: {rmse}")

RMSE: 86.92117365503061


In [None]:
# Calculating accuracy within RMSE range
allowed_range_lower = y_test - rmse
allowed_range_upper = y_test + rmse

within_range = np.logical_and(y_pred >= allowed_range_lower, y_pred <= allowed_range_upper)
accuracy = np.sum(within_range) / len(y_test) * 100

print(f"Accuracy within RMSE range: {accuracy:.2f}%")


Accuracy within RMSE range: 87.83%


In [None]:
mae = mean_absolute_error(y_test, y_pred)
print(f"MAE: {mae}")

MAE: 39.73232608655444


In [None]:
# Calculating accuracy within mae range
allowed_range_lower = y_test - mae
allowed_range_upper = y_test + mae

within_range = np.logical_and(y_pred >= allowed_range_lower, y_pred <= allowed_range_upper)
accuracy = np.sum(within_range) / len(y_test) * 100

print(f"Accuracy within mae range: {accuracy:.2f}%")


Accuracy within mae range: 76.61%
