In [1]:
# Imports
%matplotlib inline
import pandas as pd
import numpy as np
import matplotlib as plt
import seaborn as sns
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import LabelEncoder
import warnings
warnings.filterwarnings(action='ignore', category=FutureWarning)
warnings.filterwarnings(action='ignore', category=DeprecationWarning)

In [2]:
# Import data
train_data = pd.read_csv('life_expectancy_train.csv')
test_data = pd.read_csv('life_expectancy_test.csv')
train_data.head()

Unnamed: 0,Country,Year,Status,Life expectancy,Adult Mortality,infant deaths,Alcohol,percentage expenditure,Hepatitis B,Measles,...,Polio,Total expenditure,Diphtheria,HIV/AIDS,GDP,Population,thinness 1-19 years,thinness 5-9 years,Income composition of resources,Schooling
0,Afghanistan,2015,Developing,65.0,263.0,62,0.01,71.279624,65.0,1154,...,6.0,8.16,65.0,0.1,584.25921,33736494.0,17.2,17.3,0.479,10.1
1,Afghanistan,2014,Developing,59.9,271.0,64,0.01,73.523582,62.0,492,...,58.0,8.18,62.0,0.1,612.696514,327582.0,17.5,17.5,0.476,10.0
2,Afghanistan,2013,Developing,59.9,268.0,66,0.01,73.219243,64.0,430,...,62.0,8.13,64.0,0.1,631.744976,31731688.0,17.7,17.7,0.47,9.9
3,Afghanistan,2012,Developing,59.5,272.0,69,0.01,78.184215,67.0,2787,...,67.0,8.52,67.0,0.1,669.959,3696958.0,17.9,18.0,0.463,9.8
4,Afghanistan,2011,Developing,59.2,275.0,71,0.01,7.097109,68.0,3013,...,68.0,7.87,68.0,0.1,63.537231,2978599.0,18.2,18.2,0.454,9.5


In [3]:
# Check info for preprocessing
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1999 entries, 0 to 1998
Data columns (total 22 columns):
 #   Column                           Non-Null Count  Dtype  
---  ------                           --------------  -----  
 0   Country                          1999 non-null   object 
 1   Year                             1999 non-null   int64  
 2   Status                           1999 non-null   object 
 3   Life expectancy                  1992 non-null   float64
 4   Adult Mortality                  1992 non-null   float64
 5   infant deaths                    1999 non-null   int64  
 6   Alcohol                          1876 non-null   float64
 7   percentage expenditure           1999 non-null   float64
 8   Hepatitis B                      1619 non-null   float64
 9   Measles                          1999 non-null   int64  
 10   BMI                             1998 non-null   float64
 11  under-five deaths                1999 non-null   int64  
 12  Polio               

In [4]:
# Preprocessing

# Remove leading and trailing spaces in column names
train_data.rename(columns=lambda x: x.strip(),inplace=True)
test_data.rename(columns=lambda x: x.strip(),inplace=True)

# Deal with NaN values
# print(train_data.isnull().sum())

# Drop year column
train_data.drop(['Year'],axis=1,inplace=True)
test_data.drop(['Year'],axis=1,inplace=True)

# Remove rows with null values in Life Expectancy
train_data = train_data[~train_data['Life expectancy'].isnull()]
test_data = test_data[~test_data['Life expectancy'].isnull()]

# Forward fill or backward fill null values in other columns
cols = train_data.columns
train_data[cols] = train_data[cols].fillna(train_data.groupby(['Country'])[cols].ffill().bfill())
cols = test_data.columns
test_data[cols] = test_data[cols].fillna(test_data.groupby(['Country'])[cols].ffill().bfill())

# Make object columns categorical
for col in ['Country','Status']:
    train_data[col] = train_data[col].astype('category')
    test_data[col] = test_data[col].astype('category')
    
# Make status numeric
status_num = {'Developing' : 0, 'Developed' : 1}
train_data['Developed'] = train_data['Status'].map(status_num)
train_data.drop(['Status'],axis=1,inplace=True)
test_data['Developed'] = test_data['Status'].map(status_num)
test_data.drop(['Status'],axis=1,inplace=True)


train_data.to_csv('life_expectancy_train_cleaned.csv',index=False)
test_data.to_csv('life_expectancy_test_cleaned.csv',index=False)

train_data.head(10)


Unnamed: 0,Country,Life expectancy,Adult Mortality,infant deaths,Alcohol,percentage expenditure,Hepatitis B,Measles,BMI,under-five deaths,...,Total expenditure,Diphtheria,HIV/AIDS,GDP,Population,thinness 1-19 years,thinness 5-9 years,Income composition of resources,Schooling,Developed
0,Afghanistan,65.0,263.0,62,0.01,71.279624,65.0,1154,19.1,83,...,8.16,65.0,0.1,584.25921,33736494.0,17.2,17.3,0.479,10.1,0
1,Afghanistan,59.9,271.0,64,0.01,73.523582,62.0,492,18.6,86,...,8.18,62.0,0.1,612.696514,327582.0,17.5,17.5,0.476,10.0,0
2,Afghanistan,59.9,268.0,66,0.01,73.219243,64.0,430,18.1,89,...,8.13,64.0,0.1,631.744976,31731688.0,17.7,17.7,0.47,9.9,0
3,Afghanistan,59.5,272.0,69,0.01,78.184215,67.0,2787,17.6,93,...,8.52,67.0,0.1,669.959,3696958.0,17.9,18.0,0.463,9.8,0
4,Afghanistan,59.2,275.0,71,0.01,7.097109,68.0,3013,17.2,97,...,7.87,68.0,0.1,63.537231,2978599.0,18.2,18.2,0.454,9.5,0
5,Afghanistan,58.8,279.0,74,0.01,79.679367,66.0,1989,16.7,102,...,9.2,66.0,0.1,553.32894,2883167.0,18.4,18.4,0.448,9.2,0
6,Afghanistan,58.6,281.0,77,0.01,56.762217,63.0,2861,16.2,106,...,9.42,63.0,0.1,445.893298,284331.0,18.6,18.7,0.434,8.9,0
7,Afghanistan,58.1,287.0,80,0.03,25.873925,64.0,1599,15.7,110,...,8.33,64.0,0.1,373.361116,2729431.0,18.8,18.9,0.433,8.7,0
8,Afghanistan,57.5,295.0,82,0.02,10.910156,63.0,1141,15.2,113,...,6.73,63.0,0.1,369.835796,26616792.0,19.0,19.1,0.415,8.4,0
9,Afghanistan,57.3,295.0,84,0.03,17.171518,64.0,1990,14.7,116,...,7.43,58.0,0.1,272.56377,2589345.0,19.2,19.3,0.405,8.1,0


In [210]:
corr = train_data.corr()
corr.style.background_gradient(cmap='coolwarm')

Unnamed: 0,Life expectancy,Adult Mortality,infant deaths,Alcohol,percentage expenditure,Hepatitis B,Measles,BMI,under-five deaths,Polio,Total expenditure,Diphtheria,HIV/AIDS,GDP,Population,thinness 1-19 years,thinness 5-9 years,Income composition of resources,Schooling
Life expectancy,1.0,-0.688177,-0.196424,0.440724,0.412536,0.342452,-0.159814,0.606781,-0.22179,0.476614,0.295723,0.51964,-0.578243,0.456732,-0.023622,-0.484609,-0.476001,0.698673,0.695567
Adult Mortality,-0.688177,1.0,0.067135,-0.244358,-0.25617,-0.171146,0.008059,-0.404392,0.081218,-0.268154,-0.170749,-0.275184,0.576741,-0.2864,-0.022972,0.302675,0.301282,-0.433356,-0.41728
infant deaths,-0.196424,0.067135,1.0,-0.12258,-0.096025,-0.243143,0.505921,-0.246878,0.996688,-0.171875,-0.163668,-0.185234,0.015224,-0.119363,0.55629,0.512525,0.519012,-0.137516,-0.17433
Alcohol,0.440724,-0.244358,-0.12258,1.0,0.404554,0.090905,-0.058335,0.38998,-0.118526,0.226502,0.270061,0.229639,-0.101219,0.413662,-0.046331,-0.442101,-0.434316,0.467972,0.533059
percentage expenditure,0.412536,-0.25617,-0.096025,0.404554,1.0,0.028935,-0.063157,0.247429,-0.098202,0.155315,0.207023,0.152621,-0.120364,0.909076,-0.027204,-0.274244,-0.271045,0.377294,0.380827
Hepatitis B,0.342452,-0.171146,-0.243143,0.090905,0.028935,1.0,-0.162666,0.25355,-0.254047,0.502052,0.165849,0.60705,-0.168534,0.07196,-0.095712,-0.20098,-0.211863,0.267684,0.269842
Measles,-0.159814,0.008059,0.505921,-0.058335,-0.063157,-0.162666,1.0,-0.190156,0.512126,-0.137203,-0.12695,-0.151185,0.029689,-0.0758,0.241847,0.24127,0.233895,-0.092094,-0.078167
BMI,0.606781,-0.404392,-0.246878,0.38998,0.247429,0.25355,-0.190156,1.0,-0.25646,0.323783,0.282451,0.349565,-0.287909,0.296515,-0.08294,-0.548271,-0.547246,0.51668,0.534564
under-five deaths,-0.22179,0.081218,0.996688,-0.118526,-0.098202,-0.254047,0.512126,-0.25646,1.0,-0.189918,-0.165852,-0.206647,0.028785,-0.122796,0.544219,0.513527,0.519145,-0.155529,-0.188672
Polio,0.476614,-0.268154,-0.171875,0.226502,0.155315,0.502052,-0.137203,0.323783,-0.189918,1.0,0.173165,0.707149,-0.186786,0.208577,-0.040323,-0.230879,-0.231474,0.394517,0.418577


In [211]:
X = train_data.drop(labels=['Life expectancy'], axis=1)
y = train_data['Life expectancy']
X_test = test_data.drop(labels=['Life expectancy'], axis=1)
y_test = test_data['Life expectancy']

encoder = LabelEncoder()
X['Country_cat'] = encoder.fit_transform(X['Country'])
X_test['Country_cat'] = encoder.fit_transform(X_test['Country'])
X.drop(['Country'],axis=1,inplace=True)
X_test.drop(['Country'],axis=1,inplace=True)

scaler = MinMaxScaler()
X = scaler.fit_transform(X)
X_test = scaler.fit_transform(X)

X, y = shuffle(X, y, random_state=42)
X_test, y_test = shuffle(X_test, y_test, random_state=42)

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.4, random_state=42)

In [212]:
# Random Forest Regression

n_estimators = [10,50,100,200,300,500]
max_depths = [1,2,3,4,5,6,7]
mses = {}

for n_estimator in n_estimators:
    for max_depth in max_depths:
        regressor = RandomForestRegressor(n_estimators=n_estimator, max_depth=max_depth)
        regressor.fit(X_train, y_train)
        y_out = regressor.predict(X_val)
        mse = mean_squared_error(y_val,y_out)
        mses[f'estimators {n_estimator}, max_depths {max_depth}'] = mse

In [None]:
min_value = np.min(list(mses.values()))
best_hp = list(mses.keys())[list(mses.values()).index(min_value)]
print("Best HPs", best_hp)
n_estimators = best_hp.split(' ')[1].replace(',','')
max_depth = best_hp.split(' ')[3]

In [None]:
regressor = RandomForestRegressor(n_estimators=int(n_estimators), max_depth=int(max_depth))
regressor.fit(X_train, y_train)
y_out = regressor.predict(X_val)
print(f'Validation error: {np.sqrt(mean_squared_error(y_val,y_out))}')
y_out = regressor.predict(X_test)
print(f'Test error: {np.sqrt(mean_squared_error(y_test,y_out))}')

In [None]:
grid = GridSearchCV(estimator=RandomForestRegressor(), param_grid={'n_estimators': [10,50,99,200,300,500], 'max_depth':[1,2,3,4,5,6,7]}, cv=5)
grid.fit(X_train, y_train.values.ravel())

grid.best_estimator_

In [None]:
def evaluate(model, test_features, test_labels):
    y_out = model.predict(test_features)
    mse = mean_squared_error(test_labels,y_out)
    rmse = np.sqrt(mse)
    
    print('Model Performance')
    print(f'Root Mean Squared Error: {rmse}')

evaluate(grid.best_estimator_, X_val, y_val)
evaluate(grid.best_estimator_, X_test, y_test)