In [28]:
# importing libraries:
import pandas as pd
import numpy as np
from sklearn import metrics
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import Lasso
from sklearn.linear_model import ElasticNet
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import r2_score
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import RandomizedSearchCV
import eli5
from eli5.sklearn import PermutationImportance
import category_encoders as ce
from pdpbox.pdp import pdp_isolate, pdp_plot
from pdpbox.pdp import pdp_interact, pdp_interact_plot
import plotly.express as px
import pandas_profiling
from sklearn.feature_selection import f_regression, SelectKBest
from xgboost import XGBRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor

In [2]:
df=pd.read_csv('https://raw.githubusercontent.com/VeraMendes/Project---Train-a-predictive-model/master/led.csv')
print(df.shape)
df.head()

(2938, 22)


Unnamed: 0,Country,Year,Status,Lifeexpectancy,AdultMortality,infantdeaths,Alcohol,percentageexpenditure,HepatitisB,Measles,...,Polio,Totalexpenditure,Diphtheria,HIV/AIDS,GDP,Population,thinness1-19years,thinness5-9years,Incomecompositionofresources,Schooling
0,Afghanistan,2015,Developing,65.0,263.0,62,0.01,71.279624,65.0,1154,...,6.0,8.16,65.0,0.1,584.25921,33736494.0,17.2,17.3,0.479,10.1
1,Afghanistan,2014,Developing,59.9,271.0,64,0.01,73.523582,62.0,492,...,58.0,8.18,62.0,0.1,612.696514,327582.0,17.5,17.5,0.476,10.0
2,Afghanistan,2013,Developing,59.9,268.0,66,0.01,73.219243,64.0,430,...,62.0,8.13,64.0,0.1,631.744976,31731688.0,17.7,17.7,0.47,9.9
3,Afghanistan,2012,Developing,59.5,272.0,69,0.01,78.184215,67.0,2787,...,67.0,8.52,67.0,0.1,669.959,3696958.0,17.9,18.0,0.463,9.8
4,Afghanistan,2011,Developing,59.2,275.0,71,0.01,7.097109,68.0,3013,...,68.0,7.87,68.0,0.1,63.537231,2978599.0,18.2,18.2,0.454,9.5


In [3]:
# changing my column names
df = df.rename(columns = {
    'Country':'country','Year':'year', 'Status':'development','Lifeexpectancy':'lifespan',
    'AdultMortality':'adult_mortality', 'infantdeaths':'infant_deaths',
    'Alcohol':'alcohol_consumption', 'percentageexpenditure':'percentage_expenditure',
    'HepatitisB':'hepatitisb','Measles':'measles','BMI':'BMI','under-fivedeaths':'baby_deaths',
    'Polio':'polio', 'Totalexpenditure':'total_expenditure','Diphtheria':'diphtheria','HIV/AIDS':'HIV',
    'GDP':'GDP','Population':'population','thinness1-19years':'thinness_teenager',
    'thinness5-9years':'thinness_children','Incomecompositionofresources':'ICR','Schooling':'education'
})

df.head()

Unnamed: 0,country,year,development,lifespan,adult_mortality,infant_deaths,alcohol_consumption,percentage_expenditure,hepatitisb,measles,...,polio,total_expenditure,diphtheria,HIV,GDP,population,thinness_teenager,thinness_children,ICR,education
0,Afghanistan,2015,Developing,65.0,263.0,62,0.01,71.279624,65.0,1154,...,6.0,8.16,65.0,0.1,584.25921,33736494.0,17.2,17.3,0.479,10.1
1,Afghanistan,2014,Developing,59.9,271.0,64,0.01,73.523582,62.0,492,...,58.0,8.18,62.0,0.1,612.696514,327582.0,17.5,17.5,0.476,10.0
2,Afghanistan,2013,Developing,59.9,268.0,66,0.01,73.219243,64.0,430,...,62.0,8.13,64.0,0.1,631.744976,31731688.0,17.7,17.7,0.47,9.9
3,Afghanistan,2012,Developing,59.5,272.0,69,0.01,78.184215,67.0,2787,...,67.0,8.52,67.0,0.1,669.959,3696958.0,17.9,18.0,0.463,9.8
4,Afghanistan,2011,Developing,59.2,275.0,71,0.01,7.097109,68.0,3013,...,68.0,7.87,68.0,0.1,63.537231,2978599.0,18.2,18.2,0.454,9.5


In [4]:
# dropping nan's for lifespan
df = df.dropna(axis=0, subset=['lifespan'])
# checking dimnesions of my df
df.shape
# due to leakage problems, I am creating a new column which is going to be my y_target: next_year_lifespan
df['next_year_lifespan']= df['lifespan'].shift(1)
# 2015 does not have data on y_target for any country
# With pd.shift, I have moved into 2015 wrong values for the countries considered.
year_2015 = df[df.year == 2015]
# For my analysis I am dropping 2015 rows to avoid errors and leakage.
df = df[df.year != 2015]
df.head()

Unnamed: 0,country,year,development,lifespan,adult_mortality,infant_deaths,alcohol_consumption,percentage_expenditure,hepatitisb,measles,...,total_expenditure,diphtheria,HIV,GDP,population,thinness_teenager,thinness_children,ICR,education,next_year_lifespan
1,Afghanistan,2014,Developing,59.9,271.0,64,0.01,73.523582,62.0,492,...,8.18,62.0,0.1,612.696514,327582.0,17.5,17.5,0.476,10.0,65.0
2,Afghanistan,2013,Developing,59.9,268.0,66,0.01,73.219243,64.0,430,...,8.13,64.0,0.1,631.744976,31731688.0,17.7,17.7,0.47,9.9,59.9
3,Afghanistan,2012,Developing,59.5,272.0,69,0.01,78.184215,67.0,2787,...,8.52,67.0,0.1,669.959,3696958.0,17.9,18.0,0.463,9.8,59.9
4,Afghanistan,2011,Developing,59.2,275.0,71,0.01,7.097109,68.0,3013,...,7.87,68.0,0.1,63.537231,2978599.0,18.2,18.2,0.454,9.5,59.5
5,Afghanistan,2010,Developing,58.8,279.0,74,0.01,79.679367,66.0,1989,...,9.2,66.0,0.1,553.32894,2883167.0,18.4,18.4,0.448,9.2,59.2


In [5]:
# looking into nan's
df.isnull().sum()

country                     0
year                        0
development                 0
lifespan                    0
adult_mortality             0
infant_deaths               0
alcohol_consumption        16
percentage_expenditure      0
hepatitisb                544
measles                     0
BMI                        30
baby_deaths                 0
polio                      19
total_expenditure          45
diphtheria                 19
HIV                         0
GDP                       414
population                603
thinness_teenager          30
thinness_children          30
ICR                       150
education                 150
next_year_lifespan          0
dtype: int64

### Looking into Time Series I am:
#### using 2013 & 2014 as test
#### using 2011 & 2012 as val
#### I cannot use 2015 values as I don't have values for the next year lifespan (2016)

In [6]:
# creating a new y target
df['delta_target']= df['next_year_lifespan'] - df['lifespan']

In [8]:
# splitting my data into a time series manner
train = df[df['year']<2011]
val = df[(df.year == 2011) | (df.year == 2012)]
test = df[(df.year == 2013) | (df.year == 2014)]
train.shape, val.shape, test.shape

((2013, 24), (366, 24), (366, 24))

In [9]:
# Arrange data into X features matrix and y target vector
delta_target = 'delta_target'
baseline_values = 'lifespan'
cols_to_drop = ['next_year_lifespan', 'delta_target']
X_train = train[train.columns.drop(cols_to_drop)]
y_train = train['delta_target']
X_val = val[val.columns.drop(cols_to_drop)]
y_val = val['delta_target']
X_test = test[test.columns.drop(cols_to_drop)]
y_test = test['delta_target']

In [14]:
# using previous year lifespan, as y_pred = y_baseline
# y_baseline = pd.Series(0, index=train.index)
# using previous year lifespan, as y_pred = y_baseline to try to find a better model fit
# mean_baseline = train['next_year_lifespan'].mean()
y_pred = [0] * len(y_val)
baseline_mae = mean_absolute_error(y_val, y_pred)
r2_val = r2_score(y_val, y_pred)
# print('mean baseline:', train['next_year_lifespan'].mean())
print(f'Mean Absolut Error of the baseline prediction (predict lifespan for years 2011 & 2012): {baseline_mae:.4f}')
print('Val R\u00b2:', r2_val)

Mean Absolut Error of the baseline prediction (predict lifespan for years 2011 & 2012): 0.8281
Val R²: -0.02837317309823173


In [20]:
# Arrange data into X features matrix and y target vector
delta_target = 'delta_target'
baseline_values = 'lifespan'
cols_to_drop = ['country', 'year', 'development','infant_deaths',
                'percentage_expenditure','hepatitisb', 'measles',
                'polio','total_expenditure', 'diphtheria', 'GDP',
                'population', 'thinness_teenager', 'next_year_lifespan',
                'delta_target']
X_train = train[train.columns.drop(cols_to_drop)]
y_train = train['delta_target']
X_val = val[val.columns.drop(cols_to_drop)]
y_val = val['delta_target']
X_test = test[test.columns.drop(cols_to_drop)]
y_test = test['delta_target']

In [21]:
transformers = make_pipeline(
    ce.OrdinalEncoder(), 
    SimpleImputer(strategy='mean'), 
)

X_train_transformed = transformers.fit_transform(X_train)
X_val_transformed = transformers.transform(X_val)
X_test_transformed = transformers.transform(X_test)

In [22]:
model = LinearRegression()
model.fit(X_train_transformed, y_train)
y_pred = model.predict(X_val_transformed)
r2_val = r2_score(y_val, y_pred)
print('Mean Absolut error:', mean_absolute_error(y_val, y_pred))
print('Val R\u00b2:', r2_val)

Mean Absolut error: 0.7694277849740352
Val R²: 0.06941164836171176


In [23]:
model = Ridge()
model.fit(X_train_transformed, y_train)
y_pred = model.predict(X_val_transformed)
print('Mean Absolut error:', mean_absolute_error(y_val, y_pred))

Mean Absolut error: 0.7694324882908721


In [26]:
model = DecisionTreeRegressor()
model.fit(X_train_transformed, y_train)
y_pred = model.predict(X_val_transformed)
print('Mean Absolut error:', mean_absolute_error(y_val, y_pred))

Mean Absolut error: 1.266666666666667


In [29]:
model = RandomForestRegressor(n_estimators=100, n_jobs=-1)
model.fit(X_train_transformed, y_train)
y_pred = model.predict(X_val_transformed)
print('Mean Absolut error:', mean_absolute_error(y_val, y_pred))

Mean Absolut error: 0.8252513661202187


In [31]:
model = XGBRegressor(learning_rate=0.1,
             max_depth=7, n_estimators=2000,
             n_jobs=-1, objective='reg:linear')

model.fit(X_train, y_train)
y_pred = model.predict(X_val)
r2_val = r2_score(y_val, y_pred)
print('Mean Absolut error:', mean_absolute_error(y_val, y_pred))
print('Val R\u00b2:', r2_val)

Mean Absolut error: 0.8900343867110423
Val R²: 0.13177303760923176


In [34]:
# Arrange data into X features matrix and y target vector
delta_target = 'delta_target'
baseline_values = 'lifespan'
cols_to_drop = ['country', 'year', 'population', 'next_year_lifespan', 'delta_target', 'lifespan']
X_train = train[train.columns.drop(cols_to_drop)]
y_train = train['delta_target']
X_val = val[val.columns.drop(cols_to_drop)]
y_val = val['delta_target']
X_test = test[test.columns.drop(cols_to_drop)]
y_test = test['delta_target']

In [35]:
transformers = make_pipeline(
    ce.OrdinalEncoder(), 
    SimpleImputer(strategy='mean'), 
)

X_train_transformed = transformers.fit_transform(X_train)
X_val_transformed = transformers.transform(X_val)
X_test_transformed = transformers.transform(X_test)

In [36]:
model = LinearRegression()
model.fit(X_train_transformed, y_train)
y_pred = model.predict(X_val_transformed)
r2_val = r2_score(y_val, y_pred)
print('Mean Absolut error:', mean_absolute_error(y_val, y_pred))
print('Val R\u00b2:', r2_val)

Mean Absolut error: 0.7293505873423107
Val R²: 0.0011235527048174543


In [37]:
model = Ridge()
model.fit(X_train_transformed, y_train)
y_pred = model.predict(X_val_transformed)
print('Mean Absolut error:', mean_absolute_error(y_val, y_pred))

Mean Absolut error: 0.7293496985443878


In [38]:
model = DecisionTreeRegressor()
model.fit(X_train_transformed, y_train)
y_pred = model.predict(X_val_transformed)
print('Mean Absolut error:', mean_absolute_error(y_val, y_pred))

Mean Absolut error: 1.420218579234973


In [39]:
model = RandomForestRegressor(n_estimators=100, n_jobs=-1)
model.fit(X_train_transformed, y_train)
y_pred = model.predict(X_val_transformed)
print('Mean Absolut error:', mean_absolute_error(y_val, y_pred))

Mean Absolut error: 0.9056530054644807


In [43]:
model = XGBRegressor(learning_rate=0.1,
             max_depth=7, n_estimators=2000,
             n_jobs=-1)

model.fit(X_train_transformed, y_train)
y_pred = model.predict(X_val_transformed)
r2_val = r2_score(y_val, y_pred)
print('Mean Absolut error:', mean_absolute_error(y_val, y_pred))
print('Val R\u00b2:', r2_val)

Mean Absolut error: 1.0637669849428317
Val R²: -0.33556783517344413
