In [1]:
## Import Libraries

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.linear_model import ElasticNet, ElasticNetCV
import statsmodels.api as sm
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score

import warnings
warnings.filterwarnings('ignore')

In [2]:
# Read Data 

df = pd.read_csv('Data/regression_Life Expectancy.csv')

#Display Data
display(df)

Unnamed: 0,Country,Year,Status,Life expectancy,Adult Mortality,infant deaths,Alcohol,percentage expenditure,Hepatitis B,Measles,...,Polio,Total expenditure,Diphtheria,HIV/AIDS,GDP,Population,thinness 1-19 years,thinness 5-9 years,Income composition of resources,Schooling
0,Afghanistan,2015,Developing,65.0,263.0,62,0.01,71.279624,65.0,1154,...,6.0,8.16,65.0,0.1,584.259210,33736494.0,17.2,17.3,0.479,10.1
1,Afghanistan,2014,Developing,59.9,271.0,64,0.01,73.523582,62.0,492,...,58.0,8.18,62.0,0.1,612.696514,327582.0,17.5,17.5,0.476,10.0
2,Afghanistan,2013,Developing,59.9,268.0,66,0.01,73.219243,64.0,430,...,62.0,8.13,64.0,0.1,631.744976,31731688.0,17.7,17.7,0.470,9.9
3,Afghanistan,2012,Developing,59.5,272.0,69,0.01,78.184215,67.0,2787,...,67.0,8.52,67.0,0.1,669.959000,3696958.0,17.9,18.0,0.463,9.8
4,Afghanistan,2011,Developing,59.2,275.0,71,0.01,7.097109,68.0,3013,...,68.0,7.87,68.0,0.1,63.537231,2978599.0,18.2,18.2,0.454,9.5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2933,Zimbabwe,2004,Developing,44.3,723.0,27,4.36,0.000000,68.0,31,...,67.0,7.13,65.0,33.6,454.366654,12777511.0,9.4,9.4,0.407,9.2
2934,Zimbabwe,2003,Developing,44.5,715.0,26,4.06,0.000000,7.0,998,...,7.0,6.52,68.0,36.7,453.351155,12633897.0,9.8,9.9,0.418,9.5
2935,Zimbabwe,2002,Developing,44.8,73.0,25,4.43,0.000000,73.0,304,...,73.0,6.53,71.0,39.8,57.348340,125525.0,1.2,1.3,0.427,10.0
2936,Zimbabwe,2001,Developing,45.3,686.0,25,1.72,0.000000,76.0,529,...,76.0,6.16,75.0,42.1,548.587312,12366165.0,1.6,1.7,0.427,9.8


In [3]:
#Check nulls 

print(df.isna().sum())

Country                              0
Year                                 0
Status                               0
Life expectancy                     10
Adult Mortality                     10
infant deaths                        0
Alcohol                            194
percentage expenditure               0
Hepatitis B                        553
Measles                              0
 BMI                                34
under-five deaths                    0
Polio                               19
Total expenditure                  226
Diphtheria                          19
 HIV/AIDS                            0
GDP                                448
Population                         652
 thinness  1-19 years               34
 thinness 5-9 years                 34
Income composition of resources    167
Schooling                          163
dtype: int64


In [4]:
# Handle Null Values via Droping

df_copy = df.copy()

df2 = df_copy.dropna()
df2 = df2.reset_index(drop=True)

In [5]:
# Create X : Independent variables (Predictor) ------ Y : Dependent variable (Target)

Y = df2['Life expectancy ']
X = df2.drop(['Life expectancy '], axis =1)

In [6]:
# Handle Object/Categorical data Columns (Avoid Dummies Trap)

X = pd.get_dummies(X, drop_first=True)

In [7]:
# Use statmodels OLS for analysis
X_new = sm.add_constant(X)
ols_model = sm.OLS(Y,X_new)
results = ols_model.fit()
results.summary()

0,1,2,3
Dep. Variable:,Life expectancy,R-squared:,0.967
Model:,OLS,Adj. R-squared:,0.964
Method:,Least Squares,F-statistic:,294.9
Date:,"Tue, 25 Oct 2022",Prob (F-statistic):,0.0
Time:,14:00:49,Log-Likelihood:,-3100.3
No. Observations:,1649,AIC:,6505.0
Df Residuals:,1497,BIC:,7327.0
Df Model:,151,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,-354.6865,32.214,-11.010,0.000,-417.877,-291.496
Year,0.2219,0.017,12.837,0.000,0.188,0.256
Adult Mortality,-0.0006,0.001,-1.194,0.233,-0.002,0.000
infant deaths,0.0497,0.016,3.204,0.001,0.019,0.080
Alcohol,-0.0652,0.030,-2.151,0.032,-0.125,-0.006
percentage expenditure,-7.639e-05,0.000,-0.625,0.532,-0.000,0.000
Hepatitis B,0.0032,0.002,1.315,0.189,-0.002,0.008
Measles,-6.556e-06,6.46e-06,-1.014,0.311,-1.92e-05,6.12e-06
BMI,-0.0015,0.003,-0.432,0.666,-0.008,0.005

0,1,2,3
Omnibus:,667.315,Durbin-Watson:,1.239
Prob(Omnibus):,0.0,Jarque-Bera (JB):,3360.716
Skew:,1.861,Prob(JB):,0.0
Kurtosis:,8.921,Cond. No.,7.87e+20


In [8]:
#Get p=value from OLS model

p_values =results.pvalues.round(4)
p_values.drop('const', inplace=True)
p_value_df = pd.DataFrame(p_values)
p_value_df.reset_index(inplace= True)
p_value_df.rename(columns = {0 : 'p_value','index': 'Feature'}, inplace =True)
p_value_df

Unnamed: 0,Feature,p_value
0,Year,0.0000
1,Adult Mortality,0.2326
2,infant deaths,0.0014
3,Alcohol,0.0317
4,percentage expenditure,0.5323
...,...,...
147,Country_Uzbekistan,0.0000
148,Country_Vanuatu,0.0000
149,Country_Zambia,0.1089
150,Country_Zimbabwe,0.3889


In [9]:
# Fetch columns based on p_value

col_p_value = list(p_value_df[p_value_df['p_value']<0.05]['Feature'])
col_p_value

['Year',
 'infant deaths',
 'Alcohol',
 'under-five deaths ',
 ' HIV/AIDS',
 ' thinness 5-9 years',
 'Schooling',
 'Country_Albania',
 'Country_Algeria',
 'Country_Angola',
 'Country_Argentina',
 'Country_Armenia',
 'Country_Australia',
 'Country_Austria',
 'Country_Azerbaijan',
 'Country_Bangladesh',
 'Country_Belarus',
 'Country_Belgium',
 'Country_Belize',
 'Country_Bhutan',
 'Country_Bosnia and Herzegovina',
 'Country_Botswana',
 'Country_Brazil',
 'Country_Bulgaria',
 'Country_Cabo Verde',
 'Country_Cambodia',
 'Country_Canada',
 'Country_Central African Republic',
 'Country_Chad',
 'Country_Chile',
 'Country_China',
 'Country_Colombia',
 'Country_Comoros',
 'Country_Costa Rica',
 'Country_Croatia',
 'Country_Cyprus',
 'Country_Djibouti',
 'Country_Dominican Republic',
 'Country_Ecuador',
 'Country_El Salvador',
 'Country_Eritrea',
 'Country_Estonia',
 'Country_Ethiopia',
 'Country_Fiji',
 'Country_France',
 'Country_Gabon',
 'Country_Georgia',
 'Country_Germany',
 'Country_Ghana'

In [10]:
X_copy = X.copy()
X_copy = X[col_p_value]

X_new = sm.add_constant(X_copy)
ols_model = sm.OLS(Y,X_new)
results = ols_model.fit()
results.summary()

0,1,2,3
Dep. Variable:,Life expectancy,R-squared:,0.967
Model:,OLS,Adj. R-squared:,0.964
Method:,Least Squares,F-statistic:,351.6
Date:,"Tue, 25 Oct 2022",Prob (F-statistic):,0.0
Time:,14:00:49,Log-Likelihood:,-3117.7
No. Observations:,1649,AIC:,6489.0
Df Residuals:,1522,BIC:,7176.0
Df Model:,126,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,-371.5616,27.029,-13.747,0.000,-424.580,-318.543
Year,0.2308,0.014,16.016,0.000,0.202,0.259
infant deaths,0.0414,0.012,3.510,0.000,0.018,0.065
Alcohol,-0.0677,0.029,-2.337,0.020,-0.124,-0.011
under-five deaths,-0.0309,0.008,-4.013,0.000,-0.046,-0.016
HIV/AIDS,-0.3116,0.013,-24.774,0.000,-0.336,-0.287
thinness 5-9 years,0.0805,0.022,3.582,0.000,0.036,0.125
Schooling,0.3444,0.063,5.497,0.000,0.221,0.467
Country_Albania,17.0721,0.537,31.815,0.000,16.020,18.125

0,1,2,3
Omnibus:,657.581,Durbin-Watson:,1.212
Prob(Omnibus):,0.0,Jarque-Bera (JB):,3254.682
Skew:,1.836,Prob(JB):,0.0
Kurtosis:,8.821,Cond. No.,4.86e+19


In [11]:
#ElasticNet

X_train, X_test, y_train, y_test = train_test_split(X_copy, Y, test_size=0.20, random_state=42)

model_elas = ElasticNet(alpha = 1, l1_ratio=0.5)           # These are the default values
model_elas.fit(X_train,y_train)

Pred_elas = model_elas.predict(X_test)

print("R2_score for Elas Model : ", r2_score(y_test,Pred_elas))


R2_score for Elas Model :  0.7182021063375545


In [12]:
#ElasticNetCV to fit best Alpha

from sklearn.model_selection import RepeatedKFold
cross_validation = RepeatedKFold(n_splits=10,n_repeats=3,random_state=42)

elas_model2 = ElasticNetCV(alphas = np.arange(0.01,1,0.02),cv = cross_validation, l1_ratio=np.arange(0.1,1,0.1))

elas_model2.fit(X_train,y_train)

print("Best Alpha is : ", elas_model2.alpha_)
print("Best l1_ratio is : ", elas_model2.l1_ratio_)

Pred_elasCV = elas_model2.predict(X_test)

print("R2_score for elasCV Model : ", r2_score(y_test,Pred_elasCV))

Best Alpha is :  0.01
Best l1_ratio is :  0.9
R2_score for elasCV Model :  0.8739640030880047
