In [55]:
# Libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import statsmodels.api as sm
from sklearn.model_selection import train_test_split,KFold
from sklearn.metrics import mean_squared_error
import scipy.stats as stats
from sklearn.model_selection import cross_val_score

In [3]:
path="D:/project/Life_exe_final.xlsx"
data=pd.read_excel(path)

In [4]:
data.head()

Unnamed: 0,Country,Year,Status,Adult Mortality,infant deaths,Alcohol,percentage expenditure,Hepatitis B,Measles,BMI,...,Diphtheria (deaths/1000),HIV/AIDS(impact),GDP,Population,thinness 10-19 years,thinness 5-9 years,Income composition of resources,Schooling,COVID,Life expectancy
0,United States of America,2021,Developed,19.0,11.0,8.3,5241.3,91.7,324,70.8,...,94.0,0.1,65031.0,327954612,0.75,0.57,0.88,15.8,181.26,77.1
1,United States of America,2020,Developed,20.0,19.0,8.2,5210.2,91.5,410,70.6,...,95.0,0.1,64012.0,328010321,0.76,0.54,0.9,16.0,178.2,77.5
2,United States of America,2019,Developed,17.0,13.0,8.1,5123.2,91.0,310,68.3,...,90.0,0.1,65298.0,329064917,0.77,0.56,0.93,16.2,1.0,78.87
3,United States of America,2018,Developed,15.0,17.0,8.2,5323.64,89.5,264,70.1,...,96.0,0.1,62996.0,327096265,0.81,0.59,0.91,16.6,1.0,78.81
4,United States of America,2017,Developed,13.0,21.0,8.47,5200.64,90.0,215,69.1,...,91.0,0.1,60062.0,327096266,0.78,0.58,0.92,16.5,2.0,78.84


In [6]:
data.shape
data.columns
data1=data.copy()

In [7]:
data1.columns=['country','year','status','adult_mortality','infant_deaths','alcohol','percentage_exp','hepatitis_B','measles','bmi','under_five_deths','polio','total_expenditure','diphtheria','hiv_or_aids','gdp','population','thinsess_1to19_years','thinsess_5to9_years','income_composition_of_resources','schooling','COVID','life_expectancy']
data.columns=['country','year','status','adult_mortality','infant_deaths','alcohol','percentage_exp','hepatitis_B','measles','bmi','under_five_deths','polio','total_expenditure','diphtheria','hiv_or_aids','gdp','population','thinsess_1to19_years','thinsess_5to9_years','income_composition_of_resources','schooling','COVID','life_expectancy']

In [8]:
data1.drop('COVID',axis=1,inplace=True)

In [9]:
data1.dtypes

country                             object
year                                 int64
status                              object
adult_mortality                    float64
infant_deaths                      float64
alcohol                            float64
percentage_exp                     float64
hepatitis_B                        float64
measles                              int64
bmi                                float64
under_five_deths                   float64
polio                              float64
total_expenditure                  float64
diphtheria                         float64
hiv_or_aids                        float64
gdp                                float64
population                           int64
thinsess_1to19_years               float64
thinsess_5to9_years                float64
income_composition_of_resources    float64
schooling                          float64
life_expectancy                    float64
dtype: object

In [10]:
## Checking the nulls
data1.isnull().sum() # No nulls

country                            0
year                               0
status                             0
adult_mortality                    0
infant_deaths                      0
alcohol                            0
percentage_exp                     0
hepatitis_B                        0
measles                            0
bmi                                0
under_five_deths                   0
polio                              0
total_expenditure                  0
diphtheria                         0
hiv_or_aids                        0
gdp                                0
population                         0
thinsess_1to19_years               0
thinsess_5to9_years                0
income_composition_of_resources    0
schooling                          0
life_expectancy                    0
dtype: int64

In [31]:
#### Multi-collinearity
import seaborn as sns
cor = data1.corr()
cor=np.tril(cor)
nc=data1.select_dtypes(exclude='object').columns.values
print(nc)
cols=data1[nc].columns

#interactive plotting in separate window
%matplotlib qt 

sns.heatmap(cor,xticklabels=cols,yticklabels=cols,vmin=-1,vmax=1,annot=True,square=False) # vmin and vmax shows the limit in legends
plt.title("Correlation Matrix")

['year' 'adult_mortality' 'alcohol' 'percentage_exp' 'hepatitis_B' 'bmi'
 'polio' 'total_expenditure' 'hiv_or_aids' 'population' 'schooling'
 'life_expectancy']


Text(0.5, 1.0, 'Correlation Matrix')

In [12]:
## Removing columns which are highly correlated
data1=data1.drop('under_five_deths',axis=1)
data1=data1.drop('thinsess_5to9_years',axis=1)
data1=data1.drop('thinsess_1to19_years',axis=1)

In [13]:
data1.drop('income_composition_of_resources',axis=1,inplace=True)

In [14]:
data1.drop('infant_deaths',axis=1,inplace=True)

In [15]:
data1.drop('gdp',axis=1,inplace=True)
data1.drop('diphtheria',axis=1,inplace=True)

In [16]:
data1.drop('measles',axis=1,inplace=True)

In [17]:
data1.describe()

Unnamed: 0,year,adult_mortality,alcohol,percentage_exp,hepatitis_B,bmi,polio,total_expenditure,hiv_or_aids,population,schooling,life_expectancy
count,616.0,616.0,616.0,616.0,616.0,616.0,616.0,616.0,616.0,616.0,616.0,616.0
mean,2010.493506,134.071104,7.106282,2267.476769,81.486526,52.987808,90.070292,7.626039,0.640925,116397300.0,14.486185,75.038133
std,6.340215,81.560166,3.712449,2844.73286,20.585398,15.721452,11.486064,2.98858,3.339271,228422700.0,2.150569,5.774179
min,2000.0,3.0,0.01,0.84,6.0,5.1,9.0,1.22,0.1,3452745.0,8.3,53.9
25%,2005.0,77.0,4.5275,264.4725,72.75,52.8,88.0,5.6,0.1,29860990.0,12.975,71.21
50%,2010.5,122.0,7.67,998.275,91.0,58.2,94.0,7.24,0.1,48387540.0,15.0,75.685
75%,2016.0,172.25,9.7825,3671.25,96.0,62.3,97.0,9.25,0.1,87842380.0,16.2,79.69
max,2021.0,498.0,13.89,12123.0,99.0,70.8,99.0,19.0,29.7,1366418000.0,18.8,83.57


In [18]:
## Categorical Data
fc=data1.select_dtypes(include='object').columns.values
data1.country.unique()
data1.status.unique()

array(['Developed', 'Developing'], dtype=object)

In [20]:
## Model Building
# Linear Regression
# Ridge
# Lasso
# Decision Tree Regressor
# Random Forest Regressor
# SVM
# KNN
# BOOSTING (Gradient, Adboosting)

In [None]:
##### Linear Regression

In [19]:
#### FUNCTIONS

def LR_Model(trainx,testx,trainy,testy):
    trainx=sm.add_constant(trainx)
    testx=sm.add_constant(testx)

    m=sm.OLS(trainy,trainx).fit()

    ## Summarise the model
    summary=m.summary()
    
    p=m.predict(testx)
    
    # MSE of the model
    mse=np.round(mean_squared_error(testy,p),3)
    mse

    return([m,summary,mse])
    

def CrossValidation_LR(trainx,trainy,k):
    folds=k
    cv_mse=[]

    X=trainx.values # Should be in the form of array
    Y=trainy.values

    kf=KFold(folds)
    #kf.get_n_splits(X)

    for train_index,test_index in kf.split(X):
        cv_trainx,cv_trainy=X[train_index],Y[train_index]
        cv_testx,cv_testy=X[test_index],Y[test_index]
        
        # Build the model on the cv_train and predict on cv_test
        m=sm.OLS(cv_trainy,cv_trainx).fit()
        p=m.predict(cv_testx)
    
        # store MSE in the list for each model
        cv_mse.append(np.round(mean_squared_error(cv_testy,p),3))
        
        cv_mse    

    # Calculate the mean MSE of K-Fold cross validation
    return(np.mean(cv_mse))

def Normality(data,cols):
    # Agistino-Pearson test for normality
    # H0: normal distribution
    # H1: not a normal distribution

    # create a k-v pair to store column names and its corresponding distribution type (Normal/Not Normal)
    aptest={}

    for c in cols:
        tstat,pval = normaltest(data[c])
        if pval < 0.05:
            aptest[c]="Not Normal"
        else:
            aptest[c]="Normal"

    df=pd.DataFrame(list(aptest.items()),columns=['Features','Distribution'])
    return(df)

In [20]:
## Converting categorical data into dummies.
data2=data1.copy()
data2 = pd.get_dummies(data2,columns=['status','country'])

In [21]:
trainx,testx,trainy,testy=train_test_split(data2.drop('life_expectancy',axis=1), data2['life_expectancy'],test_size=0.2) ## Single step process.
print("trainx={},trainy={},testx={},testy={}".format(trainx.shape,trainy.shape,testx.shape,testy.shape))


trainx=(492, 41),trainy=(492,),testx=(124, 41),testy=(124,)


In [22]:
# Cross Validation
m1_LR_CV=CrossValidation_LR(trainx,trainy,5)

In [23]:
trainx=sm.add_constant(trainx)
testx=sm.add_constant(testx)

In [24]:
model=LR_Model(trainx,testx,trainy,testy)

In [25]:
# Model
m1_LR=model[0]

In [26]:
# Summary
model[1]

0,1,2,3
Dep. Variable:,life_expectancy,R-squared:,0.986
Model:,OLS,Adj. R-squared:,0.985
Method:,Least Squares,F-statistic:,801.7
Date:,"Tue, 18 May 2021",Prob (F-statistic):,0.0
Time:,17:00:31,Log-Likelihood:,-518.12
No. Observations:,492,AIC:,1116.0
Df Residuals:,452,BIC:,1284.0
Df Model:,39,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,-103.1266,12.619,-8.172,0.000,-127.926,-78.327
year,0.1123,0.010,11.274,0.000,0.093,0.132
adult_mortality,-0.0040,0.001,-3.247,0.001,-0.006,-0.002
alcohol,-0.0689,0.052,-1.329,0.184,-0.171,0.033
percentage_exp,5.028e-05,2.35e-05,2.138,0.033,4.07e-06,9.65e-05
hepatitis_B,0.0084,0.003,2.420,0.016,0.002,0.015
bmi,-0.0048,0.004,-1.287,0.199,-0.012,0.003
polio,-0.0027,0.005,-0.524,0.601,-0.013,0.007
total_expenditure,0.0153,0.031,0.490,0.624,-0.046,0.077

0,1,2,3
Omnibus:,65.005,Durbin-Watson:,2.177
Prob(Omnibus):,0.0,Jarque-Bera (JB):,132.649
Skew:,-0.743,Prob(JB):,1.57e-29
Kurtosis:,5.064,Cond. No.,1.12e+16


In [27]:
# MSE
m1_LR_mse=model[2]
print()

0.55


In [28]:
## Validation of assumptions

# 1.) mean of residual=0
print(m1_LR.resid.mean())

-3.785887375953258e-10


In [31]:
## PLot the graph (errors)
yhat=m1_LR.predict(trainx)
sns.set(style="whitegrid")
sns.residplot(m1_LR.resid,yhat,lowess=True,color='g')

## bruesch-pagan test against hetero
import statsmodels.stats.api as sms

# H0: Homoscedasticity 
# H1: Heteroscedasticity

# return value of breusch pagan test
# lagrange_multiplier,p-value,f-score,fp-value

# parameters: [residuals, x-array]
pval=sms.het_breuschpagan(m1_LR.resid,m1_LR.model.exog)[1]

if pval<0.05:
    print("Reject H0. Model is Heteroscedastic")
else:
    print("FTR H0: MOdel is Homoscedastic")

# Model is hetero

Reject H0. Model is Heteroscedastic




In [32]:
# iii.) Residuals have the normal distribution
from matplotlib import pylab

stats.probplot(m1_LR.resid,dist='norm',plot=pylab)
pylab.show()

In [29]:
# iv.) rows>columns
data2.shape

(616, 42)

In [32]:
# v.) Multicollinearity check have been done above

# vi.) Normality check

# Agistino-Pearson test for normality
# H0: normal distribution
# H1: not a normal distribution

from scipy.stats import normaltest

# create a k-v pair to store column names and its corresponding distribution type (Normal/Not Normal)
aptest={}

for c in cols:
    tstat,pval = normaltest(data2[c])
    if pval < 0.05:
        aptest[c]="Not Normal"
    else:
        aptest[c]="Normal"

print(aptest) # No feature is normal.

df=pd.DataFrame(list(aptest.items()),columns=['Features','Distribution'])
df

{'year': 'Not Normal', 'adult_mortality': 'Not Normal', 'alcohol': 'Not Normal', 'percentage_exp': 'Not Normal', 'hepatitis_B': 'Not Normal', 'bmi': 'Not Normal', 'polio': 'Not Normal', 'total_expenditure': 'Not Normal', 'hiv_or_aids': 'Not Normal', 'population': 'Not Normal', 'schooling': 'Not Normal', 'life_expectancy': 'Not Normal'}


Unnamed: 0,Features,Distribution
0,year,Not Normal
1,adult_mortality,Not Normal
2,alcohol,Not Normal
3,percentage_exp,Not Normal
4,hepatitis_B,Not Normal
5,bmi,Not Normal
6,polio,Not Normal
7,total_expenditure,Not Normal
8,hiv_or_aids,Not Normal
9,population,Not Normal


In [33]:
# Predictions
p1_LR=m1_LR.predict(testx)
p1_LR

112    70.310868
150    78.299164
542    69.421048
534    70.422847
457    79.068231
         ...    
137    80.777777
295    74.141423
589    80.433847
314    75.461584
280    75.213949
Length: 124, dtype: float64

In [35]:
# Compare the train and test errors
print("Training MSE = {} , Testing MSE = {}".format(m1_LR_CV,m1_LR_mse))

## Store the actual and predicted data for comparison
df=pd.DataFrame({'actual':testy,'predicted':round(p1_LR,2)})
df.head(30)

Training MSE = 0.6666 , Testing MSE = 0.55


Unnamed: 0,actual,predicted
112,73.08,70.31
150,78.41,78.3
542,69.33,69.42
534,70.55,70.42
457,78.97,79.07
447,81.46,81.19
32,73.72,72.73
377,71.41,70.42
569,71.12,71.15
584,81.4,81.03


In [36]:
# Plot the actual and predicted values
ax1=sns.distplot(testy,hist=False,color='blue',label='Actual')
sns.distplot(p1_LR,hist=False,color='red',label='Predicted',ax=ax1)



<AxesSubplot:xlabel='life_expectancy', ylabel='Density'>

In [37]:
### square root
data2.shape
data4=data2.copy()
data4=data4[nc]

for j in range(0,len(nc)):
    for i in range(0,len(data4)):
        data4.iloc[i,j]=pow(data4.iloc[i,j],(1/2))

In [38]:
# Normality check
Normality(data4,cols)

Unnamed: 0,Features,Distribution
0,year,Not Normal
1,adult_mortality,Not Normal
2,alcohol,Not Normal
3,percentage_exp,Not Normal
4,hepatitis_B,Not Normal
5,bmi,Not Normal
6,polio,Not Normal
7,total_expenditure,Not Normal
8,hiv_or_aids,Not Normal
9,population,Not Normal


In [39]:
new_cols=set(data2.columns)-set(data4.columns)

data4.shape
data4[list(new_cols)]=data2[list(new_cols)]
data4.shape

(616, 42)

In [40]:
trainx,testx,trainy,testy=train_test_split(data4.drop('life_expectancy',axis=1), data4['life_expectancy'],test_size=0.2) ## Single step process.
m2_LR_CV=CrossValidation_LR(trainx,trainy,k=5)
model=LR_Model(trainx, testx, trainy, testy)
m2_LR=model[0] # model
m2_LR_mse=model[2] # MSE

In [41]:
m2_LR

<statsmodels.regression.linear_model.RegressionResultsWrapper at 0x1caec1555e0>

In [42]:
# Compare the train and test errors
print("Training MSE = {} , Testing MSE = {}".format(m2_LR_CV,m2_LR_mse))

Training MSE = 0.002 , Testing MSE = 0.002


In [43]:
#------------------------------------
# Log transformation can not be done since there are Zeros in the data.

In [44]:
# Now trying with boxcox transformation

#### Boxcox transformation
from scipy.stats import boxcox

# transform data into BoxCox transformaed data
data4.shape
d=data4['life_expectancy']
bc_values,lamda=boxcox(d)
bc_values
lamda # used to raise the value to make it normal

bc1=bc_values[0]
bc1

data4=data2.copy()
data4['life_expectancy']=bc_values

data4.shape

trainx,testx,trainy,testy=train_test_split(data4.drop('life_expectancy',axis=1),data4['life_expectancy'],test_size=0.2)

CrossValidation_LR(trainx, trainy, k=5)
m4_LR=LR_Model(trainx,testx,trainy,testy)
model=m4_LR[0]

## bruesch-pagan test against hetero
import statsmodels.stats.api as sms

# H0: Homoscedasticity 
# H1: Heteroscedasticity

# return value of breusch pagan test
# lagrange_multiplier,p-value,f-score,fp-value

# parameters: [residuals, x-array]
pval=sms.het_breuschpagan(model.resid,model.model.exog)[1]

if pval<0.05:
    print("Reject H0. Model is Heteroscedastic")
else:
    print("FTR H0: MOdel is Homoscedastic")
# Hetero

Reject H0. Model is Heteroscedastic


In [None]:
################ DECISION TREE REGRESSOR ####################

In [45]:
from sklearn.tree import DecisionTreeRegressor

In [46]:
trainx,testx,trainy,testy=train_test_split(data2.drop('life_expectancy',axis=1),data2['life_expectancy'],test_size=0.2)

In [47]:
m1_DT=DecisionTreeRegressor(criterion='mse').fit(trainx,trainy)

In [48]:
print(m1_DT.get_depth())

17


In [49]:
p1_DT=m1_DT.predict(testx) 
mse1_DT=round(mean_squared_error(testy,p1_DT),3)
mse1_DT

2.305

In [50]:
def CrossValidation_DT(trainx,trainy,k):
    folds=k
    cv_mse=[]

    X=trainx.values # Should be in the form of array
    Y=trainy.values

    kf=KFold(folds)
    #kf.get_n_splits(X)

    for train_index,test_index in kf.split(X):
        cv_trainx,cv_trainy=X[train_index],Y[train_index]
        cv_testx,cv_testy=X[test_index],Y[test_index]
        
        # Build the model on the cv_train and predict on cv_test
        m=DecisionTreeRegressor(criterion='mse').fit(cv_trainx,cv_trainy)
        p=m.predict(cv_testx)
    
        # store MSE in the list for each model
        cv_mse.append(np.round(mean_squared_error(cv_testy,p),3))
        
        cv_mse    

    # Calculate the mean MSE of K-Fold cross validation
    return(np.mean(cv_mse))

In [51]:
mse_DT_CV=CrossValidation_DT(trainx,trainy,5)
print(mse_DT_CV)

1.5177999999999998


In [52]:
### Decision Tree with Hypertuning
m2_DT=DecisionTreeRegressor(criterion='mse',max_depth=8,min_samples_leaf=2).fit(trainx,trainy)

In [53]:
## Predictions
p2_DT=m2_DT.predict(testx) 
mse2_DT=round(mean_squared_error(testy,p2_DT),3)
mse2_DT

1.723

In [56]:
# Cross Validation
kf = KFold(n_splits=5)
mse2_DT_CV=np.mean(-cross_val_score(m2_DT, trainx, trainy, scoring="neg_mean_squared_error", cv=kf))
mse2_DT_CV

1.7420646415886651

In [62]:
############## RANDOM FOREST ###################
from sklearn.ensemble import RandomForestRegressor

In [63]:
m1_RF=RandomForestRegressor().fit(trainx,trainy)
p1_RF=m1_RF.predict(testx)
mse1_RF=np.round(mean_squared_error(testy,p1_RF),3)
mse1_RF

0.678

In [64]:
# Cross Validation
kf = KFold(n_splits=5)
mse1_RF_CV=np.mean(-cross_val_score(m1_RF, trainx, trainy, scoring="neg_mean_squared_error", cv=kf))
mse1_RF_CV

1.0078291450933843

In [65]:
########### RANDOM FOREST HPT ###########
m2_RF=RandomForestRegressor(n_estimators=30,max_depth=6,min_samples_split=3,min_samples_leaf=2).fit(trainx,trainy)
p2_RF=m2_RF.predict(testx)
mse2_RF=np.round(mean_squared_error(testy,p2_RF),3)
mse2_RF

0.994

In [66]:
# Cross Validation
kf = KFold(n_splits=5)
mse2_RF_CV=np.mean(-cross_val_score(m2_RF, trainx, trainy, scoring="neg_mean_squared_error", cv=kf))
mse2_RF_CV

1.1347702387563618

In [68]:
################### KNN Regressor ##########################
from sklearn import neighbors

trainx_std=trainx.copy()
testx_std=testx.copy()

from sklearn import preprocessing
minmax=preprocessing.MinMaxScaler()

## Scale the train data
sc=minmax.fit_transform(trainx_std.iloc[:,:])
trainx_std.iloc[:,:]=sc

## Scale the test data
sc=minmax.fit_transform(testx_std.iloc[:,:])
testx_std.iloc[:,:]=sc

trainx_std.head(3)


### Best value for the k
nn=range(3,12)
list(nn)

from sklearn.model_selection import cross_val_score

mse_cv=[]

for k in list(nn):
    m=neighbors.KNeighborsRegressor(n_neighbors=k).fit(trainx_std,trainy)
    err=cross_val_score(m,trainx_std,trainy,cv=5,scoring='neg_mean_squared_error')
    err=np.round(np.mean(err),3)
    mse_cv.append(err)
    
# The MSE scores are all in -ve numbers. COnvert to +ve
print(mse_cv)

## COnvert all the -ve to +ve numbers using lambda function
mse_cv=list(map(lambda x:abs(x),mse_cv))
mse_cv

## Select the lowest mse scores and its corresponding K-value
bestk=nn[mse_cv.index(min(mse_cv))]
bestk

[-0.359, -0.41, -0.513, -0.663, -0.856, -1.103, -1.374, -1.675, -2.008]


3

In [104]:
def CrossValidation_KNN(trainx,trainy,k):
    folds=k
    cv_mse=[]

    X=trainx.values # Should be in the form of array
    Y=trainy.values

    kf=KFold(folds)
    #kf.get_n_splits(X)

    for train_index,test_index in kf.split(X):
        cv_trainx,cv_trainy=X[train_index],Y[train_index]
        cv_testx,cv_testy=X[test_index],Y[test_index]
        
        # Build the model on the cv_train and predict on cv_test
        m=neighbors.KNeighborsRegressor(n_neighbors=3).fit(cv_trainx,cv_trainy)
        p=m.predict(cv_testx)
    
        # store MSE in the list for each model
        cv_mse.append(np.round(mean_squared_error(cv_testy,p),3))
        
        cv_mse    

    # Calculate the mean MSE of K-Fold cross validation
    return(np.mean(cv_mse))

In [69]:
## Build the knn model with the bestk
m1_KNN=neighbors.KNeighborsRegressor(n_neighbors=bestk).fit(trainx_std,trainy)
p1_KNN=m1_KNN.predict(testx_std)
mse1_KNN=mean_squared_error(testy,p1_KNN)
mse1_KNN

0.634345788530465

In [70]:
# Cross Validation
kf = KFold(n_splits=5)
mse1_KNN_CV=np.mean(-cross_val_score(m1_KNN, trainx, trainy, scoring="neg_mean_squared_error", cv=kf))
mse1_KNN_CV

21.022329611534847

In [None]:
###################### SVM ############################

In [71]:
from sklearn import svm,preprocessing

In [72]:
kernals=['linear','rbf','poly','sigmoid']
kernals

for k in kernals:
    m=svm.SVR(kernel=k).fit(trainx_std,trainy)
    r_square=m.score(testx_std,testy)
    print('kernal={} , R-Square={}'.format(k,r_square))

kernal=linear , R-Square=0.9485485622761807
kernal=rbf , R-Square=0.9640727309780169
kernal=poly , R-Square=0.9336767129535648
kernal=sigmoid , R-Square=0.8223513385467516


In [99]:
def svmRegression(ker,trainx,trainy,testx,testy,bestc=2,bestg='scale'):
    model=svm.SVR(kernel=ker,C=bestc,gamma=bestg).fit(trainx,trainy)
    pred=model.predict(testx)
    mse=mean_squared_error(testy,pred)
    return(pred,mse)

In [100]:
m_mse=[]; p1_SVM=[]
### Run the regression model for each kernel
for k in kernals:
    pred,mseval=svmRegression(k,trainx_std,trainy,testx_std,testy) # funciton call
    p1_SVM.append(pred)
    m_mse.append(round(mseval,3))  
m_mse

[1.372, 0.79, 1.278, 5.18]

In [None]:
[1.308, 0.914, 1.687, 4.518] # c=1
[1.372, 0.79, 1.278, 5.18] # c=2
[1.404, 0.846, 1.099, 9.421] c=3

In [None]:
### Data Frame

In [115]:
result1 = pd.DataFrame([],columns=['Model','CV_mse','Prediction_mse'])
print(result1)

Empty DataFrame
Columns: [Model, CV_mse, Prediction_mse]
Index: []


In [116]:
result1.loc[0]=['Linear Regression',m1_LR_CV,m1_LR_mse]
result1.loc[1]=['Decision Tree',mse_DT_CV,mse1_DT]
result1.loc[2]=['Decision Tree HPT',mse2_DT_CV,mse2_DT]
result1.loc[3]=['Random Forest',mse1_RF_CV,mse1_RF]
result1.loc[4]=['Random Forest HPT',mse2_RF_CV,mse2_RF]
result1.loc[5]=['KNN',mse1_KNN_CV,mse1_KNN]
result1.loc[6]=['SVM_Linear','',1.372]
result1.loc[7]=['SVM_rbf','',0.79]
result1.loc[8]=['SVM_poly','',1.278]
result1.loc[9]=['SVM_sigmoid','',5.18]

In [117]:
print(result1)

               Model    CV_mse  Prediction_mse
0  Linear Regression    0.6666        0.550000
1      Decision Tree    1.5178        2.305000
2  Decision Tree HPT  1.742065        1.723000
3      Random Forest  1.007829        0.678000
4  Random Forest HPT   1.13477        0.994000
5                KNN  21.02233        0.634346
6         SVM_Linear                  1.372000
7            SVM_rbf                  0.790000
8           SVM_poly                  1.278000
9        SVM_sigmoid                  5.180000


In [75]:
y = data2['life_expectancy']
x = data2.drop(['life_expectancy'],axis=1)

In [76]:
from sklearn.model_selection import train_test_split
xtrain,xtest,ytrain,ytest = train_test_split(x,y,test_size=0.2)

In [123]:
from sklearn.linear_model import LinearRegression, Ridge,Lasso

from sklearn.ensemble import GradientBoostingRegressor,RandomForestRegressor,AdaBoostRegressor

from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
import numpy as np
from sklearn.metrics import mean_squared_error
import warnings

In [124]:
lr=0.1
n = 200
kf = KFold(n_splits=5, random_state=42, shuffle=True)

adaboost_model = AdaBoostRegressor(n_estimators=n,learning_rate=lr)

gradientboost_model=GradientBoostingRegressor(learning_rate=lr,n_estimators = n)

ridge_model = Ridge(alpha=0.1)

lasso_model = Lasso(alpha=0.1)

In [125]:
def metrics(ytest,ypred):
  return(mean_squared_error(ytest,ypred))

result = pd.DataFrame([],columns=['Model','CV_mse','Prediction_mse'])
def compute(model,i):
  cv_mse = -cross_val_score(model, x, y, scoring="neg_mean_squared_error", cv=kf)
  model.fit(xtrain,ytrain)
  ypred = model.predict(xtest)
  result.loc[i] = [str(model)[:str(model).index('(')] ,cv_mse.mean(),metrics(ytest,ypred) ]

In [126]:
models = [adaboost_model,gradientboost_model,ridge_model,lasso_model]
for model in range(len(models)):
  compute(models[model],model)
result = result.sort_values('CV_mse')
warnings.filterwarnings("ignore")
result

Unnamed: 0,Model,CV_mse,Prediction_mse
1,GradientBoostingRegressor,0.432631,0.518998
2,Ridge,0.705968,0.652947
0,AdaBoostRegressor,1.940014,2.452862
3,Lasso,2.952786,3.649959


In [127]:
res=pd.concat([result, result1], axis=0, join='outer')
display(res)

Unnamed: 0,Model,CV_mse,Prediction_mse
1,GradientBoostingRegressor,0.432631,0.518998
2,Ridge,0.705968,0.652947
0,AdaBoostRegressor,1.940014,2.452862
3,Lasso,2.952786,3.649959
0,Linear Regression,0.6666,0.55
1,Decision Tree,1.5178,2.305
2,Decision Tree HPT,1.742065,1.723
3,Random Forest,1.007829,0.678
4,Random Forest HPT,1.13477,0.994
5,KNN,21.02233,0.634346


In [138]:
lr=0.1
n = 200
kf = KFold(n_splits=5, random_state=42, shuffle=True)

gradientboost_model=GradientBoostingRegressor(learning_rate=lr,n_estimators = n)
def metrics(ytest,ypred):
  return(mean_squared_error(ytest,ypred))

cv_mse = -cross_val_score(gradientboost_model, x, y, scoring="neg_mean_squared_error", cv=kf)
gradientboost_model.fit(xtrain,ytrain)
ypred = gradientboost_model.predict(xtest)
metrics(ytest,ypred)

0.47518407422131725

In [154]:
## Store the actual and predicted data for comparison
df=pd.DataFrame({'actual':np.array(ytest),'predicted':np.round(ypred,2)})
df.head(30)

Unnamed: 0,actual,predicted
0,79.6,79.91
1,80.01,80.38
2,79.58,80.11
3,82.91,82.67
4,82.72,82.47
5,77.58,77.11
6,81.27,80.92
7,73.08,72.27
8,75.08,74.71
9,78.97,79.52


In [139]:
# Plot the actual and predicted values
ax1=sns.distplot(ytest,hist=False,color='blue',label='Actual')
sns.distplot(ypred,hist=False,color='red',label='Predicted',ax=ax1)

<AxesSubplot:xlabel='life_expectancy', ylabel='Density'>