## Life Expectancy Multilinear Regression

In [None]:
import numpy as np
import pandas as pd
import statsmodels.api as sm
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression

import seaborn as sns
sns.set()

In [None]:
raw_data = pd.read_csv('../input/life-expectancy-who/Life Expectancy Data.csv')
raw_data.head()

In [None]:
raw_data.describe(include='all')

In [None]:
raw_data.isnull().sum()

In [None]:
# Checking why and which countries has null value for alcohol
null_alcohol = raw_data[raw_data["Alcohol"].isnull()]
#alcohol_na = raw_data.query('Alcohol == 0')
#alcohol_na
null_alcohol

 #Checking why there are 34 missing data in BMI

In [None]:
null_bmi = raw_data[raw_data[" BMI "].isnull()]
null_bmi

In [None]:
## Turns out only Sudan and South Sudan do not report the BMI. We can use imputation for Monaco and San Marino from
## previous years because only one year missing from those countries. I will drop Sudan and South Sudan from the data

In [None]:
## Question: Does Life Expectancy have positive or negative relationship with drinking alcohol?
## Data is missing for almost every country in 2015, so I will drop the 2015 from the data
is_2015 = raw_data[raw_data["Year"]==2015].index
is_2015
data_wo_2015 = raw_data.drop(is_2015)
data_wo_2015

In [None]:
## South Sudan does not have any Alcohol data for taking mean and fill the null spaces, so I will drop South Sudan completely
is_s_sudan = data_wo_2015[data_wo_2015["Country"]=="South Sudan"].index
is_s_sudan
data_alcohol = data_wo_2015.drop(is_s_sudan)
data_alcohol

In [None]:
data_alcohol.isnull().sum()

In [None]:
data_1 = data_alcohol[data_alcohol['Life expectancy '].isnull()].index
data_1

In [None]:
data_2 = data_alcohol.drop(data_1) 
data_2

In [None]:
na_bmi = data_2[data_2[" BMI "].isnull()].index
na_bmi

In [None]:
data_3 = data_2.drop(na_bmi)
data_3

In [None]:
data_3.isnull().sum()

In [None]:
data_4 = data_3[data_3['Alcohol'].isnull()].index
data_4

In [None]:
data_clean = data_3.drop(data_4)
data_clean.isnull().sum()

In [None]:
## Dropping multiple columns at the same time.

In [None]:
to_drop = ["Hepatitis B", "Polio", "Total expenditure", "Diphtheria ", "GDP", "Population", "Income composition of resources","Schooling"]
data_clean.drop(to_drop, inplace=True, axis=1)

#passing in the inplace parameter as True and the axis parameter as 1. This tells Pandas that we want the changes to be made directly in our object and that it should look for the values to be dropped in the columns of the object.

In [None]:
#include='all' shows all the data not only numerical
data_clean.describe(include='all')

In [None]:
data_clean.isnull().sum()

In [None]:
sns.distplot(data_clean["Alcohol"])

In [None]:
sns.distplot(data_clean[" BMI "])

In [None]:
# Based on the PDF, BMI is bimodel distrubition

In [None]:
sns.distplot(data_clean[' HIV/AIDS'])

In [None]:
sns.distplot(data_clean['percentage expenditure'])

In [None]:
sns.distplot(data_clean[' thinness  1-19 years'])

In [None]:
round(data_clean[['Status','Life expectancy ']].groupby('Status').mean(),2)

In [None]:
sns.distplot(data_clean['Life expectancy '])

In [None]:
data_clean["Country"].unique()

In [None]:
# Transform to categorical data to numerical data, 1 stands for "Developed countries, and 0 for "developing countries
data_clean["Status"] = data_clean["Status"].map({'Developed':1,'Developing':0})

In [None]:
data_clean['Status'].unique()


# OLS - Scatterplots for Life Expectancy vs Alcohol, BMI, HIV etc

## if we print all of them together
f,(ax1,ax2,ax3)= plt.subplots(1,3,sharey=True,figsize=(15,3))
ax1.scatter(data_clean['Status'], data_clean["Life expectancy "])
ax1.set_title("Life Expectancy vs Status")
ax2.scatter(data_clean['Alcohol'], data_clean["Life expectancy "])
ax2.set_title("Life Expectancy vs Alcohol")
ax3.scatter(data_clean[' BMI '], data_clean["Life expectancy "])
ax3.set_title("Life Expectancy vs BMI")


plt.show()

In [None]:
plt.figure(figsize=(15,35))

plt.subplot(6,3,1)
plt.scatter(data_clean['Status'], data_clean["Life expectancy "])
plt.title("Life Expectancy vs Status")
plt.xlabel('Developed or Developing status')
plt.ylabel('Age (yrs)')

plt.subplot(6,3,2)
plt.scatter(data_clean['Alcohol'], data_clean["Life expectancy "])
plt.title("Life Expectancy vs Alcohol")
plt.xlabel('Litres')
plt.ylabel('Age (yrs)')

plt.subplot(6,3,3)
plt.scatter(data_clean[' BMI '], data_clean["Life expectancy "])
plt.title("Life Expectancy vs BMI")
plt.xlabel('BMI')
plt.ylabel('Age (yrs)')

plt.subplot(6,3,4)
plt.scatter(data_clean[' HIV/AIDS'], data_clean["Life expectancy "])
plt.title("Life Expectancy vs HIV/AIDS")
plt.xlabel('Deaths per 1000 live births')
plt.ylabel('Age (yrs)')

plt.subplot(6,3,5)
plt.scatter(data_clean['percentage expenditure'], data_clean["Life expectancy "])
plt.title("Life Expectancy vs Percentage Expenditure")
plt.xlabel('% of total government expenditure')
plt.ylabel('Age (yrs)')

plt.subplot(6,3,6)
plt.scatter(data_clean[' thinness  1-19 years'], data_clean["Life expectancy "])
plt.title("Life Expectancy vs Teenage Thinness")
plt.xlabel('%')
plt.ylabel('Age (yrs)')

plt.show()

In [None]:
#It did not work but, box coxis a tranformation technic for the data not normaly distributed
# Try to transform Box-Cox, because the data is not normaly distributed

#from scipy import stats

# get values from our data_clean columns
#t_alcohol = np.asarray(data_clean['Alcohol'].values)

#tranfrom values and store as "d_t_"
#d_t_alcohol = stats.boxcox(t_alcohol)[0]

#plot the transformed data
#plt.hist(d_t_alcohol,bins=10)
#plt.show()

In [None]:
# I replace 0 with 0.01 in data_clean['percentage expenditure'] for better log transformation
data_clean['percentage expenditure'] = data_clean['percentage expenditure'].mask(data_clean['percentage expenditure']==0, 0.01)
data_clean['percentage expenditure']

In [None]:
# Just because boxcox did not work, i will transform the x axis for hiv and percent expenditure, that are close to be linear relationship
log_hiv = np.log(data_clean[' HIV/AIDS'])
data_clean['log_hiv'] = log_hiv

log_expenditure = np.log(data_clean['percentage expenditure'])
data_clean['log_expenditure'] = log_expenditure

data_clean

In [None]:
sns.distplot(data_clean['log_hiv'])

In [None]:
sns.distplot(data_clean['log_expenditure'])

In [None]:
plt.figure(figsize=(15,35))

plt.subplot(6,3,1)
plt.scatter(data_clean['Status'], data_clean["Life expectancy "])
plt.title("Life Expectancy vs Status")
plt.xlabel('Developed or Developing status')
plt.ylabel('Age (yrs)')

plt.subplot(6,3,2)
plt.scatter(data_clean['Alcohol'], data_clean["Life expectancy "])
plt.title("Life Expectancy vs Alcohol")
plt.xlabel('Litres')
plt.ylabel('Age (yrs)')

plt.subplot(6,3,3)
plt.scatter(data_clean[' BMI '], data_clean["Life expectancy "])
plt.title("Life Expectancy vs BMI")
plt.xlabel('BMI')
plt.ylabel('Age (yrs)')

plt.subplot(6,3,4)
plt.scatter(data_clean['log_hiv'], data_clean["Life expectancy "])
plt.title("Life Expectancy vs log HIV/AIDS")
plt.xlabel('Deaths per 10 live births')
plt.ylabel('Age (yrs)')

plt.subplot(6,3,5)
plt.scatter(data_clean['log_expenditure'], data_clean["Life expectancy "])
plt.title("Life Expectancy vs Log Transformed Percentage Expenditure")
plt.xlabel('log of total government expenditure')
plt.ylabel('Age (yrs)')

plt.subplot(6,3,6)
plt.scatter(data_clean[' thinness  1-19 years'], data_clean["Life expectancy "])
plt.title("Life Expectancy vs Teenage Thinness")
plt.xlabel('%')
plt.ylabel('Age (yrs)')

plt.show()

In [None]:
# Dropping the useless or pre-transformed data
drop_axis = [' HIV/AIDS','percentage expenditure']
data_clean.drop(drop_axis, inplace=True, axis=1)

In [None]:
#I do not need to write (include='all'), because I already transformed categorical data to numerical
data_clean.describe()

(Based on the OLS assumptions, there is no obvious linear connection,
however just for practicing I will continue using linear regression model, also so far I do not know any other :) )

# Multicollinearity

In [None]:
# One of the best ways to check for multicollinearity is VIF(variance inflation factor)
data_clean.columns.values

In [None]:
from statsmodels.stats.outliers_influence import variance_inflation_factor
var = data_clean[['Status','Alcohol',' BMI ', ' thinness  1-19 years','log_hiv', 'log_expenditure']]
vif = pd.DataFrame()
vif["VIF"] = [variance_inflation_factor(var.values,i) for i in range(var.shape[1])]
vif['features'] = var.columns

In [None]:
#VIF = 1: no multicollinearity
# 1<VIF < 5: perfectly okay
#10 < VIF : unacceptable (there is no upperlimit and different sources are saying different numbers, but for acceptance try to keep under 5)

vif

In [None]:
# all my data is VIF<5, which is awesome because I do not need to remove any column

In [None]:
# Status can use for logistic regression, regardless it's vif, I will drop "status" and use it later

data_clean.drop(['Status'],inplace=True,axis=1)
data_clean.head()

# Create dummy variables

In [None]:
# if we have N categories for a feature, we have to create N-1 dummies
data_w_dummies = pd.get_dummies(data_clean, drop_first=True)

In [None]:
data_w_dummies.head()

## Rearrange a bit

In [None]:
data_w_dummies.columns.values

In [None]:
cols= ['Life expectancy ', 'Adult Mortality', 'infant deaths',
       'Alcohol', 'Measles ', ' BMI ', 'under-five deaths ',
       ' thinness  1-19 years', ' thinness 5-9 years', 'log_hiv',
       'log_expenditure', 'Country_Albania', 'Country_Algeria',
       'Country_Angola', 'Country_Antigua and Barbuda',
       'Country_Argentina', 'Country_Armenia', 'Country_Australia',
       'Country_Austria', 'Country_Azerbaijan', 'Country_Bahamas',
       'Country_Bahrain', 'Country_Bangladesh', 'Country_Barbados',
       'Country_Belarus', 'Country_Belgium', 'Country_Belize',
       'Country_Benin', 'Country_Bhutan',
       'Country_Bolivia (Plurinational State of)',
       'Country_Bosnia and Herzegovina', 'Country_Botswana',
       'Country_Brazil', 'Country_Brunei Darussalam', 'Country_Bulgaria',
       'Country_Burkina Faso', 'Country_Burundi', 'Country_Cabo Verde',
       'Country_Cambodia', 'Country_Cameroon', 'Country_Canada',
       'Country_Central African Republic', 'Country_Chad',
       'Country_Chile', 'Country_China', 'Country_Colombia',
       'Country_Comoros', 'Country_Congo', 'Country_Costa Rica',
       'Country_Croatia', 'Country_Cuba', 'Country_Cyprus',
       'Country_Czechia', "Country_Côte d'Ivoire",
       "Country_Democratic People's Republic of Korea",
       'Country_Democratic Republic of the Congo', 'Country_Denmark',
       'Country_Djibouti', 'Country_Dominican Republic',
       'Country_Ecuador', 'Country_Egypt', 'Country_El Salvador',
       'Country_Equatorial Guinea', 'Country_Eritrea', 'Country_Estonia',
       'Country_Ethiopia', 'Country_Fiji', 'Country_Finland',
       'Country_France', 'Country_Gabon', 'Country_Gambia',
       'Country_Georgia', 'Country_Germany', 'Country_Ghana',
       'Country_Greece', 'Country_Grenada', 'Country_Guatemala',
       'Country_Guinea', 'Country_Guinea-Bissau', 'Country_Guyana',
       'Country_Haiti', 'Country_Honduras', 'Country_Hungary',
       'Country_Iceland', 'Country_India', 'Country_Indonesia',
       'Country_Iran (Islamic Republic of)', 'Country_Iraq',
       'Country_Ireland', 'Country_Israel', 'Country_Italy',
       'Country_Jamaica', 'Country_Japan', 'Country_Jordan',
       'Country_Kazakhstan', 'Country_Kenya', 'Country_Kiribati',
       'Country_Kuwait', 'Country_Kyrgyzstan',
       "Country_Lao People's Democratic Republic", 'Country_Latvia',
       'Country_Lebanon', 'Country_Lesotho', 'Country_Liberia',
       'Country_Libya', 'Country_Lithuania', 'Country_Luxembourg',
       'Country_Madagascar', 'Country_Malawi', 'Country_Malaysia',
       'Country_Maldives', 'Country_Mali', 'Country_Malta',
       'Country_Mauritania', 'Country_Mauritius', 'Country_Mexico',
       'Country_Micronesia (Federated States of)', 'Country_Mongolia',
       'Country_Montenegro', 'Country_Morocco', 'Country_Mozambique',
       'Country_Myanmar', 'Country_Namibia', 'Country_Nepal',
       'Country_Netherlands', 'Country_New Zealand', 'Country_Nicaragua',
       'Country_Niger', 'Country_Nigeria', 'Country_Norway',
       'Country_Oman', 'Country_Pakistan', 'Country_Panama',
       'Country_Papua New Guinea', 'Country_Paraguay', 'Country_Peru',
       'Country_Philippines', 'Country_Poland', 'Country_Portugal',
       'Country_Qatar', 'Country_Republic of Korea',
       'Country_Republic of Moldova', 'Country_Romania',
       'Country_Russian Federation', 'Country_Rwanda',
       'Country_Saint Lucia', 'Country_Saint Vincent and the Grenadines',
       'Country_Samoa', 'Country_Sao Tome and Principe',
       'Country_Saudi Arabia', 'Country_Senegal', 'Country_Serbia',
       'Country_Seychelles', 'Country_Sierra Leone', 'Country_Singapore',
       'Country_Slovakia', 'Country_Slovenia', 'Country_Solomon Islands',
       'Country_Somalia', 'Country_South Africa', 'Country_Spain',
       'Country_Sri Lanka', 'Country_Suriname', 'Country_Swaziland',
       'Country_Sweden', 'Country_Switzerland',
       'Country_Syrian Arab Republic', 'Country_Tajikistan',
       'Country_Thailand',
       'Country_The former Yugoslav republic of Macedonia',
       'Country_Timor-Leste', 'Country_Togo', 'Country_Tonga',
       'Country_Trinidad and Tobago', 'Country_Tunisia', 'Country_Turkey',
       'Country_Turkmenistan', 'Country_Uganda', 'Country_Ukraine',
       'Country_United Arab Emirates',
       'Country_United Kingdom of Great Britain and Northern Ireland',
       'Country_United Republic of Tanzania',
       'Country_United States of America', 'Country_Uruguay',
       'Country_Uzbekistan', 'Country_Vanuatu',
       'Country_Venezuela (Bolivarian Republic of)', 'Country_Viet Nam',
       'Country_Yemen', 'Country_Zambia', 'Country_Zimbabwe']

In [None]:
data_preprocessed = data_w_dummies[cols]
data_preprocessed.head()

# Linear Regression Model

## Declare the inputs and the targets


In [None]:
targets = data_preprocessed['Life expectancy ']
unscaled_inputs = data_preprocessed.drop(['Life expectancy '], axis=1)

## Scale the data

In [None]:
## Custom scaler for protecting the dummies in the inputs

unscaled_inputs.columns.values

In [None]:
## We just want to scaler non-dummies, so:

columns_to_scale = ['Adult Mortality', 'infant deaths', 'Alcohol', 'Measles ',
       ' BMI ', 'under-five deaths ', ' thinness  1-19 years',
       ' thinness 5-9 years', 'log_hiv', 'log_expenditure']

In [None]:
# Create the Custom Scaler, with this we will keep the dummies untouched

from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import StandardScaler

class CustomScaler(BaseEstimator, TransformerMixin):
    
    def __init__(self,columns,copy=True,with_mean=True,with_std=True):
        self.scaler = StandardScaler(copy,with_mean,with_std)
        self.columns = columns
        self.mean_ = None
        self.var_ = None
        
    def fit(self,X,y=None):
        self.scaler.fit(X[self.columns],y)
        self.mean_ = np.mean(X[self.columns])
        self.var_ = np.var(X[self.columns])
        return self
    
    def transform(self,X,y=None,copy=None):
        init_col_order = X.columns
        X_scaled = pd.DataFrame(self.scaler.transform(X[self.columns]),columns = self.columns)
        X_not_scaled = X.loc[:,~X.columns.isin(self.columns)]
        return pd.concat([X_not_scaled,X_scaled],axis=1)[init_col_order]

In [None]:
 life_expectancy_scaler = CustomScaler(columns_to_scale,copy=True,with_mean=True,with_std=True)

In [None]:
life_expectancy_scaler.fit(unscaled_inputs)

In [None]:
inputs_scaled = life_expectancy_scaler.transform(unscaled_inputs)

In [None]:
inputs_scaled

In [None]:
inputs_scaled.isnull().sum()

In [None]:
## after scaling there are some NaN values added(idk why, if someone read this,and knows the reason pls message me, I would love to learn the reason)
## so, i drop that values

scaled_inputs = inputs_scaled.iloc[0:2714]
scaled_inputs.isnull().sum()

In [None]:
no_na = scaled_inputs.fillna(0)

In [None]:
no_na

## Train Test Split

In [None]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(no_na, targets, test_size=0.2, random_state=365)

#test_size= 0.2 means 20/80 split between test and train and random_state makes fix randomize for future tests

## Create the regression

In [None]:
reg = LinearRegression()
reg.fit(x_train, y_train)

In [None]:
y_hat = reg.predict(x_train)

In [None]:
plt.scatter(y_train, y_hat)
plt.xlabel('Targets (y_train)', size =18)
plt.ylabel('Predictions (y_hat)', size=18)
plt.xlim(35,90)
plt.ylim(35,90)
plt.show()

In [None]:
## Residual plot
## Residual = Differences between the targets and the predictions 
## The residuals are estimates of the errors and expected to behave normality and homoscedasticity

sns.distplot(y_train - y_hat)
plt.title("Residuals PDF", size =18)
  

It is normaly distrubuted however there is a much longer tail in the negative side therefore there are certain observations for which (y_train -y_hat) is much lower than the mean ( a much higher life expectancy is predicted than is observed)

### R_ Square

In [None]:
reg.score(x_train, y_train)

R-Square is around 88 percent, it means our model is explaining 88% of the variablity of the data

### Adjusted R_Square

In [None]:
def adj_r2(x,y):
    r2 = reg.score(x,y)
    n = x.shape[0]
    p = x.shape[1]
    adjusted_r2 = 1-(1-r2)*(n-1)/(n-p-1)
    return adjusted_r2

In [None]:
adj_r2(x_train,y_train)

## Finding the weights and bias

In [None]:
## bias == intercept (ML term)

reg.intercept_

In [None]:
## weight == coefficient (ML term) Bigger weight is bigger impact
reg.coef_

In [None]:
## For readablity, I will make summary table
reg_summary = pd.DataFrame(unscaled_inputs.columns.values, columns=['Features'])
reg_summary["Weights"] = reg.coef_
reg_summary.head(20)

Weights interpretation
-- Continuous variables
1. A positive weight shows that as a feature increases in value, so do the life expectancy ( in this case of BMI increases (you are not starving - you live longer or living in developed country (status = 1) increases your life)
2. A negative weight shows that as a feature increases in value, life_expectancy decrease

In [None]:
## Finding the benchmark - based on the dummies which one is 1 - in our case benchmark is Afghanistan
data_clean['Country'].unique()

In our case (because of alphabetically assigned dummies- our benchmark is Afghanistan)
Weights interpretation -- Dummy variables
1. A positive weight shows that the respective category ( Country ) has longer life expectancy than benchmark (Afghanistan)
2. A negative weight shows that the respective category ( Country ) has shorter life expectancy than benchmark (Afghanistan)

## Testing

In [None]:
y_hat_test = reg.predict(x_test)

In [None]:
## alpha gives opactiy which can used for seeing which area has more density,
## the more saturated the color, the higher the concentration
plt.scatter(y_test, y_hat_test, alpha=0.2)
plt.xlabel("Targets (y_test)", size=18)
plt.ylabel("Predictions (y-hat_test)", size=18)
plt.xlim(35,90)
plt.ylim(35,90)
plt.show()

in this case we can predict more longer life expectancy, because it is more dense

In [None]:
## df_pf == DataFrame Performance for seeing how accurate is our predictions

df_pf = pd.DataFrame(y_hat_test, columns=['Prediction'])
df_pf


In [None]:
df_pf["Target"] = y_test
df_pf

In [None]:
## We need to reset the indexes, because or the random_scale
y_test = y_test.reset_index(drop=True)
y_test.head()

In [None]:
df_pf["Target"] = y_test
df_pf

In [None]:
df_pf['Residual'] = df_pf['Target'] - df_pf['Prediction']

In [None]:
## Whether an observation is off by +1% or -1% is mostly irrelevant
df_pf['Difference%'] = np.absolute(df_pf['Residual']/df_pf['Target']*100)

In [None]:
df_pf

In [None]:
df_pf.describe()

Based on the df_pf, my model works! Min difference is almost 0% and max difference is 27% (for developed countries,outliners)

In [None]:
## checking all of the data
pd.options.display.max_rows = 999
pd.set_option('display.float_format', lambda x: '%.2f' %x)
df_pf.sort_values(by=['Difference%'])

In the Redidual PDF, negative tail was longer and assuming the model will predict higher than it should be, so when I check the data, it actually true.. Predictions are slightly higher than WHO Life Expectancy