# Analysing the dataset

**Importing required libraries**

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer, make_column_transformer
from sklearn.pipeline import make_pipeline
from sklearn.impute import SimpleImputer
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score
from collections import OrderedDict
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import ShuffleSplit
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import r2_score,mean_squared_error

**Reading the dataset in IBM Watson Studio**

In [None]:

import types
import pandas as pd
from botocore.client import Config
import ibm_boto3

def __iter__(self): return 0

# @hidden_cell
# The following code accesses a file in your IBM Cloud Object Storage. It includes your credentials.
# You might want to remove those credentials before you share the notebook.
client_d6a042f58dc44bfcac01d1a01afd0d38 = ibm_boto3.client(service_name='s3',
    ibm_api_key_id='3okOdlerylavk0PK0-Xd-r5HYVMO2iegQ87elISoTdbn',
    ibm_auth_endpoint="https://iam.cloud.ibm.com/oidc/token",
    config=Config(signature_version='oauth'),
    endpoint_url='https://s3.eu-geo.objectstorage.service.networklayer.com')

body = client_d6a042f58dc44bfcac01d1a01afd0d38.get_object(Bucket='lifeexpectancy-donotdelete-pr-6bb2hoexroj1xl',Key='Life_Expectancy_Data.csv')['Body']
# add missing __iter__ method, so pandas accepts body as file-like object
if not hasattr(body, "__iter__"): body.__iter__ = types.MethodType( __iter__, body )

data = pd.read_csv(body)
data.head()


In [None]:
data.head()

In [None]:
data.shape

In [None]:
data.describe()

In [None]:
data.info()

In [None]:
data.size

In [None]:
data.columns

In [None]:
data.isnull().sum()

__Handling Missing Value__

In [None]:
country_list = data.Country.unique()
len(country_list)

In [None]:
country_list = data.Country.unique()
fill_list = ['Country', 'Year', 'Status', 'Life expectancy ', 'Adult Mortality',
       'infant deaths', 'Alcohol', 'percentage expenditure', 'Hepatitis B',
       'Measles ', ' BMI ', 'under-five deaths ', 'Polio', 'Total expenditure',
       'Diphtheria ', ' HIV/AIDS', 'GDP', 'Population',
       ' thinness  1-19 years', ' thinness 5-9 years',
       'Income composition of resources', 'Schooling']

**Filling missing value according to country column using interpolate()**

In [None]:
for country in country_list:
    data.loc[data['Country'] == country,fill_list] = data.loc[data['Country'] == country,fill_list].interpolate()
data.dropna(inplace=True)

In [None]:
data.shape

In [None]:
data.isna().sum()

**Corelation matrix**

In [None]:
corrMatrix = data.corr()
corrMatrix.style.background_gradient(cmap='plasma', low=.5, high=0).highlight_null('red')

**Renaming the columns as it contains trailing spaces**

In [None]:
data.rename(columns={" BMI ":"BMI",'Life expectancy ':'Life expectancy',
                  "under-five deaths ":"under-five deaths","Measles ":"Measles","Diphtheria ":"Diphtheria",
                  ' HIV/AIDS':"HIV/AIDS",
                  " thinness  1-19 years":"thinness 10-19 years"," thinness 5-9 years":"thinness 5-9 years"},inplace=True)

**Removing outliers**

Taking numeric features , (country,year, status columns are excluded)

In [None]:
col_dict = {'Life expectancy':1 , 'Adult Mortality':2 ,
        'Alcohol':3 , 'percentage expenditure': 4, 'Hepatitis B': 5,
       'Measles' : 6, 'BMI': 7, 'under-five deaths' : 8, 'Polio' : 9, 'Total expenditure' :10,
       'Diphtheria':11, 'HIV/AIDS':12, 'GDP':13, 'Population' :14,
       'thinness 10-19 years' :15, 'thinness 5-9 years' :16,
       'Income composition of resources' : 17, 'Schooling' :18, 'infant deaths':19}

Showing outliers using box plot

In [None]:
import matplotlib.pyplot as plt
plt.figure(figsize=(20,30))

for variable,i in col_dict.items():
                     plt.subplot(5,4,i)
                     plt.boxplot(data[variable],whis=1.5)
                     plt.title(variable)

plt.show()

BMI has no outliers

In [None]:
import numpy as np

for variable in col_dict.keys():
    q75, q25 = np.percentile(data[variable], [75 ,25])
    iqr = q75 - q25
    min_val = q25 - (iqr*1.5)
    max_val = q75 + (iqr*1.5)
    print("Number of outliers and percentage of it in {} : {} and {}".format(variable,
                                                                             len((np.where((data[variable] > max_val) | (data[variable] < min_val))[0])),
                                                                             len((np.where((data[variable] > max_val) | (data[variable] < min_val))[0]))*100/1987))

In [None]:
from scipy.stats.mstats import winsorize
winsorized_Life_Expectancy = winsorize(data['Life expectancy'],(0.01,0))
winsorized_Adult_Mortality = winsorize(data['Adult Mortality'],(0,0.03))
winsorized_Infant_Deaths = winsorize(data['infant deaths'],(0,0.10))
winsorized_Alcohol = winsorize(data['Alcohol'],(0,0.01))
winsorized_Percentage_Exp = winsorize(data['percentage expenditure'],(0,0.12))
winsorized_HepatitisB = winsorize(data['Hepatitis B'],(0.11,0))
winsorized_Measles = winsorize(data['Measles'],(0,0.19))
winsorized_Under_Five_Deaths = winsorize(data['under-five deaths'],(0,0.12))
winsorized_Polio = winsorize(data['Polio'],(0.09,0))
winsorized_Tot_Exp = winsorize(data['Total expenditure'],(0,0.01))
winsorized_Diphtheria = winsorize(data['Diphtheria'],(0.10,0))
winsorized_HIV = winsorize(data['HIV/AIDS'],(0,0.16))
winsorized_GDP = winsorize(data['GDP'],(0,0.13))
winsorized_Population = winsorize(data['Population'],(0,0.14))
winsorized_thinness_10_19_years = winsorize(data['thinness 10-19 years'],(0,0.04))
winsorized_thinness_5_9_years = winsorize(data['thinness 5-9 years'],(0,0.04))
winsorized_Income_Comp_Of_Resources = winsorize(data['Income composition of resources'],(0.05,0))
winsorized_Schooling = winsorize(data['Schooling'],(0.02,0.01))

In [None]:
winsorized_list = [winsorized_Life_Expectancy,winsorized_Adult_Mortality,winsorized_Alcohol,winsorized_Measles,winsorized_Infant_Deaths,
            winsorized_Percentage_Exp,winsorized_HepatitisB,winsorized_Under_Five_Deaths,winsorized_Polio,winsorized_Tot_Exp,winsorized_Diphtheria,
            winsorized_HIV,winsorized_GDP,winsorized_Population,winsorized_thinness_10_19_years,winsorized_thinness_5_9_years,
            winsorized_Income_Comp_Of_Resources,winsorized_Schooling]

for variable in winsorized_list:
    q75, q25 = np.percentile(variable, [75 ,25])
    iqr = q75 - q25

    min_val = q25 - (iqr*1.5)
    max_val = q75 + (iqr*1.5)
    
    print("Number of outliers after winsorization in  : {} ".format(len(np.where((variable > max_val) | (variable < min_val))[0])))

Adding 18 new columns having no outliers to the dataframe

In [None]:
data['winsorized_Life_Expectancy'] = winsorized_Life_Expectancy
data['winsorized_Adult_Mortality'] = winsorized_Adult_Mortality
data['winsorized_Infant_Deaths'] = winsorized_Infant_Deaths
data['winsorized_Alcohol'] = winsorized_Alcohol
data['winsorized_Percentage_Exp'] = winsorized_Percentage_Exp
data['winsorized_HepatitisB'] = winsorized_HepatitisB
data['winsorized_Under_Five_Deaths'] = winsorized_Under_Five_Deaths
data['winsorized_Polio'] = winsorized_Polio
data['winsorized_Tot_Exp'] = winsorized_Tot_Exp
data['winsorized_Diphtheria'] = winsorized_Diphtheria
data['winsorized_HIV'] = winsorized_HIV
data['winsorized_GDP'] = winsorized_GDP
data['winsorized_Population'] = winsorized_Population
data['winsorized_thinness_10_19_years'] = winsorized_thinness_10_19_years
data['winsorized_thinness_5_9_years'] = winsorized_thinness_5_9_years
data['winsorized_Income_Comp_Of_Resources'] = winsorized_Income_Comp_Of_Resources
data['winsorized_Schooling'] = winsorized_Schooling
data['winsorized_Measles'] = winsorized_Measles

In [None]:
data.shape #More 18 columns are added

**Exploratory Data Analysis (EDA)**

In [None]:
data.columns

In [None]:
sns.distplot(data['Life expectancy'],kde=True)

In [None]:
disease_cols=data[['Life expectancy','Alcohol','Hepatitis B','Measles','BMI','Polio','Diphtheria','HIV/AIDS','Adult Mortality',
                   'infant deaths','under-five deaths','thinness 10-19 years','thinness 5-9 years','Schooling',
                   'percentage expenditure','Total expenditure','GDP','Population','Income composition of resources']]

In [None]:
disease_cols.corr()

In [None]:
sns.pairplot(disease_cols,diag_kind='kde')

Hence all the features are significant to predict the target variable

In [None]:
col = ['Life expectancy','winsorized_Life_Expectancy','Adult Mortality','winsorized_Adult_Mortality','infant deaths',
         'winsorized_Infant_Deaths','Alcohol','winsorized_Alcohol','percentage expenditure','winsorized_Percentage_Exp','Hepatitis B',
         'winsorized_HepatitisB','under-five deaths','winsorized_Under_Five_Deaths','Polio','winsorized_Polio','Total expenditure',
         'winsorized_Tot_Exp','Diphtheria','winsorized_Diphtheria','HIV/AIDS','winsorized_HIV','GDP','winsorized_GDP',
         'Population','winsorized_Population','thinness 10-19 years','winsorized_thinness_10_19_years','thinness 5-9 years',
         'winsorized_thinness_5_9_years','Income composition of resources','winsorized_Income_Comp_Of_Resources',
         'Schooling','winsorized_Schooling','Measles','winsorized_Measles','GDP','winsorized_GDP']

plt.figure(figsize=(15,75))

for i in range(len(col)):
    plt.subplot(19,2,i+1)
    plt.hist(data[col[i]])
    plt.title(col[i])

plt.show()

In [None]:
data.describe(include= 'O')

In [None]:
plt.figure(figsize=(6,6))
plt.bar(data.groupby('Status')['Status'].count().index,data.groupby('Status')['winsorized_Life_Expectancy'].mean())
plt.ylabel("Avg Life_Expectancy")
plt.title("Life_Expectancy w.r.t Status")
plt.show()

In [None]:
le_country = data.groupby('Country')['winsorized_Life_Expectancy'].mean().sort_values(ascending=True)
le_country.plot(kind='bar', figsize=(50,15), fontsize=25)
plt.title("Life_Expectancy w.r.t Country",fontsize=40)
plt.xlabel("Country",fontsize=35)
plt.ylabel("Avg Life_Expectancy",fontsize=35)
plt.show()

In [None]:
le_country = data.groupby('Country')['winsorized_GDP'].mean().sort_values(ascending=True)
le_country.plot(kind='bar', figsize=(50,15), fontsize=25)
plt.title("Average GDP w.r.t Country",fontsize=40)
plt.xlabel("Country",fontsize=35)
plt.ylabel("Avg GDP",fontsize=35)
plt.show()

In [None]:
le_country = data.groupby('Country')['winsorized_Adult_Mortality'].mean().sort_values(ascending=True)
le_country.plot(kind='bar', figsize=(50,15), fontsize=25)
plt.title("Adult_Mortality w.r.t Country",fontsize=40)
plt.xlabel("Country",fontsize=35)
plt.ylabel("Avg Adult Mortality",fontsize=35)
plt.show()

In [None]:
le_country = data.groupby('Country')['winsorized_Alcohol'].mean().sort_values(ascending=True)
le_country.plot(kind='bar', figsize=(50,15), fontsize=25)
plt.title("Alcohol w.r.t Country",fontsize=40)
plt.xlabel("Country",fontsize=35)
plt.ylabel("Avg Alcohol Comsumption",fontsize=35)
plt.show()

In [None]:
le_country = data.groupby('Country')['winsorized_Diphtheria'].mean().sort_values(ascending=True)
le_country.plot(kind='bar', figsize=(50,15), fontsize=25)
plt.title("Diphtheria w.r.t Country",fontsize=40)
plt.xlabel("Country",fontsize=35)
plt.ylabel("Avg Diphtheria",fontsize=35)
plt.show()

In [None]:
le_country = data.groupby('Country')['winsorized_HepatitisB'].mean().sort_values(ascending=True)
le_country.plot(kind='bar', figsize=(50,15), fontsize=25)
plt.title("HepatitisB w.r.t Country",fontsize=40)
plt.xlabel("Country",fontsize=35)
plt.ylabel("Avg HepatitisB",fontsize=35)
plt.show()

In [None]:
le_country = data.groupby('Country')['winsorized_HIV'].mean().sort_values(ascending=True)
le_country.plot(kind='bar', figsize=(50,15), fontsize=25)
plt.title("HIV w.r.t Country",fontsize=40)
plt.xlabel("Country",fontsize=35)
plt.ylabel("Avg HIV cases",fontsize=35)
plt.show()

In [None]:
le_country = data.groupby('Country')['winsorized_Income_Comp_Of_Resources'].mean().sort_values(ascending=True)
le_country.plot(kind='bar', figsize=(50,15), fontsize=25)
plt.title("Income Composition of Resources w.r.t Country",fontsize=40)
plt.xlabel("Country",fontsize=35)
plt.ylabel("Avg income composition of resourses",fontsize=35)
plt.show()

In [None]:
le_country = data.groupby('Country')['winsorized_Infant_Deaths'].mean().sort_values(ascending=True)
le_country.plot(kind='bar', figsize=(50,15), fontsize=25)
plt.title("Infant Deaths w.r.t Country",fontsize=40)
plt.xlabel("Country",fontsize=35)
plt.ylabel("Avg Infant Deaths",fontsize=35)
plt.show()

In [None]:
le_country = data.groupby('Country')['winsorized_Measles'].mean().sort_values(ascending=True)
le_country.plot(kind='bar', figsize=(50,15), fontsize=25)
plt.title("Measles w.r.t Country",fontsize=40)
plt.xlabel("Country",fontsize=35)
plt.ylabel("Avg Measles cases",fontsize=35)
plt.show()

In [None]:
le_country = data.groupby('Country')['winsorized_Percentage_Exp'].mean().sort_values(ascending=True)
le_country.plot(kind='bar', figsize=(50,15), fontsize=25)
plt.title("Percentage Expenditure w.r.t Country",fontsize=40)
plt.xlabel("Country",fontsize=35)
plt.ylabel("Avg percentage expenditure",fontsize=35)
plt.show()

In [None]:
le_country = data.groupby('Country')['winsorized_Polio'].mean().sort_values(ascending=True)
le_country.plot(kind='bar', figsize=(50,15), fontsize=25)
plt.title("Polio w.r.t Country",fontsize=40)
plt.xlabel("Country",fontsize=35)
plt.ylabel("Avg Polio Cases",fontsize=35)
plt.show()

In [None]:
le_country = data.groupby('Country')['winsorized_Population'].mean().sort_values(ascending=True)
le_country.plot(kind='bar', figsize=(50,15), fontsize=25)
plt.title("Population w.r.t Country",fontsize=40)
plt.xlabel("Country",fontsize=35)
plt.ylabel("Avg Population",fontsize=35)
plt.show()

In [None]:
le_country = data.groupby('Country')['winsorized_Schooling'].mean().sort_values(ascending=True)
le_country.plot(kind='bar', figsize=(50,15), fontsize=25)
plt.title("Schooling w.r.t Country",fontsize=40)
plt.xlabel("Country",fontsize=35)
plt.ylabel("Avg Schooling",fontsize=35)
plt.show()

In [None]:
le_country = data.groupby('Country')['winsorized_thinness_10_19_years'].mean().sort_values(ascending=True)
le_country.plot(kind='bar', figsize=(50,15), fontsize=25)
plt.title(" Thinness 10to19 years w.r.t Country",fontsize=40)
plt.xlabel("Country",fontsize=35)
plt.ylabel("Avg Thinness 10 to 19 Years",fontsize=35)
plt.show()

In [None]:
le_country = data.groupby('Country')['winsorized_thinness_5_9_years'].mean().sort_values(ascending=True)
le_country.plot(kind='bar', figsize=(50,15), fontsize=25)
plt.title("Thinness 5 to 9 years w.r.t Country",fontsize=40)
plt.xlabel("Country",fontsize=35)
plt.ylabel("Avg thinness 5 to 9 years ",fontsize=35)
plt.show()

In [None]:
le_country = data.groupby('Country')['winsorized_Tot_Exp'].mean().sort_values(ascending=True)
le_country.plot(kind='bar', figsize=(50,15), fontsize=25)
plt.title(" Total Expenditure w.r.t Country",fontsize=40)
plt.xlabel("Country",fontsize=35)
plt.ylabel("Avg Total Expenditure",fontsize=35)
plt.show()

In [None]:
le_country = data.groupby('Country')['winsorized_Under_Five_Deaths'].mean().sort_values(ascending=True)
le_country.plot(kind='bar', figsize=(50,15), fontsize=25)
plt.title(" Under five Deaths w.r.t Country",fontsize=40)
plt.xlabel("Country",fontsize=35)
plt.ylabel("Avg under 5 deaths",fontsize=35)
plt.show()

In [None]:
plt.figure(figsize=(7,5))
plt.bar(data.groupby('Year')['Year'].count().index,data.groupby('Year')['winsorized_Life_Expectancy'].mean())
plt.xlabel("Year",fontsize=12)
plt.ylabel("Avg Life_Expectancy",fontsize=12)
plt.title("Life_Expectancy w.r.t Year")
plt.show()

In [None]:
cor_matrix=data.corr()
print(cor_matrix['winsorized_Life_Expectancy'].sort_values(ascending=False))

In [None]:
import seaborn as sns
from pandas.plotting import scatter_matrix
attributes= ['winsorized_Life_Expectancy','winsorized_Income_Comp_Of_Resources','winsorized_Schooling'
,'winsorized_Diphtheria','winsorized_Polio','winsorized_Adult_Mortality','winsorized_Alcohol','winsorized_Measles','winsorized_Infant_Deaths',
            'winsorized_Percentage_Exp','winsorized_HepatitisB','winsorized_Under_Five_Deaths','winsorized_Tot_Exp',
            'winsorized_HIV','winsorized_GDP','winsorized_Population','winsorized_thinness_10_19_years','winsorized_thinness_5_9_years']
cormat=data[attributes].corr()
plt.figure(figsize=(15,15))
sns.heatmap(cormat, square=True, annot=True, linewidths=.5)
plt.show()

In [None]:
plt.figure(figsize=(18,40))

plt.subplot(6,3,1)
plt.scatter(data["winsorized_Life_Expectancy"], data["winsorized_Adult_Mortality"])
plt.title("LifeExpectancy vs AdultMortality")

plt.subplot(6,3,2)
plt.scatter(data["winsorized_Life_Expectancy"], data["winsorized_Infant_Deaths"])
plt.title("LifeExpectancy vs Infant_Deaths")

plt.subplot(6,3,3)
plt.scatter(data["winsorized_Life_Expectancy"], data["winsorized_Alcohol"])
plt.title("LifeExpectancy vs Alcohol")

plt.subplot(6,3,4)
plt.scatter(data["winsorized_Life_Expectancy"], data["winsorized_Percentage_Exp"])
plt.title("LifeExpectancy vs Percentage_Exp")

plt.subplot(6,3,5)
plt.scatter(data["winsorized_Life_Expectancy"], data["winsorized_HepatitisB"])
plt.title("LifeExpectancy vs HepatitisB")

plt.subplot(6,3,6)
plt.scatter(data["winsorized_Life_Expectancy"], data["winsorized_Under_Five_Deaths"])
plt.title("LifeExpectancy vs Under_Five_Deaths")

plt.subplot(6,3,7)
plt.scatter(data["winsorized_Life_Expectancy"], data["winsorized_Polio"])
plt.title("LifeExpectancy vs Polio")

plt.subplot(6,3,8)
plt.scatter(data["winsorized_Life_Expectancy"], data["winsorized_Tot_Exp"])
plt.title("LifeExpectancy vs Tot_Exp")

plt.subplot(6,3,9)
plt.scatter(data["winsorized_Life_Expectancy"], data["winsorized_Diphtheria"])
plt.title("LifeExpectancy vs Diphtheria")

plt.subplot(6,3,10)
plt.scatter(data["winsorized_Life_Expectancy"], data["winsorized_HIV"])
plt.title("LifeExpectancy vs HIV")

plt.subplot(6,3,11)
plt.scatter(data["winsorized_Life_Expectancy"], data["winsorized_GDP"])
plt.title("LifeExpectancy vs GDP")

plt.subplot(6,3,12)
plt.scatter(data["winsorized_Life_Expectancy"], data["winsorized_Population"])
plt.title("LifeExpectancy vs Population")

plt.subplot(6,3,13)
plt.scatter(data["winsorized_Life_Expectancy"], data["winsorized_thinness_10_19_years"])
plt.title("LifeExpectancy vs thinness_10to19_years")

plt.subplot(6,3,14)
plt.scatter(data["winsorized_Life_Expectancy"], data["winsorized_thinness_5_9_years"])
plt.title("LifeExpectancy vs thinness_5to9_years")

plt.subplot(6,3,15)
plt.scatter(data["winsorized_Life_Expectancy"], data["winsorized_Income_Comp_Of_Resources"])
plt.title("LifeExpectancy vs Income_Comp_Of_Resources")

plt.subplot(6,3,16)
plt.scatter(data["winsorized_Life_Expectancy"], data["winsorized_Schooling"])
plt.title("LifeExpectancy vs Schooling")


plt.show()

In [None]:
round(data[['Status','winsorized_Life_Expectancy']].groupby(['Status']).mean(),2)

Since 'status' is a categorical feature, we have to find the correlation with Life expectancy

In [None]:
import scipy.stats as stats
stats.ttest_ind(data.loc[data['Status']=='Developed','winsorized_Life_Expectancy'],data.loc[data['Status']=='Developing','winsorized_Life_Expectancy'])

In [None]:
data.columns

**Now our data has no null values and no outliers**

# Creating a new dataframe with refined data

In [None]:
new_data=pd.DataFrame(data=data,columns=['Country', 'Year', 'Status', 
        'BMI', 'winsorized_Adult_Mortality',
       'winsorized_Infant_Deaths', 'winsorized_Alcohol',
       'winsorized_Percentage_Exp', 'winsorized_HepatitisB',
       'winsorized_Under_Five_Deaths', 'winsorized_Polio',
       'winsorized_Tot_Exp', 'winsorized_Diphtheria', 'winsorized_HIV',
       'winsorized_GDP', 'winsorized_Population',
       'winsorized_thinness_10_19_years', 'winsorized_thinness_5_9_years',
       'winsorized_Income_Comp_Of_Resources', 'winsorized_Schooling',
       'winsorized_Measles',   
       'winsorized_Life_Expectancy'])

In [None]:
new_data.shape

In [None]:
new_data.head()

In [None]:
new_data.rename(columns={  
             'winsorized_Adult_Mortality':'Adult_Mortality',
       'winsorized_Infant_Deaths' :'Infant_Deaths',
       'winsorized_Alcohol':'Alcohol',
       'winsorized_Percentage_Exp':'Percentage_Expenditure',
       'winsorized_HepatitisB':'Hepatitis_B',
       'winsorized_Under_Five_Deaths':'Under_Five_Deaths',
       'winsorized_Polio':'Polio',
       'winsorized_Tot_Exp':'Total_Expenditure',
       'winsorized_Diphtheria':'Diphtheria',
       'winsorized_HIV':'HIV/AIDS',
       'winsorized_GDP':'GDP',
       'winsorized_Population':'Population',
       'winsorized_thinness_10_19_years':'Thinness_10_19_years',
       'winsorized_thinness_5_9_years':'Thinness_5_9_years',
       'winsorized_Income_Comp_Of_Resources':'Income_Composition_of_Resources',
       'winsorized_Schooling':'Schooling',
       'winsorized_Measles':'Measles',
       'winsorized_Life_Expectancy':'Life_Expectancy' } ,inplace=True)

In [None]:
new_data.head()

In [None]:
new_data.columns

**Separating the input features and label**

In [None]:
X = new_data.drop('Life_Expectancy', axis=1)
Y = pd.DataFrame(data=new_data,columns=['Life_Expectancy'])

In [None]:
X.head()

In [None]:
Y.head()

**Splitting the data into train set and test set**

In [None]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2, random_state = 42)

# Creating a pipeline

In [None]:
numeric_features = ['Year', 'BMI',
       'Adult_Mortality', 'Infant_Deaths', 'Alcohol', 'Percentage_Expenditure',
       'Hepatitis_B', 'Under_Five_Deaths', 'Polio', 'Total_Expenditure',
       'Diphtheria', 'HIV/AIDS', 'GDP', 'Population', 'Thinness_10_19_years',
       'Thinness_5_9_years', 'Income_Composition_of_Resources', 'Schooling',
       'Measles']
categorical_features = ['Country', 'Status']

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder

categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore')),
])

In [None]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler

numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median'))
   
])

In [None]:
from sklearn.compose import ColumnTransformer

preprocessor = ColumnTransformer(
    transformers=[
        ('cat', categorical_transformer, categorical_features),
        ('num', numeric_transformer, numeric_features)
    ]
)

# Finding best algorithm

In [None]:
models = OrderedDict([
    ( "Linear Regression",       Pipeline([
                                            ('preprocessor', preprocessor),
                                            ('LRegressor', LinearRegression())])  ),
    ( "Decision Tree Regressor", Pipeline([
                                           ('preprocessor', preprocessor),
                                            ('DTRegressor', DecisionTreeRegressor())])  ),
    ( "Random Forest Regressor", Pipeline([
                                           ('preprocessor', preprocessor),
                                            ('RFRegressor', RandomForestRegressor())])  ),
    
])

In [None]:
scores = {}
for (name, model) in models.items():
  model.fit(X_train,Y_train)
  scores[name] =r2_score(model.predict(X_test), Y_test)
    
scores = OrderedDict(sorted(scores.items()))
scores

**Hence Random forest regression is the most suitable algorithm for this dataset**

# Random forest regression

In [None]:
RFRegressor = Pipeline([
     ('preprocessor', preprocessor),
     ('RFRegressor', RandomForestRegressor())
])

In [None]:
RFRegressor.fit(X_train,Y_train)

In [None]:
predict= RFRegressor.predict(X_test)

In [None]:
r2_score(predict, Y_test)

# Deploying model

In [None]:
!pip install watson-machine-learning-client

In [None]:
from watson_machine_learning_client import WatsonMachineLearningAPIClient

In [None]:
wml_credentials={
  "apikey": "ein0dLtA3GvhDOX6w0xbdM6A8niBiwsWcjvgP5nhlhCm",
  #"iam_apikey_description": "Auto-generated for key e81d078e-21d5-4c12-9226-799ddc2287ab",
  #"iam_apikey_name": "Service credentials-1",
  #"iam_role_crn": "crn:v1:bluemix:public:iam::::serviceRole:Manager",
  #"iam_serviceid_crn": "crn:v1:bluemix:public:iam-identity::a/cb901b1b6f5849fea6c631aa90ff44e7::serviceid:ServiceId-21c720f1-7f7c-4613-9342-9c501064ba11",
  "instance_id": "bfcef6f2-d531-42d8-9977-4d790a2a145c",
  "url": "https://eu-gb.ml.cloud.ibm.com"
}

In [None]:
client = WatsonMachineLearningAPIClient( wml_credentials )

In [None]:
model_props = {client.repository.ModelMetaNames.AUTHOR_NAME: "ShreyanshShukla", 
               client.repository.ModelMetaNames.AUTHOR_EMAIL: "shreyanshshuklashukla@gmail.com", 
               client.repository.ModelMetaNames.NAME: "Life_Expectancy_Prediction_ML_SmartInternz"}

In [None]:
model_artifact =client.repository.store_model(RFRegressor, meta_props=model_props)

In [None]:
published_model_uid = client.repository.get_model_uid(model_artifact)
published_model_uid

In [None]:
deployment = client.deployments.create(published_model_uid, name="Life_Expectancy_Prediction_ML_SmartInternz")
scoring_endpoint = client.deployments.get_scoring_url(deployment)
scoring_endpoint