In [2]:
import pandas as pd
import numpy as np

#I installed tensorflow for CPU via Anaconda prompt.
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import Normalizer
from sklearn.compose import ColumnTransformer
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import InputLayer
from tensorflow.keras.layers import Dense
from tensorflow.keras.optimizers import Adam

expectancy = pd.read_csv("life expectancy.csv")
expectancy.head()
expectancy.info()
expectancy.describe()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2938 entries, 0 to 2937
Data columns (total 22 columns):
 #   Column                           Non-Null Count  Dtype  
---  ------                           --------------  -----  
 0   Country                          2938 non-null   object 
 1   Year                             2938 non-null   int64  
 2   Status                           2938 non-null   object 
 3   Adult Mortality                  2938 non-null   float64
 4   infant deaths                    2938 non-null   int64  
 5   Alcohol                          2938 non-null   float64
 6   percentage expenditure           2938 non-null   float64
 7   Hepatitis B                      2938 non-null   float64
 8   Measles                          2938 non-null   int64  
 9    BMI                             2938 non-null   float64
 10  under-five deaths                2938 non-null   int64  
 11  Polio                            2938 non-null   float64
 12  Total expenditure   

Unnamed: 0,Year,Adult Mortality,infant deaths,Alcohol,percentage expenditure,Hepatitis B,Measles,BMI,under-five deaths,Polio,Total expenditure,Diphtheria,HIV/AIDS,GDP,Population,thinness 1-19 years,thinness 5-9 years,Income composition of resources,Schooling,Life expectancy
count,2938.0,2938.0,2938.0,2938.0,2938.0,2938.0,2938.0,2938.0,2938.0,2938.0,2938.0,2938.0,2938.0,2938.0,2938.0,2938.0,2938.0,2938.0,2938.0,2938.0
mean,2007.51872,164.725664,30.303948,4.546875,738.251295,83.022124,2419.59224,38.381178,42.035739,82.617767,5.924098,82.393125,1.742103,6611.523863,10230850.0,4.821886,4.852144,0.630362,12.009837,69.234717
std,4.613841,124.086215,117.926501,3.921946,1987.914858,22.996984,11467.272489,19.935375,160.445548,23.367166,2.40077,23.655562,5.077785,13296.603449,54022420.0,4.397621,4.485854,0.20514,3.265139,9.509115
min,2000.0,1.0,0.0,0.01,0.0,1.0,0.0,1.0,0.0,3.0,0.37,2.0,0.1,1.68135,34.0,0.1,0.1,0.0,0.0,36.3
25%,2004.0,74.0,0.0,1.0925,4.685343,82.0,0.0,19.4,0.0,78.0,4.37,78.0,0.1,580.486996,418917.2,1.6,1.6,0.50425,10.3,63.2
50%,2008.0,144.0,3.0,3.755,64.912906,92.0,17.0,43.5,4.0,93.0,5.755,93.0,0.1,1766.947595,1386542.0,3.3,3.3,0.677,12.3,72.1
75%,2012.0,227.0,22.0,7.39,441.534144,96.0,360.25,56.1,28.0,97.0,7.33,97.0,0.8,4779.40519,4584371.0,7.1,7.2,0.772,14.1,75.6
max,2015.0,723.0,1800.0,17.87,19479.91161,99.0,212183.0,87.3,2500.0,99.0,17.6,99.0,50.6,119172.7418,1293859000.0,27.7,28.6,0.948,20.7,89.0


In [3]:
#To create a predictive model, knowing from which country data comes can be confusing and it is not a column we can generalize over. 
#We want to learn a general pattern for all the countries, and not only those dependent on specific countries.

expectancy = expectancy.drop(columns = ["Country"])
expectancy.head()

Unnamed: 0,Year,Status,Adult Mortality,infant deaths,Alcohol,percentage expenditure,Hepatitis B,Measles,BMI,under-five deaths,...,Total expenditure,Diphtheria,HIV/AIDS,GDP,Population,thinness 1-19 years,thinness 5-9 years,Income composition of resources,Schooling,Life expectancy
0,2015,Developing,263.0,62,0.01,71.279624,65.0,1154,19.1,83,...,8.16,65.0,0.1,584.25921,33736494.0,17.2,17.3,0.479,10.1,65.0
1,2014,Developing,271.0,64,0.01,73.523582,62.0,492,18.6,86,...,8.18,62.0,0.1,612.696514,327582.0,17.5,17.5,0.476,10.0,59.9
2,2013,Developing,268.0,66,0.01,73.219243,64.0,430,18.1,89,...,8.13,64.0,0.1,631.744976,31731688.0,17.7,17.7,0.47,9.9,59.9
3,2012,Developing,272.0,69,0.01,78.184215,67.0,2787,17.6,93,...,8.52,67.0,0.1,669.959,3696958.0,17.9,18.0,0.463,9.8,59.5
4,2011,Developing,275.0,71,0.01,7.097109,68.0,3013,17.2,97,...,7.87,68.0,0.1,63.537231,2978599.0,18.2,18.2,0.454,9.5,59.2


### Splitting data into labels and features.

In [5]:
#Selecting last column.
labels = expectancy.iloc[:,-1]
print(labels.head())

#Selecting all columns excluding the last one.
features = expectancy.iloc[:, 0: -1]
print(features.head())

0    65.0
1    59.9
2    59.9
3    59.5
4    59.2
Name: Life expectancy, dtype: float64
   Year      Status  Adult Mortality  infant deaths  Alcohol  \
0  2015  Developing            263.0             62     0.01   
1  2014  Developing            271.0             64     0.01   
2  2013  Developing            268.0             66     0.01   
3  2012  Developing            272.0             69     0.01   
4  2011  Developing            275.0             71     0.01   

   percentage expenditure  Hepatitis B  Measles    BMI   under-five deaths   \
0               71.279624         65.0      1154   19.1                  83   
1               73.523582         62.0       492   18.6                  86   
2               73.219243         64.0       430   18.1                  89   
3               78.184215         67.0      2787   17.6                  93   
4                7.097109         68.0      3013   17.2                  97   

   Polio  Total expenditure  Diphtheria    HIV/AIDS 

### One-hot-encoding of categorical data.

In [6]:
features = pd.get_dummies(features)
print(features)

      Year  Adult Mortality  infant deaths  Alcohol  percentage expenditure  \
0     2015            263.0             62     0.01               71.279624   
1     2014            271.0             64     0.01               73.523582   
2     2013            268.0             66     0.01               73.219243   
3     2012            272.0             69     0.01               78.184215   
4     2011            275.0             71     0.01                7.097109   
...    ...              ...            ...      ...                     ...   
2933  2004            723.0             27     4.36                0.000000   
2934  2003            715.0             26     4.06                0.000000   
2935  2002             73.0             25     4.43                0.000000   
2936  2001            686.0             25     1.72                0.000000   
2937  2000            665.0             24     1.68                0.000000   

      Hepatitis B  Measles    BMI   under-five deat

### Train-test split

In [19]:
feat_train, feat_test, labels_train, labels_test = train_test_split(features, labels, test_size = 0.3, random_state = 42)

print(feat_train.head())
print(labels_train)
print(labels_test)

      Year  Adult Mortality  infant deaths  Alcohol  percentage expenditure  \
1641  2008             64.0              0     7.14             2655.573684   
618   2005            394.0              8     2.03                0.000000   
406   2009            283.0             44     4.55               81.143047   
1094  2011            289.0              4     3.57               40.453674   
1788  2007            217.0             58     0.26                0.530573   

      Hepatitis B  Measles    BMI   under-five deaths   Polio  ...  \
1641         86.0         1   66.6                   0   72.0  ...   
618          92.0       146   21.7                  13   62.0  ...   
406          92.0     54118   16.1                  77   91.0  ...   
1094         86.0         0   23.7                   7   85.0  ...   
1788         85.0      1088   17.6                  78   84.0  ...   

      Diphtheria    HIV/AIDS           GDP  Population   thinness  1-19 years  \
1641         72.0      

### Extracting float and integer variables and assigning them to variable with columns. It is required for ColumnTransformer.

In [8]:
num_feats = features.select_dtypes(include = ['float64', 'int64'])
num_cols = num_feats.columns

print(num_cols)

Index(['Year', 'Adult Mortality', 'infant deaths', 'Alcohol',
       'percentage expenditure', 'Hepatitis B', 'Measles ', ' BMI ',
       'under-five deaths ', 'Polio', 'Total expenditure', 'Diphtheria ',
       ' HIV/AIDS', 'GDP', 'Population', ' thinness  1-19 years',
       ' thinness 5-9 years', 'Income composition of resources', 'Schooling'],
      dtype='object')


### Standarization

In [22]:
#Definig ColumnTransformer model 
ct = ColumnTransformer([('only_numeric', StandardScaler(), num_cols)], remainder = 'passthrough')

#Fitting and transforming trainning data in the model.
feat_train_scaled = ct.fit_transform(feat_train)
feat_test_scaled = ct.transform(feat_test)

print(feat_train_scaled, feat_test_scaled)

[[ 0.09628979 -0.81092833 -0.25772413 ...  0.78649403  1.
   0.        ]
 [-0.55350825  1.87482792 -0.18585883 ... -0.82511401  0.
   1.        ]
 [ 0.31288913  0.97143718  0.13753501 ... -1.9098502   0.
   1.        ]
 ...
 [-0.12030956  0.83308004 -0.10501037 ... -1.13503864  0.
   1.        ]
 [-0.98670694 -0.74581909 -0.23975781 ...  1.03443373  1.
   0.        ]
 [-0.55350825 -1.05508799 -0.19484199 ... -2.06481251  0.
   1.        ]] [[-0.3369089  -0.33074767 -0.18585883 ... -0.20526477  0.
   1.        ]
 [-0.3369089  -0.41213422 -0.25772413 ...  0.56954679  1.
   0.        ]
 [-0.12030956 -0.31447036 -0.25772413 ...  0.47656941  0.
   1.        ]
 ...
 [-0.77010759 -0.77023506 -0.24874097 ...  2.67703424  1.
   0.        ]
 [ 0.74608782 -0.85976026 -0.25772413 ...  1.2203885   1.
   0.        ]
 [-0.98670694  0.87377332 -0.23975781 ... -0.48419693  0.
   1.        ]]


### Building the model

Here, we are implementing model with one hidden layer with 128 neurons, ReLu as activation function and adam optimalizer.

In [9]:
#Implemening sequential model
model = Sequential()
#Defining input with shape adjusted to features.
input = InputLayer(input_shape = (features.shape[1]))
#Adding input layer
model.add(input)
#Adding hidden layer
model.add(Dense(128, activation = 'relu'))
#Adding output layer
model.add(Dense(1))
print(model)

<keras.engine.sequential.Sequential object at 0x0000023C565BEEE0>


In [10]:
#Defininig optimizer.
opt = Adam(learning_rate = 0.01)
print(opt)

<keras.optimizers.optimizer_v2.adam.Adam object at 0x0000023C565BE220>


In [11]:
#Model compilation.
model.compile(loss = 'mse', metrics = ['mae'], optimizer = opt)

print(model.summary())

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense (Dense)               (None, 128)               2816      
                                                                 
 dense_1 (Dense)             (None, 1)                 129       
                                                                 
Total params: 2,945
Trainable params: 2,945
Non-trainable params: 0
_________________________________________________________________
None


### Fitting and evaluating the model.

In [26]:
#For now, I'm struggling with "ValueError: Failed to convert a NumPy array to a Tensor (Unsupported object type float)."
#Fitting the model.
model.fit(feat_train_scaled, labels_train, epochs = 40, batch_size = 1, verbose = 1)

#Evaluating model's metrics and loss function.
res_mse, res_mae = model.evaluate(feat_test_scaled, labels_test)

print(res_mse, res_mae)

Epoch 1/40
Epoch 2/40
Epoch 3/40
Epoch 4/40
Epoch 5/40
Epoch 6/40
Epoch 7/40
Epoch 8/40
Epoch 9/40
Epoch 10/40
Epoch 11/40
Epoch 12/40
Epoch 13/40
Epoch 14/40
Epoch 15/40
Epoch 16/40
Epoch 17/40
Epoch 18/40
Epoch 19/40
Epoch 20/40
Epoch 21/40
Epoch 22/40
Epoch 23/40
Epoch 24/40
Epoch 25/40
Epoch 26/40
Epoch 27/40
Epoch 28/40
Epoch 29/40
Epoch 30/40
Epoch 31/40
Epoch 32/40
Epoch 33/40
Epoch 34/40
Epoch 35/40
Epoch 36/40
Epoch 37/40
Epoch 38/40
Epoch 39/40
Epoch 40/40
6.966291904449463 1.8560799360275269
