In [32]:
import pandas as pd
import numpy as np

#I installed tensorflow for CPU via Anaconda prompt.
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import Normalizer
from sklearn.compose import ColumnTransformer
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import InputLayer
from tensorflow.keras.layers import Dense
from tensorflow.keras.optimizers import Adam

expectancy = pd.read_csv("life expectancy.csv")
expectancy.head()
expectancy.info()
expectancy.describe()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2938 entries, 0 to 2937
Data columns (total 22 columns):
 #   Column                           Non-Null Count  Dtype  
---  ------                           --------------  -----  
 0   Country                          2938 non-null   object 
 1   Year                             2938 non-null   int64  
 2   Status                           2938 non-null   object 
 3   Adult Mortality                  2938 non-null   float64
 4   infant deaths                    2938 non-null   int64  
 5   Alcohol                          2938 non-null   float64
 6   percentage expenditure           2938 non-null   float64
 7   Hepatitis B                      2938 non-null   float64
 8   Measles                          2938 non-null   int64  
 9    BMI                             2938 non-null   float64
 10  under-five deaths                2938 non-null   int64  
 11  Polio                            2938 non-null   float64
 12  Total expenditure   

Unnamed: 0,Year,Adult Mortality,infant deaths,Alcohol,percentage expenditure,Hepatitis B,Measles,BMI,under-five deaths,Polio,Total expenditure,Diphtheria,HIV/AIDS,GDP,Population,thinness 1-19 years,thinness 5-9 years,Income composition of resources,Schooling,Life expectancy
count,2938.0,2938.0,2938.0,2938.0,2938.0,2938.0,2938.0,2938.0,2938.0,2938.0,2938.0,2938.0,2938.0,2938.0,2938.0,2938.0,2938.0,2938.0,2938.0,2938.0
mean,2007.51872,164.725664,30.303948,4.546875,738.251295,83.022124,2419.59224,38.381178,42.035739,82.617767,5.924098,82.393125,1.742103,6611.523863,10230850.0,4.821886,4.852144,0.630362,12.009837,69.234717
std,4.613841,124.086215,117.926501,3.921946,1987.914858,22.996984,11467.272489,19.935375,160.445548,23.367166,2.40077,23.655562,5.077785,13296.603449,54022420.0,4.397621,4.485854,0.20514,3.265139,9.509115
min,2000.0,1.0,0.0,0.01,0.0,1.0,0.0,1.0,0.0,3.0,0.37,2.0,0.1,1.68135,34.0,0.1,0.1,0.0,0.0,36.3
25%,2004.0,74.0,0.0,1.0925,4.685343,82.0,0.0,19.4,0.0,78.0,4.37,78.0,0.1,580.486996,418917.2,1.6,1.6,0.50425,10.3,63.2
50%,2008.0,144.0,3.0,3.755,64.912906,92.0,17.0,43.5,4.0,93.0,5.755,93.0,0.1,1766.947595,1386542.0,3.3,3.3,0.677,12.3,72.1
75%,2012.0,227.0,22.0,7.39,441.534144,96.0,360.25,56.1,28.0,97.0,7.33,97.0,0.8,4779.40519,4584371.0,7.1,7.2,0.772,14.1,75.6
max,2015.0,723.0,1800.0,17.87,19479.91161,99.0,212183.0,87.3,2500.0,99.0,17.6,99.0,50.6,119172.7418,1293859000.0,27.7,28.6,0.948,20.7,89.0


In [14]:
#To create a predictive model, knowing from which country data comes can be confusing and it is not a column we can generalize over. 
#We want to learn a general pattern for all the countries, and not only those dependent on specific countries.

expectancy = expectancy.drop(columns = ["Country"])
expectancy.head()

Unnamed: 0,Year,Status,Adult Mortality,infant deaths,Alcohol,percentage expenditure,Hepatitis B,Measles,BMI,under-five deaths,...,Total expenditure,Diphtheria,HIV/AIDS,GDP,Population,thinness 1-19 years,thinness 5-9 years,Income composition of resources,Schooling,Life expectancy
0,2015,Developing,263.0,62,0.01,71.279624,65.0,1154,19.1,83,...,8.16,65.0,0.1,584.25921,33736494.0,17.2,17.3,0.479,10.1,65.0
1,2014,Developing,271.0,64,0.01,73.523582,62.0,492,18.6,86,...,8.18,62.0,0.1,612.696514,327582.0,17.5,17.5,0.476,10.0,59.9
2,2013,Developing,268.0,66,0.01,73.219243,64.0,430,18.1,89,...,8.13,64.0,0.1,631.744976,31731688.0,17.7,17.7,0.47,9.9,59.9
3,2012,Developing,272.0,69,0.01,78.184215,67.0,2787,17.6,93,...,8.52,67.0,0.1,669.959,3696958.0,17.9,18.0,0.463,9.8,59.5
4,2011,Developing,275.0,71,0.01,7.097109,68.0,3013,17.2,97,...,7.87,68.0,0.1,63.537231,2978599.0,18.2,18.2,0.454,9.5,59.2


### Splitting data into labels and features.

In [3]:
#Selecting last column.
labels = expectancy.iloc[:,-1]
labels.head()

#Selecting all columns excluding the last one.
features = expectancy.iloc[:, 0: -1]
print(features.head())

       Country  Year      Status  Adult Mortality  infant deaths  Alcohol  \
0  Afghanistan  2015  Developing            263.0             62     0.01   
1  Afghanistan  2014  Developing            271.0             64     0.01   
2  Afghanistan  2013  Developing            268.0             66     0.01   
3  Afghanistan  2012  Developing            272.0             69     0.01   
4  Afghanistan  2011  Developing            275.0             71     0.01   

   percentage expenditure  Hepatitis B  Measles    BMI   ...  Polio  \
0               71.279624         65.0      1154   19.1  ...    6.0   
1               73.523582         62.0       492   18.6  ...   58.0   
2               73.219243         64.0       430   18.1  ...   62.0   
3               78.184215         67.0      2787   17.6  ...   67.0   
4                7.097109         68.0      3013   17.2  ...   68.0   

   Total expenditure  Diphtheria    HIV/AIDS         GDP  Population  \
0               8.16         65.0     

### One-hot-encoding of categorical data.

In [19]:
features = pd.get_dummies(features)
#print(features)

### Train-test split

In [49]:
feat_train, feat_test, labels_train, lables_test = train_test_split(features, labels, test_size = 0.3, random_state = 42)

print(feat_train.head())
print(labels_train)
print(lables_test)

            Country  Year      Status  Adult Mortality  infant deaths  \
1641          Malta  2008   Developed             64.0              0   
618           Congo  2005  Developing            394.0              8   
406    Burkina Faso  2009  Developing            283.0             44   
1094  Guinea-Bissau  2011  Developing            289.0              4   
1788        Myanmar  2007  Developing            217.0             58   

      Alcohol  percentage expenditure  Hepatitis B  Measles    BMI   ...  \
1641     7.14             2655.573684         86.0         1   66.6  ...   
618      2.03                0.000000         92.0       146   21.7  ...   
406      4.55               81.143047         92.0     54118   16.1  ...   
1094     3.57               40.453674         86.0         0   23.7  ...   
1788     0.26                0.530573         85.0      1088   17.6  ...   

      Polio  Total expenditure  Diphtheria    HIV/AIDS           GDP  \
1641   72.0               8.15  

### Extracting float and integer variables and assigning them to variable with columns. It is required for ColumnTransformer.

In [52]:
num_feats = features.select_dtypes(include = ['float64', 'int64'])
num_cols = num_feats.columns

print(num_cols)

Index(['Adult Mortality', 'Alcohol', 'percentage expenditure', 'Hepatitis B',
       ' BMI ', 'Polio', 'Total expenditure', 'Diphtheria ', ' HIV/AIDS',
       'GDP', 'Population', ' thinness  1-19 years', ' thinness 5-9 years',
       'Income composition of resources', 'Schooling'],
      dtype='object')


### Standarization

In [18]:
#Definig ColumnTransformer model 
ct = ColumnTransformer([('only_numeric', StandardScaler(), num_cols)], remainder = 'passthrough')

#Fitting and transforming trainning data in the model.
feat_train_scaled = ct.fit_transform(feat_train)

print(feat_train_scaled)

[[0.09628978649337691 -0.8109283326382842 -0.2577241301286717 ...
  0.7864940299331927 'Malta' 'Developed']
 [-0.553508247523271 1.8748279235412306 -0.18585883171283754 ...
  -0.825114014690047 'Congo' 'Developing']
 [0.31288913116559286 0.9714371828263029 0.13753501115841602 ...
  -1.9098501985710739 'Burkina Faso' 'Developing']
 ...
 [-0.12030955817883905 0.8330800423564491 -0.10501037099502415 ...
  -1.1350386386560547 'Haiti' 'Developing']
 [-0.9867069368677028 -0.7458190900642354 -0.23975780552471312 ...
  1.034433729105999 'Italy' 'Developed']
 [-0.553508247523271 -1.0550879922909675 -0.1948419940148168 ...
  -2.0648125105540776 'Eritrea' 'Developing']]


### Building the model

Here, we are implementing model with one hidden layer with 128 neurons, ReLu as activation function and adam optimalizer.

In [54]:
#Implemening sequential model
model = Sequential()
#Defining input with shape adjusted to features.
input = InputLayer(input_shape = (features.shape[1]))
#Adding input layer
model.add(input)
#Adding hidden layer
model.add(Dense(128, activation = 'relu'))
#Adding output layer
model.add(Dense(1))
print(model)

<keras.engine.sequential.Sequential object at 0x00000222EFC800A0>


In [55]:
#Defininig optimizer.
opt = Adam(learning_rate = 0.01)
print(opt)

<keras.optimizers.optimizer_v2.adam.Adam object at 0x00000222EFC80400>


In [57]:
#Model compilation.
model.compile(loss = 'mse', metrics = ['mae'], optimizer = opt)

print(model.summary())

Model: "sequential_5"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_8 (Dense)             (None, 128)               2816      
                                                                 
 dense_9 (Dense)             (None, 1)                 129       
                                                                 
Total params: 2,945
Trainable params: 2,945
Non-trainable params: 0
_________________________________________________________________
None


### Fitting and evaluating the model.

In [63]:
#For now, I'm struggling with "ValueError: Failed to convert a NumPy array to a Tensor (Unsupported object type float)."
#Fitting the model.
model.fit(feat_train_scaled, labels_train, epochs = 40, batch_size = 1, verbose = 1)

#Evaluating model's metrics and loss function.
res_mse, res_mae = model.evaluate(feat_train_scaled, labels_test)

print(res_mse, res_mae)