In [1]:
from os.path import join
from os import getcwd
from tensorflow import keras
from IPython.display import clear_output
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

In [2]:
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold

In [3]:
from keras import Model
from keras.models import Sequential
from keras.layers import Dense, Dropout, BatchNormalization, Embedding, Flatten, Input, Concatenate
from keras.optimizers import SGD, Adam
from keras.callbacks import EarlyStopping, LearningRateScheduler, ModelCheckpoint, TensorBoard
from keras.optimizers.schedules import ExponentialDecay
from keras.initializers import GlorotNormal
from keras.regularizers import l1, l2
from keras.losses import MAE, MSE

In [4]:
from functions import *
checkpoints_path = getcwd()+'\\checks_MPL_L'

### Recoleccion de los datos

In [5]:
df = pd.read_csv('../../databases/insurance.csv')
x_df = df[['age', 'sex', 'bmi', 'children', 'smoker', 'region']]
y_df = df['charges']
df.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


In [6]:
df.describe()

Unnamed: 0,age,bmi,children,charges
count,1338.0,1338.0,1338.0,1338.0
mean,39.207025,30.663397,1.094918,13270.422265
std,14.04996,6.098187,1.205493,12110.011237
min,18.0,15.96,0.0,1121.8739
25%,27.0,26.29625,0.0,4740.28715
50%,39.0,30.4,1.0,9382.033
75%,51.0,34.69375,2.0,16639.912515
max,64.0,53.13,5.0,63770.42801


In [7]:
x_df_edit = x_df.copy()
replace = {
    'sex': {'female':1, 'male':0},
    'smoker':{'yes':1, 'no':0}
}
x_df_edit = x_df_edit.replace(replace)
x_df_edit.head()

Unnamed: 0,age,sex,bmi,children,smoker,region
0,19,1,27.9,0,1,southwest
1,18,0,33.77,1,0,southeast
2,28,0,33.0,3,0,southeast
3,33,0,22.705,0,0,northwest
4,32,0,28.88,0,0,northwest


In [8]:
x_df_encoded = pd.get_dummies(x_df_edit, columns=['region'])
x_df_encoded.head()

Unnamed: 0,age,sex,bmi,children,smoker,region_northeast,region_northwest,region_southeast,region_southwest
0,19,1,27.9,0,1,0,0,0,1
1,18,0,33.77,1,0,0,0,1,0
2,28,0,33.0,3,0,0,0,1,0
3,33,0,22.705,0,0,0,1,0,0
4,32,0,28.88,0,0,0,1,0,0


In [9]:
# Split dataset into 15% test, 85% train_validation 
x, x_test, y, y_test = train_test_split(x_df_encoded, y_df, test_size=0.15)

In [10]:
# dividiendo en validación y train
x_train, x_valid, y_train, y_valid = train_test_split(x, y, test_size=0.15)

### Definicion de la red simple con una sola capa oculta

In [11]:
name = 'first_attempt'
checkfile = join(checkpoints_path, name)
# Define callbacks
checkpoint_callback = ModelCheckpoint(filepath=checkfile, save_weights_only=True, monitor='loss', mode='min', 
                                           save_best_only=True)
callbacks = [checkpoint_callback]
# Defining model
mlp_model = Sequential(name=name)

mlp_model.add(Dense(50, activation='relu', input_shape=(x_train.shape[1],)))
mlp_model.add(Dense(1, activation='linear'))

mlp_model.compile(optimizer=Adam(learning_rate=0.02), loss='mae', metrics=['mae', 'mse'])
mlp_model.summary()

Model: "first_attempt"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense (Dense)                (None, 50)                500       
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 51        
Total params: 551
Trainable params: 551
Non-trainable params: 0
_________________________________________________________________


In [12]:
history_mlp_0 = mlp_model.fit(x_train, y_train, validation_data=(x_valid, y_valid),
                              batch_size=32, epochs=200,
                              verbose=1, callbacks=callbacks) 
# Cargo el mejor modelo entrenado
clear_output(wait=True)
mlp_model.load_weights(checkfile)
verify_model(mlp_model, x_train, y_train, x_valid, y_valid)

Unnamed: 0,Set,MAE,MSE
0,Train,3180.757122,28455900.0
1,Validacion,2339.301205,41752710.0


Vamos a probar utilizar mse como loss

In [13]:
name = 'mse_loss'
checkfile = join(checkpoints_path, name)
# Define callbacks
checkpoint_callback = ModelCheckpoint(filepath=checkfile, save_weights_only=True, monitor='loss', mode='min', 
                                           save_best_only=True)
callbacks = [checkpoint_callback]
# Defining model
mlp_model_2 = Sequential(name=name)

mlp_model_2.add(Dense(50, activation='relu', input_shape=(x_train.shape[1],)))
mlp_model_2.add(Dense(1, activation='linear'))

mlp_model_2.compile(optimizer=Adam(learning_rate=0.02), loss='mse', metrics=['mae', 'mse'])
print(mlp_model.summary())

history_mlp_1 = mlp_model_2.fit(x_train, y_train, validation_data=(x_valid, y_valid),
                              batch_size=32, epochs=200,
                              verbose=1, callbacks=callbacks) 
# Cargo el mejor modelo entrenado
clear_output(wait=True)
mlp_model_2.load_weights(checkfile)
verify_model(mlp_model_2, x_train, y_train, x_valid, y_valid)


Unnamed: 0,Set,MAE,MSE
0,Train,4155.192284,26871020.0
1,Validacion,3560.352327,36091980.0


No hubo mejoras

### Matriz de embedding
En lugar de separa la columna de region en 4 columnas separadas como para los modelos anteriores, se procede a codificar cada una de las regiones y utilizar una capa de embedding de dimension 2

In [14]:
def normalize(dataframe, cols=[], norm_info=None):
    replace = {}
    data = dataframe.copy()
    if norm_info:
        replace=norm_info
    else:
        if len(cols)==0:
            cols = dataframe.keys()
        for col in cols:
            replace[col] = [np.mean(dataframe[col]), np.std(dataframe[col])]
    for key in replace:
        data[key] = (data[key]-replace[key][0])/replace[key][1]
    return data, replace

In [15]:
replace = {
    'sex': {'female':1, 'male':0},
    'smoker':{'yes':1, 'no':0},
    'region':{'northeast':0,'northwest':1, 'southeast':2, 'southwest':3}
}
x_df_emb = x_df.copy()
x_df_emb = x_df_emb.replace(replace)

# Split dataset into 15% test, 85% train_validation 
x_e, x_test_e, y_e, y_test_e = train_test_split(x_df_emb, y_df, test_size=0.15)

# dividiendo en validación y train
x_train_e, x_valid_e, y_train_e, y_valid_e = train_test_split(x_e, y_e, test_size=0.15)


In [16]:
x_cat_train = x_train_e[['region']]
x_num_train = x_train_e[['age',	'sex',	'bmi',	'children',	'smoker']]
x_cat_valid = x_valid_e[['region']]
x_num_valid = x_valid_e[['age',	'sex',	'bmi',	'children',	'smoker']]

In [17]:
# Definiendo el modelo
name = 'embeded_test'
checkfile = join(checkpoints_path, name)
# Define callbacks
checkpoint_callback = ModelCheckpoint(filepath=checkfile, save_weights_only=True, monitor='loss', mode='min', 
                                           save_best_only=True)
callbacks = [checkpoint_callback]

# Input Layers
inp_cat = Input(shape=(1,))
inp_num = Input(shape=(x_num_train.shape[1],))
# Embedding
emb = Embedding(input_dim=4, output_dim=2)(inp_cat)
flat = Flatten()(emb)
# Concatenate layers
conc = Concatenate()([flat, inp_num])
# Hidden layers
dense = Dense(50, activation='relu')(conc)
# Output Layer
out = Dense(1, activation='linear')(dense)
# Create model
emb_model = Model(inputs=[inp_cat, inp_num], outputs=out)

emb_model.compile(optimizer=Adam(learning_rate=0.02), loss='mae', metrics=['mae', 'mse'])
emb_model.summary()

Model: "functional_1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            [(None, 1)]          0                                            
__________________________________________________________________________________________________
embedding (Embedding)           (None, 1, 2)         8           input_1[0][0]                    
__________________________________________________________________________________________________
flatten (Flatten)               (None, 2)            0           embedding[0][0]                  
__________________________________________________________________________________________________
input_2 (InputLayer)            [(None, 5)]          0                                            
_______________________________________________________________________________________

In [18]:
# Train model
history_mlp_1 = emb_model.fit([x_cat_train, x_num_train], y_train, validation_data=([x_cat_valid, x_num_valid], y_valid),
                                batch_size=32, epochs=200,
                                verbose=1, callbacks=callbacks)
                                 
# Cargo el mejor modelo entrenado
clear_output(wait=True)
emb_model.load_weights(checkfile)
verify_model(emb_model, [x_cat_train, x_num_train], y_train, [x_cat_valid, x_num_valid], y_valid)

Unnamed: 0,Set,MAE,MSE
0,Train,8233.295654,119468000.0
1,Validacion,7123.021188,159903600.0


### En las 3 pruebas se pudo observar un comportamiento peculiar en el cual los resultados en el set de validación resultan mejores que en el set de train, lo cual lleva a que las metricas obtenidas del modelo esten sujetas a una elevada varianza, para mejorar esto se utilizará k-folding

Ahora vamos a realizar una prueba a fuerza bruta probando utilizar 1 o 2 capaz ocultas, variando la cantidad de neuronas por capa

In [19]:
cat = ['region']
num = ['age',	'sex',	'bmi',	'children',	'smoker']

def create_model(neuronas,activation):
    # Definiendo el modelo
    name = 'n{}_act{}'.format(neuronas, activation)
    print('Testing ->', name)
    # Input Layers
    inp_cat = Input(shape=(1,))
    inp_num = Input(shape=(len(num),))
    # Embedding
    emb = Embedding(input_dim=4, output_dim=2)(inp_cat)
    flat = Flatten()(emb)
    # Concatenate layers
    conc = Concatenate()([flat, inp_num])
    # Hidden layers
    dense = Dense(neuronas, activation=activation)(conc)
    # Output Layer
    out = Dense(1, activation='linear')(dense)
    # Create model
    model = Model(inputs=[inp_cat, inp_num], outputs=out)
    model.compile(optimizer=Adam(1), loss='mae', metrics=['mae', 'mse'])
    return model, name


In [20]:
results = {}

for neuronas in [5, 10, 30, 50]:
    for act in ['linear', 'relu', 'elu']:
        my_model, name = create_model(neuronas, act)
        
        mae, metrics = train_model_emb(x_e, y, my_model, name, 32, checkpoints_path, stopping_patiece=50, cat=cat, num=num)
        results[name] = (mae, metrics)

clear_output(wait=True)
sorted_results = list(sorted(results.items(), key=lambda x: x[1][0]))
for i in sorted_results:
    print(i[0], '->', i[1][0])

n30_actelu -> 8174.971512313183
n50_actlinear -> 8178.583459945633
n50_actrelu -> 8218.495897002698
n30_actrelu -> 8219.420831426558
n5_actlinear -> 8222.85945406444
n10_actlinear -> 8289.043790206051
n10_actelu -> 8315.597631982451
n5_actelu -> 8322.71398188308
n50_actelu -> 8328.149446639165
n10_actrelu -> 8334.968537976936
n30_actlinear -> 8347.518754408626
n5_actrelu -> 8849.490342952695


Los mejores resultados se obtuvieron con 30 neuronas y utilizando adam por lo que se continuará avanzando para mejorar dicho modelo

### Optimización del Learning Rate

In [21]:
results_3 = {}
lr_s = [0.01, 0.05, 0.1, 0.5, 1.0, 1.5, 2.0]
for i in lr_s:
     # Definiendo el modelo
    name = 'best_lr{}'.format(i)
    print('Testing ->', name)
    checkpoints_path
    # Input Layers
    inp_cat = Input(shape=(1,))
    inp_num = Input(shape=(len(num),))
    # Embedding
    emb = Embedding(input_dim=4, output_dim=2)(inp_cat)
    flat = Flatten()(emb)
    # Concatenate layers
    conc = Concatenate()([flat, inp_num])
    # Hidden layers
    dense = Dense(30, activation='relu')(conc)
    # Output Layer
    out = Dense(1, activation='linear')(dense)
    # Create model
    model = Model(inputs=[inp_cat, inp_num], outputs=out)
    model.compile(optimizer=Adam(lr=1.2), loss='mae', metrics=['mae', 'mse'])

    mae, metrics = train_model_emb(x_e, y, model, name, 32, checkpoints_path, stopping_patiece=50, cat=cat, num=num)
    results_3[name] = (mae, metrics)
clear_output(wait=True)

Testing -> best_lr0.01
Testing -> best_lr0.05
Testing -> best_lr0.1
Testing -> best_lr0.5
Testing -> best_lr1.0
Testing -> best_lr1.5
Testing -> best_lr2.0


In [22]:
results_3

{'best_lr0.01': (8226.928020271871,
  [          Set          MAE           MSE
   0       Train  8205.464809  1.634767e+08
   1  Validacion  8748.498262  1.370446e+08,
             Set          MAE           MSE
   0       Train  8150.146535  1.422041e+08
   1  Validacion  7905.400292  1.521689e+08,
             Set          MAE           MSE
   0       Train  8565.360396  1.609719e+08
   1  Validacion  7781.008585  1.827913e+08,
             Set          MAE           MSE
   0       Train  7952.996296  1.918170e+08
   1  Validacion  8982.317063  1.580462e+08,
             Set         MAE           MSE
   0       Train  8243.92168  1.235369e+08
   1  Validacion  7717.41590  1.513879e+08]),
 'best_lr0.05': (8217.729447907972,
  [          Set          MAE           MSE
   0       Train  7951.571727  1.726256e+08
   1  Validacion  8610.562247  1.418181e+08,
             Set          MAE           MSE
   0       Train  8183.895612  1.524937e+08
   1  Validacion  7954.970559  1.642296e+08

### Probando varias capas

In [23]:
 # Definiendo el modelo
name = '3_hidden'.format(i)
print('Testing ->', name)
checkpoints_path
# Input Layers
inp_cat = Input(shape=(1,))
inp_num = Input(shape=(len(num),))
# Embedding
emb = Embedding(input_dim=4, output_dim=2)(inp_cat)
flat = Flatten()(emb)
# Concatenate layers
conc = Concatenate()([flat, inp_num])
# Hidden layers
dense = Dense(30, activation='relu')(conc)
dense1 = Dense(30, activation='elu')(dense)
dense2 = Dense(30, activation='elu')(dense1)
# Output Layer
out = Dense(1, activation='linear')(dense2)
# Create model
model = Model(inputs=[inp_cat, inp_num], outputs=out)
model.compile(optimizer=Adam(lr=1.2), loss='mae', metrics=['mae', 'mse'])

mae, metrics = train_model_emb(x_e, y, model, name, 32, checkpoints_path, stopping_patiece=50, cat=cat, num=num)
metrics

Testing -> 3_hidden


[          Set          MAE           MSE
 0       Train  8128.363209  2.044878e+08
 1  Validacion  9008.769999  1.663902e+08,
           Set          MAE           MSE
 0       Train  8146.112628  1.421291e+08
 1  Validacion  7851.842020  1.530762e+08,
           Set          MAE           MSE
 0       Train  8335.581496  1.501571e+08
 1  Validacion  7537.070966  1.699922e+08,
           Set          MAE           MSE
 0       Train  7899.913483  1.771878e+08
 1  Validacion  8783.417714  1.478390e+08,
           Set          MAE           MSE
 0       Train  8207.930047  1.271221e+08
 1  Validacion  7547.938935  1.606405e+08]

In [24]:
 # Definiendo el modelo
name = '3_hidden_drop'.format(i)
print('Testing ->', name)
checkpoints_path
# Input Layers
inp_cat = Input(shape=(1,))
inp_num = Input(shape=(len(num),))
# Embedding
emb = Embedding(input_dim=4, output_dim=2)(inp_cat)
flat = Flatten()(emb)
# Concatenate layers
conc = Concatenate()([flat, inp_num])
# Hidden layers
dense = Dense(30, activation='relu')(conc)
drop1 = Dropout(0.1)(dense)
dense1 = Dense(30, activation='elu')(drop1)
drop2 = Dropout(0.1)(dense1)
dense2 = Dense(30, activation='elu')(drop2)
drop3 = Dropout(0.1)(dense2)
# Output Layer
out = Dense(1, activation='linear')(drop3)
# Create model
model = Model(inputs=[inp_cat, inp_num], outputs=out)
model.compile(optimizer=Adam(lr=1.2), loss='mae', metrics=['mae', 'mse'])

mae, metrics = train_model_emb(x_e, y, model, name, 32, checkpoints_path, stopping_patiece=50, cat=cat, num=num)
metrics

Testing -> 3_hidden_drop


[          Set          MAE           MSE
 0       Train  7945.080088  1.872590e+08
 1  Validacion  8718.199529  1.525853e+08,
           Set          MAE           MSE
 0       Train  8138.699550  1.465794e+08
 1  Validacion  7845.054855  1.581299e+08,
           Set          MAE           MSE
 0       Train  8207.743204  1.386461e+08
 1  Validacion  7559.590138  1.559708e+08,
           Set          MAE           MSE
 0       Train  7899.886948  1.794934e+08
 1  Validacion  8797.521820  1.496151e+08,
           Set          MAE           MSE
 0       Train  8262.238672  1.338848e+08
 1  Validacion  7594.070900  1.694883e+08]

In [25]:
 # Definiendo el modelo
name = '2_hidden'.format(i)
print('Testing ->', name)
checkpoints_path
# Input Layers
inp_cat = Input(shape=(1,))
inp_num = Input(shape=(len(num),))
# Embedding
emb = Embedding(input_dim=4, output_dim=2)(inp_cat)
flat = Flatten()(emb)
# Concatenate layers
conc = Concatenate()([flat, inp_num])
# Hidden layers
dense = Dense(5, activation='relu')(conc)
dense1 = Dense(5, activation='elu')(dense)
# Output Layer
out = Dense(1, activation='linear')(dense1)
# Create model
model = Model(inputs=[inp_cat, inp_num], outputs=out)
model.compile(optimizer=Adam(lr=0.2), loss='mae', metrics=['mae', 'mse'])

mae, metrics = train_model_emb(x_e, y, model, name, 32, checkpoints_path, stopping_patiece=50, cat=cat, num=num)
metrics

Testing -> 2_hidden


[          Set          MAE           MSE
 0       Train  7947.608949  1.746257e+08
 1  Validacion  8628.672332  1.435127e+08,
           Set          MAE           MSE
 0       Train  8144.306921  1.511806e+08
 1  Validacion  7925.881775  1.608340e+08,
           Set          MAE           MSE
 0       Train  8207.742616  1.386453e+08
 1  Validacion  7559.607230  1.559699e+08,
           Set          MAE           MSE
 0       Train  7876.343636  1.801068e+08
 1  Validacion  8826.796937  1.487945e+08,
           Set          MAE           MSE
 0       Train  8302.199214  1.388208e+08
 1  Validacion  7692.205682  1.728424e+08]

In [26]:
 # Definiendo el modelo
name = '2_hidden_drop'.format(i)
print('Testing ->', name)
checkpoints_path
# Input Layers
inp_cat = Input(shape=(1,))
inp_num = Input(shape=(len(num),))
# Embedding
emb = Embedding(input_dim=4, output_dim=2)(inp_cat)
flat = Flatten()(emb)
# Concatenate layers
conc = Concatenate()([flat, inp_num])
# Hidden layers
dense = Dense(5, activation='relu')(conc)
drop1 = Dropout(0.6)(dense)
dense1 = Dense(5, activation='elu')(drop1)
drop2 = Dropout(0.6)(dense1)
# Output Layer
out = Dense(1, activation='linear')(drop2)
# Create model
model = Model(inputs=[inp_cat, inp_num], outputs=out)
model.compile(optimizer=Adam(lr=10), loss='mae', metrics=['mae', 'mse'])

mae, metrics = train_model_emb(x_e, y, model, name, 32, checkpoints_path, stopping_patiece=50, cat=cat, num=num)
metrics

Testing -> 2_hidden_drop


[          Set          MAE           MSE
 0       Train  7929.249738  1.818523e+08
 1  Validacion  8670.742524  1.484613e+08,
           Set          MAE           MSE
 0       Train  8134.774865  1.442414e+08
 1  Validacion  7845.169127  1.554911e+08,
           Set          MAE           MSE
 0       Train  8208.376743  1.378534e+08
 1  Validacion  7578.586281  1.549513e+08,
           Set          MAE           MSE
 0       Train  7899.762678  1.794043e+08
 1  Validacion  8796.898687  1.495459e+08,
           Set          MAE           MSE
 0       Train  8208.013180  1.266224e+08
 1  Validacion  7549.864138  1.599565e+08]

### Se probaron varias combinaciones de topologia pero no se logro mejorar lo realizado previamente con features polinomiales, por lo tanto se busca mejorar el primer modelo desarrollado que no utilizaba la capa de Embedding

### Buscando mejorar el primer modelo 

In [27]:
name = 'first_attempt_v2'
checkfile = join(checkpoints_path, name)
# Define callbacks
checkpoint_callback = ModelCheckpoint(filepath=checkfile, save_weights_only=True, monitor='loss', mode='min', 
                                           save_best_only=True)
callbacks = [checkpoint_callback]
# Defining model
mlp_model = Sequential(name=name)

mlp_model.add(Dense(50, activation='relu', input_shape=(x_train.shape[1],),  kernel_regularizer='l2'))
mlp_model.add(Dense(1, activation='linear'))

mlp_model.compile(optimizer=Adam(learning_rate=0.02), loss='mae', metrics=['mae', 'mse'])
print(mlp_model.summary())
history_mlp_0 = mlp_model.fit(x_train, y_train, validation_data=(x_valid, y_valid),
                              batch_size=32, epochs=200,
                              verbose=1, callbacks=callbacks) 
# Cargo el mejor modelo entrenado
clear_output(wait=True)
mlp_model.load_weights(checkfile)
verify_model(mlp_model, x_train, y_train, x_valid, y_valid)

Unnamed: 0,Set,MAE,MSE
0,Train,3206.245597,32077240.0
1,Validacion,2449.437323,48490560.0


In [28]:
verify_model(mlp_model, x_train, y_train, x_test, y_test, valid_label='Test')

Unnamed: 0,Set,MAE,MSE
0,Train,3206.245597,62467500.0
1,Test,4176.948949,48490560.0
