In [1]:
from os.path import join
from os import getcwd
from tensorflow import keras
from IPython.display import clear_output
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

In [2]:
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold

In [3]:
from keras import Model
from keras.models import Sequential
from keras.layers import Dense, Dropout, BatchNormalization, Embedding, Flatten, Input, Concatenate
from keras.optimizers import SGD, Adam
from keras.callbacks import EarlyStopping, LearningRateScheduler, ModelCheckpoint, TensorBoard
from keras.optimizers.schedules import ExponentialDecay
from keras.initializers import GlorotNormal
from keras.regularizers import l1, l2
from keras.losses import MAE, MSE

In [4]:
from functions import *
checkpoints_path = getcwd()+'\\checkpoints'

In [5]:
df = pd.read_csv('../../databases/insurance.csv')
x_df = df[['age', 'sex', 'bmi', 'children', 'smoker', 'region']]
y_df = df['charges']
df.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


In [6]:
df.describe()

Unnamed: 0,age,bmi,children,charges
count,1338.0,1338.0,1338.0,1338.0
mean,39.207025,30.663397,1.094918,13270.422265
std,14.04996,6.098187,1.205493,12110.011237
min,18.0,15.96,0.0,1121.8739
25%,27.0,26.29625,0.0,4740.28715
50%,39.0,30.4,1.0,9382.033
75%,51.0,34.69375,2.0,16639.912515
max,64.0,53.13,5.0,63770.42801


In [7]:
x_df_edit = x_df.copy()
replace = {
    'sex': {'female':1, 'male':0},
    'smoker':{'yes':1, 'no':0}
}
x_df_edit = x_df_edit.replace(replace)
x_df_edit.head()

Unnamed: 0,age,sex,bmi,children,smoker,region
0,19,1,27.9,0,1,southwest
1,18,0,33.77,1,0,southeast
2,28,0,33.0,3,0,southeast
3,33,0,22.705,0,0,northwest
4,32,0,28.88,0,0,northwest


In [8]:
x_df_encoded = pd.get_dummies(x_df_edit, columns=['region'])
x_df_encoded.head()

Unnamed: 0,age,sex,bmi,children,smoker,region_northeast,region_northwest,region_southeast,region_southwest
0,19,1,27.9,0,1,0,0,0,1
1,18,0,33.77,1,0,0,0,1,0
2,28,0,33.0,3,0,0,0,1,0
3,33,0,22.705,0,0,0,1,0,0
4,32,0,28.88,0,0,0,1,0,0


In [9]:
# Split dataset into 15% test, 85% train_validation 
x, x_test, y, y_test = train_test_split(x_df_encoded, y_df, test_size=0.15)

In [10]:
# dividiendo en validación y train
x_train, x_valid, y_train, y_valid = train_test_split(x, y, test_size=0.15)

### Definicion de la red

In [11]:
name = 'first_attempt'
checkfile = join(checkpoints_path, name)
# Define callbacks
checkpoint_callback = ModelCheckpoint(filepath=checkfile, save_weights_only=True, monitor='loss', mode='min', 
                                           save_best_only=True)
callbacks = [checkpoint_callback]
# Defining model
mlp_model = Sequential(name=name)

mlp_model.add(Dense(50, activation='relu', input_shape=(x_train.shape[1],)))
mlp_model.add(Dense(1, activation='linear'))

mlp_model.compile(optimizer=Adam(learning_rate=0.02), loss='mae', metrics=['mae', 'mse'])
mlp_model.summary()

Model: "first_attempt"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense (Dense)                (None, 50)                500       
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 51        
Total params: 551
Trainable params: 551
Non-trainable params: 0
_________________________________________________________________


In [12]:
history_mlp_0 = mlp_model.fit(x_train, y_train, validation_data=(x_valid, y_valid),
                              batch_size=32, epochs=200,
                              verbose=1, callbacks=callbacks) 
# Cargo el mejor modelo entrenado
clear_output(wait=True)
mlp_model.load_weights(checkfile)
verify_model(mlp_model, x_train, y_train, x_valid, y_valid)

Unnamed: 0,Set,MAE,MSE
0,Train,3245.285393,48947870.0
1,Validacion,3681.269342,41365040.0


Vamos a probar utilizar mse como loss

In [13]:
name = 'mse_loss'
checkfile = join(checkpoints_path, name)
# Define callbacks
checkpoint_callback = ModelCheckpoint(filepath=checkfile, save_weights_only=True, monitor='loss', mode='min', 
                                           save_best_only=True)
callbacks = [checkpoint_callback]
# Defining model
mlp_model_2 = Sequential(name=name)

mlp_model_2.add(Dense(50, activation='relu', input_shape=(x_train.shape[1],)))
mlp_model_2.add(Dense(1, activation='linear'))

mlp_model_2.compile(optimizer=Adam(learning_rate=0.02), loss='mse', metrics=['mae', 'mse'])
print(mlp_model.summary())

history_mlp_1 = mlp_model_2.fit(x_train, y_train, validation_data=(x_valid, y_valid),
                              batch_size=32, epochs=200,
                              verbose=1, callbacks=callbacks) 
# Cargo el mejor modelo entrenado
clear_output(wait=True)
mlp_model_2.load_weights(checkfile)
verify_model(mlp_model_2, x_train, y_train, x_valid, y_valid)


Unnamed: 0,Set,MAE,MSE
0,Train,4194.965971,44575850.0
1,Validacion,4544.500522,35999120.0


No hubo mejoras

### Matriz de embedding
Otra tecnica en lugar de dividir una variable categórica en n columnas distintas es utilizar una capa de embedding

In [14]:
x_cat_train = x_train[['region_northeast',	'region_northwest',	'region_southeast',	'region_southwest']]
x_num_train = x_train[['age',	'sex',	'bmi',	'children',	'smoker']]
x_cat_valid = x_valid[['region_northeast',	'region_northwest',	'region_southeast',	'region_southwest']]
x_num_valid = x_valid[['age',	'sex',	'bmi',	'children',	'smoker']]

In [15]:
# Definiendo el modelo
name = 'embeded_test'
checkfile = join(checkpoints_path, name)
# Define callbacks
checkpoint_callback = ModelCheckpoint(filepath=checkfile, save_weights_only=True, monitor='loss', mode='min', 
                                           save_best_only=True)
callbacks = [checkpoint_callback]

# Input Layers
inp_cat = Input(shape=(4,))
inp_num = Input(shape=(x_num_train.shape[1],))
# Embedding
emb = Embedding(input_dim=4, output_dim=2)(inp_cat)
flat = Flatten()(emb)
# Concatenate layers
conc = Concatenate()([flat, inp_num])
# Hidden layers
dense = Dense(50, activation='relu')(conc)
# Output Layer
out = Dense(1, activation='linear')(dense)
# Create model
emb_model = Model(inputs=[inp_cat, inp_num], outputs=out)

emb_model.compile(optimizer=Adam(learning_rate=0.02), loss='mae', metrics=['mae', 'mse'])
emb_model.summary()

Model: "functional_1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            [(None, 4)]          0                                            
__________________________________________________________________________________________________
embedding (Embedding)           (None, 4, 2)         8           input_1[0][0]                    
__________________________________________________________________________________________________
flatten (Flatten)               (None, 8)            0           embedding[0][0]                  
__________________________________________________________________________________________________
input_2 (InputLayer)            [(None, 5)]          0                                            
_______________________________________________________________________________________

In [16]:
# Train model
history_mlp_1 = emb_model.fit([x_cat_train, x_num_train], y_train, validation_data=([x_cat_valid, x_num_valid], y_valid),
                                batch_size=32, epochs=200,
                                verbose=1, callbacks=callbacks)
                                 
# Cargo el mejor modelo entrenado
clear_output(wait=True)
emb_model.load_weights(checkfile)
verify_model(emb_model, [x_cat_train, x_num_train], y_train, [x_cat_valid, x_num_valid], y_valid)

Unnamed: 0,Set,MAE,MSE
0,Train,2447.682327,35557760.0
1,Validacion,2822.177543,27977230.0


En las 3 pruebas se pudo observar un comportamiento peculiar en el cual los resultados en el set de validación resultan mejores que en el set de train, lo cual lleva a que las metricas obtenidas del modelo esten sujetas a una elevada varianza, para mejorar esto se utilizará k-folding

Ahora vamos a realizar una prueba a fuerza bruta probando utilizar 1 o 2 capaz ocultas, variando la cantidad de neuronas por capa, y el optimizador

In [18]:
results = {}
cat = ['region_northeast',	'region_northwest',	'region_southeast',	'region_southwest']
num = ['age',	'sex',	'bmi',	'children',	'smoker']
for neuronas in [5, 10, 30]:
    for activation in ['linear', 'relu']:
        for optimizer in ['adam', 'sgd']:
            # Definiendo el modelo
            name = 'n{}_a{}_op{}_1h'.format(neuronas, activation, optimizer)
            print('Testing ->', name)
            checkpoints_path
            # Input Layers
            inp_cat = Input(shape=(4,))
            inp_num = Input(shape=(len(num),))
            # Embedding
            emb = Embedding(input_dim=4, output_dim=2)(inp_cat)
            flat = Flatten()(emb)
            # Concatenate layers
            conc = Concatenate()([flat, inp_num])
            # Hidden layers
            dense = Dense(neuronas, activation=activation)(conc)
            # Output Layer
            out = Dense(1, activation='linear')(dense)
            # Create model
            model = Model(inputs=[inp_cat, inp_num], outputs=out)
            model.compile(optimizer=optimizer, loss='mae', metrics=['mae', 'mse'])
            
            mae, metrics = train_model_emb(x, y, model, name, 32, checkpoints_path, stopping_patiece=50, cat=cat, num=num)
            results[name] = (mae, metrics)
clear_output(wait=True)
sorted_results = list(sorted(results.items(), key=lambda x: x[1][0]))
for i in sorted_results:
    print(i[0], '->', i[1][0])

n5_arelu_opadam_1h -> 13361.66984182815
n30_alinear_opsgd_1h -> 8781.261079605918
n10_alinear_opsgd_1h -> 8482.53412679552
n10_arelu_opadam_1h -> 8047.180123396861
n5_arelu_opsgd_1h -> 8016.8898521002875
n30_arelu_opsgd_1h -> 7818.368202069211
n5_alinear_opadam_1h -> 7520.918518090471
n10_arelu_opsgd_1h -> 7512.249889325602
n30_arelu_opadam_1h -> 7406.511472331175
n5_alinear_opsgd_1h -> 7327.358086654835
n10_alinear_opadam_1h -> 7326.907737906154
n30_alinear_opadam_1h -> 6775.898474540145


In [19]:
results_2 = {}
cat = ['region_northeast',	'region_northwest',	'region_southeast',	'region_southwest']
num = ['age',	'sex',	'bmi',	'children',	'smoker']
for neuronas in [50]:
    for activation in ['linear', 'relu']:
        for optimizer in ['adam', 'sgd']:
            # Definiendo el modelo
            name = 'n{}_a{}_op{}_1h'.format(neuronas, activation, optimizer)
            print('Testing ->', name)
            checkpoints_path
            # Input Layers
            inp_cat = Input(shape=(4,))
            inp_num = Input(shape=(len(num),))
            # Embedding
            emb = Embedding(input_dim=4, output_dim=2)(inp_cat)
            flat = Flatten()(emb)
            # Concatenate layers
            conc = Concatenate()([flat, inp_num])
            # Hidden layers
            dense = Dense(neuronas, activation=activation)(conc)
            # Output Layer
            out = Dense(1, activation='linear')(dense)
            # Create model
            model = Model(inputs=[inp_cat, inp_num], outputs=out)
            model.compile(optimizer=optimizer, loss='mae', metrics=['mae', 'mse'])
            
            mae, metrics = train_model_emb(x, y, model, name, 32, checkpoints_path, stopping_patiece=50, cat=cat, num=num)
            results_2[name] = (mae, metrics)
clear_output(wait=True)

sorted_results = list(sorted(results_2.items(), key=lambda x: x[1][0]))
for i in sorted_results:
    print(i[0], '->', i[1][0])

n50_alinear_opadam_1h -> 6574.412991094408
n50_arelu_opadam_1h -> 6887.853404898575
n50_alinear_opsgd_1h -> 7303.13533856638
n50_arelu_opsgd_1h -> 7404.519113190958


Los mejores resultados se obtuvieron con 50 neuronas y utilizando adam por lo que se continuará avanzando para mejorar dicho modelo