In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from tensorflow import keras

In [2]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score, roc_curve, confusion_matrix, auc

In [50]:
from keras.models import Sequential
from keras.layers import Dense
from keras.optimizers import SGD, Adam
from keras.metrics import AUC # Area under the curve, default: ROC
from keras.losses import BinaryCrossentropy
from keras.callbacks import EarlyStopping, LearningRateScheduler
import kerastuner as kt

In [4]:
df = pd.read_csv('../databases/diabetes.csv')

In [5]:
df.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [6]:
def replace_outliers_zeros(data_frame, outlayers_keys: dict, zero_keys: list, mean_median = False):
    """
    Fuction to replace outliers from data_frame
    and replace them with median or mean value
    Don't afect the original data frame, returns a copy 
    with the parameters changed
    """
    data = data_frame.copy()
    if len(zero_keys) != 0:
        for key in zero_keys:
            data.loc[data[key] == 0, key] = np.NaN
            
    replace = {}
    for x in data.keys():
        if mean_median:
            replace[x] = np.mean(data[x].dropna())
        else:
            replace[x] = np.median(data[x].dropna())
    
    for key, val in outlayers_keys.items():
        data.loc[data[key] < val[0], key] = np.NaN
        data.loc[data[key] > val[1], key] = np.NaN
    
    for key, val in replace.items():
        data[key] = data[key].replace(np.NaN, replace[key])
        
    return data

def normalize(data_frame):
    data = data_frame.copy()
    for key in data:
        k_mean = np.mean(data[key])
        k_std = np.std(data[key])
        data[key] = (data[key]-k_mean)/k_std
    return data

def metrics(tn, fp, fn, tp):
    spec = tn / (tn + fp + 1e-15)
    neg_pred_val = tn / (tn + fn + 1e-15)
    sens = tp / (tp + fn + 1e-15)
    pos_pred_val = tp / (tp + fp + 1e-15)
    print('\tEspecificidad: {}\n\tSesitividad: {}\n\tValor predictivo positivo: {}\n\tValor predictivo negativo: {}'.format(spec, 
                                                                                                                    sens, 
                                                                                                                    neg_pred_val, 
                                                                                                                    pos_pred_val))
    
def print_metrics(y, y_pred):
    roc_auc = roc_auc_score(y, y_pred)
    print('PRINCIPAL\n\tArea bajo la curva ROC: {}\nSECUNDARIAS'.format(roc_auc))
    tn, fp, fn, tp = confusion_matrix(y, y_pred).ravel()
    metrics(tn, fp, fn, tp)
    

Tomando el EDA del Tp1, se reemplazan los valores no medidos por NaN y se quitan los outlayers para luego reemplazarlos por la media o la mediana

In [7]:
outlayers = {
    'BloodPressure': (40, np.Inf),
    'SkinThickness': (0, 80),
    'Insulin': (0, 400),
    'BMI': (0, 50)
}

zeros = [
    'Glucose',
    'BloodPressure',
    'SkinThickness',
    'Insulin',
    'BMI'
]

In [8]:
x_data = df[['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin','BMI', 'DiabetesPedigreeFunction', 'Age']]
y_data = df['Outcome']

In [9]:
# Split dataset into 15% test, 85% train 
x, x_test, y, y_test = train_test_split(x_data, y_data, test_size=0.15, random_state=0)
x_train, x_valid, y_train, y_valid = train_test_split(x, y, test_size=0.15, random_state=0)

In [10]:
print('shape train {}, {} '.format(x_train.shape, y_train.shape))
print('shape valid {}, {} '.format(x_valid.shape, y_valid.shape))
print('shape test {}, {} '.format(x_test.shape, y_test.shape))

shape train (554, 8), (554,) 
shape valid (98, 8), (98,) 
shape test (116, 8), (116,) 


In [34]:
x_train_clean = replace_outliers_zeros(x_train, outlayers, zeros, mean_median=True)
x_test_clean = replace_outliers_zeros(x_test, outlayers, zeros, mean_median=True)
x_valid_clean = replace_outliers_zeros(x_valid, outlayers, zeros, mean_median=True)

In [35]:
x_train_clean = normalize(x_train_clean)
x_valid_clean = normalize(x_valid_clean)
x_test_clean = normalize(x_test_clean)

In [36]:
x_train_clean.describe()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age
count,554.0,554.0,554.0,554.0,554.0,554.0,554.0,554.0
mean,1.042087e-17,-1.014832e-15,1.087177e-16,5.259331e-15,-1.890185e-15,-1.483771e-15,3.631271e-16,-8.296612e-17
std,1.000904,1.000904,1.000904,1.000904,1.000904,1.000904,1.000904,1.000904
min,-1.132323,-2.551563,-2.762256,-2.604498,-2.319631,-2.181602,-1.226701,-1.039051
25%,-0.8395991,-0.7297063,-0.7318759,-0.4614834,-0.4475001,-0.7538432,-0.6903276,-0.7910755
50%,-0.2541518,-0.1500246,-0.0190864,0.01504268,0.178663,0.01376913,-0.2925762,-0.3777826
75%,0.6240192,0.6284051,0.5868243,0.3719111,0.178663,0.6547254,0.4895369,0.6141204
max,3.843979,2.549636,4.103358,3.705489,4.31306,2.700412,5.865088,3.920464


In [14]:
def create_model(in_shape, optimizer_, metrics_, loss_, activation_='sigmoid'):
    model = Sequential()
    model.add(Dense(units=1, activation=activation_, input_shape=in_shape))
    model.compile(optimizer=optimizer_,
                  loss= loss_,
                  metrics=metrics_)
    return model

In [15]:
model = create_model(in_shape= (x_train_clean.shape[1],),
                     optimizer_= SGD(momentum=0.9, learning_rate=0.1),
                     metrics_ = [AUC()],
                     loss_ = BinaryCrossentropy(from_logits=True))

In [16]:
model.fit(x_train_clean, y_train, epochs=20,shuffle=True, validation_data=(x_valid_clean, y_valid))

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<tensorflow.python.keras.callbacks.History at 0x25352bc45b0>

In [17]:
print('Sobre train')
y_train_pred = model.predict(x_train_clean)
print_metrics(y_train, np.rint(y_train_pred))
print('Sobre validation')
y_valid_pred = model.predict(x_valid_clean)
print_metrics(y_valid, np.rint(y_valid_pred))

Sobre train
PRINCIPAL
	Area bajo la curva ROC: 0.7145804387183697
SECUNDARIAS
	Especificidad: 0.9316239316239316
	Sesitividad: 0.4975369458128079
	Valor predictivo positivo: 0.7622377622377622
	Valor predictivo negativo: 0.808
Sobre validation
PRINCIPAL
	Area bajo la curva ROC: 0.6799687010954617
SECUNDARIAS
	Especificidad: 0.9154929577464789
	Sesitividad: 0.4444444444444444
	Valor predictivo positivo: 0.8125
	Valor predictivo negativo: 0.6666666666666666


In [18]:
model = create_model(in_shape= (x_train_clean.shape[1],),
                     optimizer_= Adam(learning_rate=0.1),
                     metrics_ = [AUC()],
                     loss_ = BinaryCrossentropy(from_logits=True))

In [19]:
model.fit(x_train_clean, y_train, epochs=20,shuffle=True, validation_data=(x_valid_clean, y_valid))

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<tensorflow.python.keras.callbacks.History at 0x2535552fdc0>

In [20]:
print('Sobre train')
y_train_pred = model.predict(x_train_clean)
print_metrics(y_train, np.rint(y_train_pred))
print('Sobre validation')
y_valid_pred = model.predict(x_valid_clean)
print_metrics(y_valid, np.rint(y_valid_pred))

Sobre train
PRINCIPAL
	Area bajo la curva ROC: 0.7307832652660239
SECUNDARIAS
	Especificidad: 0.9344729344729344
	Sesitividad: 0.5270935960591133
	Valor predictivo positivo: 0.7735849056603774
	Valor predictivo negativo: 0.823076923076923
Sobre validation
PRINCIPAL
	Area bajo la curva ROC: 0.6914449660928533
SECUNDARIAS
	Especificidad: 0.9014084507042254
	Sesitividad: 0.48148148148148145
	Valor predictivo positivo: 0.8205128205128205
	Valor predictivo negativo: 0.65


In [21]:
def model_builder(hp):
    model = Sequential()
    model.add(Dense(units=1, activation='sigmoid', input_shape=(x_train_clean.shape[1],)))
    
    # Optimizadores a porbar
    optimizer_type = hp.Choice('optimizer_type', values=['SGD', 'Adam']) 
    # Hyperparametros para porbar en el optimizador
    _learning_rate_ = hp.Choice('learning_rate', values=[1e-3, 1e-2, 0.1, 0.32, 0.56, 0.81, 1.0, 1.2, 1.5, 1.8, 2.0, 2.2, 2.7])
    _momentum_ = hp.Choice('momentum', values=[0.5, 0.75, 0.85, 0.9, 0.99, 0.999])
    
    if optimizer_type == 'SGD':
        _optimizer_ = SGD(learning_rate=_learning_rate_, momentum=_momentum_)
    else:
        _optimizer_ = Adam(learning_rate=_learning_rate_)
    
    _metrics_ = [AUC()]
    _loss_ = BinaryCrossentropy(from_logits=True)
    
    model.compile(optimizer=_optimizer_,
                  loss=_loss_,
                  metrics=_metrics_)
    
    return model


In [22]:
hyptester = kt.Hyperband(model_builder, kt.Objective('val_loss', direction='min'), max_epochs=15, factor=3)

In [23]:
hyptester.search(x_train_clean, y_train, epochs=50, validation_data=(x_valid_clean, y_valid))
hyperparams = hyptester.get_best_hyperparameters(num_trials=2)[0]

Trial 30 Complete [00h 00m 01s]
val_loss: 0.6863452196121216

Best val_loss So Far: 0.6806035041809082
Total elapsed time: 00h 00m 52s
INFO:tensorflow:Oracle triggered exit


In [24]:
print('best momentum',hyperparams.get('momentum'))
print('best learning_rate',hyperparams.get('learning_rate'))
print('best optimizer', hyperparams.get('optimizer_type'))

best momentum 0.9
best learning_rate 2.0
best optimizer Adam


In [25]:
best_hyp_model = hyptester.hypermodel.build(hyperparams)
best_hyp_model.fit(x_train_clean, y_train, epochs=20,shuffle=True, validation_data=(x_valid_clean, y_valid))

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<tensorflow.python.keras.callbacks.History at 0x25356d07640>

In [26]:
print('Sobre train')
y_train_pred = best_hyp_model.predict(x_train_clean)
print_metrics(y_train, np.rint(y_train_pred))
print('Sobre validation')
y_valid_pred = best_hyp_model.predict(x_valid_clean)
print_metrics(y_valid, np.rint(y_valid_pred))

Sobre train
PRINCIPAL
	Area bajo la curva ROC: 0.7428599497565015
SECUNDARIAS
	Especificidad: 0.8945868945868946
	Sesitividad: 0.5911330049261084
	Valor predictivo positivo: 0.7909319899244333
	Valor predictivo negativo: 0.7643312101910829
Sobre validation
PRINCIPAL
	Area bajo la curva ROC: 0.6632759520083463
SECUNDARIAS
	Especificidad: 0.8450704225352113
	Sesitividad: 0.48148148148148145
	Valor predictivo positivo: 0.8108108108108109
	Valor predictivo negativo: 0.5416666666666666


# Early Stopping

Detiene el entrenamiento de forma anticipada si se llega a cierto valor de un parametro determinado, en principio permite agilizar las pruebas al buscar hyperparámetros, descartanto rapidamenete las combinacion que converjan rapidamente a un valor esperado

Los hyperparámetrso principales de este callback son:
- monitor: indica la variable a monitorear (valga la redundancia)
- min_delta: la minima diferencia que se considera como mejora
- patience: cantidad de epochs sin mejoras antes de parar

In [47]:
stop_early = EarlyStopping(monitor='val_loss', patience=15)

In [48]:
hyptester_es = kt.Hyperband(model_builder, kt.Objective('val_loss', direction='min'), max_epochs=50, factor=3)
hyptester_es.search(x_train_clean, y_train, epochs=50, validation_data=(x_valid_clean, y_valid), callbacks=[stop_early])
# Get the optimal hyperparameters
hyperparams_es = hyptester_es.get_best_hyperparameters(num_trials=2)[0]
print('best momentum',hyperparams_es.get('momentum'))
print('best learning_rate',hyperparams_es.get('learning_rate'))
print('best optimizer', hyperparams_es.get('optimizer_type'))

Trial 90 Complete [00h 00m 02s]
val_loss: 0.6901907920837402

Best val_loss So Far: 0.6796790361404419
Total elapsed time: 00h 03m 46s
INFO:tensorflow:Oracle triggered exit
best momentum 0.99
best learning_rate 1.0
best optimizer SGD


In [53]:
best_hyp_model_es = hyptester_es.hypermodel.build(hyperparams_es)
best_hyp_model_es.fit(x_train_clean, y_train, epochs=50,shuffle=True, validation_data=(x_valid_clean, y_valid), callbacks=[stop_early])
print('Sobre train')
y_train_pred = best_hyp_model_es.predict(x_train_clean)
print_metrics(y_train, np.rint(y_train_pred))
print('Sobre validation')
y_valid_pred = best_hyp_model_es.predict(x_valid_clean)
print_metrics(y_valid, np.rint(y_valid_pred))

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Sobre train
PRINCIPAL
	Area bajo la curva ROC: 0.7271623650933995
SECUNDARIAS
	Especificidad: 0.9173789173789174
	Sesitividad: 0.5369458128078818
	Valor predictivo positivo: 0.7740384615384616
	Valor predictivo negativo: 0.7898550724637681
Sobre validation
PRINCIPAL
	Area bajo la curva ROC: 0.6473656755346896
SECUNDARIAS
	Especificidad: 0.8873239436619719
	Sesitividad: 0.4074074074074074
	Valor predictivo positivo: 0.7974683544303798
	Valor predictivo negativo: 0.5789473684210527


Se puede ver que los resultados sobre el set de validación son similares, lo que fue beneficioso al utilizar el earlystopping fue la posibilidad de probar una mayor cantidad de combinaciónes de hyperparámetros en un tiempo similar, obteniendo asi una combinación de hyperparámetros más confiable

# Learning Rate Scheduler

Permite ajustar el learning rate en funcion del epoch que se esta trabajando, por lo tanto dependiendo la funcion de costos minimizarla de forma más rapida y eficiente???

In [61]:
def schedule(epoch, lr):
    if epoch < 5:
        return lr
    else:
        return lr * np.exp(-0.8)

In [62]:
scheduler = LearningRateScheduler(schedule)

In [64]:
best_hyp_model_es_sched = hyptester_es.hypermodel.build(hyperparams_es)
best_hyp_model_es_sched.fit(x_train_clean, y_train, epochs=50,shuffle=True, validation_data=(x_valid_clean, y_valid), 
                            callbacks=[stop_early, scheduler])
print('\nSobre train')
y_train_pred = best_hyp_model_es_sched.predict(x_train_clean)
print_metrics(y_train, np.rint(y_train_pred))
print('Sobre validation')
y_valid_pred = best_hyp_model_es_sched.predict(x_valid_clean)
print_metrics(y_valid, np.rint(y_valid_pred))

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50

Sobre train
PRINCIPAL
	Area bajo la curva ROC: 0.7075772248186042
SECUNDARIAS
	Especificidad: 0.9373219373219374
	Sesitividad: 0.47783251231527096
	Valor predictivo positivo: 0.7563218390804598
	Valor predictivo negativo: 0.8151260504201681
Sobre validation
PRINCIPAL
	Area bajo la curva ROC: 0.6729264475743348
SECUNDARIAS
	Especificidad: 0.9014084507042254
	Sesitividad: 0.4444444444444444
	Valor predictivo positivo: 0.810126582278481
	Valor predictivo negativo: 0.631578947368421


# Model CheckPoint & Tensorboard

# Tecnicas de Regularización

*   Drop out

*   Regularización L1
*   Regularización L2
*   Batch Normalization
