# TP1 - 22.45 Redes Neuronales - Regresion Lógistica y Lineal

## Regresión Lógistica - Busqueda de Modelos Óptimos

#### Import required libraries

In [None]:
from keras.datasets import fashion_mnist
import numpy as np
import tensorflow as tf
from matplotlib import pyplot as plt
import datetime
from os.path import exists
tf.config.set_visible_devices([], 'GPU')

from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras import utils
import tensorflow.keras.optimizers as optimizers
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers.experimental import preprocessing


from sklearn.metrics import roc_curve, auc
import tensorflow_addons as tfa
from tensorboard.plugins.hparams import api as hp
import keras_tuner

# Load the TensorBoard notebook extension
%load_ext tensorboard

#### Download and load Fashion MNIST data

In [None]:
(train_X, train_y), (test_X, test_y) = fashion_mnist.load_data()

#### Normalize the dataset

In [None]:
data_max = np.max(train_X)
train_X = train_X.astype('float32') / data_max
test_X = test_X.astype('float32') / data_max
np.max(train_X)

#### Convert it to categorical vectors

In [None]:
num_classes = np.max(train_y) + 1
train_y_cat = utils.to_categorical(train_y, num_classes)
test_y_cat = utils.to_categorical(test_y, num_classes)

#### Configuring preeliminary optimizer analysis parameters and callbacks

In [None]:
# Callback to stop training if, after 5 epochs, the accuracy is not improving
early_stop_callback = tf.keras.callbacks.EarlyStopping(monitor='val_accuracy', patience=5)

In [None]:
# Callback to log stats and metrics for TensorBoard
log_dir = "logs/softmax" + datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir=log_dir)

In [None]:
METRICS = [
    hp.Metric(
        "epoch_accuracy",
        group="validation",
        display_name="Accuracy (val.)",
    ),
    hp.Metric(
        "epoch_loss",
        group="validation",
        display_name="Loss (val.)",
    ),
    hp.Metric(
        "epoch_f1_score",
        group="validation",
        display_name="F1 Score Macro (val.)",
    ),
    hp.Metric(
        "epoch_f1_score_micro",
        group="validation",
        display_name="F1 Score Micro (val.)",
    ),
    hp.Metric(
        "epoch_recall",
        group="validation",
        display_name="Recall (val.)",
    ),
    hp.Metric(
        "epoch_precision",
        group="validation",
        display_name="Precision (val.)",
    ),
]

HP_OPTIMIZER = hp.HParam('optimizer', hp.Discrete(['adam', 'sgd', 'rmsprop', 'adamw', 'nadam', 'adagrad', 'adadelta', 'ftrl', 'adamax', 'adafactor']))
HP_LEARN_RATE = hp.HParam('learning_rate', hp.Discrete([0.0001, 0.001, 0.01, 0.1]))
HP_MOMENTUM = hp.HParam('momentum', hp.Discrete([ 0.9, 0.95, 0.99]))
HP_NESTEROV = hp.HParam('nesterov', hp.Discrete([True, False]))
HP_RHO = hp.HParam('rho', hp.Discrete([0.92, 0.95, 0.97]))
HP_BETA_1 = hp.HParam('beta_1', hp.Discrete([0.86, 0.9, 0.94]))
HP_BETA_2 = hp.HParam('beta_2', hp.Discrete([0.97, 0.99, 0.999]))
HP_BETA_2_DECAY = hp.HParam('beta_2_decay', hp.Discrete([-0.9, -0.8, -0.7]))
HP_WEIGHT_DECAY = hp.HParam('weight_decay', hp.Discrete([0.0001, 0.001, 0.004, 0.01]))
HP_LEARN_RATE_POWER = hp.HParam('learning_rate_power', hp.Discrete([-0.8, -0.5, -0.0]))

HPARAMS = [HP_OPTIMIZER, HP_LEARN_RATE, HP_MOMENTUM, HP_NESTEROV, HP_RHO, HP_BETA_1, HP_BETA_2, HP_BETA_2_DECAY, HP_WEIGHT_DECAY, HP_LEARN_RATE_POWER]

In [None]:
with tf.summary.create_file_writer(log_dir).as_default():
  hp.hparams_config(
    hparams=HPARAMS,
    metrics=METRICS,
  )

### Preeliminary Optimizer Analysis

Se realizó una busqueda preeliminar de los optimizadores que mejor se comportaran, que a pesar de ser un análisis limitado permitió concentrarse en algunos optimizadores específicos para el análisis posterior.

#### Softmax function to get models for optimizer testing

In [None]:
def train_test_model(hparams, run_dir):
  softmax_model = tf.keras.models.Sequential([
    tf.keras.layers.Flatten(input_shape=(28,28)),
    tf.keras.layers.Dense(num_classes, activation=tf.nn.softmax),
  ])

  if(hparams[HP_OPTIMIZER] == 'sgd'):
    optimizer = optimizers.SGD(learning_rate=hparams[HP_LEARN_RATE], momentum=hparams[HP_MOMENTUM], nesterov=hparams[HP_NESTEROV])
  elif(hparams[HP_OPTIMIZER] == 'adam'):
    optimizer = optimizers.Adam(learning_rate=hparams[HP_LEARN_RATE], beta_1=hparams[HP_BETA_1], beta_2=hparams[HP_BETA_2])
  elif(hparams[HP_OPTIMIZER] == 'rmsprop'):
    optimizer = optimizers.RMSprop(learning_rate=hparams[HP_LEARN_RATE], rho=hparams[HP_RHO], momentum=hparams[HP_MOMENTUM])
  elif(hparams[HP_OPTIMIZER] == 'adadelta'):
    optimizer = optimizers.Adadelta(learning_rate=hparams[HP_LEARN_RATE], rho=hparams[HP_RHO])
  elif(hparams[HP_OPTIMIZER] == 'adagrad'):
    optimizer = optimizers.Adagrad(learning_rate=hparams[HP_LEARN_RATE])
  elif(hparams[HP_OPTIMIZER] == 'adamax'):
    optimizer = optimizers.Adamax(learning_rate=hparams[HP_LEARN_RATE], beta_1=hparams[HP_BETA_1], beta_2=hparams[HP_BETA_2])
  elif(hparams[HP_OPTIMIZER] == 'nadam'):
    optimizer = optimizers.Nadam(learning_rate=hparams[HP_LEARN_RATE], beta_1=hparams[HP_BETA_1], beta_2=hparams[HP_BETA_2])
  elif(hparams[HP_OPTIMIZER] == 'ftrl'):
    optimizer = optimizers.Ftrl(learning_rate=hparams[HP_LEARN_RATE], learning_rate_power=hparams[HP_LEARN_RATE_POWER])

  softmax_model.compile(
      optimizer=optimizer,
      loss='categorical_crossentropy',
      metrics=["accuracy", tfa.metrics.F1Score(average='macro',num_classes=num_classes),tfa.metrics.F1Score(average='micro',num_classes=num_classes, name="f1_score_micro"), tf.keras.metrics.Precision(name="precision"), tf.keras.metrics.Recall(name="recall")]
  )

  callbacks = [
      early_stop_callback,
      tf.keras.callbacks.TensorBoard(run_dir), # log metrics
      hp.KerasCallback(run_dir, hparams),  # log hparams
    ]

  softmax_model.fit(train_X, train_y_cat, validation_data=(test_X, test_y_cat), batch_size = 64, epochs = 5, callbacks=callbacks)

#### Optimizer Analysis

In [None]:
session_num = 0

for optimizer in HP_OPTIMIZER.domain.values:
  for learning_rate in HP_LEARN_RATE.domain.values:
    if(optimizer == 'adagrad'):
      hparams = {
        HP_OPTIMIZER: optimizer,
        HP_LEARN_RATE: learning_rate,
      }
      run_name = "/run-%d" % session_num
      print('--- Starting trial: %s' % run_name)
      print({h.name: hparams[h] for h in hparams})
      train_test_model(hparams, log_dir + run_name)
      session_num += 1
    elif(optimizer == 'adadelta'):
      for rho in HP_RHO.domain.values:
        hparams = {
          HP_OPTIMIZER: optimizer,
          HP_LEARN_RATE: learning_rate,
          HP_RHO: rho,
        }
        run_name = "/run-%d" % session_num
        print('--- Starting trial: %s' % run_name)
        print({h.name: hparams[h] for h in hparams})
        train_test_model(hparams, log_dir + run_name)
        session_num += 1
    elif(optimizer == 'ftrl'):
      for learning_rate_power in HP_LEARN_RATE_POWER.domain.values:
        hparams = {
          HP_OPTIMIZER: optimizer,
          HP_LEARN_RATE: learning_rate,
          HP_LEARN_RATE_POWER: learning_rate_power,
        }
        run_name = "/run-%d" % session_num
        print('--- Starting trial: %s' % run_name)
        print({h.name: hparams[h] for h in hparams})
        train_test_model(hparams, log_dir + run_name)
        session_num += 1
    elif(optimizer == 'sgd' or optimizer == 'rmsprop'):
      for momentum in HP_MOMENTUM.domain.values:
        if(optimizer == 'sgd'):
          for nesterov in HP_NESTEROV.domain.values:
            hparams = {
              HP_OPTIMIZER: optimizer,
              HP_LEARN_RATE: learning_rate,
              HP_MOMENTUM: momentum,
              HP_NESTEROV: nesterov,
            }
            run_name = "/run-%d" % session_num
            print('--- Starting trial: %s' % run_name)
            print({h.name: hparams[h] for h in hparams})
            train_test_model(hparams, log_dir + run_name)
            session_num += 1
        else:
          for rho in HP_RHO.domain.values:
            hparams = {
              HP_OPTIMIZER: optimizer,
              HP_LEARN_RATE: learning_rate,
              HP_MOMENTUM: momentum,
              HP_RHO: rho,
            }
            run_name = "/run-%d" % session_num
            print('--- Starting trial: %s' % run_name)
            print({h.name: hparams[h] for h in hparams})
            train_test_model(hparams, log_dir + run_name)
            session_num += 1
    elif(optimizer == 'adam' or optimizer == 'adamax' or optimizer == 'nadam'):
      for beta_1 in HP_BETA_1.domain.values:
        for beta_2 in HP_BETA_2.domain.values:
          hparams = {
            HP_OPTIMIZER: optimizer,
            HP_LEARN_RATE: learning_rate,
            HP_BETA_1: beta_1,
            HP_BETA_2: beta_2,
          }
          run_name = "/run-%d" % session_num
          print('--- Starting trial: %s' % run_name)
          print({h.name: hparams[h] for h in hparams})
          train_test_model(hparams, log_dir + run_name)
          session_num += 1

Tenemos entonces un análisis preeliminar de los distintos optimizadores disponibles, y observamos que los que mejor se comportan en terminos de maximizar la accuracy son SGD, RMSprop, Adam y nAdam.
Estos resultados son claramente imperfectos ya que:
- Fueron realizados a solo 5 epochs (lo que le da una ventaja a learning rates altos, y no todos los optimizadores se comportan bien con los mismos).
- No se analizaron otras cuestiones como agregar capas de dropout, batch normalization o cambiar la función de costo, que podrían mejorar la performance de algunos optimizadores en específico.
- Solo se realizó para SoftmaxReg, y otros optimizadores podrían ser mejor para MLP.
Sin embargo, sirve como análisis preeliminar para observar cuales optimizadores se comportan bien en general para este problema y realizar un análisis más en profundidad de los mismos, para no requerir tanto tiempo de entrenamiento.

### Configuring Hyperparameter tuning callbacks

In [None]:
# Callback to stop training if, after 5 epochs, the accuracy is not improving
early_stop_callback = tf.keras.callbacks.EarlyStopping(monitor='val_accuracy', patience=5)

In [None]:
# Callback to log stats and metrics for TensorBoard
log_dir = "testing/" + datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir=log_dir)

In [None]:
# Callback to save the weights of the best model
checkpoint_filepath = log_dir + '/checkpoint'
model_checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
    filepath=checkpoint_filepath,
    save_weights_only=True,
    monitor='val_accuracy',
    mode='max',
    save_best_only=True)

### Hyperparameter tuning

#### Softmax

##### Function to build the models to test

In [None]:
def build_model_softmax(hp):
    model = keras.Sequential()
    model.add(layers.Flatten(input_shape=(28, 28)))

    # Test with and without dropout, with different rates (0.1, 0.3, 0.5)
    if hp.Boolean("dropout"):
        model.add(layers.Dropout(hp.Float("dropout_rate", 0.1, 0.5, step=0.2, parent_name="dropout", parent_values=[True])))

    # Test with and without batch normalization
    if hp.Boolean("batch_normalization"):
        model.add(layers.BatchNormalization())

    model.add(layers.Dense(num_classes, activation='softmax'))

    # Test learning rates 0.0001, 0.001 and 0.01   
    learning_rate = hp.Float("lr", min_value=1e-4, max_value=1e-2, sampling="log", step=10)

    # Test best optimizers from the preeliminary optimizer analysis
    optimizer_list = hp.Choice("optimizer", ["adam", "nadam", "sgd", "rmsprop"])
    if(optimizer_list == "adam" or optimizer_list == "nadam"):
        beta_1 = hp.Float("beta_1", min_value=0.87, max_value=0.99, step=0.04, parent_name="optimizer", parent_values=["adam", "nadam"])
        beta_2 = hp.Float("beta_2", min_value=0.939, max_value=0.999, step=0.03, parent_name="optimizer", parent_values=["adam", "nadam"])
        optimizer = optimizers.Adam(learning_rate=learning_rate, beta_1=beta_1, beta_2=beta_2)
    elif(optimizer_list == "sgd"):
        momentum_1 = hp.Float("momentum_1", min_value=0.90, max_value=0.99, step=0.03, parent_name="optimizer", parent_values=["sgd"])
        optimizer = optimizers.SGD(learning_rate=learning_rate, momentum=momentum_1)
    elif(optimizer_list == "rmsprop"):
        momentum_2 = hp.Float("momentum_2", min_value=0.90, max_value=0.99, step=0.03, parent_name="optimizer", parent_values=["rmsprop"])
        rho = hp.Float("rho", min_value=0.9, max_value=0.99, step=0.03, parent_name="optimizer", parent_values=["rmsprop"])
        optimizer = optimizers.RMSprop(learning_rate=learning_rate, rho=rho, momentum=momentum_2)

    # The loss function is fixed, we will test changing it after we decide which model to use
    model.compile(
        optimizer=optimizer,
        loss="categorical_crossentropy",
        metrics=["accuracy", tfa.metrics.F1Score(average='macro',num_classes=num_classes),tfa.metrics.F1Score(average='micro',num_classes=num_classes, name="f1_score_micro"), tf.keras.metrics.Precision(name="precision"), tf.keras.metrics.Recall(name="recall")],
    )
    return model

##### Testing

Los resultados de este test se pueden encontrar en la carpeta testing/softmax

In [None]:
# Build the hyperparameter tuner
tuner_softmax = keras_tuner.Hyperband(
    hypermodel=build_model_softmax,
    objective="val_accuracy",
    max_epochs=50,
    overwrite=True,
    directory=log_dir + '/hparams',
    project_name="nn-tp1",
)

# Summary
tuner_softmax.search_space_summary()

In [None]:
# Testing (batch_size is fixed, we will test changing it after we decide which model to use)
tuner_softmax.search(train_X, train_y_cat, validation_data=(test_X, test_y_cat), batch_size = 64, callbacks=[early_stop_callback, tensorboard_callback])
tuner_softmax.results_summary()

Observamos que los mejores modelos para SoftmaxReg entonces resultan ser:
- adam: sin dropout ni batch normalization, con learning rate de 0.001, beta 1 de 0.95 y beta 2 de 0.939
- rmsprop: sin dropout ni batch normalization, con learning rate de 0.0001, momentum de 0.93 y rho de 0.93
- sgd: sin dropout pero con batch normalization, con learning rate de 0.001, momentum de 0.93

Por su rápida velocidad de convergencia, que limita la cantidad de tiempo gastado en entrenamiento, eligiremos el modelo Adam para SoftmaxReg.

#### MLP

##### Function to build the models to test

In [None]:
def build_model_mlp(hp):
    model = keras.Sequential()
    model.add(layers.Flatten(input_shape=(28, 28)))

    if hp.Boolean("dropout"):
        model.add(layers.Dropout(hp.Float("dropout_rate", 0.1, 0.5, step=0.2, parent_name="dropout", parent_values=[True])))

    if hp.Boolean("batch_normalization"):
        model.add(layers.BatchNormalization())
    
    # For MLP we add more Dense layers before the Softmax one. We test with 1, 2 and 3 layers raging from 32 neurons to 512
    for i in range(hp.Int("num_layers", 1, 3)):
      model.add(
        layers.Dense(
          units=hp.Int(f"units_{i}", min_value=32, max_value=512, sampling="log", step=2),
          activation=hp.Choice("activation", ["relu", "tanh", "sigmoid", "elu", "selu", "softplus", "softsign", "exponential"]), # Plus, we test different activation functions for those Dense layers
          )
      )

    model.add(layers.Dense(num_classes, activation='softmax'))
        
    learning_rate = hp.Float("lr", min_value=1e-4, max_value=1e-2, sampling="log", step=10)

    optimizer_list = hp.Choice("optimizer", ["adam", "nadam", "sgd", "rmsprop"])
    if(optimizer_list == "adam" or optimizer_list == "nadam"):
        beta_1 = hp.Float("beta_1", min_value=0.87, max_value=0.99, step=0.04, parent_name="optimizer", parent_values=["adam", "nadam"])
        beta_2 = hp.Float("beta_2", min_value=0.939, max_value=0.999, step=0.03, parent_name="optimizer", parent_values=["adam", "nadam"])
        optimizer = optimizers.Adam(learning_rate=learning_rate, beta_1=beta_1, beta_2=beta_2)
    elif(optimizer_list == "sgd"):
        momentum_1 = hp.Float("momentum_1", min_value=0.90, max_value=0.99, step=0.03, parent_name="optimizer", parent_values=["sgd"])
        optimizer = optimizers.SGD(learning_rate=learning_rate, momentum=momentum_1)
    elif(optimizer_list == "rmsprop"):
        momentum_2 = hp.Float("momentum_2", min_value=0.90, max_value=0.99, step=0.03, parent_name="optimizer", parent_values=["rmsprop"])
        rho = hp.Float("rho", min_value=0.9, max_value=0.99, step=0.03, parent_name="optimizer", parent_values=["rmsprop"])
        optimizer = optimizers.RMSprop(learning_rate=learning_rate, rho=rho, momentum=momentum_2)

    model.compile(
        optimizer=optimizer,
        loss="categorical_crossentropy",
        metrics=["accuracy", tfa.metrics.F1Score(average='macro',num_classes=num_classes),tfa.metrics.F1Score(average='micro',num_classes=num_classes, name="f1_score_micro"), tf.keras.metrics.Precision(name="precision"), tf.keras.metrics.Recall(name="recall")],
    )
    return model

##### Testing

Los resultados de este test se pueden encontrar en la carpeta testing/mlp

In [None]:
# Build the hyperparameter tuner
tuner_mlp = keras_tuner.Hyperband(
    hypermodel=build_model_mlp,
    objective="val_accuracy",
    max_epochs=50,
    overwrite=True,
    directory=log_dir + '/hparams',
    project_name="nn-tp1",
)

# Summary
tuner_mlp.search_space_summary()

In [None]:
# Testing (batch_size is fixed, we will test changing it after we decide which model to use)
tuner_mlp.search(train_X, train_y_cat, validation_data=(test_X, test_y_cat), batch_size = 64, callbacks=[early_stop_callback, tensorboard_callback])
tuner_mlp.results_summary()

Observamos entonces que los mejores optimizadores para el modelo utilizando MLP son Adam y nADAM, con las siguientes características:
- nAdam: sin dropout ni batch normalization, con una sola layer Dense de 256 con activación softplus, un learning rate de 0.001, beta 1 de 0.95 y beta 2 de 0.969 (val_accuracy = 0.8952). También tuvo buen rendimiento con elu con batch normalization, utilizando 3 capas Dense de 256, 128 y 32, con un learning rate de 0.001, beta 1 de 0.91 y beta 2 de 0.939 (val_accuracy = 0.8929).
- Adam: sin dropout pero con batch normalization, con una sola layer Dense de 512 con activación softplus, un learning rate de 0.0001, beta 1 de 0.87 y beta 2 de 0.969 (val_accuracy = 0.8946). También tuvo buen rendimiento con elu, utilizando 3 capas Dense de 256, 32 y 512, con un learning rate de 0.001, beta 1 de 0.95 y beta 2 de 0.999 (val_accuracy = 0.8923).

Elegimos entonces para MLP el optimizador nAdam, ya que presenta bajo condiciones similares un mejor rendimiento que Adam.