## Optimizadores, Learning Rate Schedulers

En esta sesión vamos a comparar el resultado diferentes optimizadores y learning rates schedulers 

## Leer, normalizar y particionar datos

In [None]:
## Importar y normalizar datos

from tensorflow import keras
from keras.datasets import mnist

(x_train, y_train), (x_test, y_test) = mnist.load_data()

print('training set', x_train.shape)
print('test set', x_test.shape)

x_train = x_train.reshape(60000, 784)
x_test = x_test.reshape(10000, 784)
x_train = x_train.astype('float32')
x_test = x_test.astype('float32')

# Normalize [0..255]-->[0..1]
x_train /= 255
x_test /= 255

# convert class vectors to binary class matrices
num_classes=10
y_train = keras.utils.to_categorical(y_train, num_classes)
y_test = keras.utils.to_categorical(y_test, num_classes)

from sklearn.model_selection import train_test_split

x_train, x_val, y_train, y_val = train_test_split(x_train, y_train, test_size=0.2, random_state=42)

print('training set', x_train.shape)
print('val set', x_val.shape)

## Optimizadores

Vamos a probar diferentes optimizadores:

In [None]:
from keras import Sequential
from keras.layers import Dense, Input
from keras.optimizers import SGD,Adam,Adagrad

batch_size=128
epochs=5
lr=0.001

opt=[]
opt.append(SGD(learning_rate=lr, momentum=0.9))
opt.append(Adam(learning_rate=lr))
opt.append(Adagrad(learning_rate=lr))

best_acc=0.0
for optim in opt:
    model = Sequential()

    model.add(Input(784))
    model.add(Dense(1024, activation='relu'))
    model.add(Dense(1024, activation='relu'))
    model.add(Dense(num_classes, activation='softmax'))

    model.compile(loss='categorical_crossentropy',
              optimizer=optim,
              metrics=['accuracy'])
    
 

    history = model.fit(x_train, y_train,
                    batch_size=batch_size,
                    epochs=epochs,
                    verbose=1,
                    validation_data=(x_val, y_val)) ## <--- OJO validation set
    
    print("\n>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>\n")
    
    if history.history['val_accuracy'][-1]>best_acc:
        best_acc=history.history['val_accuracy'][-1]
        bestopt=optim

print("=============================")
print("Best acc",best_acc)
print("Best optim",bestopt)
print("=============================")


## **EJERCICIO**

Añade más epochs a este ejemplo anterior, un early stopping y un model_checkpoint. Luego prueba el test sobre el model checkpoint almacenado.

## Learning Rates Schedulers

Los learning rate schedulers son mecanismos de modificación del learning rate. Normalmente lo que hacen es reducir el valor del learning rate, lo que se conoce como ***learning rate annealing***. Esta modificación se suele realizar al acabar cada epoch.

Keras ya dispone de algunos learning rate schedulers implementados pero el usuario puede implemetar su propia estrategia de annealing. Veamos ambos casos.

### LRS ya implementado en Keras

In [None]:
# Emplear un LRS estandard de Keras: ReduceLROnPlateau

from keras.callbacks import ReduceLROnPlateau

model = Sequential()

model.add(Input(784))
model.add(Dense(1024, activation='relu'))
model.add(Dense(1024, activation='relu'))
model.add(Dense(num_classes, activation='softmax'))

opt=Adam(learning_rate=0.01)
model.compile(loss='categorical_crossentropy',
            optimizer=opt,
            metrics=['accuracy'])

reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.2,
                                patience=2, min_lr=0.00001)

history = model.fit(x_train, y_train,
                    batch_size=batch_size,
                    epochs=epochs,
                    verbose=1,
                    validation_data=(x_val, y_val),
                    callbacks=[reduce_lr])  ## <--- aquí está



### LRS propio mediante función 

La función toma como entrada el epoch

In [None]:
## Emplear un LRS propio: LearningRateScheduler

from keras.callbacks import LearningRateScheduler

def scheduler(epoch):
    if epoch < 5:
        return lr
    else:
        return lr * 0.1

LRS=LearningRateScheduler(scheduler, verbose=1)

model = Sequential()

model.add(Input(784))

model.add(Dense(1024, activation='relu'))
model.add(Dense(1024, activation='relu'))
model.add(Dense(num_classes, activation='softmax'))

opt=Adam(learning_rate=0.01)
model.compile(loss='categorical_crossentropy',
            optimizer=opt,
            metrics=['accuracy'])   

history = model.fit(x_train, y_train,
                    batch_size=batch_size,
                    epochs=epochs,
                    verbose=1,
                    validation_data=(x_val, y_val),
                    callbacks=[LRS])

 
                


### LRS propio tipo CosineAnnealing

![cosine annealing](cosine.png)


In [None]:
# Emplear un LRS propio, CosineAnnealingScheduler

from keras.callbacks import LearningRateScheduler
import math

lr_max=0.001
lr_min=0.00001
epochs=100
def cosine_annealing(x): # recordemos, x es el número de epoch
    lr = lr_max/2 * (1 + math.cos(math.pi * x / epochs))
    if lr<lr_min:
        lr=lr_min
    return lr

LRS = LearningRateScheduler(cosine_annealing)

model = Sequential()

model.add(Input(784))
model.add(Dense(1024, activation='relu'))
model.add(Dense(1024, activation='relu'))
model.add(Dense(num_classes, activation='softmax'))

opt=Adam(learning_rate=0.01)
model.compile(loss='categorical_crossentropy',
            optimizer=opt,
            metrics=['accuracy'])

history = model.fit(x_train, y_train,
                    batch_size=batch_size,
                    epochs=epochs,
                    verbose=1,
                    validation_data=(x_val, y_val),
                    callbacks=[LRS])



### LRS propio tipo CosineAnnealing with restarts

![cosine annealing restarts](cosinerestart.png)

Este scheduler tiene sentido cuanto queremos guardarnos cada uno de los estado del modelo alcanzado en el mínimo LR para luego combinarlos.

In [None]:
# Emplear un LRS propio, CosineAnnealingScheduler with restarts

from keras.callbacks import LearningRateScheduler
import math

lr_max=0.001
lr_min=0.00001
epochs=100

def cosine_annealing_with_restarts(x):
    lr = lr_max/2 * (1 + math.cos(math.pi * (x % (epochs/5)) / (epochs/5)))
    if lr<lr_min:
        lr=lr_min
    return lr

LRS = LearningRateScheduler(cosine_annealing_with_restarts)

model = Sequential()
model.add(Input(784))
model.add(Dense(1024, activation='relu'))
model.add(Dense(1024, activation='relu'))
model.add(Dense(num_classes, activation='softmax'))

opt=Adam(learning_rate=0.01)
model.compile(loss='categorical_crossentropy',
            optimizer=opt,
            metrics=['accuracy'])

history = model.fit(x_train, y_train,
                    batch_size=batch_size,
                    epochs=epochs,
                    verbose=1,
                    validation_data=(x_val, y_val),
                    callbacks=[LRS])

---

## EXTRA, estandaricemos la clase MLP

Vamos a implementar una clase para ahorrarnos escribir código recurrentemente. 
Para ello empleamos la **functional api** de Keras:

In [None]:
from tensorflow import keras
from keras.layers import Dense

class MLP(keras.Model):

  def __init__(self, input_size,num_classes,hidden=[128]):
    super().__init__()
    
    self.hidden = []
    self.num_h=len(hidden)

    for h in hidden:
       self.hidden.append(Dense(h, activation='relu'))
    self.out = Dense(num_classes, activation='softmax')
    

  def call(self, inputs, training=False):
    
    x=self.hidden[0](inputs)
    for h in range(1,self.num_h):
        x = self.hidden[h](x)
    x = self.out(x)
    return x
    


model = MLP(784,10,[1024,512])  ## <-- aquí se instancia el modelo, input, num clases y lista con hidden layers
model.build((None,784)) ## Esto es necesario para poder instaciar adecuadamente todos los shapes del grafo de computación


model.summary()