# Optimizers with TensorFlow 2 and Keras - Part 2



# Setup


In [0]:
try:
  # %tensorflow_version only exists in Colab.
  %tensorflow_version 2.x
except Exception:
  pass

import tensorflow as tf
tf.__version__

In [0]:
tf.random.set_seed(42)

In [0]:
!nvidia-smi

In [0]:
# for RAdam
!pip install -q tensorflow_addons

In [0]:
# https://github.com/AndreasMadsen/python-lrcurve
!pip install -q lrcurve

In [0]:
from lrcurve import KerasLearningCurve    

In [0]:
import tensorflow_addons as tfa
from tensorflow.keras.layers import Dense, Dropout
from sklearn.model_selection import train_test_split

In [0]:
import matplotlib.pyplot as plt
%matplotlib inline

In [0]:
import matplotlib as mpl
mpl.rcParams['figure.figsize'] = (20, 8)
mpl.rcParams['axes.titlesize'] = 24
mpl.rcParams['axes.labelsize'] = 20

figsize_3d = (12, 12)

In [0]:
# Adapted from:
# http://scikit-learn.org/stable/auto_examples/neighbors/plot_classification.html
# http://jponttuset.cat/xkcd-deep-learning/

from matplotlib.colors import ListedColormap

import numpy as np
import pandas as pd

cmap = ListedColormap(['#FF6666', '#6666FF'])

font_size=15
title_font_size=25

def meshGrid(x_data, y_data):
    h = .05  # step size in the mesh
#     x_min, x_max = -0.1, 1.1
#     y_min, y_max = -0.1, 1.1

    x_min, x_max = x_data.min() - .1, x_data.max() + .1
    y_min, y_max = y_data.min() - .1, y_data.max() + .1
    xx, yy = np.meshgrid(np.arange(x_min, x_max, h),
                         np.arange(y_min, y_max, h))
    return (xx,yy)
    
def plotPrediction(clf, x_data, y_data, x_label, y_label, ground_truth, title="", 
                   size=(15, 8), n_samples=None, proba=True, prediction=True, 
                   ax=None, marker_size=100
                  ):
    xx,yy = meshGrid(x_data, y_data)
    if ax is None:
      _, ax = plt.subplots(figsize=size)

    if clf:
        Z = clf.predict_proba(np.c_[xx.ravel(), yy.ravel()])
        Z = Z.reshape(xx.shape)
        ax.contourf(xx, yy, Z, cmap=plt.cm.RdBu, alpha=.6)

    ax.set_xlim(xx.min(), xx.max())
    ax.set_ylim(yy.min(), yy.max())
    
    samples = pd.DataFrame(np.array([x_data, y_data, ground_truth]).T)
    if n_samples:
      samples = samples.sample(n_samples, random_state=42)
      
    classes = samples[2]
    ax.scatter(samples[0], samples[1], c=classes, cmap=cmap, marker='o', edgecolors='k', s=classes*marker_size)
    ax.scatter(samples[0], samples[1], c=classes, cmap=cmap, marker='^', edgecolors='k', s=~classes.astype(bool)*marker_size)

    ax.set_xlabel(x_label, fontsize=font_size)
    ax.set_ylabel(y_label, fontsize=font_size)
    ax.set_title(title, fontsize=title_font_size)

    return ax


# Binary Classification Problem - Complex Decision Boundary

In [0]:
#@title Configure our example { run: "auto", display-mode: "form" }

# https://colab.research.google.com/notebooks/forms.ipynb

n = 2000 #@param {type:"slider", min:500, max:10000, step:500}
a = 20 #@param {type:"slider", min:10, max:20, step: 1} 
b = 0.7 #@param {type:"slider", min:0.1, max:1, step: 0.1}
noise_level = 0 #@param {type:"slider", min:0.0, max:1.0, step:0.05}

title = 'Categories expressed as colors' 
dim_1_label = 'x1' #@param {type:"string"}
dim_2_label = 'x2' #@param {type:"string"}

# Set every random seed that is available ;-)
tf.random.set_seed(42)
np.random.seed(42)

# all points
X = np.random.uniform(0, 1, (n, 2))

# complex decision boundary based on sine function plus noise optional
noise = np.random.normal(0, noise_level, n)
y_bool = X[:, 1] < np.sin(a*X[:, 0]**b) + noise
y = y_bool.astype(int)

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

size=100
plt.xlabel(dim_1_label)
plt.ylabel(dim_2_label)
plt.title(title)

plt.scatter(X[:,0], X[:,1], c=y, cmap=plt.cm.bwr, marker='o', edgecolors='k', s=y*size);
plt.scatter(X[:,0], X[:,1], c=y, cmap=plt.cm.bwr, marker='^', edgecolors='k', s=~y_bool*size);

## Inspect two-dimensional data and class labels 

In [0]:
from mpl_toolkits.mplot3d import Axes3D

fig = plt.figure(figsize=figsize_3d)

ax = fig.add_subplot(111, projection='3d')
ax.set_title('True categories 0/1')

# we can have the probability encoded in shade of color
ax.scatter(X[:,0], X[:,1], y, c=y, 
           cmap=plt.cm.bwr,
           marker='o', 
           edgecolors='k',
           depthshade=False,
           s=y*size)

ax.scatter(X[:,0], X[:,1], y, c=y, 
           cmap=plt.cm.bwr,
           marker='^', 
           edgecolors='k',
           depthshade=False,
           s=~y_bool*size)

# https://en.wikipedia.org/wiki/Azimuth
ax.view_init(elev=20, azim=-40)

# SGD and a Simple Model

In [0]:
%%time 

tf.random.set_seed(42)

EPOCHS=100 # Change to 500/1000 later  

# Most important parameters with defaults - add to optimizer if you like
learning_rate=1e-2 # DEFAULT
momentum = 0.0 # Momentum is one of the best heuristics, try value between 0.9 and 0.99
nesterov = False # Nesterov accelerated gradient
    
optimizer = tf.keras.optimizers.SGD()     
    
model = tf.keras.Sequential()
model.add(Dense(units=1, input_dim=2, activation='sigmoid'))

model.compile(loss='binary_crossentropy', 
              optimizer=optimizer,
             metrics=['accuracy'])

history = model.fit(X_train, y_train,
                    validation_data=(X_test, y_test),
                    epochs=EPOCHS, 
                    callbacks=[KerasLearningCurve()],
                    verbose=0)

In [0]:
loss, accuracy = model.evaluate(X_train, y_train, verbose=0)
loss, accuracy

In [0]:
loss, accuracy = model.evaluate(X_test, y_test, verbose=0)
loss, accuracy

In [0]:
START_EPOCHE = 0
END_EPOCHE = -1

plt.yscale('log')
plt.ylabel("loss")
plt.xlabel("epochs")
plt.title('Loss over time')

plt.plot(history.history['loss'][START_EPOCHE:END_EPOCHE]);
plt.plot(history.history['val_loss'][START_EPOCHE:END_EPOCHE]);

plt.legend(['Training', 'Test']);

In [0]:
plt.ylabel("accuracy")
plt.xlabel("epochs")
plt.title('Accuracy over time')

plt.plot(history.history['accuracy'][START_EPOCHE:END_EPOCHE])
plt.plot(history.history['val_accuracy'][START_EPOCHE:END_EPOCHE])

plt.legend(['Training', 'Test']);

In [0]:
plotPrediction(model, X[:, 0], X[:, 1], 
               dim_1_label, dim_2_label, y,
                title="Classification probabilities (dark is certain)");

# Adam and a Simple Model

In [0]:
%%time 
 
tf.random.set_seed(42)

EPOCHS=100 # Change to 500/1000 later    

# Most important parameters with defaults - add to optimizer if you like
learning_rate=1e-3 # DEFAULT
amsgrad = False 
# We do not touch beta_1 or beta_2 

optimizer = tf.keras.optimizers.Adam()   
    
model = tf.keras.Sequential()
model.add(Dense(units=1, input_dim=2, activation='sigmoid'))

model.compile(loss='binary_crossentropy', 
              optimizer=optimizer,
             metrics=['accuracy'])
model.summary()


In [0]:
%%time 

history = model.fit(X_train, y_train,
                    validation_data=(X_test, y_test),
                    epochs=EPOCHS, 
                    callbacks=[KerasLearningCurve()],
                    verbose=0)

In [0]:
loss, accuracy = model.evaluate(X_train, y_train, verbose=0)
loss, accuracy

In [0]:
loss, accuracy = model.evaluate(X_test, y_test, verbose=0)
loss, accuracy

In [0]:
START_EPOCHE = 0
END_EPOCHE = -1

plt.yscale('log')
plt.ylabel("loss")
plt.xlabel("epochs")
plt.title('Loss over time')

plt.plot(history.history['loss'][START_EPOCHE:END_EPOCHE]);
plt.plot(history.history['val_loss'][START_EPOCHE:END_EPOCHE]);

plt.legend(['Training', 'Test']);

In [0]:
plt.ylabel("accuracy")
plt.xlabel("epochs")
plt.title('Accuracy over time')

plt.plot(history.history['accuracy'][START_EPOCHE:END_EPOCHE])
plt.plot(history.history['val_accuracy'][START_EPOCHE:END_EPOCHE])

plt.legend(['Training', 'Test']);

In [0]:
plotPrediction(model, X[:, 0], X[:, 1], 
               dim_1_label, dim_2_label, y,
                title="Classification probabilities (dark is certain)");

# RAdam and a Simple Model

* https://www.pyimagesearch.com/2019/09/30/rectified-adam-radam-optimizer-with-keras/
* https://www.tensorflow.org/addons/api_docs/python/tfa/optimizers/RectifiedAdam

In [0]:
tfa.optimizers.RectifiedAdam?

In [0]:
%%time 

tf.random.set_seed(42)

EPOCHS = 20 # Change to 50/100/... later    

# Most important parameters for Rectified Adam - add to optimizer if you like
lr = 1e-2 # learning_rate
warmup_proportion = 0.001
# Intialize RAdam with default values (vanilla)
optimizer = tfa.optimizers.RectifiedAdam()

model = tf.keras.Sequential()
model.add(Dense(units=1, input_dim=2, activation='sigmoid'))

model.compile(loss='binary_crossentropy', 
              optimizer=optimizer,
              metrics=['accuracy'])

history = model.fit(X_train, y_train,
                    validation_data=(X_test, y_test),
                    epochs=EPOCHS, 
                    callbacks=[KerasLearningCurve()],
                    verbose=0)

In [0]:
loss, accuracy = model.evaluate(X_train, y_train, verbose=0)
loss, accuracy

In [0]:
loss, accuracy = model.evaluate(X_test, y_test, verbose=0)
loss, accuracy

In [0]:
START_EPOCHE = 0
END_EPOCHE = -1

plt.yscale('log')
plt.ylabel("loss")
plt.xlabel("epochs")
plt.title('Loss over time')

plt.plot(history.history['loss'][START_EPOCHE:END_EPOCHE]);
plt.plot(history.history['val_loss'][START_EPOCHE:END_EPOCHE]);

plt.legend(['Training', 'Test']);

In [0]:
plt.ylabel("accuracy")
plt.xlabel("epochs")
plt.title('Accuracy over time')

plt.plot(history.history['accuracy'][START_EPOCHE:END_EPOCHE])
plt.plot(history.history['val_accuracy'][START_EPOCHE:END_EPOCHE])

plt.legend(['Training', 'Test']);

In [0]:
plotPrediction(model, X[:, 0], X[:, 1], 
               dim_1_label, dim_2_label, y,
                title="Classification probabilities (dark is certain)");

# Overcomplex Model trained with SGD

In [0]:
%%time 
 
tf.random.set_seed(42)

EPOCHS = 200  # Try 100/300/1000/... 

# Most important parameters with defaults - add to optimizer if you like
learning_rate=1e-2 # DEFAULT
momentum = 0.9 # Momentum is one of the best heuristics, try value between 0.9 and 0.99
nesterov = True # Nesterov accelerated gradient

optimizer = tf.optimizers.SGD(momentum=momentum)

from tensorflow.keras.layers import Dense    
    
model = tf.keras.Sequential()
model.add(Dense(units=100, input_dim=2, activation='relu'))
#model.add(Dropout(.2))
model.add(Dense(units=100, activation='relu'))
#model.add(Dropout(.2))
model.add(Dense(units=100, activation='relu'))
#model.add(Dropout(.2))
model.add(Dense(units=1, activation='sigmoid'))

model.compile(loss='binary_crossentropy', 
              optimizer=optimizer,
             metrics=['accuracy'])

history = model.fit(X_train, y_train,
                    validation_data=(X_test, y_test),
                    epochs=EPOCHS, 
                    callbacks=[KerasLearningCurve()],
                    verbose=0)

In [0]:
loss, accuracy = model.evaluate(X_train, y_train, verbose=0)
loss, accuracy

In [0]:
loss, accuracy = model.evaluate(X_test, y_test, verbose=0)
loss, accuracy

In [0]:
START_EPOCHE = 0
END_EPOCHE = -1

plt.yscale('log')
plt.ylabel("loss")
plt.xlabel("epochs")
plt.title('Loss over time')

plt.plot(history.history['loss'][START_EPOCHE:END_EPOCHE]);
plt.plot(history.history['val_loss'][START_EPOCHE:END_EPOCHE]);

plt.legend(['Training', 'Test']);

In [0]:
plt.ylabel("accuracy")
plt.xlabel("epochs")
plt.title('Accuracy over time')

plt.plot(history.history['accuracy'][START_EPOCHE:END_EPOCHE])
plt.plot(history.history['val_accuracy'][START_EPOCHE:END_EPOCHE])

plt.legend(['Training', 'Test']);

In [0]:
plotPrediction(model, X_test[:, 0], X_test[:, 1], 
               dim_1_label, dim_2_label, y_test,
                title="Classification probabilities (dark is certain)");

# Overcomplex Model trained with Adam

In [0]:
%%time 
 
tf.random.set_seed(42)

EPOCHS=50  # Change to 50/100/... later   

optimizer = tf.keras.optimizers.Adam()     


from tensorflow.keras.layers import Dense    
    
model = tf.keras.Sequential()
model.add(Dense(units=100, input_dim=2, activation='relu'))
#model.add(Dropout(.2))
model.add(Dense(units=100, activation='relu'))
#model.add(Dropout(.2))
model.add(Dense(units=100, activation='relu'))
#model.add(Dropout(.2))
model.add(Dense(units=1, activation='sigmoid'))

model.compile(loss='binary_crossentropy', 
              optimizer=optimizer,
             metrics=['accuracy'])

history = model.fit(X_train, y_train,
                    validation_data=(X_test, y_test),
                    epochs=EPOCHS, 
                    callbacks=[KerasLearningCurve()],
                    verbose=0)

In [0]:
loss, accuracy = model.evaluate(X_train, y_train, verbose=0)
loss, accuracy

In [0]:
loss, accuracy = model.evaluate(X_test, y_test, verbose=0)
loss, accuracy

In [0]:
START_EPOCHE = 0
END_EPOCHE = -1

plt.yscale('log')
plt.ylabel("loss")
plt.xlabel("epochs")
plt.title('Loss over time')

plt.plot(history.history['loss'][START_EPOCHE:END_EPOCHE]);
plt.plot(history.history['val_loss'][START_EPOCHE:END_EPOCHE]);

plt.legend(['Training', 'Test']);

In [0]:
plt.ylabel("accuracy")
plt.xlabel("epochs")
plt.title('Accuracy over time')

plt.plot(history.history['accuracy'][START_EPOCHE:END_EPOCHE])
plt.plot(history.history['val_accuracy'][START_EPOCHE:END_EPOCHE])

plt.legend(['Training', 'Test']);

In [0]:
plotPrediction(model, X_test[:, 0], X_test[:, 1], 
               dim_1_label, dim_2_label, y_test,
                title="Classification probabilities (dark is certain)");

# Overcomplex Model trained with RAdam

In [0]:
%%time 
 
tf.random.set_seed(42)

EPOCHS=50 # Change to 50/100/... later 

# Most important parameters - add to optimizer if you like
lr = 1e-2 # learning_rate
warmup_proportion = 0.001

optimizer = tfa.optimizers.RectifiedAdam()

model = tf.keras.Sequential()
model.add(Dense(units=100, input_dim=2, activation='relu'))
#model.add(Dropout(.2))
model.add(Dense(units=100, activation='relu'))
#model.add(Dropout(.2))
model.add(Dense(units=100, activation='relu'))
#model.add(Dropout(.2))
model.add(Dense(units=1, activation='sigmoid'))

model.compile(loss='binary_crossentropy', 
              optimizer=optimizer,
             metrics=['accuracy'])

#model.summary()

In [0]:
%%time 

history = model.fit(X_train, y_train,
                    validation_data=(X_test, y_test),
                    epochs=EPOCHS, 
                    callbacks=[KerasLearningCurve()],
                    verbose=0)

In [0]:
loss, accuracy = model.evaluate(X_train, y_train, verbose=0)
loss, accuracy

In [0]:
loss, accuracy = model.evaluate(X_test, y_test, verbose=0)
loss, accuracy

In [0]:
START_EPOCHE = 0
END_EPOCHE = -1

plt.yscale('log')
plt.ylabel("loss")
plt.xlabel("epochs")
plt.title('Loss over time')

plt.plot(history.history['loss'][START_EPOCHE:END_EPOCHE]);
plt.plot(history.history['val_loss'][START_EPOCHE:END_EPOCHE]);

plt.legend(['Training', 'Test']);

In [0]:
plt.ylabel("accuracy")
plt.xlabel("epochs")
plt.title('Accuracy over time')

plt.plot(history.history['accuracy'][START_EPOCHE:END_EPOCHE])
plt.plot(history.history['val_accuracy'][START_EPOCHE:END_EPOCHE])

plt.legend(['Training', 'Test']);

In [0]:
plotPrediction(model, X_test[:, 0], X_test[:, 1], 
               dim_1_label, dim_2_label, y_test,
                title="Classification probabilities (dark is certain)");

# Early Stopping with RAdam

* https://www.tensorflow.org/api_docs/python/tf/keras/callbacks/EarlyStopping

In [0]:
%%time 
 
tf.random.set_seed(42)

EPOCHS=200 # Change if required

optimizer = tfa.optimizers.RectifiedAdam()

# Choose with care (or not?)
PATIENCE = 10

early_stopping_callback = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=PATIENCE)

from tensorflow.keras.layers import Dense    
    
model = tf.keras.Sequential()
model.add(Dense(units=100, input_dim=2, activation='relu'))
model.add(Dense(units=100, activation='relu'))
model.add(Dense(units=100, activation='relu'))
model.add(Dense(units=1, activation='sigmoid'))

model.compile(loss='binary_crossentropy', 
              optimizer=optimizer,
             metrics=['accuracy'])

history = model.fit(X_train, y_train,
                    validation_data=(X_test, y_test),
                    epochs=EPOCHS, 
                    callbacks=[early_stopping_callback, KerasLearningCurve()],
                    verbose=0)

In [0]:
loss, accuracy = model.evaluate(X_train, y_train, verbose=0)
loss, accuracy

In [0]:
loss, accuracy = model.evaluate(X_test, y_test, verbose=0)
loss, accuracy

In [0]:
START_EPOCHE = 0
END_EPOCHE = -1

plt.yscale('log')
plt.ylabel("loss")
plt.xlabel("epochs")
plt.title('Loss over time')

plt.plot(history.history['loss'][START_EPOCHE:END_EPOCHE]);
plt.plot(history.history['val_loss'][START_EPOCHE:END_EPOCHE]);

plt.legend(['Training', 'Test']);

In [0]:
plt.ylabel("accuracy")
plt.xlabel("epochs")
plt.title('Accuracy over time')

plt.plot(history.history['accuracy'][START_EPOCHE:END_EPOCHE])
plt.plot(history.history['val_accuracy'][START_EPOCHE:END_EPOCHE])

plt.legend(['Training', 'Test']);

In [0]:
plotPrediction(model, X_test[:, 0], X_test[:, 1], 
               dim_1_label, dim_2_label, y_test,
                title="Classification probabilities (dark is certain)");

# Early Stopping with SGD

* https://www.tensorflow.org/api_docs/python/tf/keras/callbacks/EarlyStopping

In [0]:
%%time 

tf.random.set_seed(42)

EPOCHS=1250 # Change if required

optimizer = tf.keras.optimizers.SGD()

# Choose with care
PATIENCE = 10

early_stopping_callback = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=PATIENCE)

from tensorflow.keras.layers import Dense    
    
model = tf.keras.Sequential()
model.add(Dense(units=100, input_dim=2, activation='relu'))
model.add(Dense(units=100, activation='relu'))
model.add(Dense(units=100, activation='relu'))
model.add(Dense(units=1, activation='sigmoid'))

model.compile(loss='binary_crossentropy', 
              optimizer=optimizer,
             metrics=['accuracy'])

history = model.fit(X_train, y_train,
                    validation_data=(X_test, y_test),
                    epochs=EPOCHS, 
                    callbacks=[early_stopping_callback, KerasLearningCurve()],
                    verbose=0)

In [0]:
loss, accuracy = model.evaluate(X_train, y_train, verbose=0)
loss, accuracy

In [0]:
loss, accuracy = model.evaluate(X_test, y_test, verbose=0)
loss, accuracy

In [0]:
START_EPOCHE = 0
END_EPOCHE = -1

plt.yscale('log')
plt.ylabel("loss")
plt.xlabel("epochs")
plt.title('Loss over time')

plt.plot(history.history['loss'][START_EPOCHE:END_EPOCHE]);
plt.plot(history.history['val_loss'][START_EPOCHE:END_EPOCHE]);

plt.legend(['Training', 'Test']);

In [0]:
plt.ylabel("accuracy")
plt.xlabel("epochs")
plt.title('Accuracy over time')

plt.plot(history.history['accuracy'][START_EPOCHE:END_EPOCHE])
plt.plot(history.history['val_accuracy'][START_EPOCHE:END_EPOCHE])

plt.legend(['Training', 'Test']);

In [0]:
plotPrediction(model, X[:, 0], X[:, 1], 
               dim_1_label, dim_2_label, y,
                title="Classification probabilities (dark is certain)");

# Takeaways

### Simple Models and complex decision boundaries

* Simple models are not able to capture to complex decision boundaries anyway
* SGD is sufficient, Adam should be an improvement (runtime, number of iterations)
* No reason to use RAdam, only slows down runtime
* Level of noise has no impact, there is not much to learn anyway 
* In general: Less iterations indicate better numerical stability



### Advanced Models and complex decision boundaries

* Complex models can easily capture complex decision boundaries
* SGD has a hard time learning complex boundaries, information flow through deep network is limited
* Adam / RAdam are to be preferred, the can learn complex decision boundaries easily
* BUT: In case of **noise** they tend to overfit/overtrain, while SGD has "implicit regularization" property, i.e. it cannot really overtrain because it learns so slow
* RAdam performs good, sometimes slightly better than Adam but is slower (runtime twice as long on average)



### Early Stopping and Dropout

* Early Stopping is very useful for Adam and RAdam
* SGD can stop way too early (patience requires finetuning which is not what you want to do)
* Dropout slows down learning for Adam/RAdam but improves robustness in case of noise (additional regularization)
* Dropout has no big impact on SGD here