## **Gradient Vanishing and ResNet**

This Colab file demonstrates the gradient vanishing problem and how ResNet can overcome the problem.

Some of the code in this Colab file is based on the examples in https://machinelearningmastery.com/how-to-fix-vanishing-gradients-using-the-rectified-linear-activation-function/



In [None]:
# Check version of Keras and Tensorflow
import tensorflow as tf
import keras
print('Keras version:', keras.__version__)
print('Tensorflow version:', tf.__version__)

### **1. Data preparation**

In [None]:
# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')
# Create a working folder and cd to it.
!mkdir -p /content/drive/MyDrive/Learning/EIE4122/lab3
%cd /content/drive/MyDrive/Learning/EIE4122/lab3

In [None]:
# Import packages
from sklearn.datasets import make_circles, make_moons
from sklearn.preprocessing import MinMaxScaler
from tensorflow import keras
from keras.optimizers import SGD
from keras.layers import Dense
from keras.models import Sequential
from keras.initializers import RandomUniform
from keras.callbacks import TensorBoard


In [None]:
# Create two 2D spirals
import numpy as np
def twospirals(n_points, noise=.5):
    n = np.sqrt(np.random.rand(n_points,1)) * 600 * (2*np.pi)/360
    d1x = -np.cos(n)*n + np.random.rand(n_points,1) * noise
    d1y = np.sin(n)*n + np.random.rand(n_points,1) * noise
    return (np.vstack((np.hstack((d1x,d1y)),np.hstack((-d1x,-d1y)))),
            np.hstack((np.zeros(n_points),np.ones(n_points))))

In [None]:
# generate 2d classification dataset
X, y = twospirals(500, noise=1.5)
scaler = MinMaxScaler(feature_range=(-1, 1))
X = scaler.fit_transform(X)

In [None]:
# Plot the two classes using two different colors
from matplotlib import pyplot
for i in range(2):
  idx = (y == i)
  pyplot.scatter(X[idx, 0], X[idx, 1], label=f"Class {str(i)}")
pyplot.legend()
pyplot.show()

In [None]:
# Split into train and test, shuffle the data
from sklearn.utils import shuffle
n_train = y.shape[0]//2
X, y = shuffle(X, y)
trainX, testX = X[:n_train, :], X[n_train:, :]
trainy, testy = y[:n_train], y[n_train:]

### **2. Network definition and training**

In [None]:
# Define and compile a feedforward net, using 'tanh' as non-linear activation function
act = 'tanh'
x_in = tf.keras.layers.Input((2,), name='Input')
x = Dense(5, input_dim=2, activation=act, name='L1')(x_in)
l = 1
for i in range(30):
  x = Dense(5, input_dim=5, activation=act, name=f'L{l+1}')(x)
  l = l + 1
x_out = Dense(1, activation='sigmoid', name='Output')(x)
ffnet = tf.keras.models.Model(inputs=x_in, outputs=x_out, name="FFNet")
opt = SGD(learning_rate=0.01, momentum=0.9)
ffnet.compile(loss='binary_crossentropy', optimizer=opt, metrics=['accuracy'])
#ffnet.summary()
ffnet_init_weights = ffnet.get_weights()
print(f"No. of layers = {len(ffnet.layers)}")

In [None]:
# Train the feedforward net without Tensorboard
ffnet_hist = ffnet.fit(trainX, trainy, validation_data=(testX, testy), epochs=200, verbose=0)

In [None]:
# Evaluate the ffnet and plot training/test accuracy against epoch
_, train_acc = ffnet.evaluate(trainX, trainy, verbose=0)
_, test_acc = ffnet.evaluate(testX, testy, verbose=0)
print('Train: %.3f, Test: %.3f' % (train_acc, test_acc))
# plot training history
pyplot.plot(ffnet_hist.history['accuracy'], label='train')
pyplot.plot(ffnet_hist.history['val_accuracy'], label='test')
pyplot.xlabel('epoch')
pyplot.ylabel('accuracy')
pyplot.legend()
pyplot.show()

In [None]:
# Define a ResNet, using 'tanh' as non-linear act function
import tensorflow as tf
from keras.layers import Add
act = 'tanh'
x_in = tf.keras.layers.Input((2,), name='Input')
x = Dense(5, input_dim=2, activation=act, name='L1')(x_in)
x = Dense(5, input_dim=5, activation=act, name='L2')(x)
l = 3
for i in range(25):
  x_skip = x
  x = keras.layers.BatchNormalization()(x)
  x = Dense(5, input_dim=5, activation=act, name=f'L{l}')(x)
  x = Dense(5, input_dim=5, activation=act, name=f'L{l+1}')(x)
  x = Add()([x, x_skip])
  l = l + 2
x = Dense(5, input_dim=5, activation=act, name=f'L{l}')(x)
x_out = Dense(1, activation='sigmoid', name='Output')(x)
resnet = tf.keras.models.Model(inputs=x_in, outputs=x_out, name="ResNet")
opt = SGD(learning_rate=0.01, momentum=0.9)
resnet.compile(loss='binary_crossentropy', optimizer=opt, metrics=['accuracy'])
#resnet.summary()
resnet_init_weights = resnet.get_weights()
print(f"No. of layers = {len(resnet.layers)}")

In [None]:
# Train the resnet without Tensorboard
resnet_hist = resnet.fit(trainX, trainy, validation_data=(testX, testy), epochs=200, verbose=0)

In [None]:
# Evaluate the resnet and plot training/test accuracy against epoch
_, train_acc = resnet.evaluate(trainX, trainy, verbose=0)
_, test_acc = resnet.evaluate(testX, testy, verbose=0)
print('Train: %.3f, Test: %.3f' % (train_acc, test_acc))
# plot training history
pyplot.plot(resnet_hist.history['accuracy'], label='train')
pyplot.plot(resnet_hist.history['val_accuracy'], label='test')
pyplot.xlabel('epoch')
pyplot.ylabel('accuracy')
pyplot.legend()
pyplot.show()

### **3. Inspect the network using TensorBoard**

In [None]:
import tensorflow as tf

# keep track of the gradients using TensorBoard
bce = tf.keras.losses.BinaryCrossentropy()
class ExtendedTensorBoard(tf.keras.callbacks.TensorBoard):

  def _log_gradients(self, epoch):
    step = tf.cast(epoch, dtype=tf.int64)
    writer = self._train_writer

    with writer.as_default(), tf.GradientTape() as g:
      y_pred = self.model(trainX)
      y_true = np.asarray(trainy).astype('float32').reshape((-1,1))
      loss = bce(y_true=y_true, y_pred=y_pred)
      gradients = g.gradient(loss, self.model.trainable_weights) # back-propagation

      # In "eager" mode, grads does not have name, so we get names from model.trainable_weights
      # https://jonathan-hui.medium.com/tensorflow-eager-execution-v-s-graph-tf-function-6edaa870b1f1
      for weights, grads in zip(self.model.trainable_weights, gradients):
        tf.summary.histogram(weights.path.replace('/', '_')+'_grads', data=grads, step=step)

    writer.flush()

  def on_epoch_end(self, epoch, logs=None):
    # This function overrides on_epoch_end in tf.keras.callbacks.TensorBoard
    # but we do need to run the original on_epoch_end, so here we use the super function.
    super(ExtendedTensorBoard, self).on_epoch_end(epoch, logs=logs)
    if self.histogram_freq and epoch % self.histogram_freq == 0:
      self._log_gradients(epoch)

In [None]:
!rm -rf ./logs/ffnet # Delete the logs from the previous run

In [None]:
# Train the ffnet and use TensorBoard callback to log training info
ffnet_tb = ExtendedTensorBoard(log_dir='./logs/ffnet', histogram_freq=1) # you can change the file name to store info for different models
ffnet.set_weights(ffnet_init_weights)
ffnet.fit(trainX, trainy, validation_data=(testX, testy), epochs=200, verbose=0, callbacks=[ffnet_tb])

Inspect the **HISTOGRAM** in the TensorBoard to see how the weights in the bottom layers were changed during the training. In the histograms, the x-axis is the weight values, the y-axis is the epoch number, and the z-axis is the frequency of occurrences of the weight values.

Inspect the **DISTRIBUTION** in the TensorBoard to see how the distribution of weights and gradients at different layers evolves during training. In the figures, the x-axis is the epoch, and the y-axis is the weight or gradient values.


In [None]:
# Use Tensorboard to inspect the training info and gradients
%reload_ext tensorboard
%tensorboard --logdir='./logs/ffnet/'

In [None]:
# Delete logs from previous runs
!rm -rf ./logs/resnet

In [None]:
# Train the resent and use TensorBoard callback to log training info
resnet_tb = ExtendedTensorBoard(log_dir='./logs/resnet', histogram_freq=1)
resnet.set_weights(resnet_init_weights)
resnet.fit(trainX, trainy, validation_data=(testX, testy), epochs=200, verbose=0, callbacks=[resnet_tb])

In [None]:
# Use Tensorboard to inspect the training info and gradients
%reload_ext tensorboard
%tensorboard --logdir='./logs/resnet/' --port=8008

### **<font color="red">4. Further investigations</font>**

1.   Change the number of layers in *ffnet* to find the maximum number of layers that a feedforward network can have without suffering from the vanishing gradient problem.
2.   Change the number of layers in *resnet* to find the maximum number of layers that a residual network can have without suffering from vanishing gradient problem.
3.   Change the 'tanh' to 'relu' and repeat (1) and (2) to see if ReLU can help mitigating the vanishing gradient problem.

### **<font color="red">5. What to include in your report</font>**
1.   Your Discussion/observations on the "further investigations" suggested in Section 4.
2.   The screenshots of the histograms of the weights at the bottom and the upper layers.
3.   An illustration of the structure of the ResNet (you can infer this from the code or use resnet.summary()).
4.   An explanation of why the ResNet can be very deep without suffering from the vanishing gradient problem.
5.   An explanation of why ReLU can help mitigating the vanishing gradient problem.

