# Deep Learning best practices

In [1]:
!pip install -q tensorflow==2.8.2

In [2]:
from tensorflow.keras.datasets import cifar10
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.layers import Input, Flatten, MaxPool2D, Lambda, Conv2D, Dense, Dropout, Activation, BatchNormalization
from tensorflow.keras.optimizers import Adam, SGD, RMSprop
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau, LearningRateScheduler
from tensorflow.keras.applications import VGG19
from tensorflow.keras.preprocessing.image import ImageDataGenerator

import numpy as np
import cv2

Hyperparameters

In [3]:
epochs = 10
batch_size = 32
num_classes = 10

Data loading and preprocessing

In [4]:
(x_train, y_train), (x_test, y_test) = cifar10.load_data() 
train_shape = x_train.shape
test_shape = x_test.shape
print(x_train.shape, y_train.shape, x_test.shape, y_test.shape)

#If you use a pretrained model, it might expect inputs to be in the [0-255] range, so you will need to comment this line
# x_train, x_test = x_train / 255, x_test / 255 

# ImageDataGenerator requires one hot encoding and 'categorical_crossentropy' loss 
y_train = np.eye(num_classes)[y_train.reshape(-1)]
y_test = np.eye(num_classes)[y_test.reshape(-1)]

Downloading data from https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz
(50000, 32, 32, 3) (50000, 1) (10000, 32, 32, 3) (10000, 1)


Making the model: each layer with neurons numbers (units), activation, use_bias, ....

In [25]:
from tensorflow.keras.layers import Resizing, RandomFlip, RandomCrop, RandomTranslation, RandomRotation, RandomZoom, RandomContrast

# Task-1 *Parameters Tunning*
#Tune number of units in each layer
#Tune the activation (sigmoid, relu, tanh, linear, selu, .. visit: https://goo.gl/hdtK15
def model():
    model = Sequential()

    # https://keras.io/guides/preprocessing_layers/
    # https://keras.io/api/layers/preprocessing_layers/image_augmentation/
    model.add(Resizing(224, 224, input_shape=(32, 32, 3)))
    model.add(RandomFlip("horizontal"))
    model.add(RandomTranslation(0.1, 0.1))
    model.add(RandomRotation(0.1))
    model.add(RandomZoom((-0.1, 0.1)))
    model.add(RandomContrast(0.1))

    pretrained = VGG19(input_shape=(224, 224, 3), weights='imagenet', include_top=False)
    pretrained.trainable = False
    model.add(pretrained)
    model.add(Dropout(0.2))

    model.add(Conv2D(256, (3, 3), activation='relu'))
    model.add(BatchNormalization())
    model.add(MaxPool2D((5, 5)))
    model.add(Dropout(0.2))

    model.add(Flatten())
    # model.add(Flatten(input_shape=(32, 32, 3)))

    model.add(Dense(units=128, use_bias=True))
    model.add(BatchNormalization())
    model.add(Activation('relu'))
    model.add(Dropout(0.2))

    model.add(Dense(units=64, use_bias=True))
    model.add(BatchNormalization())
    model.add(Activation('relu'))

    model.add(Dense(units=num_classes, use_bias=True, activation='softmax'))
    #Try to tune the optimizer, visit: https://goo.gl/dHFJNy
    #Try to tune the loss func, visit: https://goo.gl/xMrooU
    #Try to tune learning rate (lr)
    #In your free time take a look at different variations of GD: https://goo.gl/YFa6XY
    sgd_optimizer = SGD(learning_rate=.01)
    adam_optimizer = Adam(learning_rate=.01, clipnorm=1.)
    model.compile(optimizer=adam_optimizer,
                  loss='categorical_crossentropy',
                  metrics=['accuracy'])
    return model


In [26]:
model = model()
model.summary()

Model: "sequential_9"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 resizing_9 (Resizing)       (None, 224, 224, 3)       0         
                                                                 
 random_flip_8 (RandomFlip)  (None, 224, 224, 3)       0         
                                                                 
 random_translation_8 (Rando  (None, 224, 224, 3)      0         
 mTranslation)                                                   
                                                                 
 random_rotation_8 (RandomRo  (None, 224, 224, 3)      0         
 tation)                                                         
                                                                 
 random_zoom_7 (RandomZoom)  (None, 224, 224, 3)       0         
                                                                 
 random_contrast_7 (RandomCo  (None, 224, 224, 3)     

Training and Evaluation on test set.

In [27]:
# Task-2 *Early Stopping*
# Add Early stopping, to stop the training when the accuracy doesn't improve after 2 epochs
# and restore the model weights which produced best training accuracy.

# Task-3 *Validation Dataset generation*
# - Have a validation set split of 20%.
# - update the early stopping to stop on the validation accuracy.

early_stop = EarlyStopping(monitor='val_accuracy', patience=2, restore_best_weights=True)
reduce_lr = ReduceLROnPlateau(monitor='val_loss', patience=2, factor=0.3)

model.fit(x_train, y_train, 
          validation_split=0.2,
          batch_size=batch_size,
          epochs=epochs,
          callbacks=[early_stop, reduce_lr])

loss, acc = model.evaluate(x_test, y_test)
print("Loss:", loss, ", Accuracy:", acc)
model.save_weights(filepath="my_model_weights.hdf5")

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Loss: 0.4584525525569916 , Accuracy: 0.8402000069618225


## Tasks
- Apply Dropout after the dense layers.
- Apply BatchNormalization
- Put Early Stopping.
- Add a learning rate scheduler.
- Add gradient clipping.
- Use a pretrained model, e.g. VGG19 (optional).
- Train with Augmentation (optional).


## Questions
- Why do we need gradient clipping?

> If a model is very deep and has activations that are not limited from the upper side, like relu (in contrast with sigmoid or tanh). As you remember, the backpropagation starts from the end and goes forward layer by layer. That means the gradient is multiplied repeatedly on those positive numbers in each layer during backpropagation. These repeated multiplications may lead to gradient values becoming too large for stable training, that is called gradient explosion.
>
> A similar but opposite problem is when the weight are around zero -- the gradient can become too low for further layers to train, that is called gradient vanishing (residual connection is another solution to this problem). 
>
> Gradient clipping, which is limiting its norm or components by value, is a solution to both described problems.

- What are your steps if you are out of memory?

> If you are out of GPU memory, you could, firstly, try **decreasing the batch size**, however, you should not make it too small, as it will affect training. Secondly, you can **decrease your model dimensions**, if you don't have to stick to the given specs. Apart from these general points, there are some advanced techniques: **gradient accumulation** or **automatic mixed precision**. And of course, you can buy or reserve more GPU memory.

- How to select an optimal learning rate?

> There is no general way to select a learning rate, but there are a couple of strategies. First, you try a very high learning rate (i.e. 0.1-0.5), expecting a model to fastly get into a local optima. Then you decrease the learning rate exponentially (from 0.1 to 0.01, 0.001, ...) and check the training speed by looking at the loss value decreasing with each epoch. What we want is to select such learning rate that the loss decreases during all epochs. We cannot do it precisely, but can have up to 5 runs to estimate a good learning rate.

Another strategy is to use learning rate schedulers, the most popular is the exponential decay. We start with a high learning rate, then epoch by epoch (step by step) the learning rate decreases leading a model to a emperically good local optima.

- Is image augmentation more effective with 32x32 or 224x224 size and why?

> 224x224 models benefit more from image augmentaion because not only there are more variations of images that can be generated by augmentation algorithms, but also some of the algorithms can corrupt 32x32 images, for instance, rotating such small image by 10 degrees. So, only a small portion of image augmentation techniques are beneficial for 32x32 images.

