## Nima Samadi
## Deep Learning HW3 - P2

In [1]:
import os
IS_COLAB_BACKEND = 'COLAB_GPU' in os.environ  # this is always set on Colab, the value is 0 or 1 depending on GPU presence
if IS_COLAB_BACKEND:
  from google.colab import auth
  # Authenticates the Colab machine and also the TPU using your
  # credentials so that they can access your private GCS buckets.
  auth.authenticate_user()

In [2]:
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf
import cv2
import seaborn as sn
import datetime
from sklearn.metrics import confusion_matrix

# Detect hardware
try:
  tpu_resolver = tf.distribute.cluster_resolver.TPUClusterResolver() # TPU detection
except ValueError:
  tpu_resolver = None
  gpus = tf.config.experimental.list_logical_devices("GPU")

# Select appropriate distribution strategy
if tpu_resolver:
  tf.config.experimental_connect_to_cluster(tpu_resolver)
  tf.tpu.experimental.initialize_tpu_system(tpu_resolver)
  strategy = tf.distribute.experimental.TPUStrategy(tpu_resolver)
  print('Running on TPU ', tpu_resolver.cluster_spec().as_dict()['worker'])
elif len(gpus) > 1:
  strategy = tf.distribute.MirroredStrategy([gpu.name for gpu in gpus])
  print('Running on multiple GPUs ', [gpu.name for gpu in gpus])
elif len(gpus) == 1:
  strategy = tf.distribute.get_strategy() # default strategy that works on CPU and single GPU
  print('Running on single GPU ', gpus[0].name)
else:
  strategy = tf.distribute.get_strategy() # default strategy that works on CPU and single GPU
  print('Running on CPU')
  
print("Number of accelerators: ", strategy.num_replicas_in_sync)

Running on single GPU  /device:GPU:0
Number of accelerators:  1


In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
%cd /content/drive/MyDrive/deepLearning
!ls

[Errno 2] No such file or directory: '/content/drive/MyDrive/deepLearning'
/content
drive  sample_data


## A)

Load dataset:

In [5]:
(x_train, y_train), (x_test, y_test) = tf.keras.datasets.cifar10.load_data()

Downloading data from https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz


Load pretrained model:

In [6]:
# Cifar-10 images are (32, 32). ResNet input is (224, 224). 
# As it's wanted to train on Cifar-10, I change model input size
# and keep images' size
IMG_SHAPE = (32, 32, 3)
resnet50_base = tf.keras.applications.ResNet50(input_shape=IMG_SHAPE,
                                               include_top=False,
                                               weights='imagenet',
                                               pooling='avg')
resnet50_base.trainable = False
preproc_input = tf.keras.applications.resnet50.preprocess_input
# Data augmentation to improve results
data_augmentation = tf.keras.Sequential([
    tf.keras.layers.RandomFlip('horizontal'),
    tf.keras.layers.RandomRotation(0.2)
])

Downloading data from https://storage.googleapis.com/tensorflow/keras-applications/resnet/resnet50_weights_tf_dim_ordering_tf_kernels_notop.h5


Add one Dense layer on top of base model:

In [12]:
inputs = tf.keras.Input(shape=IMG_SHAPE)
inputs = data_augmentation(inputs)
inputs = tf.cast(inputs, tf.float32)
x = preproc_input(inputs)
x = resnet50_base(x, training=False)
outputs = tf.keras.layers.Dense(10, activation='relu')(x)

teacher_model = tf.keras.Model(inputs, outputs)

In [13]:
teacher_model.summary()

Model: "model_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_5 (InputLayer)        [(None, 32, 32, 3)]       0         
                                                                 
 tf.__operators__.getitem_1   (None, 32, 32, 3)        0         
 (SlicingOpLambda)                                               
                                                                 
 tf.nn.bias_add_1 (TFOpLambd  (None, 32, 32, 3)        0         
 a)                                                              
                                                                 
 resnet50 (Functional)       (None, 2048)              23587712  
                                                                 
 dense_1 (Dense)             (None, 10)                20490     
                                                                 
Total params: 23,608,202
Trainable params: 20,490
Non-train

Train model:

In [15]:
lr = 1e-4
teacher_model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=lr),
                      loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
                      metrics=['accuracy'])

In [16]:
history = teacher_model.fit(x_train, y_train, epochs=50, batch_size=256, validation_data=(x_test, y_test))

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


In [52]:
res = teacher_model.evaluate(x_test, y_test)



## B)

In [17]:
class Distiller(tf.keras.Model):
  def __init__(self, student, teacher):
    super().__init__()
    self.teacher = teacher
    self.student = student
  
  def compile(self, optimizer, metrics, student_loss_fn, distillation_loss_fn, alpha=0.1, temperature=3):
    super().compile(optimizer=optimizer, metrics=metrics)
    self.student_loss_fn = student_loss_fn
    self.distillation_loss_fn = distillation_loss_fn
    self.alpha = alpha
    self.temperature = temperature

  def train_step(self, data):
    x, y = data
    teacher_preds = self.teacher(x, training=False)
    with tf.GradientTape() as tape:
      student_preds = self.student(x, training=True)
      student_loss = self.student_loss_fn(y, student_preds)

      distillation_loss = (
          self.distillation_loss_fn(
              tf.nn.softmax(teacher_preds/self.temperature, axis=1),
              tf.nn.softmax(student_preds/self.temperature, axis=1)
          ) * self.temperature**2
      )
      loss = self.alpha*student_loss + (1-self.alpha)*distillation_loss

    trainable_vars = self.student.trainable_variables
    grads = tape.gradient(loss, trainable_vars)

    self.optimizer.apply_gradients(zip(grads, trainable_vars))
    self.compiled_metrics.update_state(y, student_preds)

    results = {m.name: m.result() for m in self.metrics}
    results.update(
        {"student_loss": student_loss, "distillation_loss": distillation_loss}
    )
    return results

  def test_step(self, data):
    x, y = data

    y_preds = self.student(x, training=False)
    student_loss = self.student_loss_fn(y, y_preds)
    self.compiled_metrics.update_state(y, y_preds)

    results = {m.name: m.result() for m in self.metrics}
    results.update({"student_loss": student_loss})

    return results

To download Resnet18 model, I use `image-classifiers` package

In [18]:
!pip install image-classifiers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting image-classifiers
  Downloading image_classifiers-1.0.0-py3-none-any.whl (19 kB)
Collecting keras-applications<=1.0.8,>=1.0.7
  Downloading Keras_Applications-1.0.8-py3-none-any.whl (50 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m50.7/50.7 KB[0m [31m6.9 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: keras-applications, image-classifiers
Successfully installed image-classifiers-1.0.0 keras-applications-1.0.8


In [19]:
from classification_models.tfkeras import Classifiers

In [20]:
resnet18, pre_input_resnet18 = Classifiers.get('resnet18')
resnet18_base = resnet18(IMG_SHAPE, weights=None, include_top=False)
# Note that I used None weights which means I only used the structure of resnet18

In [48]:
inputs = tf.keras.Input(shape=IMG_SHAPE)
inputs = tf.cast(inputs, tf.float32)
x = pre_input_resnet18(inputs)
x = resnet18_base(x)
x = tf.keras.layers.GlobalAveragePooling2D()(x)
outputs = tf.keras.layers.Dense(10, activation='relu')(x)
student_model = tf.keras.Model(inputs, outputs)

In [49]:
student_model.summary()

Model: "model_12"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_25 (InputLayer)       [(None, 32, 32, 3)]       0         
                                                                 
 model_2 (Functional)        (None, 1, 1, 512)         11186889  
                                                                 
 global_average_pooling2d_9   (None, 512)              0         
 (GlobalAveragePooling2D)                                        
                                                                 
 dense_11 (Dense)            (None, 10)                5130      
                                                                 
Total params: 11,192,019
Trainable params: 11,184,077
Non-trainable params: 7,942
_________________________________________________________________


In [50]:
distiller = Distiller(student=student_model, teacher=teacher_model)
distiller.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=1e-4),
    metrics=['accuracy'],
    student_loss_fn=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
    distillation_loss_fn=tf.keras.losses.CategoricalCrossentropy(from_logits=True),
    alpha=0.1,
    temperature=1
)

history = distiller.fit(x_train, y_train, epochs=20, batch_size=64, validation_data=(x_test, y_test))

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [54]:
res = distiller.evaluate(x_test, y_test)



## C) 
To explain differences fairly, same hyperparameters in part A) and B) are chosen for following parts.

In [55]:
inputs = tf.keras.Input(shape=IMG_SHAPE)
inputs = data_augmentation(inputs)
inputs = tf.cast(inputs, tf.float32)
x = pre_input_resnet18(inputs)
x = resnet18_base(x)
x = tf.keras.layers.GlobalAveragePooling2D()(x)
outputs = tf.keras.layers.Dense(10, activation='relu')(x)
scratch_student_model = tf.keras.Model(inputs, outputs)

In [56]:
lr = 1e-4
scratch_student_model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=lr),
                              loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
                              metrics=['accuracy'])

In [57]:
scratch_student_model.summary()

Model: "model_13"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_27 (InputLayer)       [(None, 32, 32, 3)]       0         
                                                                 
 model_2 (Functional)        (None, 1, 1, 512)         11186889  
                                                                 
 global_average_pooling2d_10  (None, 512)              0         
  (GlobalAveragePooling2D)                                       
                                                                 
 dense_12 (Dense)            (None, 10)                5130      
                                                                 
Total params: 11,192,019
Trainable params: 11,184,077
Non-trainable params: 7,942
_________________________________________________________________


In [58]:
history = scratch_student_model.fit(x_train, y_train, epochs=20, batch_size=64, validation_data=(x_test, y_test))

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [59]:
res = scratch_student_model.evaluate(x_test, y_test)



ResNet18 model that is trained from scratch achieves $58.6\%$ accuracy on the test set while the same model that was trained with knowledge distillation achieved $63.2\%$ accuracy on the test set. The main reason is the teacher model in knowledge distillation scenario. In that case, teacher model helps student model to update its paramater such that it minimizes sum of two losses. This will prevent overvitting that appears in training ResNet18 model from scratch and results in better performance on the test set.

 ## D)
In this part first the ResNet50 model is trained from scratch. Then it's used as teacher model in knowledge distillation. Again, ResNet18 model is used as the student model. Note that ResNet50 is trained on ImageNet which consists of 1000 classes. So, FC layer must be removed and replaced.

In [60]:
resnet50_base_none = tf.keras.applications.ResNet50(input_shape=IMG_SHAPE,
                                                    include_top=False,
                                                    weights=None,
                                                    pooling='avg')

In [61]:
inputs = tf.keras.Input(shape=IMG_SHAPE)
inputs = data_augmentation(inputs)
inputs = tf.cast(inputs, tf.float32)
x = preproc_input(inputs)
x = resnet50_base_none(x, training=True)
outputs = tf.keras.layers.Dense(10, activation='relu')(x)

resnet50_scratch = tf.keras.Model(inputs, outputs)

In [62]:
resnet50_scratch.summary()

Model: "model_14"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_30 (InputLayer)       [(None, 32, 32, 3)]       0         
                                                                 
 tf.__operators__.getitem_2   (None, 32, 32, 3)        0         
 (SlicingOpLambda)                                               
                                                                 
 tf.nn.bias_add_2 (TFOpLambd  (None, 32, 32, 3)        0         
 a)                                                              
                                                                 
 resnet50 (Functional)       (None, 2048)              23587712  
                                                                 
 dense_13 (Dense)            (None, 10)                20490     
                                                                 
Total params: 23,608,202
Trainable params: 23,555,082
Non-

In [63]:
lr = 1e-4
resnet50_scratch.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=lr),
                      loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
                      metrics=['accuracy'])

In [64]:
history = resnet50_scratch.fit(x_train, y_train, epochs=50, batch_size=256, validation_data=(x_test, y_test))

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


In [67]:
res = resnet50_scratch.evaluate(x_test, y_test)



Knowledge distillation:

In [65]:
inputs = tf.keras.Input(shape=IMG_SHAPE)
inputs = tf.cast(inputs, tf.float32)
x = pre_input_resnet18(inputs)
x = resnet18_base(x)
x = tf.keras.layers.GlobalAveragePooling2D()(x)
outputs = tf.keras.layers.Dense(10, activation='relu')(x)
student_model_with_resnet50_scratch = tf.keras.Model(inputs, outputs)

In [66]:
distiller = Distiller(student=student_model_with_resnet50_scratch, teacher=resnet50_scratch)
distiller.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=1e-4),
    metrics=['accuracy'],
    student_loss_fn=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
    distillation_loss_fn=tf.keras.losses.CategoricalCrossentropy(from_logits=True),
    alpha=0.1,
    temperature=1
)

history = distiller.fit(x_train, y_train, epochs=20, batch_size=64, validation_data=(x_test, y_test))

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [68]:
res = distiller.evaluate(x_test, y_test)



As you can see, the teacher accuracy is lower than teacher model in part A which was linear tuned. The reason is that to train such big network, more epochs must be passed and regularization techniques must be used. But, to compare results fairly, I used same hyperparameters which results in lower accuracy.

When this teacher is used, it's obvious that we're going to have lower accuracy compared to part C. In this scenario, student's accuracy is $60.89\%$ but in part C, student's accuracy was $63.2\%$. Although there is difference in accuracy, but due to student model itself, the difference is not that high. 