In [1]:
# IMPORTS

import tensorflow as tf
import numpy as np
from tensorflow import keras
from keras import layers, models
import matplotlib.pyplot as plt
from keras.utils import to_categorical
from keras import metrics, losses
from keras.models import load_model
from sklearn.model_selection import train_test_split
from PIL import Image
import os
import random


In [2]:
# Load both the teacher and student model

scratch_student = load_model('student_model.h5')
student_model = load_model('student_model.h5')
teacher_model =  load_model('teacher_model.h5')




In [4]:
# Load train from v2(1000), test from google+bing(1000)
train_dir = "v2(1000)"
test_dir = "google+bing(1000)"
img_size = (32, 32)

classes = sorted(os.listdir(train_dir))  # e.g., airplane, automobile, etc.
class_indices = {cls: idx for idx, cls in enumerate(classes)}

# Mapping from v2 class names to corresponding test folder names
class_folder_map = {
    'airplane': 'airplanes_square',
    'automobile': 'automobiles_square',
    'ship': 'ships_square (1)',
    'dog': 'dog_square',
    'deer': 'deer_square',
    'cat': 'cat',
    'truck': 'truck',
    'bird': 'bird',
    'horse': 'horse',
    'frog': 'frog'
}

# Load training images
X_train, y_train = [], []
for cls in classes:
    cls_path = os.path.join(train_dir, cls)
    images = sorted(os.listdir(cls_path))
    selected = random.sample(images, 80)
    for fname in selected:
        img = Image.open(os.path.join(cls_path, fname)).convert("RGB").resize(img_size)
        X_train.append(np.array(img))
        y_train.append(class_indices[cls])

# Load testing images
X_test, y_test = [], []
for cls in classes:
    test_cls_folder = class_folder_map[cls]
    cls_path = os.path.join(test_dir, test_cls_folder)
    images = sorted(os.listdir(cls_path))
    selected = random.sample(images, 20)
    for fname in selected:
        img = Image.open(os.path.join(cls_path, fname)).convert("RGB").resize(img_size)
        X_test.append(np.array(img))
        y_test.append(class_indices[cls])

# Normalize and encode
X_train = np.array(X_train).astype("float32") / 255.0
X_test = np.array(X_test).astype("float32") / 255.0
train_labels = to_categorical(np.array(y_train), num_classes=10)
test_labels = to_categorical(np.array(y_test), num_classes=10)

print("Training images shape:", X_train.shape)
print("Testing images shape:", X_test.shape)


Training images shape: (800, 32, 32, 3)
Testing images shape: (200, 32, 32, 3)


In [5]:
# Compute student model metrics without KD

scratch_student.compile(optimizer = 'sgd',
              loss='categorical_crossentropy',
              metrics=['accuracy'])


In [6]:
# First, let us try to see what if we directly train the student model without using knowledge distillation

scratch_student.fit(X_train, train_labels, epochs=7, batch_size=32)


Epoch 1/7
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 32ms/step - accuracy: 0.0863 - loss: 3.5632
Epoch 2/7
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 32ms/step - accuracy: 0.1378 - loss: 2.9903
Epoch 3/7
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 36ms/step - accuracy: 0.1721 - loss: 2.8072
Epoch 4/7
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 33ms/step - accuracy: 0.2137 - loss: 2.4768
Epoch 5/7
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 34ms/step - accuracy: 0.2219 - loss: 2.4422
Epoch 6/7
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 28ms/step - accuracy: 0.2338 - loss: 2.2568
Epoch 7/7
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 30ms/step - accuracy: 0.3176 - loss: 2.0376


<keras.src.callbacks.history.History at 0x23edaa41180>

In [7]:
# We evaluate student model for its loss and accuracy, if the student model is trained without using knowledge distillation

scratch_student.evaluate(X_test, test_labels)


[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 20ms/step - accuracy: 0.0783 - loss: 3.0053    


[2.915072441101074, 0.1550000011920929]

In [8]:
# KNOWLEDGE DISTILLATION CLASS, You can adjust alpha based on how much you want the student to learn from the teacher

class Distiller(keras.Model):
    def __init__(self, student, teacher):
        super().__init__()
        self.teacher = teacher
        self.student = student

    def compile(
        self,
        optimizer,
        metrics,
        student_loss_fn,
        distillation_loss_fn,
        alpha=0.2,
        temperature=3,
    ):
        super().compile(optimizer=optimizer, metrics=metrics)
        self.student_loss_fn = student_loss_fn
        self.distillation_loss_fn = distillation_loss_fn
        self.alpha = alpha
        self.temperature = temperature

    def compute_loss(
        self, x=None, y=None, y_pred=None, sample_weight=None, allow_empty=False
    ):
        teacher_pred = self.teacher(x, training=False)
        student_loss = self.student_loss_fn(y, y_pred)

        distillation_loss = self.distillation_loss_fn(
            tf.nn.softmax(teacher_pred / self.temperature, axis=1),
            tf.nn.softmax(y_pred / self.temperature, axis=1),
        ) * (self.temperature**2)

        loss = self.alpha * student_loss + (1 - self.alpha) * distillation_loss
        return loss

    def call(self, x):
        return self.student(x)


In [9]:
# Initialize the distiller
# Train the student model using knowledge distillation

distiller = Distiller(student=student_model, teacher=teacher_model)

distiller.compile(
    optimizer=keras.optimizers.Adam(),
    metrics=[metrics.CategoricalAccuracy()],
    student_loss_fn=losses.CategoricalCrossentropy(),
    distillation_loss_fn=losses.CategoricalCrossentropy(),
    alpha=0.2,
    temperature=1,
) 

# Fitting the student model receiving KD
history = distiller.fit(
    X_train,
    train_labels,
    epochs=7,
    batch_size=32,  
    validation_split=0.2,
)


Epoch 1/7
[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 76ms/step - categorical_accuracy: 0.1172 - loss: 2.2770 - val_categorical_accuracy: 0.0000e+00 - val_loss: 2.7901
Epoch 2/7
[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 47ms/step - categorical_accuracy: 0.1070 - loss: 2.2549 - val_categorical_accuracy: 0.0000e+00 - val_loss: 2.8581
Epoch 3/7
[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 43ms/step - categorical_accuracy: 0.1103 - loss: 2.2504 - val_categorical_accuracy: 0.0000e+00 - val_loss: 2.9549
Epoch 4/7
[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 44ms/step - categorical_accuracy: 0.1429 - loss: 2.2357 - val_categorical_accuracy: 0.0000e+00 - val_loss: 3.9543
Epoch 5/7
[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 42ms/step - categorical_accuracy: 0.1885 - loss: 2.2348 - val_categorical_accuracy: 0.0000e+00 - val_loss: 3.2993
Epoch 6/7
[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[

In [10]:
# We evaluate student model again for its loss and accuracy,
# But this time the student model is trained using knowledge distillation
# You can compare this results with the results above

distiller.evaluate(X_test, test_labels)


[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 119ms/step - categorical_accuracy: 0.1744 - loss: 2.3130


[2.4698023796081543, 0.14000000059604645]