In [9]:
import tensorflow as tf
import tensorflow.keras as keras
import tensorflow.keras.layers as layers
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import sys
import os
sys.path.append("..") # Adds higher directory to python modules path.
import branchingdnn as branching

In [10]:
class Distiller(keras.Model):
    def __init__(self, student, teacher):
        super(Distiller, self).__init__()
        self.teacher = teacher
        self.student = student

    def compile(
        self,
        optimizer,
        metrics,
        student_loss_fn,
        distillation_loss_fn,
        alpha=0.1,
        temperature=3,
    ):
        """ Configure the distiller.

        Args:
            optimizer: Keras optimizer for the student weights
            metrics: Keras metrics for evaluation
            student_loss_fn: Loss function of difference between student
                predictions and ground-truth
            distillation_loss_fn: Loss function of difference between soft
                student predictions and soft teacher predictions
            alpha: weight to student_loss_fn and 1-alpha to distillation_loss_fn
            temperature: Temperature for softening probability distributions.
                Larger temperature gives softer distributions.
        """
        super(Distiller, self).compile(optimizer=optimizer, metrics=metrics)
        self.student_loss_fn = student_loss_fn
        self.distillation_loss_fn = distillation_loss_fn
        self.alpha = alpha
        self.temperature = temperature

    def train_step(self, data):
        # Unpack data
        x, y = data

        # Forward pass of teacher
        teacher_predictions = self.teacher(x, training=False)

        with tf.GradientTape() as tape:
            # Forward pass of student
            student_predictions = self.student(x, training=True)

            # Compute losses
            student_loss = self.student_loss_fn(y, student_predictions)
            distillation_loss = self.distillation_loss_fn(
                tf.nn.softmax(teacher_predictions / self.temperature, axis=1),
                tf.nn.softmax(student_predictions / self.temperature, axis=1),
            )
            loss = self.alpha * student_loss + (1 - self.alpha) * distillation_loss *100

        # Compute gradients
        trainable_vars = self.student.trainable_variables
        gradients = tape.gradient(loss, trainable_vars)

        # Update weights
        self.optimizer.apply_gradients(zip(gradients, trainable_vars))

        # Update the metrics configured in `compile()`.
        self.compiled_metrics.update_state(y, student_predictions)

        # Return a dict of performance
        results = {m.name: m.result() for m in self.metrics}
        results.update(
            {"student_loss": student_loss, "distillation_loss": distillation_loss}
        )
        return results

    def test_step(self, data):
        # Unpack the data
        x, y = data

        # Compute predictions
        y_prediction = self.student(x, training=False)

        # Calculate the loss
        student_loss = self.student_loss_fn(y, y_prediction)

        # Update the metrics.
        self.compiled_metrics.update_state(y, y_prediction)

        # Return a dict of performance
        results = {m.name: m.result() for m in self.metrics}
        results.update({"student_loss": student_loss})
        return results

In [3]:
# Create the teacher
teacher = keras.Sequential(
    [
        keras.Input(shape=(28, 28, 1)),
        layers.Conv2D(256, (3, 3), strides=(2, 2), padding="same"),
        layers.LeakyReLU(alpha=0.2),
        layers.MaxPooling2D(pool_size=(2, 2), strides=(1, 1), padding="same"),
        layers.Conv2D(512, (3, 3), strides=(2, 2), padding="same"),
        layers.Flatten(),
        layers.Dense(10),
    ],
    name="teacher",
)

# Create the student
student = keras.Sequential(
    [
        keras.Input(shape=(28, 28, 1)),
        layers.Conv2D(16, (3, 3), strides=(2, 2), padding="same"),
        layers.LeakyReLU(alpha=0.2),
        layers.MaxPooling2D(pool_size=(2, 2), strides=(1, 1), padding="same"),
        layers.Conv2D(32, (3, 3), strides=(2, 2), padding="same"),
        layers.Flatten(),
        layers.Dense(10),
    ],
    name="student",
)

# Clone student for later comparison
student_scratch = keras.models.clone_model(student)

In [13]:
# Prepare the train and test dataset.
batch_size = 64
(x_train, y_train), (x_test, y_test) = keras.datasets.mnist.load_data()

# Normalize data
x_train = x_train.astype("float32") / 255.0
x_train = np.reshape(x_train, (-1, 28, 28, 1))

x_test = x_test.astype("float32") / 255.0
x_test = np.reshape(x_test, (-1, 28, 28, 1))



In [4]:
dataset = branching.dataset.prepare.dataset(tf.keras.datasets.cifar10.load_data(),32,5000,22500,(227,227), include_targets=False, categorical=True)
train_ds, test_ds, validation_ds = dataset

augment Dataset
targetsis : False
trainSize 45000
testSize 10000


In [5]:
model_teacher = tf.keras.models.load_model("models/alexNetv6_logits_teacher.hdf5")
model_teacher.evaluate(test_ds)



[0.6905297040939331, 0.7939703464508057]

In [14]:
# # Train teacher as usual
# teacher.compile(
#     optimizer=keras.optimizers.Adam(),
#     loss=keras.losses.SparseCategoricalCrossentropy(from_logits=True),
#     metrics=[keras.metrics.SparseCategoricalAccuracy()],
# )

# # Train and evaluate teacher on data.
# teacher.fit(x_train, y_train, epochs=5)
# teacher.evaluate(x_test, y_test)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


[0.11047188937664032, 0.9714999794960022]

In [32]:
# Initialize and compile distiller
for i in range(6):
    student = keras.Sequential(
        [
            keras.Input(shape=(227, 227, 1)),
            layers.Conv2D(16, (3, 3), strides=(2, 2), padding="same"),
            layers.LeakyReLU(alpha=0.2),
            layers.MaxPooling2D(pool_size=(2, 2), strides=(1, 1), padding="same"),
            layers.Conv2D(32, (3, 3), strides=(2, 2), padding="same"),
            layers.Flatten(),
            layers.Dense(10),
        ],
        name="student",
    )

#     print("alpha: ",(i+1)/10)
    distiller = Distiller(student=student, teacher=model_teacher)
    distiller.compile(
        optimizer=keras.optimizers.Adam(),
        metrics=[keras.metrics.SparseCategoricalAccuracy()],
        student_loss_fn=keras.losses.SparseCategoricalCrossentropy(from_logits=True),
        distillation_loss_fn=keras.losses.KLDivergence(),
        alpha=0.3,
        temperature=10,
    )
    # Distill teacher to student
    distiller.fit(x_train, y_train, epochs=3, verbose=0)
    # Evaluate student on test dataset
    print("results: ",distiller.evaluate(x_test, y_test))

results:  [0.9799000024795532, 0.0016497487667948008]
results:  [0.9763000011444092, 0.0008576142136007547]
results:  [0.9764000177383423, 0.002193244406953454]
results:  [0.9769999980926514, 9.805313311517239e-05]
results:  [0.9753999710083008, 0.0012869059573858976]
results:  [0.9769999980926514, 3.976887819590047e-05]


Train the student model with the teacher model supplying additional loss signals.
---

In [6]:
# Initialize and compile distiller
# for i in range(3):

inputs = keras.Input(shape=(227,227,3))
x = keras.layers.Conv2D(filters=96, kernel_size=(11,11), strides=(4,4), activation='relu', input_shape=(227,227,3))(inputs)
x = keras.layers.BatchNormalization()(x)
x = keras.layers.MaxPool2D(pool_size=(3,3), strides=(2,2))(x)
x = keras.layers.Conv2D(filters=256, kernel_size=(5,5), strides=(1,1), activation='relu', padding="same")(x)
x = keras.layers.BatchNormalization()(x)
x = keras.layers.MaxPool2D(pool_size=(3,3), strides=(2,2))(x)
x = keras.layers.Conv2D(filters=384, kernel_size=(3,3), strides=(1,1), activation='relu', padding="same")(x)
x = keras.layers.BatchNormalization()(x)
x = keras.layers.Conv2D(filters=384, kernel_size=(1,1), strides=(1,1), activation='relu', padding="same")(x)
x = keras.layers.BatchNormalization()(x)
x = keras.layers.Conv2D(filters=256, kernel_size=(1,1), strides=(1,1), activation='relu', padding="same")(x)
x = keras.layers.BatchNormalization()(x)
x = keras.layers.MaxPool2D(pool_size=(3,3), strides=(2,2))(x)
x = keras.layers.Flatten()(x)
x = keras.layers.Dense(4096, activation='relu')(x)
x = keras.layers.Dropout(0.5)(x)
x = keras.layers.Dense(4096, activation='relu')(x)
x = keras.layers.Dropout(0.5)(x)
x = keras.layers.Dense(10, activation='softmax')(x)
student = keras.Model(inputs=(inputs), outputs=[x], name="alexnet")

# student = keras.Sequential(
#         [
#             keras.Input(shape=(227, 227, 3)),
#             layers.Conv2D(16, (3, 3), strides=(2, 2), padding="same"),
#             layers.LeakyReLU(alpha=0.2),
#             layers.MaxPooling2D(pool_size=(2, 2), strides=(1, 1), padding="same"),
#             layers.Conv2D(32, (3, 3), strides=(2, 2), padding="same"),
#             layers.Flatten(),
#             layers.Dense(10),
#         ],
#         name="student",
#     )

distiller = Distiller(student=student, teacher=model_teacher)
distiller.compile(
    optimizer=keras.optimizers.Adam(),
    metrics=[keras.metrics.CategoricalAccuracy()],
    student_loss_fn=keras.losses.CategoricalCrossentropy(from_logits=True),
    distillation_loss_fn=keras.losses.KLDivergence(),
    alpha=0.1,
    temperature=10,
)
# Distill teacher to student
distiller.fit(train_ds, epochs=6,verbose=1)
# Evaluate student on test dataset
print("results: ",distiller.evaluate(test_ds))

Epoch 1/6
Epoch 2/6
Epoch 3/6
Epoch 4/6
Epoch 5/6
Epoch 6/6
results:  [0.5899438858032227, 1.2809326648712158]


In [6]:
# Initialize and compile distiller
# for i in range(3):

inputs = keras.Input(shape=(227,227,3))
x = keras.layers.Conv2D(filters=96, kernel_size=(11,11), strides=(4,4), activation='relu', input_shape=(227,227,3))(inputs)
x = keras.layers.BatchNormalization()(x)
x = keras.layers.MaxPool2D(pool_size=(3,3), strides=(2,2))(x)
# x = keras.layers.Conv2D(filters=256, kernel_size=(5,5), strides=(1,1), activation='relu', padding="same")(x)
# x = keras.layers.BatchNormalization()(x)
# x = keras.layers.MaxPool2D(pool_size=(3,3), strides=(2,2))(x)
# x = keras.layers.Conv2D(filters=384, kernel_size=(3,3), strides=(1,1), activation='relu', padding="same")(x)
# x = keras.layers.BatchNormalization()(x)
# x = keras.layers.Conv2D(filters=384, kernel_size=(1,1), strides=(1,1), activation='relu', padding="same")(x)
# x = keras.layers.BatchNormalization()(x)
# x = keras.layers.Conv2D(filters=256, kernel_size=(1,1), strides=(1,1), activation='relu', padding="same")(x)
# x = keras.layers.BatchNormalization()(x)
# x = keras.layers.MaxPool2D(pool_size=(3,3), strides=(2,2))(x)
x = keras.layers.Flatten()(x)
# x = keras.layers.Dense(124, activation='relu')(x)
# x = keras.layers.Dense(64, activation='relu')(x)
x = keras.layers.Dense(4096, activation='relu')(x)
x = keras.layers.Dense(4096, activation='relu')(x)
x = keras.layers.Dense(10, activation='softmax')(x)
student = keras.Model(inputs=(inputs), outputs=[x], name="alexnet")

# student = keras.Sequential(
#         [
#             keras.Input(shape=(227, 227, 3)),
#             layers.Conv2D(16, (3, 3), strides=(2, 2), padding="same"),
#             layers.LeakyReLU(alpha=0.2),
#             layers.MaxPooling2D(pool_size=(2, 2), strides=(1, 1), padding="same"),
#             layers.Conv2D(32, (3, 3), strides=(2, 2), padding="same"),
#             layers.Flatten(),
#             layers.Dense(10),
#         ],
#         name="student",
#     )

distiller = Distiller(student=student, teacher=model_teacher)
distiller.compile(
    optimizer=keras.optimizers.Adam(),
    metrics=[keras.metrics.CategoricalAccuracy()],
    student_loss_fn=keras.losses.CategoricalCrossentropy(from_logits=True),
    distillation_loss_fn=keras.losses.KLDivergence(),
    alpha=.1,
    temperature=10,
)
print(distiller.alpha)
# Distill teacher to student
distiller.fit(train_ds, epochs=12,verbose=1)
# Evaluate student on test dataset
print("results: ",distiller.evaluate(test_ds))

0.1
Epoch 1/12


ResourceExhaustedError:  OOM when allocating tensor with shape[69984,4096] and type float on /job:localhost/replica:0/task:0/device:GPU:0 by allocator GPU_0_bfc
	 [[node gradient_tape/alexnet/dense_3/MatMul_1 (defined at <ipython-input-2-51b0ad79b8c8>:56) ]]
Hint: If you want to see a list of allocated tensors when OOM happens, add report_tensor_allocations_upon_oom to RunOptions for current allocation info.
 [Op:__inference_train_function_138672]

Errors may have originated from an input operation.
Input Source operations connected to node gradient_tape/alexnet/dense_3/MatMul_1:
 alexnet/flatten_1/Reshape (defined at <ipython-input-2-51b0ad79b8c8>:44)

Function call stack:
train_function


Results from running:  <br>
1 [0.5782251358032227, 1.4982712268829346] <br>
2 [0.5210336446762085, 1.799507737159729] <br>
3 [0.5548878312110901, 1.3219722509384155] <br>
4 [0.5767227411270142, 1.3181259632110596] <br>
<br>
5 [0.5558894276618958, 1.3667815923690796] <br>
6 [0.5221354365348816, 1.5140984058380127] <br>
7 [0.5203325152397156, 1.4471924304962158] <br>
8 [0.5422676205635071, 1.8327810764312744] <br>


12 epochs <br>
0.59,
0.58, 0.588

Now train the same student model without the teacher
----

In [8]:
# Train student as doen usually
# for i in range(6):
inputs = keras.Input(shape=(227,227,3))
x = keras.layers.Conv2D(filters=96, kernel_size=(11,11), strides=(4,4), activation='relu', input_shape=(227,227,3))(inputs)
x = keras.layers.BatchNormalization()(x)
x = keras.layers.MaxPool2D(pool_size=(3,3), strides=(2,2))(x)
x = keras.layers.Conv2D(filters=256, kernel_size=(5,5), strides=(1,1), activation='relu', padding="same")(x)
x = keras.layers.BatchNormalization()(x)
x = keras.layers.MaxPool2D(pool_size=(3,3), strides=(2,2))(x)
x = keras.layers.Conv2D(filters=384, kernel_size=(3,3), strides=(1,1), activation='relu', padding="same")(x)
x = keras.layers.BatchNormalization()(x)
x = keras.layers.Conv2D(filters=384, kernel_size=(1,1), strides=(1,1), activation='relu', padding="same")(x)
x = keras.layers.BatchNormalization()(x)
x = keras.layers.Conv2D(filters=256, kernel_size=(1,1), strides=(1,1), activation='relu', padding="same")(x)
x = keras.layers.BatchNormalization()(x)
x = keras.layers.MaxPool2D(pool_size=(3,3), strides=(2,2))(x)
x = keras.layers.Flatten()(x)
x = keras.layers.Dense(4096, activation='relu')(x)
x = keras.layers.Dropout(0.5)(x)
x = keras.layers.Dense(4096, activation='relu')(x)
x = keras.layers.Dropout(0.5)(x)
x = keras.layers.Dense(10, activation='softmax')(x)
student_scratch = keras.Model(inputs=(inputs), outputs=[x], name="alexnet")

# student_scratch = keras.Sequential(
#     [
#         keras.Input(shape=(227, 227, 3)),
#         layers.Conv2D(16, (3, 3), strides=(2, 2), padding="same"),
#         layers.LeakyReLU(alpha=0.2),
#         layers.MaxPooling2D(pool_size=(2, 2), strides=(1, 1), padding="same"),
#         layers.Conv2D(32, (3, 3), strides=(2, 2), padding="same"),
#         layers.Flatten(),
#         layers.Dense(10),
#     ],
#     name="student",
# )
student_scratch.compile(
    optimizer=keras.optimizers.Adam(),
    loss=keras.losses.CategoricalCrossentropy(from_logits=True),
    metrics=[keras.metrics.CategoricalAccuracy()],
)

# Train and evaluate student trained from scratch.
student_scratch.fit(train_ds, epochs=6,verbose=1)
#     student_scratch.evaluate(x_test, y_test)
print("results: ",student_scratch.evaluate(test_ds))

Epoch 1/6
Epoch 2/6
Epoch 3/6
Epoch 4/6
Epoch 5/6
Epoch 6/6
results:  [1.0825001001358032, 0.6264022588729858]


Results from running: <br>
1 [0.5782251358032227, 1.4982712268829346] <br>
2 [1.342477798461914, 0.5252403616905212] <br>
3 0.5658053159713745 <br>
4 0.5724158883094788 <br>


In [50]:
print("results: ",student_scratch.evaluate(test_ds))

results:  [1.6138664484024048, 0.39222756028175354]


In [19]:
student_2 = keras.Sequential(
    [
        keras.Input(shape=(28, 28, 1)),
        layers.Conv2D(16, (3, 3), strides=(2, 2), padding="same"),
        layers.LeakyReLU(alpha=0.2),
        layers.MaxPooling2D(pool_size=(2, 2), strides=(1, 1), padding="same"),
        layers.Conv2D(32, (3, 3), strides=(2, 2), padding="same"),
        layers.Flatten(),
        layers.Dense(10),
    ],
    name="student",
)
student_2.compile(
    optimizer=keras.optimizers.Adam(),
    loss=keras.losses.SparseCategoricalCrossentropy(from_logits=True),
    metrics=[keras.metrics.SparseCategoricalAccuracy()],
)
# Train and evaluate student trained from scratch.
student_2.fit(x_train, y_train, epochs=3)
student_2.evaluate(x_test, y_test)

Epoch 1/3
Epoch 2/3
Epoch 3/3


[0.06662680953741074, 0.9783999919891357]