In [11]:
import tensorflow as tf
import tensorflow.keras as keras
import tensorflow.keras.layers as layers
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import sys
import os
sys.path.append("..") # Adds higher directory to python modules path.
import branchingdnn as branching

In [22]:
class Distiller(keras.Model):
    def __init__(self, student, teacher):
        super(Distiller, self).__init__()
        self.teacher = teacher
        self.student = tf.keras.models.clone_model(student)

    def compile(
        self,
        optimizer,
        metrics,
        student_loss_fn,
        distillation_loss_fn,
        alpha=0.1,
        temperature=3,
    ):
        """ Configure the distiller.

        Args:
            optimizer: Keras optimizer for the student weights
            metrics: Keras metrics for evaluation
            student_loss_fn: Loss function of difference between student
                predictions and ground-truth
            distillation_loss_fn: Loss function of difference between soft
                student predictions and soft teacher predictions
            alpha: weight to student_loss_fn and 1-alpha to distillation_loss_fn
            temperature: Temperature for softening probability distributions.
                Larger temperature gives softer distributions.
        """
        super(Distiller, self).compile(optimizer=optimizer, metrics=metrics)
        self.student_loss_fn = student_loss_fn
        self.distillation_loss_fn = distillation_loss_fn
        self.alpha = alpha
        self.temperature = temperature

    def train_step(self, data):
        # Unpack data
        x, y = data

        # Forward pass of teacher
        teacher_predictions = self.teacher(x, training=False)

        with tf.GradientTape() as tape:
            # Forward pass of student
            student_predictions = self.student(x, training=True)

            # Compute losses
            student_loss = self.student_loss_fn(y, student_predictions)
            distillation_loss = self.distillation_loss_fn(
                tf.nn.softmax(teacher_predictions / self.temperature, axis=1),
                tf.nn.softmax(student_predictions / self.temperature, axis=1),
            )
            student_loss = student_loss #* self.alpha
            distillation_loss = (distillation_loss *1000) #* (1 - self.alpha)
            #loss = self.alpha * student_loss + ((1 - self.alpha) * (distillation_loss *1000))
            loss=student_loss +distillation_loss
        # Compute gradients
        trainable_vars = self.student.trainable_variables
        gradients = tape.gradient(loss, trainable_vars)

        # Update weights
        self.optimizer.apply_gradients(zip(gradients, trainable_vars))

        # Update the metrics configured in `compile()`.
        self.compiled_metrics.update_state(y, student_predictions)

        # Return a dict of performance
        results = {m.name: m.result() for m in self.metrics}
        results.update(
            {"student_loss": student_loss, "distillation_loss": distillation_loss}
        )
        return results

    def test_step(self, data):
        # Unpack the data
        x, y = data

        # Compute predictions
        y_prediction = self.student(x, training=False)

        # Calculate the loss
        student_loss = self.student_loss_fn(y, y_prediction)

        # Update the metrics.
        self.compiled_metrics.update_state(y, y_prediction)

        # Return a dict of performance
        results = {m.name: m.result() for m in self.metrics}
        results.update({"student_loss": student_loss})
        return results

In [14]:
dataset = branching.dataset.prepare.dataset(tf.keras.datasets.cifar10.load_data(),32,5000,22500,(227,227), include_targets=False, categorical=True)
train_ds, test_ds, validation_ds = dataset

augment Dataset
targetsis : False
trainSize 45000
testSize 10000


In [15]:
model_teacher = tf.keras.models.load_model("models/alexNetv6_logits_teacher.hdf5")
model_teacher.evaluate(test_ds)



[0.6905280947685242, 0.7939703464508057]

Train the student model with the teacher model supplying additional loss signals.
---

In [16]:
inputs = keras.Input(shape=(227,227,3))
x = keras.layers.Conv2D(filters=96, kernel_size=(11,11), strides=(4,4), activation='relu', input_shape=(227,227,3))(inputs)
x = keras.layers.BatchNormalization()(x)
x = keras.layers.MaxPool2D(pool_size=(3,3), strides=(2,2))(x)
x = keras.layers.Conv2D(filters=256, kernel_size=(5,5), strides=(1,1), activation='relu', padding="same")(x)
x = keras.layers.BatchNormalization()(x)
x = keras.layers.MaxPool2D(pool_size=(3,3), strides=(2,2))(x)
x = keras.layers.Conv2D(filters=384, kernel_size=(3,3), strides=(1,1), activation='relu', padding="same")(x)
x = keras.layers.BatchNormalization()(x)
x = keras.layers.Conv2D(filters=384, kernel_size=(1,1), strides=(1,1), activation='relu', padding="same")(x)
x = keras.layers.BatchNormalization()(x)
x = keras.layers.Conv2D(filters=256, kernel_size=(1,1), strides=(1,1), activation='relu', padding="same")(x)
x = keras.layers.BatchNormalization()(x)
x = keras.layers.MaxPool2D(pool_size=(3,3), strides=(2,2))(x)
x = keras.layers.Flatten()(x)
x = keras.layers.Dense(4096, activation='relu')(x)
x = keras.layers.Dropout(0.5)(x)
x = keras.layers.Dense(4096, activation='relu')(x)
x = keras.layers.Dropout(0.5)(x)
x = keras.layers.Dense(10, activation='softmax')(x)
student = keras.Model(inputs=(inputs), outputs=[x], name="alexnet")
student.compile( loss=keras.losses.CategoricalCrossentropy(from_logits=False),
            optimizer=keras.optimizers.SGD(lr=0.001, momentum=0.9),
            metrics=[keras.metrics.CategoricalAccuracy()])
student.save("student.hdf5")

In [28]:
# Initialize and compile distiller
for i in range(5):
    print("attempt ",i+1)


# for i in range (10):
    student_copy = tf.keras.models.load_model("student.hdf5")
    # student_copy.compile( loss=keras.losses.CategoricalCrossentropy(from_logits=False),
    #         optimizer=keras.optimizers.SGD(lr=0.001, momentum=0.9),
    #         metrics=[keras.metrics.CategoricalAccuracy()],
    #     )
    student_copy.evaluate(test_ds)
    student_copy.fit(train_ds,validation_data = validation_ds, epochs=6,verbose=1)
    student_copy.evaluate(test_ds)


attempt  1
Epoch 1/6
Epoch 2/6
Epoch 3/6
Epoch 4/6
Epoch 5/6
Epoch 6/6
attempt  2
Epoch 1/6
Epoch 2/6
Epoch 3/6
Epoch 4/6
Epoch 5/6
Epoch 6/6
attempt  3
Epoch 1/6
Epoch 2/6
Epoch 3/6
Epoch 4/6
Epoch 5/6
Epoch 6/6
attempt  4
Epoch 1/6
Epoch 2/6
Epoch 3/6
Epoch 4/6
Epoch 5/6
Epoch 6/6
attempt  5
Epoch 1/6
Epoch 2/6
Epoch 3/6
Epoch 4/6
Epoch 5/6
Epoch 6/6


In [27]:
# Initialize and compile distiller
# for i in range(3):



# for i in range (10):
loaded_student = tf.keras.models.load_model("student.hdf5")
student_copy = loaded_student
student_copy.compile( loss=keras.losses.CategoricalCrossentropy(from_logits=False),
        optimizer=keras.optimizers.SGD(lr=0.001, momentum=0.9),
        metrics=[keras.metrics.CategoricalAccuracy()],
    )
student_copy.evaluate(test_ds)
student_copy.fit(train_ds,validation_data = validation_ds, epochs=6,verbose=1)
student_copy.evaluate(test_ds)


Epoch 1/6
Epoch 2/6
Epoch 3/6
Epoch 4/6
Epoch 5/6
Epoch 6/6


[0.8251988291740417, 0.7164463400840759]

In [20]:
def createDistiller(alpha=1):
    loaded_student = tf.keras.models.load_model("student.hdf5")
    distiller = Distiller(student=loaded_student, teacher=model_teacher)
    distiller.compile(
        optimizer=keras.optimizers.SGD(lr=0.001, momentum=0.9),
        metrics=[keras.metrics.CategoricalAccuracy()],
        student_loss_fn=keras.losses.CategoricalCrossentropy(from_logits=False),

        distillation_loss_fn=keras.losses.KLDivergence(),
        alpha=0.1,
        temperature=10,
    )
    return distiller


In [24]:
# for i in range(6):
# print("run: ",i+1)
distiller = createDistiller(0.3)
# Distill teacher to student
distiller.fit(train_ds, validation_data = validation_ds,epochs=6,verbose=1)
# Evaluate student on test dataset
print("results: ",distiller.evaluate(test_ds))
    

Epoch 1/6
Epoch 2/6
Epoch 3/6
Epoch 4/6
Epoch 5/6
Epoch 6/6
results:  [0.7398838400840759, 1.142507791519165]


In [25]:
# for i in range(6):
# print("run: ",i+1)
distiller = createDistiller()
# Distill teacher to student
distiller.fit(train_ds, validation_data = validation_ds,epochs=6,verbose=1)
# Evaluate student on test dataset
print("results: ",distiller.evaluate(test_ds))

Epoch 1/6
Epoch 2/6
Epoch 3/6
Epoch 4/6
Epoch 5/6
Epoch 6/6
results:  [0.7330729365348816, 1.0371853113174438]


In [None]:
loaded_student = tf.keras.models.load_model("student.hdf5")
distiller.compile(
    optimizer=keras.optimizers.SGD(lr=0.001, momentum=0.9),
    metrics=[keras.metrics.CategoricalAccuracy()],
    student_loss_fn=keras.losses.CategoricalCrossentropy(from_logits=False),

    distillation_loss_fn=keras.losses.KLDivergence(),
    alpha=0.1,
    temperature=10,
)

Now train the same student model without the teacher
----

In [5]:
# Train student as doen usually
# for i in range(6):
inputs = keras.Input(shape=(227,227,3))
x = keras.layers.Conv2D(filters=96, kernel_size=(11,11), strides=(4,4), activation='relu', input_shape=(227,227,3))(inputs)
x = keras.layers.BatchNormalization()(x)
x = keras.layers.MaxPool2D(pool_size=(3,3), strides=(2,2))(x)
x = keras.layers.Conv2D(filters=256, kernel_size=(5,5), strides=(1,1), activation='relu', padding="same")(x)
x = keras.layers.BatchNormalization()(x)
x = keras.layers.MaxPool2D(pool_size=(3,3), strides=(2,2))(x)
x = keras.layers.Conv2D(filters=384, kernel_size=(3,3), strides=(1,1), activation='relu', padding="same")(x)
x = keras.layers.BatchNormalization()(x)
x = keras.layers.Conv2D(filters=384, kernel_size=(1,1), strides=(1,1), activation='relu', padding="same")(x)
x = keras.layers.BatchNormalization()(x)
x = keras.layers.Conv2D(filters=256, kernel_size=(1,1), strides=(1,1), activation='relu', padding="same")(x)
x = keras.layers.BatchNormalization()(x)
x = keras.layers.MaxPool2D(pool_size=(3,3), strides=(2,2))(x)
x = keras.layers.Flatten()(x)
x = keras.layers.Dense(4096, activation='relu')(x)
x = keras.layers.Dropout(0.5)(x)
x = keras.layers.Dense(4096, activation='relu')(x)
x = keras.layers.Dropout(0.5)(x)
x = keras.layers.Dense(10, activation='softmax')(x)
student_scratch = keras.Model(inputs=(inputs), outputs=[x], name="alexnet")

student_scratch.compile(
    optimizer=keras.optimizers.SGD(lr=0.001, momentum=0.9),
    loss=keras.losses.CategoricalCrossentropy(from_logits=True),
    metrics=[keras.metrics.CategoricalAccuracy()],
)

# Train and evaluate student trained from scratch.
student_scratch.fit(train_ds, epochs=6,verbose=1)
#     student_scratch.evaluate(x_test, y_test)
print("results: ",student_scratch.evaluate(test_ds))

Epoch 1/6
Epoch 2/6
Epoch 3/6
Epoch 4/6
Epoch 5/6
Epoch 6/6
results:  [1.0058716535568237, 0.6460336446762085]


Results from running: <br>
1 [0.5782251358032227, 1.4982712268829346] <br>
2 [1.342477798461914, 0.5252403616905212] <br>
3 0.5658053159713745 <br>
4 0.5724158883094788 <br>


In [50]:
print("results: ",student_scratch.evaluate(test_ds))

results:  [1.6138664484024048, 0.39222756028175354]


In [19]:
student_2 = keras.Sequential(
    [
        keras.Input(shape=(28, 28, 1)),
        layers.Conv2D(16, (3, 3), strides=(2, 2), padding="same"),
        layers.LeakyReLU(alpha=0.2),
        layers.MaxPooling2D(pool_size=(2, 2), strides=(1, 1), padding="same"),
        layers.Conv2D(32, (3, 3), strides=(2, 2), padding="same"),
        layers.Flatten(),
        layers.Dense(10),
    ],
    name="student",
)
student_2.compile(
    optimizer=keras.optimizers.Adam(),
    loss=keras.losses.SparseCategoricalCrossentropy(from_logits=True),
    metrics=[keras.metrics.SparseCategoricalAccuracy()],
)
# Train and evaluate student trained from scratch.
student_2.fit(x_train, y_train, epochs=3)
student_2.evaluate(x_test, y_test)

Epoch 1/3
Epoch 2/3
Epoch 3/3


[0.06662680953741074, 0.9783999919891357]