In [1]:
#imports
import keras
from keras.datasets import fashion_mnist,mnist,cifar10
from keras.layers import Activation, Input, Embedding, LSTM, Dense, Lambda, GaussianNoise, concatenate
from keras.models import Model
import numpy as np
from keras.utils import np_utils
from keras.layers.core import Dense, Dropout, Activation
from keras import backend as K
from keras.models import Sequential
from keras.layers import Dense
from sklearn.model_selection import train_test_split
from keras.optimizers import SGD, Adam, RMSprop
from keras.constraints import max_norm
from keras.layers import MaxPooling2D, Dropout, Dense, Flatten, Activation, Conv2D
from keras.models import Sequential
from keras.losses import categorical_crossentropy as logloss
from keras.metrics import categorical_accuracy
import matplotlib.pyplot as plt
from sklearn.metrics import mean_squared_error
from matplotlib import offsetbox

Using TensorFlow backend.


In [2]:
#Loading and splitting the dataset into train, validation and test
nb_classes = 10

(X_Train, y_Train), (X_test, y_test) = fashion_mnist.load_data()
X_train, X_val, y_train, y_val = train_test_split(X_Train, y_Train, test_size=0.20)
# convert y_train and y_test to categorical binary values 
Y_train = np_utils.to_categorical(y_train, nb_classes)
Y_val = np_utils.to_categorical(y_val, nb_classes)

Downloading data from http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/train-labels-idx1-ubyte.gz
Downloading data from http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/train-images-idx3-ubyte.gz
Downloading data from http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/t10k-labels-idx1-ubyte.gz
Downloading data from http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/t10k-images-idx3-ubyte.gz


In [0]:
# Reshape them to batch_size, width,height,#channels
X_train = X_train.reshape(48000, 28, 28, 1)
X_val = X_val.reshape(12000, 28, 28, 1)

X_train = X_train.astype('float32')
X_val = X_val.astype('float32')

# Normalize the values
X_train /= 255
X_val /= 255

In [4]:
#Creating a teacher network
input_shape = (28, 28, 1) # Input shape of each image

# Hyperparameters
nb_filters = 64 # number of convolutional filters to use
pool_size = (2, 2) # size of pooling area for max pooling
kernel_size = (3, 3) # convolution kernel size

teacher = Sequential()
teacher.add(Conv2D(32, kernel_size=(3, 3),
                 activation='relu',
                 input_shape=input_shape))
teacher.add(Conv2D(64, (3, 3), activation='relu'))
teacher.add(MaxPooling2D(pool_size=(2, 2)))

teacher.add(Dropout(0.25)) # For reguralization

teacher.add(Flatten())
teacher.add(Dense(256, activation='relu'))
teacher.add(Dropout(0.5)) # For reguralization

teacher.add(Dense(nb_classes))
teacher.add(Activation('softmax')) # Note that we add a normal softmax layer to begin with

teacher.compile(loss='categorical_crossentropy',
              optimizer='adadelta',
              metrics=['accuracy'])

print(teacher.summary())

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv2d_1 (Conv2D)            (None, 26, 26, 32)        320       
_________________________________________________________________
conv2d_2 (Conv2D)            (None, 24, 24, 64)        18496     
_________________________________________________________________
max_pooling2d_1 (MaxPooling2 (None, 12, 12, 64)        0         
_________________________________________________________________
dropout_1 (Dropout)          (None, 12, 12, 64)        0         
_________________________________________________________________
flatten_1 (Flatten)          (None, 9216)              0         
_________________________________________________________________
dense_1 (Dense)              (None, 256)               2359552   
_________________________________________________________________
dropout_2 (Dropout)          (None, 256)              

In [5]:
# Train the teacher model as usual
epochs = 15
batch_size = 256
teacher.fit(X_train, Y_train,
          batch_size=batch_size,
          epochs=epochs,
          verbose=1,
          validation_data=(X_val, Y_val))

Train on 48000 samples, validate on 12000 samples
Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


<keras.callbacks.callbacks.History at 0x7f3be4496ba8>

In [6]:
#checking teachers test accuracy
X_test = X_test.reshape(10000, 28, 28, 1)

X_test = X_test.astype('float32')
# Normalize the values
X_test /= 255
preds=teacher.predict(X_test)

numbers=[0,1,2,3,4,5,6,7,8,9]
preds2=[]
for p in preds:
  preds2.append(numbers[np.argmax(p)])
preds2=np.array(preds2)

print (y_test)
print (preds2)
print (np.sum(y_test==preds2))

[9 2 1 ... 8 1 5]
[9 2 1 ... 8 1 5]
9195


In [0]:
#Collect the dense vector from the previous layer output and store it in a different model
teacher_WO_Softmax = Model(teacher.input, teacher.get_layer('dense_1').output)

In [0]:
#Extracting dense representation from the teacher network
train_dense = teacher_WO_Softmax.predict(X_train)
test_dense = teacher_WO_Softmax.predict(X_val)

In [9]:
#Splitting the training dense vector among N students(in this case 4)
s1Train=train_dense[:,:64]
s2Train=train_dense[:,64:128]
s3Train=train_dense[:,128:192]
s4Train=train_dense[:,192:]
print (s1Train.shape)
print (s2Train.shape)
print (s3Train.shape)
print (s4Train.shape)

(48000, 64)
(48000, 64)
(48000, 64)
(48000, 64)


In [10]:
#Splitting the test dense vector among N students(in this case 4)
s1Test=test_dense[:,:64]
s2Test=test_dense[:,64:128]
s3Test=test_dense[:,128:192]
s4Test=test_dense[:,192:]
print (s1Test.shape)
print (s2Test.shape)
print (s3Test.shape)
print (s4Test.shape)

(12000, 64)
(12000, 64)
(12000, 64)
(12000, 64)


In [11]:
#creating the first student
student1 = Sequential()
student1.add(Conv2D(32, kernel_size=(3, 3),
                 activation='relu',
                 input_shape=(28, 28, 1),kernel_initializer='normal'))
student1.add(Conv2D(8, (3, 3), activation='relu',kernel_initializer='normal'))
student1.add(MaxPooling2D(pool_size=(2, 2)))
student1.add(Dropout(0.25)) # For reguralization
student1.add(Flatten())
student1.add(Dense(16, activation='relu',kernel_initializer='normal'))
student1.add(Dropout(0.5)) # For reguralization
student1.add(Dense(64,activation='relu',kernel_initializer='normal'))


print(student1.summary())

Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv2d_3 (Conv2D)            (None, 26, 26, 32)        320       
_________________________________________________________________
conv2d_4 (Conv2D)            (None, 24, 24, 8)         2312      
_________________________________________________________________
max_pooling2d_2 (MaxPooling2 (None, 12, 12, 8)         0         
_________________________________________________________________
dropout_3 (Dropout)          (None, 12, 12, 8)         0         
_________________________________________________________________
flatten_2 (Flatten)          (None, 1152)              0         
_________________________________________________________________
dense_3 (Dense)              (None, 16)                18448     
_________________________________________________________________
dropout_4 (Dropout)          (None, 16)               

In [12]:
#creating the second student
student2 = Sequential()
student2.add(Conv2D(32, kernel_size=(3, 3),
                 activation='relu',
                 input_shape=(28, 28, 1)))
student2.add(Conv2D(8, (3, 3), activation='relu'))
student2.add(MaxPooling2D(pool_size=(2, 2)))
student2.add(Dropout(0.25)) # For reguralization
student2.add(Flatten())
student2.add(Dense(16, activation='relu'))
student2.add(Dropout(0.5)) # For reguralization
student2.add(Dense(64,activation='relu'))


print(student2.summary())

Model: "sequential_3"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv2d_5 (Conv2D)            (None, 26, 26, 32)        320       
_________________________________________________________________
conv2d_6 (Conv2D)            (None, 24, 24, 8)         2312      
_________________________________________________________________
max_pooling2d_3 (MaxPooling2 (None, 12, 12, 8)         0         
_________________________________________________________________
dropout_5 (Dropout)          (None, 12, 12, 8)         0         
_________________________________________________________________
flatten_3 (Flatten)          (None, 1152)              0         
_________________________________________________________________
dense_5 (Dense)              (None, 16)                18448     
_________________________________________________________________
dropout_6 (Dropout)          (None, 16)               

In [13]:
#creating the third student
student3 = Sequential()
student3.add(Conv2D(32, kernel_size=(3, 3),
                 activation='relu',
                 input_shape=(28, 28, 1)))
student3.add(Conv2D(8, (3, 3), activation='relu'))
student3.add(MaxPooling2D(pool_size=(2, 2)))
student3.add(Dropout(0.25)) # For reguralization
student3.add(Flatten())
student3.add(Dense(16, activation='relu'))
student3.add(Dropout(0.5)) # For reguralization
student3.add(Dense(64,activation='relu'))


print(student3.summary())#Look at multi task networks 

Model: "sequential_4"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv2d_7 (Conv2D)            (None, 26, 26, 32)        320       
_________________________________________________________________
conv2d_8 (Conv2D)            (None, 24, 24, 8)         2312      
_________________________________________________________________
max_pooling2d_4 (MaxPooling2 (None, 12, 12, 8)         0         
_________________________________________________________________
dropout_7 (Dropout)          (None, 12, 12, 8)         0         
_________________________________________________________________
flatten_4 (Flatten)          (None, 1152)              0         
_________________________________________________________________
dense_7 (Dense)              (None, 16)                18448     
_________________________________________________________________
dropout_8 (Dropout)          (None, 16)               

In [14]:
#creating the fourth student
student4 = Sequential()
student4.add(Conv2D(32, kernel_size=(3, 3),
                 activation='relu',
                 input_shape=(28, 28, 1)))
student4.add(Conv2D(8, (3, 3), activation='relu'))
student4.add(MaxPooling2D(pool_size=(2, 2)))
student4.add(Dropout(0.25)) # For reguralization
student4.add(Flatten())
student4.add(Dense(16, activation='relu'))
student4.add(Dropout(0.5)) # For reguralization
student4.add(Dense(64,activation='relu'))

print(student4.summary())#Look at multi task networks 

Model: "sequential_5"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv2d_9 (Conv2D)            (None, 26, 26, 32)        320       
_________________________________________________________________
conv2d_10 (Conv2D)           (None, 24, 24, 8)         2312      
_________________________________________________________________
max_pooling2d_5 (MaxPooling2 (None, 12, 12, 8)         0         
_________________________________________________________________
dropout_9 (Dropout)          (None, 12, 12, 8)         0         
_________________________________________________________________
flatten_5 (Flatten)          (None, 1152)              0         
_________________________________________________________________
dense_9 (Dense)              (None, 16)                18448     
_________________________________________________________________
dropout_10 (Dropout)         (None, 16)               

In [15]:
opt=Adam(lr=0.001, beta_1=0.9, beta_2=0.999, amsgrad=True)
student1.compile(loss='mean_squared_error', optimizer=opt)
print(student1.summary())

Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv2d_3 (Conv2D)            (None, 26, 26, 32)        320       
_________________________________________________________________
conv2d_4 (Conv2D)            (None, 24, 24, 8)         2312      
_________________________________________________________________
max_pooling2d_2 (MaxPooling2 (None, 12, 12, 8)         0         
_________________________________________________________________
dropout_3 (Dropout)          (None, 12, 12, 8)         0         
_________________________________________________________________
flatten_2 (Flatten)          (None, 1152)              0         
_________________________________________________________________
dense_3 (Dense)              (None, 16)                18448     
_________________________________________________________________
dropout_4 (Dropout)          (None, 16)               

In [16]:
#indivdual training
history1=student1.fit(X_train,s1Train,
          batch_size=256,
          epochs=20,
          verbose=1,
          validation_data=(X_val,s1Test))

Train on 48000 samples, validate on 12000 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [17]:
opt=Adam(lr=0.001, beta_1=0.9, beta_2=0.999, amsgrad=False)
student2.compile(loss='mean_squared_error', optimizer=opt)
print(student2.summary())

Model: "sequential_3"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv2d_5 (Conv2D)            (None, 26, 26, 32)        320       
_________________________________________________________________
conv2d_6 (Conv2D)            (None, 24, 24, 8)         2312      
_________________________________________________________________
max_pooling2d_3 (MaxPooling2 (None, 12, 12, 8)         0         
_________________________________________________________________
dropout_5 (Dropout)          (None, 12, 12, 8)         0         
_________________________________________________________________
flatten_3 (Flatten)          (None, 1152)              0         
_________________________________________________________________
dense_5 (Dense)              (None, 16)                18448     
_________________________________________________________________
dropout_6 (Dropout)          (None, 16)               

In [18]:
history2=student2.fit(X_train,s2Train,
          batch_size=256,
          epochs=20,
          verbose=1,
          validation_data=(X_val,s2Test))

Train on 48000 samples, validate on 12000 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [19]:
opt=Adam(lr=0.001, beta_1=0.9, beta_2=0.999, amsgrad=False)
student3.compile(loss='mean_squared_error', optimizer=opt)
print(student3.summary())

Model: "sequential_4"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv2d_7 (Conv2D)            (None, 26, 26, 32)        320       
_________________________________________________________________
conv2d_8 (Conv2D)            (None, 24, 24, 8)         2312      
_________________________________________________________________
max_pooling2d_4 (MaxPooling2 (None, 12, 12, 8)         0         
_________________________________________________________________
dropout_7 (Dropout)          (None, 12, 12, 8)         0         
_________________________________________________________________
flatten_4 (Flatten)          (None, 1152)              0         
_________________________________________________________________
dense_7 (Dense)              (None, 16)                18448     
_________________________________________________________________
dropout_8 (Dropout)          (None, 16)               

In [20]:
history3=student3.fit(X_train,s3Train,
          batch_size=256,
          epochs=20,
          verbose=1,
          validation_data=(X_val,s3Test))

Train on 48000 samples, validate on 12000 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [21]:
opt=Adam(lr=0.001, beta_1=0.9, beta_2=0.999, amsgrad=False)
student4.compile(loss='mean_squared_error', optimizer=opt)
print(student4.summary())

Model: "sequential_5"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv2d_9 (Conv2D)            (None, 26, 26, 32)        320       
_________________________________________________________________
conv2d_10 (Conv2D)           (None, 24, 24, 8)         2312      
_________________________________________________________________
max_pooling2d_5 (MaxPooling2 (None, 12, 12, 8)         0         
_________________________________________________________________
dropout_9 (Dropout)          (None, 12, 12, 8)         0         
_________________________________________________________________
flatten_5 (Flatten)          (None, 1152)              0         
_________________________________________________________________
dense_9 (Dense)              (None, 16)                18448     
_________________________________________________________________
dropout_10 (Dropout)         (None, 16)               

In [22]:
history4=student4.fit(X_train,s4Train,
          batch_size=256,
          epochs=20,
          verbose=1,
          validation_data=(X_val,s4Test))

Train on 48000 samples, validate on 12000 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [24]:
#Combining learned chunks of knowledege
model_input = Input(shape=(28,28,1))
o1=student1.output
o2=student2.output
o3=student3.output
o4=student4.output
output=concatenate([o1,o2,o3,o4])
print (output.shape)
output2=Dropout(0.5)(output) # For reguralization
output3=Dense(10)(output2)
output4=Activation('softmax')(output3)
multi_model=Model([student1.input,student2.input,student3.input,student4.input],output4)

multi_model.summary()

(None, 256)
Model: "model_4"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
conv2d_3_input (InputLayer)     (None, 28, 28, 1)    0                                            
__________________________________________________________________________________________________
conv2d_5_input (InputLayer)     (None, 28, 28, 1)    0                                            
__________________________________________________________________________________________________
conv2d_7_input (InputLayer)     (None, 28, 28, 1)    0                                            
__________________________________________________________________________________________________
conv2d_9_input (InputLayer)     (None, 28, 28, 1)    0                                            
________________________________________________________________________________

In [0]:
#Extracting classification weights from the teacher network
my_weights=teacher.get_layer('dense_2').get_weights()

In [0]:
#assigning the extracted weights as a starting point to the combined student network
multi_model.get_layer('dense_12').set_weights(my_weights)

In [0]:
#for 4 students freeze till layer 36
for l in multi_model.layers[:36]:
  l.trainable=False

In [0]:
multi_model.compile(loss='categorical_crossentropy',
              optimizer='adadelta',
              metrics=['accuracy'])

In [29]:
multi_model.summary()

Model: "model_4"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
conv2d_3_input (InputLayer)     (None, 28, 28, 1)    0                                            
__________________________________________________________________________________________________
conv2d_5_input (InputLayer)     (None, 28, 28, 1)    0                                            
__________________________________________________________________________________________________
conv2d_7_input (InputLayer)     (None, 28, 28, 1)    0                                            
__________________________________________________________________________________________________
conv2d_9_input (InputLayer)     (None, 28, 28, 1)    0                                            
____________________________________________________________________________________________

In [30]:
# Train the combined model for 4 students 
epochs = 10
batch_size = 256
mm_history=multi_model.fit([X_train,X_train,X_train,X_train], Y_train,
          batch_size=batch_size,
          epochs=epochs,
          verbose=1,
          validation_data=([X_val,X_val,X_val,X_val], Y_val))

Train on 48000 samples, validate on 12000 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [31]:
#Checking the combined student model accuracy
preds=multi_model.predict([X_test,X_test,X_test,X_test])
numbers=[0,1,2,3,4,5,6,7,8,9]
preds2=[]
for p in preds:
  preds2.append(numbers[np.argmax(p)])
preds2=np.array(preds2)

print (y_test)
print (preds2)
print (np.sum(y_test==preds2))

[9 2 1 ... 8 1 5]
[9 2 1 ... 8 1 5]
8614
