In [2]:
from __future__ import print_function, division

from keras.datasets import mnist
from keras.layers import Input, Dense, Reshape, Flatten, Dropout
from keras.layers import BatchNormalization, Activation, ZeroPadding2D
from keras.layers.advanced_activations import LeakyReLU
from keras.layers.convolutional import UpSampling2D, Conv2D
from keras.models import Sequential, Model
from keras.optimizers import Adam
from keras.datasets import mnist
from keras.models import Sequential
from keras.layers import Conv2D, MaxPooling2D
from keras import backend as K
from keras.datasets import cifar10
from keras.callbacks import LearningRateScheduler
from keras.models import model_from_json
from keras.models import load_model
from keras.losses import KLDivergence
from sklearn.metrics import accuracy_score
import numpy as np
import keras
from keract import get_activations
import math
import time
import matplotlib.pyplot as plt
import sys
import tensorflow as tf

import warnings
warnings.filterwarnings('ignore')

#from wrnet_model import create_wide_residual_network
from prova_wrn import create_wide_residual_network
'''
Function that returns the trainand test data of the CIFAR10 already preprocessed
'''
def getCIFAR10():
    # input image dimensions
    img_rows, img_cols = 32, 32
    num_classes = 10

    # the data, split between train and test sets
    (x_train, y_train), (x_test, y_test) = cifar10.load_data()
    
    # format of the tensor
    if K.image_data_format() == 'channels_first':
        x_train = x_train.reshape(x_train.shape[0], 3, img_rows, img_cols)
        x_test = x_test.reshape(x_test.shape[0], 3, img_rows, img_cols)
        input_shape = (3, img_rows, img_cols)
    else:
        x_train = x_train.reshape(x_train.shape[0], img_rows, img_cols, 3)
        x_test = x_test.reshape(x_test.shape[0], img_rows, img_cols, 3)
        input_shape = (img_rows, img_cols, 3)

    # convert in to float the images
    x_train = x_train.astype('float32')
    x_test = x_test.astype('float32')
    
    # new normalization with z-score
    mean = np.mean(x_train,axis=(0,1,2,3))
    std = np.std(x_train,axis=(0,1,2,3))
    x_train = (x_train-mean)/(std+1e-7)
    x_test = (x_test-mean)/(std+1e-7)
    
    # convert class vectors to binary class matrices
    y_train = keras.utils.to_categorical(y_train, num_classes)
    y_test = keras.utils.to_categorical(y_test, num_classes)
    
    print('CIFAR10 loaded')
    return x_train,y_train,x_test,y_test

'''
Small function that returns the shape of the CIFAR10 images
'''
def getCIFAR10InputShape():
    img_rows, img_cols = 32, 32
    if K.image_data_format() == 'channels_first':
        input_shape = (3, img_rows, img_cols)
    else:
        input_shape = (img_rows, img_cols, 3)
        
    return input_shape

'''
Function that loads from a file the teacher
'''
def getTeacher(file_name):
    # Model reconstruction from JSON file
    with open(file_name + '.json', 'r') as f:
        model = model_from_json(f.read())

    # Load weights into the new model
    model.load_weights(file_name + '.h5')
    
    print('Teacher loaded from ' + file_name + '.h5')
    return model
    
'''
Function that loads from a file the teacher and test it on the CIRAF10 dataset
'''
def testTeacher(file_name):
    
    x_train,y_train,x_test,y_test = getCIFAR10()
    
    model = getTeacher(file_name)
    
    # define optimizer
    opt_rms = keras.optimizers.rmsprop(lr=0.001,decay=1e-6)

    model.compile(loss=keras.losses.categorical_crossentropy,
                  optimizer=opt_rms,
                  metrics=['accuracy'])

    # final evaluation on test
    score = model.evaluate(x_test, y_test, verbose=0)
    print('Teacher test loss:', score[0])
    print('Teacher test accuracy:', score[1])

    
'''
Function that returns a simple student done by 2 convolutions, a maxpool and a final two fully connected layers
'''
def getStudent(input_shape):
    num_classes = 10
    model1,model2=create_wide_residual_network(input_shape, num_classes, N=2, k=1, dropout=0.)
    
    print('Student loaded')
    return model1,model2
    
'''
Function to try to train the simple sutdent in order to unerstand its capabilites
'''
def trainStudent(epochs):
    
    x_train,y_train,x_test,y_test = getCIFAR10()
    
    input_shape = getCIFAR10InputShape()
    
    model = getStudent(input_shape)
    
    model.compile(loss=keras.losses.categorical_crossentropy,
              optimizer=keras.optimizers.Adadelta(),
              metrics=['accuracy'])
    
    batch_size = 128
    n_batches = math.floor( x_train.shape[0] / batch_size)

    for e in range(epochs):
    
        for i in range(0,n_batches):
            imgs = x_train[i*batch_size:(i+1)*batch_size]
            labels = y_train[i*batch_size:(i+1)*batch_size]
            loss = model.train_on_batch(imgs,labels)
            print("Epoch: " + str(e+1) + " batch " + str(i) + " loss: " + str(loss[0]) + " acc: " + str( 100*loss[1]))
            
        score = model.evaluate(x_test, y_test, verbose=0)
        print('After epoch ' + str(e+1) + ' test loss ' + str(score[0]) + ' test accuracy ' + str(score[1]))

'''
Function that returns a simple generator
'''
def getGenerator():

        noise_shape = (100,)

        model = Sequential()
        
        img_shape = getCIFAR10InputShape()

        model.add(Dense(128*8**2, input_shape=noise_shape))
        model.add(Reshape((8, 8, 128)))
        model.add(BatchNormalization())
        
        model.add(UpSampling2D())
        model.add(Conv2D(128, kernel_size=3, strides=1, padding="same"))
        model.add(BatchNormalization())
        model.add(LeakyReLU(alpha=0.2)) 
                  
        model.add(UpSampling2D())
        model.add(Conv2D(64, kernel_size=(3,3), strides=1, padding="same"))
        model.add(BatchNormalization())
        model.add(LeakyReLU(alpha=0.2))
        
        model.add(Conv2D(3, kernel_size=(3,3), strides=1, padding="same"))
        model.add(BatchNormalization())   
        
        #model.summary()
        print('Generator loaded')
        return model

def mypositiveloss(y_true,y_pred):
    
    #y_true = K.log(y_true)
    #y_pred = K.log(y_pred)
    
    #loss=  keras.losses.kullback_leibler_divergence(y_true, y_pred)
    loss= tf.keras.losses.KLD(y_true, y_pred)
    
    return loss


def mynegativeloss(y_true,y_pred):
    
    #y_true = K.log(y_true)
    #y_pred = K.log(y_pred)
    
    #loss= keras.losses.kullback_leibler_divergence(y_true, y_pred)
    loss= tf.keras.losses.KLD(y_true, y_pred)
    gen_loss = -loss
    
    return gen_loss

    
def main():
    
    x_train, y_train, x_test, y_test = getCIFAR10()
    
    teacher = getTeacher('../pretrained_models/model-16-2')
    teacher.compile(loss=keras.losses.categorical_crossentropy,
                  optimizer='adam',
                  metrics=['accuracy'])
    
    input_shape = getCIFAR10InputShape()
    
    student_train, student_test = getStudent(input_shape)
    
    optim_stud = Adam(lr=2e-3, clipnorm=5.0)
    optim_gen = Adam(lr=1e-3, clipnorm=5.0)
    
    student_train.compile(loss=mypositiveloss ,
                optimizer=optim_stud)
    
    student_test.compile(loss="categorical_crossentropy",
                  optimizer='adam',
                  metrics=['accuracy'])
    
    generator = getGenerator()
    generator.compile(loss=mynegativeloss, optimizer=optim_gen)
    
    #student_train.trainable=False
    #teacher.trainable = False

    z = Input(shape=(100,))

    #Generator makes a prediction
    fake_imgs = generator(z)

    #Discriminator attempts to categorise prediction
    y = student_train(fake_imgs)

    #gan = Model(z, y)
    #gan.compile(loss=mynegativeloss,optimizer=optim_gen)
    
    t = teacher(fake_imgs)
    #gan2 = Model(z,t)
    #gan2.compile(loss=mypositiveswappedloss,optimizer=optim_gen)
    
    print('Student Summary:')
    student_train.summary()
    print('Generator Summary:')
    generator.summary()
    
    n_batches = 152
    batch_size = 128
    log_freq = 30
    ns = 10
    
    for i in range(n_batches):
        
        noise = np.random.normal(0, 1, (batch_size, 100))
        
        gen_imgs = generator.predict(noise)
        
        
        #this is the cycle for generator (for i=0..ng)
        t_predictions = teacher.predict(gen_imgs)
        
        s_predictions = student_train.predict(gen_imgs)
           
        gen_loss = generator.train_on_batch(noise,mynegativeloss(t_predictions,s_predictions))
        
        
        s_loss = 0
        for j in range(ns):
            
            t_predictions = teacher.predict(gen_imgs)
        
            s_predictions = student_train.predict(gen_imgs)
            
            s_loss += student_train.train_on_batch(gen_imgs,mynegativeloss(t_predictions,s_predictions))
        
        print('batch ' + str(i) + '/' + str(n_batches) + ' G loss: ' + str(g_loss) + ' S loss: ' + str(s_loss/ns))
        
        if (i % log_freq) == 0:
            score = student_test.evaluate(x_test, y_test, verbose=0)
            print('Student test loss: '  + str(score))
            
            model_json = student_test.to_json()
            with open('tmp-model' + str(i) + '.json','w') as json_file:
                json_file.write(model_json)
            student_test.save_weights('tmp-model' + str(i) + '.h5')
            print('saved model ' + str(i))
   
        
        
    score = student_test.evaluate(x_test, y_test, verbose=1)
    print('Final student test loss: '  + str(score))

    '''
    for i in range(0,n_batches):
        print('batch ' + str(i))
        imgs = x_train[i*batch_size:(i+1)*batch_size]
        labels = y_train[i*batch_size:(i+1)*batch_size]
        t_predictions = teacher.predict(imgs)
        s_predictions = student.predict(imgs)
        print('teacher predictions: ')
        print(t_predictions)
        print('student predictions: ')
        print(s_predictions)
        
        kl_div = tf.keras.losses.KLDivergence()
        
        
        # to print the KL divergence
        for j in range(batch_size):
            loss1 = kl_div(t_predictions[j],s_predictions[j])
            loss2 = tf.keras.losses.categorical_crossentropy(t_predictions[j],s_predictions[j])
            with tf.Session() as sess:
                init = tf.global_variables_initializer()
                sess.run(init)
                print('sample ' + str(j) + ' kl div: ' + str(loss1.eval()) + ' cat loss: ' + str(loss2.eval()))
        '''


main()


CIFAR10 loaded
Teacher loaded from ../pretrained_models/model-16-2.h5
Wide Residual Network-16-1 created.
Student loaded
Generator loaded
Student Summary:
Model: "model_3"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_3 (InputLayer)            (None, 32, 32, 3)    0                                            
__________________________________________________________________________________________________
conv2d_20 (Conv2D)              (None, 32, 32, 16)   432         input_3[0][0]                    
__________________________________________________________________________________________________
batch_normalization_18 (BatchNo (None, 32, 32, 16)   64          conv2d_20[0][0]                  
__________________________________________________________________________________________________
activation_14 (Activation)      (Non


Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where


ResourceExhaustedError: 2 root error(s) found.
  (0) Resource exhausted: OOM when allocating tensor with shape[128,128,16,16] and type float on /job:localhost/replica:0/task:0/device:GPU:0 by allocator GPU_0_bfc
	 [[{{node training/Adam/gradients/AddN_6-1-TransposeNHWCToNCHW-LayoutOptimizer}}]]
Hint: If you want to see a list of allocated tensors when OOM happens, add report_tensor_allocations_upon_oom to RunOptions for current allocation info.

	 [[Mean_7/_1459]]
Hint: If you want to see a list of allocated tensors when OOM happens, add report_tensor_allocations_upon_oom to RunOptions for current allocation info.

  (1) Resource exhausted: OOM when allocating tensor with shape[128,128,16,16] and type float on /job:localhost/replica:0/task:0/device:GPU:0 by allocator GPU_0_bfc
	 [[{{node training/Adam/gradients/AddN_6-1-TransposeNHWCToNCHW-LayoutOptimizer}}]]
Hint: If you want to see a list of allocated tensors when OOM happens, add report_tensor_allocations_upon_oom to RunOptions for current allocation info.

0 successful operations.
0 derived errors ignored.

In [12]:
# load the model we just trained
# if i use the last for CIFAR10
from __future__ import print_function
import keras
from keras.datasets import mnist
from keras.models import Sequential
from keras.layers import Dense, Dropout, Flatten
from keras.layers import Conv2D, MaxPooling2D
from keras import backend as K
from keras.datasets import cifar10
from keras.callbacks import LearningRateScheduler
from keras.models import model_from_json
from keras.models import load_model
import numpy as np

batch_size = 128
num_classes = 10
epochs = 6

# input image dimensions
img_rows, img_cols = 32, 32

# the data, split between train and test sets
(x_train, y_train), (x_test, y_test) = cifar10.load_data()

if K.image_data_format() == 'channels_first':
    x_train = x_train.reshape(x_train.shape[0], 3, img_rows, img_cols)
    x_test = x_test.reshape(x_test.shape[0], 3, img_rows, img_cols)
    input_shape = (3, img_rows, img_cols)
else:
    x_train = x_train.reshape(x_train.shape[0], img_rows, img_cols, 3)
    x_test = x_test.reshape(x_test.shape[0], img_rows, img_cols, 3)
    input_shape = (img_rows, img_cols, 3)

x_train = x_train.astype('float32')
x_test = x_test.astype('float32')

#old normalization
#x_train /= 255
#x_test /= 255

# new normalization with z-score
mean = np.mean(x_train,axis=(0,1,2,3))
std = np.std(x_train,axis=(0,1,2,3))
x_train = (x_train-mean)/(std+1e-7)
x_test = (x_test-mean)/(std+1e-7)

print('x_train shape:', x_train.shape)
print(x_train.shape[0], 'train samples')
print(x_test.shape[0], 'test samples')

# convert class vectors to binary class matrices
y_train = keras.utils.to_categorical(y_train, num_classes)
y_test = keras.utils.to_categorical(y_test, num_classes)

print('loading tmp-model0')
# Model reconstruction from JSON file
with open('tmp-model0.json', 'r') as f:
    model = model_from_json(f.read())

# Load weights into the new model
model.load_weights('tmp-model0.h5')

# define optimizer
opt_rms = keras.optimizers.rmsprop(lr=0.001,decay=1e-6)

model.compile(loss=keras.losses.categorical_crossentropy,
              optimizer=opt_rms,
              metrics=['accuracy'])

# final evaluation on test
score = model.evaluate(x_test, y_test, verbose=0)
print('Test loss:', score[0])
print('Test accuracy:', score[1])


print('loading tmp-mode150')
# Model reconstruction from JSON file
with open('tmp-model150.json', 'r') as f:
    model = model_from_json(f.read())

# Load weights into the new model
model.load_weights('tmp-model150.h5')

# define optimizer
opt_rms = keras.optimizers.rmsprop(lr=0.001,decay=1e-6)

model.compile(loss=keras.losses.categorical_crossentropy,
              optimizer=opt_rms,
              metrics=['accuracy'])

# final evaluation on test
score = model.evaluate(x_test, y_test, verbose=0)
print('Test loss:', score[0])
print('Test accuracy:', score[1])

x_train shape: (50000, 32, 32, 3)
50000 train samples
10000 test samples
loading tmp-model0
Test loss: 2.7111117263793947
Test accuracy: 0.10000000149011612
loading tmp-mode150
Test loss: 3.3176475238800047
Test accuracy: 0.10000000149011612


In [24]:
# try to negate a tensor
import tensorflow as tf
from tensorflow.keras import backend as K

def myKLDiv(a,b):
    division = tf.math.divide(soft_a,soft_b)
    log_div = K.log(division)
    res = K.sum(log_div)
    
    return res


a = K.random_uniform(shape=(1,3))
b = K.random_uniform(shape=(1,3))

soft_a = tf.nn.softmax(a)
soft_b = tf.nn.softmax(b)

log_a = K.log(soft_a)
log_b = K.log(soft_b)
    
loss= keras.losses.kullback_leibler_divergence(log_a, log_b)

loss_soft= keras.losses.kullback_leibler_divergence(soft_a, soft_b)

neg_loss_soft = -loss_soft

neg_loss = -loss

mykl = myKLDiv(soft_a,soft_b)

print(loss)

with tf.Session() as sess:
                init = tf.global_variables_initializer()
                sess.run(init)
                print('a: ' + str(a.eval()))
                print('b: ' + str(b.eval()))
                
                print('soft_a: ' + str(soft_a.eval()))
                print('soft_b: ' + str(soft_b.eval()))
                
                print('log_a: ' + str(log_a.eval()))
                print('log_b: ' + str(log_b.eval()))
                
                print('loss_soft ' + str(loss_soft.eval()))
                print('neg_loss_soft ' + str(neg_loss_soft.eval()))
                
                print('loss_log: ' + str(loss.eval()))
                print('neg loss_log: ' + str(neg_loss.eval()))
                
                print('my loss: ' + str(mykl.eval()))
                


Tensor("Sum_12:0", shape=(1,), dtype=float32)
a: [[0.06661725 0.8389952  0.23997462]]
b: [[0.9655373  0.6230657  0.59269714]]
soft_a: [[0.27257267 0.32507604 0.40235132]]
soft_b: [[0.2674539  0.42506498 0.3074811 ]]
log_a: [[-0.66898495 -1.4593475  -1.3649876 ]]
log_b: [[-1.4161048  -1.35178    -0.69602454]]
loss_soft [0.00532334]
neg_loss_soft [-0.01719323]
loss_log: [0.]
neg loss_log: [-0.]
my loss: -0.044802185


In [22]:
# design our custom KL loss

# KL(a,b) = SUM( a * log( a / b))

from keras.layers import Lambda
import tensorflow as tf
from tensorflow.keras import backend as K

a = K.random_uniform(shape=(1,3))
b = K.random_uniform(shape=(1,3))

soft_a = tf.nn.softmax(a)
soft_b = tf.nn.softmax(b)


division = tf.math.divide(soft_a,soft_b)
#division2 = divResult = Lambda(lambda x: x[0]/x[1])([soft_a,soft_b])
log_div = K.log(division)
res = K.sum(log_div)

with tf.Session() as sess:
                init = tf.global_variables_initializer()
                sess.run(init)
                print('a: ' + str(a.eval()))
                print('b: ' + str(b.eval()))
                
                print('soft_a: ' + str(soft_a.eval()))
                print('soft_b: ' + str(soft_b.eval()))
                
                print('div: ' + str(division.eval()))
                print('div2: ' + str(division2.eval()))
                print('log_div: ' + str(log_div.eval()))
                print('res: ' + str(res.eval()))


a: [[0.40674496 0.6270155  0.20576179]]
b: [[0.82592905 0.99604785 0.34396696]]
soft_a: [[0.3309812  0.2238779  0.44514093]]
soft_b: [[0.4950692  0.24711792 0.2578129 ]]
div: [[0.69207877 0.8602267  1.6768394 ]]
div2: [[1.1801109 1.2576282 0.6115273]]
log_div: [[ 0.6486425   0.02256223 -0.8076298 ]]
res: 0.083834425


In [None]:


# pseudocodice per il paper

# for i in range(batches):
    # z = noise(100)
    # generated_images = Generator(z)
    # (output, teacher_activations) = teacher(generated_images)
    ''' Teacher is the pre-trained network that outputs its activations and the result 
    or it can outputs only the result and we can get the activations with K.function '''
    # combined.train_on_batch(generated_images, (outputs,teacher_activations)) 
    '''Combined is a network that has at the start the generator and then the freezed student.
    The labels it gets are the results and the activations of the teacher
    The loss is to increase the distance between the labels and the output of itself
    therefore this network has to output its intermediate activations'''
    
    # for j in range(ns):
        # student.train_on_batch(generated_images, (outputs,teacher_activations))
        ''' Student network that outputs its results and its intermediate activations, 
        and its loss is to match the output and activations of the teacher'''

    ''' if i don't find a way to output intermediate activations from a model we could always
        get the activations with K.fuction and the data, and then input them to the loss as the label.
        This method does not seems to increase too much the training time...
    '''


