# Initial setup

In [None]:
%matplotlib inline
%reload_ext autoreload
%autoreload 2

Import all needed packages

In [None]:
import os
import cv2
import tensorflow as tf
import numpy as np
import sklearn
import matplotlib.pyplot as plt
import json
from six.moves import urllib
from tensorflow.keras.preprocessing.image import img_to_array
from sklearn.utils import shuffle
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.utils import to_categorical

In [None]:
from keras import applications
preprocess_input = applications.mobilenet_v2.preprocess_input 
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.layers import Dense
from tensorflow.keras.models import Model
from tensorflow.keras.models import load_model
from sklearn.metrics import roc_curve
from sklearn.metrics import roc_auc_score

Run models on GPU 1

In [None]:
print("Num GPUs Available: ", len(tf.config.experimental.list_physical_devices('GPU')))

In [None]:
gpus = tf.config.experimental.list_physical_devices('GPU')  
tf.config.experimental.set_visible_devices(gpus[1], 'GPU')
tf.config.experimental.set_memory_growth(gpus[1], True)

## Set useful paths

The folder structure is the following: there is a main folder *Dataset* that contains all sub folders where taking picture to pre-process (*path_data*) and where putting all pre-processed ones (*path_train*).

The class we want to recognize among all is the *target class Persona*. 

We will also define, connected to folder *train*, a *path_reference* where inserting all classes from the reference dataset and a *path_target* where inserting the target class.

In [None]:
path_ds = "Dataset"
path_data = os.path.join(path_ds, "data")  #Dataset/data2
#path_train = os.path.join(path_ds, "trainBN")
path_train = os.path.join(path_ds, "trainBN200")
if not os.path.exists(path_train):
      os.makedirs(path_train)
target = "Persona"

path_reference = os.path.join(path_train, "reference")  #Dataset/train2/reference
if not os.path.exists(path_reference):
      os.makedirs(path_reference)
path_target = os.path.join(path_train, "target")        #Dataset/train2/target
if not os.path.exists(path_target):
      os.makedirs(path_target)

We print how many people images we have to pre-process and also how many classes of the reference dataset are present

In [None]:
print("There are ", len(os.listdir(os.path.join(path_data, target))), " people images to process inside the target dataset")

In [None]:
n_classes_ref = len(os.listdir(path_data))-1
print("There are " + str(n_classes_ref) + " classes belonging to the reference dataset")

# Training part


## Pre-process images

The used *reference dataset* is a subset of the training set of ILSVR2012. Our set is composed of randomly chosen 20 classes (from the original 1000), each containing 500 images, for a total of 10 thousand of images.

Since ILSVR2012 is organized in *synsets*, multiple words or phrases that describe a meaningful concept, we create the *synset_to_human* dictionary to map synsets to human-readable names. This helps in creating the correct folder structure.

In [None]:
def create_readable_names_for_imagenet_labels():
    """Create a dict mapping label id to human readable string.
    Returns:
            synset_to_human: dictionary mapping synsets and human-readable names.

    'imagenet_lsvrc_2015_synsets.txt' contains a list of valid synset labels used by ILSVRC competition.
    There is one synset one per line, eg.
              #   n01440764
              #   n01443537
    'imagenet_metadata.txt' contains a mapping from synsets to human-readable names for every synset in Imagenet. 
    These are stored in a tsv format, as follows:
              #   n02119247    black fox
              #   n02119359    silver fox

    Code is based on
    https://github.com/tensorflow/models/blob/master/inception/inception/data/build_imagenet_data.py#L463
    """
    filename = 'imagenet_lsvrc_2015_synsets.txt'
    synset_list = [s.strip() for s in open(filename).readlines()]
    num_synsets_in_ilsvrc = len(synset_list)
    assert num_synsets_in_ilsvrc == 1000

    filename = 'imagenet_metadata.txt'
    synset_to_human_list = open(filename).readlines()
    num_synsets_in_all_imagenet = len(synset_to_human_list)
    assert num_synsets_in_all_imagenet == 21842

    synset_to_human = {}
    for s in synset_to_human_list:
        parts = s.strip().split('\t')
        assert len(parts) == 2
        synset = parts[0]
        human = parts[1]
        if synset in synset_list:
            synset_to_human[synset] = human

    return synset_to_human

synset_to_human = create_readable_names_for_imagenet_labels()

In [None]:
print(len(synset_to_human))

In the pre-processing part for the training, we take pictures from *data* which is composed of:
<pre>
<b>data</b>
|__ <b>n01443537</b>
|__ <b>n01484850</b>
|__ <b>n01532829</b>
|__ <b>n01882714</b>
|__ <b>n--------</b>
|__ <b>n--------</b>
|__ <b>Persona</b>
</pre>

and we put the pre-processed images into *trainBN*, realizing this structure:

<pre>
<b>trainBN</b>
|__ <b>target</b>
    |__ <b>Persona</b>
|__ <b>reference</b>
    |__ <b>acoustic guitar</b>
    |__ <b>African elephant, Loxodonta africana</b>
    |__ <b>analog clock</b>
    |__ <b>backpack, back pack, knapsack, packsack, rucksack, haversack</b>
    |__ <b>beer glass</b>
    |__ --------   
</pre>

All pictures in *data* are color (RGB) images, but the training is carried out using grayscale images. This is why a future goal of this discussion is to extend the Deep One-class Classification in InfraRed images, in order to recognize people in frames coming from surveillance videos, even at night.
Problems in training networks directly with InfraRed examples arise because
there are no large datasets made of enough images like ImageNet dataset or
Open Images Dataset. That’s why a training with grayscale pictures is proposed and, then, a generalization to IR datasets through a further transfer learning is suggested.

RGB datasets are transformed to grayscale datasets using the library *OpenCV*. Steps in the pre-processing part are:

• each image is centrally cropped along its smaller size. In this way we
can resize it without altering the image aspect ratio and the properties
of objects within;

• each picture is resized to square format of 224×224 with a bilinear interpolation;

• each image is made a grayscale image with size of (224, 224, 1), having a
single channel;

• each grayscale image is brought back on three channels, repeating the single channel three times. This operation is done since the structure of
most of networks presents a three channel configuration.

In [None]:
img_size = 224
for folder in os.listdir(path_data):
    path_folder = os.path.join(path_data, folder)
    print("\n------------------------------------------------------")
    print("\nFolder ", folder, " with ", len(os.listdir(path_folder)), "images inside")

    if folder == target:
        path_out = os.path.join(path_target, folder)
        if not os.path.exists(path_out):
            os.makedirs(path_out)
    else: 
        path_out = os.path.join(path_reference, synset_to_human[folder])
        if not os.path.exists(path_out):
            os.makedirs(path_out)

    i=0   #new images
    j=0   #images already pre-processed
    for file in os.listdir(path_folder):
        if os.path.exists(path_out + "/" + file):
            j+=1
            print("Image " + file + " already pre-processed" )
        else:
            i+=1
            print("Processing ... ", file)
            
            #read the image
            image = cv2.imread(path_folder + "/"+ file)
            #crop image -> square image along its min dimension
            h, w, c = image.shape
            if w>h:
                start = (w-h)//2
                image = image[:, start:start+h]
            else:
                start = (h-w)//2
                image = image[start:start+w,:]
            #resize
            image = cv2.resize(image, (img_size, img_size), interpolation=cv2.INTER_LINEAR)
            image = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)     #gray image
            image = cv2.merge((image, image, image))            #gray image on 3 channels
            #write the pre-proc image in train folder
            cv2.imwrite(path_out + "/" + file, image)

    print("\nImages that have been previously pre-processed: " + str(j))
    print("\nNewly pre-processed images: " + str(i))

Classes of the reference dataset are therefore printed

In [None]:
classes_ref = []  #list
classes_ref = [name for name in os.listdir(path_reference)]
print("Classes of reference dataset are ", classes_ref)

## Build the model


We instantiate a unique MobileNetV2 in *base_model*. MobileNetV2 by Google belongs to MobileNets family, efficient and optimized architectures for mobile devices. It is fast and provides high accuracy, requiring few parameters and low computational power, also compared to previous versions.

• We load *weights* pre-trained on ImageNet, without including the default top part with 1000 neurons;

• The *input shape* of images is set to (224, 224, 3);

• The hyperparameter alpha, belonging to range (0, 1] and known as the width multiplier that determines the number of filters at each layer, is set to its default value 1;

• A *global average pooling layer* is inserted after the the last convolutional block, passing from a 4D output tensor of shape (batch_size, 7, 7, 1280) to a flattened 2D output tensor of shape (batch_size, 1280). These 1280 numbers are the features extracted from the input images, from which we minimize the compactness loss and on which we base the classification of objects.

In [None]:
classes = len(os.listdir(path_data))-1
alpha = 1.0 #for MobileNetV2
base_model = applications.MobileNetV2(include_top=False, 
                                      input_shape=(224, 224, 3), 
                                      alpha=alpha, 
                                      weights='imagenet',
                                      pooling="avg")

In [None]:
base_model.summary()

• A *fully connected layer* with a softmax activation function and with
a number of units equal to the total classes of the reference dataset (20) is
attached, in order to compute the descriptiveness loss.

In [None]:
predictions = Dense(classes, activation='softmax')(base_model.output)

The keras model is defined, having:

• 1 input, the base model input that is an image batch of size (batch_size, 224, 224, 3);

• 2 outputs, that are the *global average pooling layer output*, representing the extracted features from MobileNetV2 and the *fully connected layer output*, representing the classification predictions. 

In [None]:
#We extract features from the average pooling layer to take advantages from loaded pre-trained weights. Layer predictions is not pre-trained!
model = keras.Model(inputs=base_model.input,outputs=[base_model.get_layer("global_average_pooling2d").output, predictions])

In [None]:
#Seconda versione con 
#FC1 = Dense(540, activation='relu')(base_model.output) #540 neurons because we don't want sharp variation from 1280 to n_classes_ref
#FC2 = Dense(200, activation='relu')(FC1)              #we can add other layers later
#predictions = Dense(n_classes_ref, activation = 'softmax')(FC2)

#model = keras.Model(inputs=base_model.input,outputs=[base_model.get_layer("global_average_pooling2d").output, predictions])

We print the number of features extracted from each single image

In [None]:
n_features = base_model.get_layer("global_average_pooling2d").output.shape[1]
print(n_features)   #1280

We print the output shape

In [None]:
print(model.output) #shape=(None, 1280) shape=(None, 20)

We visualize properties of all layers that are part of the model and their number

In [None]:
model.summary()

In [None]:
print("Number of layers in the base model: ", len(base_model.layers))

In [None]:
print("Number of layers in the model: ", len(model.layers))

In the training phase some layers of the network are *frozen*, to preserve
imported parameters pre-trained on ImageNet. This means we take low-level
features learned in a different classification task, by leveraging them
in our problem.

In MobileNetV2 we choose to initially freeze all blocks until block 13, having 40 unfrozen layers over the whole 157 layers.

In [None]:
#Fixed weights
for layer in model.layers:
    #if layer.name == "Conv_1":
    if layer.name == "block_13_expand":
        break
    else:
        layer.trainable = False

In [None]:
#model.trainable = True

In [None]:
k=0
for layer in model.layers:
    #print(layer, layer.trainable)
    if layer.trainable == True:
        k=k+1
print("Layers with trainable=True: ", k, "")

In [None]:
for layer in model.layers:
    print(layer, layer.trainable)

## Losses computation

The network is fed with a big input batch of size *batch_size*, that is composed of two smaller batches of the same size *sub_batch_size*, called *target sub-batch* and the *reference sub-batch*.

Quantities *batch_size* and *sub_batch_size* are defined, with also the constant *beta* which is used in the *compactness loss* function

In [None]:
batch_size = 32
sub_batch_size = batch_size // 2
beta = sub_batch_size**2 / (sub_batch_size-1)**2  #1.0158 with batch_size=256
print("beta = ", beta)

The *compactness loss* is aimed at minimizing the variance of features of each batch and is computed exclusively considering objects from the target dataset, that are pictures with people inside.

The two input quantities are:

• *y_true*: the true labels of the batch, of size (batch_size, n_classes_ref).
This quantity is not used in the lc computation because it has no role in
imposing similarity among person features;

• *y_pred*: predictions of the intermediate features for each element in the
batch, of size (batch_size, n_features). It is produced by the average pooling layer, so the number of features is 1280. We choose this layer because it has weights pre-trained on ImageNet, that speed up the learning process compared to those with random inizialization.

In order to consider only features of person images, the first half part of the batch is isolated. Then, the following operations are performed: the variance of the feature distribution along the batch for each feature and the mean of all variances. This number is then multiplied by a correction factor beta.

Minimizing the mean of the variance of all the features implies having similar
characteristics for all images representing people.

In [None]:
def compactness_loss(y_true, y_pred):
    #y_pred_target = y_pred[0:128]   #shape (128, 1280)
    y_pred_target = y_pred[0:16]   #shape (16, 1280)
    # ERRATA -> l_c = tf.keras.backend.mean(tf.keras.backend.var(y_pred_target, axis = 1, keepdims=False)) 
    #axis = 1 means variance along the row -> tf.keras.backend.var of shape=(128,)
    l_c = tf.keras.backend.mean(tf.keras.backend.var(y_pred_target, axis = 0, keepdims=False)) 
    #axis = 0 means variance along the columns (so the features)-> tf.keras.backend.var of shape=(2048,)

    return l_c * beta

#when features are extraxted from convolutional layer: apply average pooling layer ->  compute loss 

The *descriptiveness loss* is computed to have high accuracy in classification and is evaluated considering instances coming only from the reference dataset. It uses the *cross-entropy loss* to state the descriptiveness of features, that is defined at the beginning.

In [None]:
#Categorical crossentropy loss used in the descriptiveness loss
cce = tf.keras.losses.CategoricalCrossentropy(from_logits=False) 

#**Note - Using from_logits=True is more numerically stable.** -> remove softmax layer
#used default redution: reduction=losses_utils.ReductionV2.AUTO

The two input quantities are:

• *y_true*: the true labels of the batch, of size (batch_size, n_classes_ref).
This quantity is provided by the inputgenerator, later defined;

• *y_pred*: predictions coming from the last fully connected layer, of size
(batch_size, n_classes_ref). The second dimension n_classes_ref is 20,
corresponding to the categorical label of classes from the reference dataset.
The label of the person class is not included because this is not a multiclass classification problem.

The descriptiveness loss is computed with respect to only elements of the reference dataset. Therefore, the second half part of the batch is considered both in *y_true* and in *y_pred*. The first part of them contains meaningless numbers, because we don’t care about person image labels.
Then, the categorical cross-entropy loss is evaluated between the predicted labels and the desired ones and it is minimized to realize a good classification.

In this way, features are characterized by the property of descriptiveness, in addition to compactness.

In [None]:
def descriptiveness_loss(y_true, y_pred): 
    #y_true_reference = y_true[128:256]  #shape (128, 20)
    #y_pred_reference = y_pred[128:256]  #shape (128, 20)
    y_true_reference = y_true[16:32]  #shape (16, 20)
    y_pred_reference = y_pred[16:32]  #shape (16, 20)
    l_d = cce(y_true_reference, y_pred_reference)
    return l_d

## Compile the model

The model is compiled defining:

• the *optimizer* as the gradient descent algorithm, employed with a very
low learning rate lr = 0.00005 and a weight decay of 0.00005;

• the two losses and related weights controlled by lambda.

In [None]:
lambd = 10

model.compile(
    optimizer=tf.keras.optimizers.SGD(learning_rate=0.00005, decay=0.00005), #, momentum=0.9),  #lr=0.2, momentum=0.9, decay=0.01
    #optimizer=tf.keras.optimizers.Adam( learning_rate=0.00005,
    #                                    beta_1=0.9,
    #                                    beta_2=0.999,
    #                                    epsilon=1e-07,
    #                                    amsgrad=False,
    #                                    name="Adam"),
    loss=[compactness_loss, descriptiveness_loss],
    #loss={"FC1": compactness_loss,
    #      "predictions": descriptiveness_loss},
    
    loss_weights = [lambd, 1],
    metrics=["accuracy"],
)

## Create a multiple ImageDataGenerator

We use a function with multiple Keras ImageDataGenerator objects to handle
the training with a big input batch, composed of two smaller batches that are the *target sub-batch* and the *reference sub-batch*.

In this way all images are provided to the network "on the fly", without storing
all matrices in memory and causing related memory issues.

In [None]:
!rm -rf `find -type d -name .ipynb_checkpoints`

In [None]:
# the output of inputgenerator is a tuple : ((batch_size, 224, 224, 3), (batch_size, 20))

input_imgen = ImageDataGenerator(preprocessing_function = preprocess_input)

def generate_generator_multiple(generator, dir1, dir2, sub_batch_size, img_height, img_width, n_classes):
    genX1 = generator.flow_from_directory(dir1,
                                          target_size = (img_height,img_width),
                                          class_mode = 'categorical',
                                          batch_size = sub_batch_size,
                                          shuffle=True)
    
    genX2 = generator.flow_from_directory(dir2,
                                          target_size = (img_height,img_width),
                                          class_mode = 'categorical',
                                          batch_size = sub_batch_size,
                                          shuffle=True)
    while True:
            X1i = genX1.next()
            X2i = genX2.next()
            yield np.concatenate([X1i[0], X2i[0]]), np.concatenate([to_categorical(np.argmax(X1i[1], axis=1), num_classes=n_classes_ref), X2i[1]]) 
            #Yield 2 concatenated batches and their categorical concatenated labels
            
inputgenerator = generate_generator_multiple(generator = input_imgen,
                                             dir1 = path_target,
                                             dir2 = path_reference,
                                             sub_batch_size = sub_batch_size,
                                             img_height = 224,
                                             img_width = 224,
                                             n_classes = n_classes_ref)       

Epochs are delineated from the size of the target dataset.

The number of epochs is set to 400, taking care to save intermediate models every 50 epochs to properly study the evolution of tested metrics.

In [None]:
train_size = len(os.listdir(os.path.join(path_target, target))) #6000
epochs = 400

In [None]:
print(train_size)

## Train with *fit*

In [None]:
history = model.fit(inputgenerator,
                    epochs = epochs,
                    steps_per_epoch = train_size // sub_batch_size,
                    #use_multiprocessing=True,
                    #shuffle=False
                    )

In [None]:
model.metrics_names

Retrive losses and accuracy from history

In [None]:
# Retrieve losses and accuracy
total_loss = history.history['loss']
l_c = history.history['avg_pool_loss']
l_d = history.history['dense_loss']
acc_dense = history.history['dense_accuracy']


# Get number of epochs
epochs = range(len(total_loss))

print("Total loss = ", total_loss)
print("Compactness loss = ", l_c)
print("Descriptiveness loss = ", l_d)
print("Accuracy (dense) = ", acc_dense)

## Train with *train_on_batch* (suggested)

Here, values of losses are stored every 10 batch iterations, in order to understand what happens during each epoch.

In [None]:
total_loss=total_loss
l_c=l_c
l_d=l_d
acc=acc

In [None]:
total_loss=[]
l_c=[]
l_d=[]
acc=[]

In [None]:
n_batches = train_size // sub_batch_size
print("Number of batches : ", n_batches)

for epoch in range(epochs):
    print("\nEpoch ", epoch+1 , "/", epochs)
  
    for i in range(n_batches):
        print("Processing batch...  ", i)
        batch = next(inputgenerator)
        #print(type(batch), batch[0].shape, batch[1].shape)
        loss, compactness_loss, descriptness_loss, dense_accuracy, dense_1_accuracy = model.train_on_batch(batch[0], batch[1])
        #Print the total loss every 10 iterations
        if i % 10 == 0:
            print("\nTotal loss after iteration ", i, " is ", loss)
            print("\nCompact loss after iteration ", i, " is ", compactness_loss)
            print("\nDescript loss after iteration ", i, " is ", descriptness_loss)
            total_loss.append(loss)
            l_c.append(compactness_loss)
            l_d.append(descriptness_loss)
            acc.append(dense_1_accuracy)
            
    if (epoch+1) % 50 == 0:
        my_model = "my_model_l400_"+ str(epoch+1) +".h5"
        path_model = os.path.join(path_ds, my_model)  #/content/drive/My Drive/my_model.h5
        model.save(path_model)

    print("Total loss at the end of epoch " , epoch+1, ": ", loss)

## Retrieve losses stored in folder *Dataset* if needed

In [None]:
path_loss = os.path.join(path_ds, "loss19.json")
path_lc = os.path.join(path_ds, "l_c19.json")
path_ld = os.path.join(path_ds, "l_d19.json")

In [None]:
with open(path_loss, 'r') as fp:
    total_loss = json.load(fp)
with open(path_lc, 'r') as fp:
    l_c = json.load(fp)
with open(path_ld, 'r') as fp:
    l_d = json.load(fp)

## Plot compacteness, descriptiveness and total losses

In [None]:
print(np.mean(total_loss))
print(np.mean(l_c))
print(np.mean(l_d))

In [None]:
len(l_c)

Plot all losses

In [None]:
import matplotlib.pyplot as plt
plt.plot(total_loss, label="Total loss")
plt.plot(l_c, label="Compacteness loss")
plt.plot(l_d, label="Descriptiveness loss")
plt.xlabel("training steps")
plt.legend()
plt.show()

Plot total loss

In [None]:
plt.plot(total_loss, label="Total loss")
plt.xlabel("training steps")
plt.legend()
#plt.xscale('log')
plt.show()

Plot compacteness loss

In [None]:
plt.plot(l_c, label="Compacteness loss")
plt.xlabel("training steps")
#plt.xscale('log')
plt.legend()
plt.show()

Plot descriptiveness loss

In [None]:
plt.plot(l_d, label="Descriptiveness loss")
plt.xlabel("training steps")
#plt.xscale('log')
plt.legend()
plt.show()

Plot dense accuracy (maximum value is 0.5)

In [None]:
plt.plot(acc, label="Dense accuracy")
plt.xlabel("training steps")
#plt.xscale('log')
plt.legend()
plt.show()

## Save losses on folder *Dataset* if needed

In [None]:
path_loss = os.path.join(path_ds, "loss20.json") 
path_lc = os.path.join(path_ds, "l_c20.json")
path_ld = os.path.join(path_ds, "l_d20.json")

In [None]:
with open(path_loss, 'w') as fp:
  json.dump(total_loss, fp)
with open(path_lc, 'w') as fp:
  json.dump(l_c, fp)
with open(path_ld, 'w') as fp:
  json.dump(l_d, fp)

## Save trained model

In [None]:
path_model = os.path.join(path_ds, "my_model_400.h5")  #/content/drive/My Drive/my_model.h5
model.save(path_model)  # creates a HDF5 file 'my_model.h5'
#del model  # deletes the existing model