# VIT

MS COCO 2014

## Imports

In [None]:
from tensorflow.keras.applications.resnet50 import preprocess_input
from tensorflow.keras.applications import ResNet50
from tensorflow.keras import layers, models, losses, callbacks
from tensorflow.keras.models import Model
from tensorflow.keras.utils import plot_model
from tensorflow.keras.utils import Sequence
from sklearn.metrics import roc_curve, auc
from IPython.display import clear_output
from pycocotools.coco import COCO
import tensorflow as tf
import os
import numpy as np
from PIL import Image
import matplotlib.pyplot as plt
import sys
from tensorflow.keras.callbacks import TensorBoard
from tensorflow.keras.preprocessing.image import ImageDataGenerator
import datetime

gpus = tf.config.experimental.list_physical_devices('GPU')
if gpus:
    for gpu in gpus:
        tf.config.experimental.set_memory_growth(gpu, True)
        print(gpu)
print(tf.__version__)



## Constantes et variables globals

In [None]:
# Paths
ANNOTDIR = 'annotations_trainval2014'
DATADIR = 'train2014'
INSTANCEFILE = '{}/annotations/instances_{}.json'.format(ANNOTDIR, DATADIR)

# Hyper-paramètres
RATIO_TRAIN = 0.8
RATIO_VAL = 0.15
RATIO_TEST = 0.05
BATCH_SIZE = 32
EPOCHS = 200
PATIENCE = 3
COCO_INSTANCES = COCO(INSTANCEFILE)
NUM_TOTAL_CLASSES = 91 # 80 classes + 10 classes omises + 1 car base 1
PATCHES_SIZE = 12
INPUT_SHAPE = (224, 224, 3) # Taille des images
GLOBAL_THRESHOLD = 0.25

# Configuration de l'augmentation
DATA_GEN = ImageDataGenerator(
    rotation_range=10,
    width_shift_range=0.05,
    height_shift_range=0.05,
    zoom_range=0.05,
    horizontal_flip=True,
    fill_mode='nearest'
)

# Vérifications
assert RATIO_TRAIN + RATIO_VAL + RATIO_TEST == 1 # Vérification de la somme des ratios

num_classes = len(COCO_INSTANCES.getCatIds())
print(f'Nombre de classes dans le dataset COCO: {num_classes}')

# Métriques
#F1_SCORE = tf.keras.metrics.F1Score()

# Couches custom

class Patches(layers.Layer):
    def __init__(self, patch_size):
        super(Patches, self).__init__()
        self.patch_size = patch_size

    def call(self, images):
        batch_size = tf.shape(images)[0]
        patches = tf.image.extract_patches(
            images=images,
            sizes=[1, self.patch_size, self.patch_size, 1],
            strides=[1, self.patch_size, self.patch_size, 1],
            rates=[1, 1, 1, 1],
            padding='VALID'
        )
        patch_dims = patches.shape[-1]
        num_patches = (images.shape[1] // self.patch_size) * (images.shape[2] // self.patch_size)
        patches = tf.reshape(patches, [batch_size, num_patches, patch_dims])
        return patches
    
    def get_config(self):
        return {"patch_size": self.patch_size}

class PatchEncoder(layers.Layer):
    def __init__(self, num_patches, projection_dim):
        super().__init__()
        self.num_patches = num_patches
        self.projection = layers.Dense(units=projection_dim)
        self.position_embedding = layers.Embedding(
            input_dim=num_patches, output_dim=projection_dim
        )

    def call(self, patch):
        positions = tf.range(start=0, limit=self.num_patches, delta=1)
        encoded = self.projection(patch) + self.position_embedding(positions)
        return encoded

    def get_config(self):
        return {"num_patches": self.num_patches, "projection_dim": self.projection.units}


## Chargement des données

In [None]:
class DatasetGenerator(Sequence):
    def _getsplit(self, ensemble):
        if ensemble == 'train':
            start = 0
            stop = int(RATIO_TRAIN * len(self.imgIds))
        elif ensemble == 'val':
            start = int(RATIO_TRAIN * len(self.imgIds))
            stop = int((RATIO_TRAIN + RATIO_VAL) * len(self.imgIds))
        elif ensemble == 'test':
            start = int((RATIO_TRAIN + RATIO_VAL) * len(self.imgIds))
            stop = len(self.imgIds)
        return start, stop

    def __init__(self, ensemble, **kwargs):
        super().__init__(**kwargs)
        self.ensemble = ensemble
        
        # Créer une liste de tous les IDs d'images
        self.imgIds = COCO_INSTANCES.getImgIds()
        start, stop = self._getsplit(ensemble)
        self.ids = self.imgIds[start:stop]

    def __len__(self):
        return int(np.ceil(len(self.ids) / BATCH_SIZE))

    def __getitem__(self, index):
        batch_ids = self.ids[index * BATCH_SIZE : (index + 1) * BATCH_SIZE]
        batch_images = []
        batch_labels = []
        for id in batch_ids:
            # Charger l'image
            file_name = COCO_INSTANCES.imgs[id]['file_name']
            image = Image.open(f'{DATADIR}/{file_name}')
            image = image.resize((224, 224))
            image = image.convert('RGB')
            image = np.array(image)
            if self.ensemble == 'train':
                image = DATA_GEN.random_transform(image)
            image = np.array(image)
            batch_images.append(image)
            # Charger les classes
            annIds = COCO_INSTANCES.getAnnIds(imgIds=id)
            anns = COCO_INSTANCES.loadAnns(annIds)
            labels = [0.0 for _ in range(NUM_TOTAL_CLASSES)]
            for ann in anns:
                labels[ann['category_id']] = 1.0
            batch_labels.append(labels)

        batch_labels = np.array(batch_labels)
        batch_images = np.array(batch_images)

        return (batch_images, batch_labels)

    def on_epoch_end(self):
        self.ids = np.random.permutation(self.ids)

train_generator = DatasetGenerator('train')
val_generator = DatasetGenerator('val')
test_generator = DatasetGenerator('test')

print(f'Taille du dataset d\'entrainement: {len(train_generator)} batches, {len(train_generator.ids)} items')
print(f'Taille du dataset de validation: {len(val_generator)} batches, {len(val_generator.ids)} items')
print(f'Taille du dataset de test: {len(test_generator)} batches, {len(test_generator.ids)} items')

### Tests

#### Test de performance du générateur de données

In [None]:
%%timeit
generator = train_generator # Temps négligeable
r_index = np.random.randint(len(generator)) # Temps négligeable
generator.__getitem__(r_index-1)

#### Test unitaire du générateur de données

In [None]:
generator = train_generator
# Récupérer un batch d'images et de légendes
r_index = np.random.randint(len(generator)-1)
images, labels = generator.__getitem__(r_index)
# Extraire une image et ses classes
r_index = np.random.randint(len(images))
image = images[r_index]
label = labels[r_index]
label_ids= [str(i) for i in np.where(label == 1)[0]]
label_str = ', '.join([ COCO_INSTANCES.cats[int(i)]['name'] for i in label_ids])
# Afficher une image et ses classes
plt.imshow(image)
plt.title(f'Classes: {label_str}')
plt.axis('off')
plt.show()

#### Test du patching

In [None]:
generator = train_generator
# Récupérer un batch d'images et de légendes
r_index = np.random.randint(len(generator)-1)
images, labels = generator.__getitem__(r_index)
# Extraire une image et ses classes
r_index = np.random.randint(len(images))
image = images[r_index]
patches = Patches(patch_size=PATCHES_SIZE)(np.expand_dims(image, axis=0))
print(f"Patch size: {PATCHES_SIZE} X {PATCHES_SIZE}")
print(f"Patches per image: {patches.shape[1]}")
print(f"Elements per patch: {patches.shape[-1]}")

n = int(np.sqrt(patches.shape[1]))
plt.figure(figsize=(4, 4))
for i, patch in enumerate(patches[0]):
    ax = plt.subplot(n, n, i + 1)
    patch_img = np.array(patch).reshape(PATCHES_SIZE, PATCHES_SIZE, 3)
    plt.imshow(patch_img)
    plt.axis('off')
plt.show()

## Modèle

### Création du modèle

In [None]:
### CUSTOM MODELS ###

def VIT_v1():
    '''
    inputs :
    - image : (224, 224, 3)
    outputs :
    - class : (91)
    results :
    
    '''
    def transformer_block(x, num_heads, projection_dim, ff_dim, dropout):
        # Normalisation et Multi-Head Attention
        x1 = layers.LayerNormalization(epsilon=1e-6)(x)
        attention = layers.MultiHeadAttention(
            num_heads=num_heads, key_dim=projection_dim
        )(x1, x1)
        x2 = layers.Add()([attention, x])
        x3 = layers.LayerNormalization(epsilon=1e-6)(x2)

        # Normalisation et Feed-Forward
        x3 = layers.Dense(units=ff_dim, activation='gelu')(x3)
        x3 = layers.Dropout(dropout)(x3)
        x3 = layers.Dense(units=projection_dim, activation='gelu')(x3)
        x3 = layers.Dropout(dropout)(x3)
        final = layers.Add()([x3, x2])
        return final

    # Création du modèle
    dropout_start = 0.20
    dropout_end = 0.55
    dropout_middle = 0.05
    projection_dim = 128+32
    ff_dim_multiplier = 6
    num_heads = 4
    num_blocks = 48

    image_input = layers.Input(shape=INPUT_SHAPE)
    x = layers.BatchNormalization()(image_input)
    x = layers.Dropout(dropout_start)(x)
    x = Patches(PATCHES_SIZE)(x)
    num_patches = (INPUT_SHAPE[0] // PATCHES_SIZE) * ( INPUT_SHAPE[1] // PATCHES_SIZE)
    size_patch = PATCHES_SIZE * PATCHES_SIZE * INPUT_SHAPE[-1]
    print(f'Nombre de patches: {num_patches}')
    print(f'Taille d\'un patch: {size_patch}')
    x = PatchEncoder(num_patches=num_patches, projection_dim=projection_dim)(x)

    # Transformer
    for _ in range(num_blocks):
        x = transformer_block(x, num_heads=num_heads, projection_dim=projection_dim, ff_dim=projection_dim*ff_dim_multiplier, dropout=dropout_middle)
    
    # Classification
    x = layers.LayerNormalization(epsilon=1e-6)(x)
    x = layers.Flatten()(x)
    #x = layers.GlobalAveragePooling1D()(x)
    #x = layers.Lambda(lambda x: x[ :, -1, :])(x)
    #x = layers.Dense(128, activation='leaky_relu')(x)
    x = layers.Dropout(dropout_end)(x)
    output = layers.Dense(NUM_TOTAL_CLASSES, activation='sigmoid')(x) # sigmoid pour multi-label classification

    model = Model(inputs=image_input, outputs=output, name='VIT_v1')
    optimizer = tf.keras.optimizers.Adam()
    model.compile(optimizer=optimizer, loss='binary_crossentropy')
    return model

model = VIT_v1()
model.summary()

#### Visualisation du modèle

In [None]:
plot_model(model, to_file=f'{model.name}.png', show_shapes=True, show_layer_names=True)

### Entrainement du modèle

In [None]:

log_dir = "logs/" + datetime.datetime.now().strftime("%Y%m%d-%H%M%S")

tensorboard_callback = TensorBoard(log_dir=log_dir, histogram_freq=1)

early_stopping = callbacks.EarlyStopping(monitor='val_loss',
                                         patience=PATIENCE, 
                                         restore_best_weights=True)

history = model.fit(train_generator,
                    validation_data=val_generator,
                    epochs=EPOCHS,
                    callbacks=[early_stopping, tensorboard_callback],
                    verbose=1)

# Plot
plt.figure(figsize=(12, 6))
plt.plot(history.history['loss'], label='Train loss')
plt.plot(history.history['val_loss'], label='Validation loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.legend()
plt.show()

#### Sauvegarde du modèle

In [29]:
model.save(f'{model.name}.keras')

## Evaluation

### Chargement d'un modèle pré-existant

In [5]:
model = tf.keras.models.load_model('VIT_v1_2210.keras', custom_objects={'Patches': Patches, 'PatchEncoder': PatchEncoder})

### Courbe ROC
Permet d'obtenir le meilleure threshold pour chaque classe

In [None]:

generator = train_generator
# Récupérer les labels et les prédictions pour l'ensemble de test
y_true = []
y_pred = []

for i in range(len(generator)):
    images, labels = generator[i]
    predictions = model.predict(images,verbose = 0)
    y_true.append(labels)
    y_pred.append(predictions)
    clear_output()
    print(f'Batch {i+1}/{len(generator)}')

y_true = np.concatenate(y_true, axis=0)
y_pred = np.concatenate(y_pred, axis=0)

# Calculer les courbes ROC et les AUC pour chaque classe
fpr = dict()
tpr = dict()
roc_auc = dict()
thresholds = dict()

for i in range(NUM_TOTAL_CLASSES):
    fpr[i], tpr[i], thresholds[i] = roc_curve(y_true[:, i], y_pred[:, i])
    roc_auc[i] = auc(fpr[i], tpr[i])

# Trouver le meilleur seuil pour chaque classe
best_thresholds = dict()
for i in range(NUM_TOTAL_CLASSES):
    gmeans = np.sqrt(tpr[i] * (1-fpr[i]))
    ix = np.argmax(gmeans)
    best_thresholds[i] = thresholds[i][ix]

# On clear les prints
clear_output()

# Tracer les courbes ROC pour quelques classes
plt.figure(figsize=(12, 8))
for i in range(NUM_TOTAL_CLASSES):
    if i in COCO_INSTANCES.cats :
        plt.plot(fpr[i], tpr[i], label=f'{COCO_INSTANCES.cats[i]["name"]} (AUC = {roc_auc[i]:.2f})')

plt.plot([0, 1], [0, 1], 'k--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('Taux de faux positifs')
plt.ylabel('Taux de vrais positifs')
plt.title('Courbes ROC')
#plt.legend(loc='lower right') # Trop de classes pour afficher la légende
plt.show()

print("Meilleurs seuils pour chaque classe:")
print(best_thresholds)

### Test unitaire du modèle

In [None]:
use_global_threshold = True

generator = train_generator
# Récupérer un batch d'images et de labels
r_index = np.random.randint(len(generator)-1)
images, labels = generator.__getitem__(r_index)
    
# Choisir une image au hasard dans le batch
batch_size = images.shape[0]
r_index = np.random.randint(batch_size-1)
image = images[r_index]
label = labels[r_index]

label_str = ', '.join([ COCO_INSTANCES.cats[int(i)]['name'] for i in np.where(label == 1)[0]])
    
image_expanded = np.expand_dims(image, axis=0)  # Ajouter une dimension pour correspondre au batch_size
predictions = model.predict(image_expanded)
    
# Extraire les prédictions pour l'image sélectionnée
predicted_label = predictions[0]
# Tri des classes prédites en fonction de l'assurance du modèle
predicted_classes = np.argsort(predicted_label)[::-1]
if use_global_threshold:
    predicted_classes = [i for i in predicted_classes if predicted_label[i] > GLOBAL_THRESHOLD]
else:
    predicted_classes = [i for i in predicted_classes if predicted_label[i] > best_thresholds[i]]
predicted_str = ', '.join([COCO_INSTANCES.cats[int(i)]['name'] for i in predicted_classes])

# Afficher l'image
plt.imshow(image)
plt.title(f'Classes prédites: {predicted_str} \nClasses réelles: {label_str}')
plt.axis('off')
plt.show()

