# VIT

MS COCO 2014

## Imports

In [None]:
from tensorflow.keras.applications.resnet50 import preprocess_input
from tensorflow.keras.applications import ResNet50
from tensorflow.keras import layers, models, losses, callbacks
from tensorflow.keras.models import Model
from tensorflow.keras.utils import plot_model
from tensorflow.keras.utils import Sequence
from pycocotools.coco import COCO
import tensorflow as tf
import os
import numpy as np
from PIL import Image
import matplotlib.pyplot as plt
import sys

gpus = tf.config.experimental.list_physical_devices('GPU')
if gpus:
    print(gpus)
print(tf.__version__)

## Constantes et variables globals

In [None]:
# Paths
ANNOTDIR = 'annotations_trainval2014'
DATADIR = 'train2014'
INSTANCEFILE = '{}/annotations/instances_{}.json'.format(ANNOTDIR, DATADIR)

# Hyper-paramètres
RATIO_TRAIN = 0.8
RATIO_VAL = 0.15
RATIO_TEST = 0.05
BATCH_SIZE = 32
EPOCHS = 200
PATIENCE = 3
COCO_INSTANCES = COCO(INSTANCEFILE)
NUM_TOTAL_CLASSES = 91 # 80 classes + 10 classes omises + 1 car base 1
PATCHES_SIZE = 16
INPUT_SHAPE = (224, 224, 3) # Taille des images

# Vérifications
assert RATIO_TRAIN + RATIO_VAL + RATIO_TEST == 1 # Vérification de la somme des ratios

num_classes = len(COCO_INSTANCES.getCatIds())
print(f'Nombre de classes dans le dataset COCO: {num_classes}')

# Métriques
#F1_SCORE = tf.keras.metrics.F1Score()

## Chargement des données

In [None]:
class DatasetGenerator(Sequence):
    def _getsplit(self, ensemble):
        if ensemble == 'train':
            start = 0
            stop = int(RATIO_TRAIN * len(self.imgIds))
        elif ensemble == 'val':
            start = int(RATIO_TRAIN * len(self.imgIds))
            stop = int((RATIO_TRAIN + RATIO_VAL) * len(self.imgIds))
        elif ensemble == 'test':
            start = int((RATIO_TRAIN + RATIO_VAL) * len(self.imgIds))
            stop = len(self.imgIds)
        return start, stop

    def __init__(self, ensemble, **kwargs):
        super().__init__(**kwargs)
        self.ensemble = ensemble
        
        # Créer une liste de tous les IDs d'images
        self.imgIds = COCO_INSTANCES.getImgIds()
        start, stop = self._getsplit(ensemble)
        self.ids = self.imgIds[start:stop]

    def __len__(self):
        return int(np.ceil(len(self.ids) / BATCH_SIZE))

    def __getitem__(self, index):
        batch_ids = self.ids[index * BATCH_SIZE : (index + 1) * BATCH_SIZE]
        batch_images = []
        batch_labels = []
        for id in batch_ids:
            # Charger l'image
            file_name = COCO_INSTANCES.imgs[id]['file_name']
            image = Image.open(f'{DATADIR}/{file_name}')
            image = image.resize((224, 224))
            image = image.convert('RGB')
            image = np.array(image)
            batch_images.append(image)
            # Charger les classes
            annIds = COCO_INSTANCES.getAnnIds(imgIds=id)
            anns = COCO_INSTANCES.loadAnns(annIds)
            labels = [0.0 for _ in range(NUM_TOTAL_CLASSES)]
            for ann in anns:
                labels[ann['category_id']] = 1.0
            batch_labels.append(labels)

        batch_labels = np.array(batch_labels)
        batch_images = np.array(batch_images)

        return (batch_images, batch_labels)

    def on_epoch_end(self):
        self.ids = np.random.permutation(self.ids)

train_generator = DatasetGenerator('train')
val_generator = DatasetGenerator('val')
test_generator = DatasetGenerator('test')

print(f'Taille du dataset d\'entrainement: {len(train_generator)} batches, {len(train_generator.ids)} items')
print(f'Taille du dataset de validation: {len(val_generator)} batches, {len(val_generator.ids)} items')
print(f'Taille du dataset de test: {len(test_generator)} batches, {len(test_generator.ids)} items')

### Test de performance du générateur de données

In [None]:
%%timeit
generator = train_generator # Temps négligeable
r_index = np.random.randint(len(generator)) # Temps négligeable
generator.__getitem__(r_index-1)

### Test unitaire du générateur de données

In [None]:
generator = train_generator
# Récupérer un batch d'images et de légendes
r_index = np.random.randint(len(generator)-1)
images, labels = generator.__getitem__(r_index)
# Extraire une image et ses classes
r_index = np.random.randint(len(images))
image = images[r_index]
label = labels[r_index]
label_ids= [str(i) for i in np.where(label == 1)[0]]
label_str = ', '.join([ COCO_INSTANCES.cats[int(i)]['name'] for i in label_ids])
# Afficher une image et ses classes
plt.imshow(image)
plt.title(f'Classes: {label_str}')
plt.axis('off')
plt.show()

## Modèle

### Création du modèle

In [None]:

### CUSTOM LAYERS ###

class Patches(layers.Layer):
    def __init__(self, patch_size):
        super(Patches, self).__init__()
        self.patch_size = patch_size

    def call(self, images):
        batch_size = tf.shape(images)[0]
        patches = tf.image.extract_patches(
            images=images,
            sizes=[1, self.patch_size, self.patch_size, 1],
            strides=[1, self.patch_size, self.patch_size, 1],
            rates=[1, 1, 1, 1],
            padding='VALID'
        )
        patch_dims = patches.shape[-1]
        num_patches = (images.shape[1] // self.patch_size) * (images.shape[2] // self.patch_size)
        patches = tf.reshape(patches, [batch_size, num_patches, patch_dims])
        return patches

class PatchEncoder(layers.Layer):
    def __init__(self, num_patches, projection_dim):
        super(PatchEncoder, self).__init__()
        self.num_patches = num_patches
        self.projection_layer = layers.Dense(units=projection_dim)
        self.position_embedding_layer = layers.Embedding(
            input_dim=num_patches, output_dim=projection_dim
        )

    def call(self, patch):
        positions = tf.range(start=0, limit=self.num_patches, delta=1)
        projection = self.projection_layer(patch)
        position_embedding = self.position_embedding_layer(positions)
        position_embedding = tf.expand_dims(position_embedding, axis=0)
        encoded = projection + position_embedding
        return encoded

### CUSTOM MODELS ###

def VIT_v1():
    '''
    inputs :
    - image : (224, 224, 3)
    outputs :
    - class : (91)
    '''
    def transformer_block(x, num_heads, projection_dim, ff_dim, dropout):
        # Normalisation et Multi-Head Attention
        start = layers.LayerNormalization(epsilon=1e-6)(x)
        x = layers.MultiHeadAttention(
            num_heads=num_heads, key_dim=projection_dim, dropout=dropout
        )(start, start)
        x = x + start
        res1 = layers.LayerNormalization(epsilon=1e-6)(x)

        # Normalisation et Feed-Forward
        x = layers.Dense(units=ff_dim, activation='gelu')(res1)
        x = layers.Dense(units=projection_dim, activation='gelu')(x)    
        x = layers.Dropout(dropout)(x)
        res2 = x + res1
        return res2

    # Création du modèle
    dropout = 0.0
    projection_dim = 128
    ff_dim_multiplier = 4
    num_heads = 8

    image_input = layers.Input(shape=INPUT_SHAPE)
    x = layers.Rescaling(1./255)(image_input)
    x = Patches(PATCHES_SIZE)(x)
    num_patches = (INPUT_SHAPE[0] // PATCHES_SIZE) * ( INPUT_SHAPE[1] // PATCHES_SIZE)
    size_patch = PATCHES_SIZE * PATCHES_SIZE * INPUT_SHAPE[-1]
    print(f'Nombre de patches: {num_patches}')
    print(f'Taille d\'un patch: {size_patch}')
    x = PatchEncoder(num_patches=num_patches, projection_dim=projection_dim)(x)

    # Transformer
    for _ in range(20):
        x = transformer_block(x, num_heads=num_heads, projection_dim=projection_dim, ff_dim=projection_dim*ff_dim_multiplier, dropout=dropout)
    
    # Classification
    x = layers.GlobalAveragePooling1D()(x)
    #x = layers.Lambda(lambda x: x[ :, -1, :])(x)
    x = layers.Dropout(dropout)(x)
    output = layers.Dense(NUM_TOTAL_CLASSES, activation='sigmoid')(x) # sigmoid pour multi-label classification

    model = Model(inputs=image_input, outputs=output, name='VIT_v1')
    optimizer = tf.keras.optimizers.Adam()
    model.compile(optimizer=optimizer, loss='binary_crossentropy', metrics=['accuracy'])
    return model

model = VIT_v1()
model.summary()

#### Visualisation du modèle

In [None]:
plot_model(model, to_file=f'{model.name}.png', show_shapes=True, show_layer_names=True)

### Entrainement du modèle

In [None]:
early_stopping = callbacks.EarlyStopping(monitor='val_loss',
                                         patience=PATIENCE, 
                                         restore_best_weights=True)

history = model.fit(train_generator,
                    validation_data=val_generator,
                    epochs=EPOCHS,
                    callbacks=[early_stopping],
                    verbose=1)

# Plot
plt.figure(figsize=(12, 6))
plt.plot(history.history['loss'], label='Train loss')
plt.plot(history.history['val_loss'], label='Validation loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.legend()
plt.show()

#### Sauvegarde du modèle

In [None]:
model.save(f'{model.name}.keras')

#### Test unitaire du modèle

In [None]:
generator = train_generator
# Récupérer un batch d'images et de labels
r_index = np.random.randint(len(generator)-1)
images, labels = generator.__getitem__(r_index)
    
# Choisir une image au hasard dans le batch
batch_size = images.shape[0]
r_index = np.random.randint(batch_size-1)
image = images[r_index]
label = labels[r_index]
    
image_expanded = np.expand_dims(image, axis=0)  # Ajouter une dimension pour correspondre au batch_size
predictions = model.predict(image_expanded)
    
# Extraire les prédictions pour l'image sélectionnée
predicted_label = predictions[0]
    
# Si c'est du multi-label classification avec sigmoid, on peut utiliser un seuil pour les prédictions
threshold = 0.3
predicted_classes = (predicted_label > threshold).astype(int)
predicted_classes = np.where(predicted_classes == 1)[0]
predicted_str = ', '.join([ COCO_INSTANCES.cats[int(i)]['name'] for i in predicted_classes])


# Afficher l'image
plt.imshow(image)
plt.title(f'Classes prédites: {predicted_str}')
plt.axis('off')
plt.show()

