## Imports

In [None]:
from tensorflow.keras.applications.resnet50 import preprocess_input
from tensorflow.keras.applications import ResNet50
from tensorflow.keras import layers, models, losses, callbacks
from tensorflow.keras.models import Model
from tensorflow.keras.utils import plot_model
from tensorflow.keras.utils import Sequence
from pycocotools.coco import COCO
import tensorflow as tf
from tensorflow.keras.preprocessing.sequence import pad_sequences
import os
import numpy as np
from PIL import Image
import matplotlib.pyplot as plt
from sklearn.metrics.pairwise import cosine_similarity
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from gensim.models import Word2Vec, KeyedVectors
from tensorflow.keras.models import load_model
import sys

nltk.download('punkt_tab')
nltk.download('stopwords')
nltk.download('punkt')

gpus = tf.config.experimental.list_physical_devices('GPU')
if gpus:
    print(gpus)
print(tf.__version__)

print("Eager execution:", tf.executing_eagerly())

## Constantes et Variables et Fonctions Globals

In [None]:
ANNOTDIR = 'annotations_trainval2014'
DATADIR = 'train2014'
CAPFILE = '{}/annotations/captions_{}.json'.format(ANNOTDIR, DATADIR)
INSTANCEFILE = '{}/annotations/instances_{}.json'.format(ANNOTDIR, DATADIR)
ALLOW_STOPWORD = True
TEXT_VECTOR_SIZE = 512 #VOCAB_SIZE = 24918
WORD2VEC_PATH = f'word2vec_captions_{TEXT_VECTOR_SIZE}.txt'
MAX_LEN_SEQUENCE = 60 # 57 Obtenu via trainement des données
START_TOKEN = '<sos>'
END_TOKEN = '<eos>'
UNK_TOKEN = '<unk>'
PADDING_TOKEN = '<pad>'
coco_captions = COCO(CAPFILE)
coco_instances = COCO(INSTANCEFILE)
BATCH_SIZE = 32
EPOCHS = 200
RATIO_TRAIN = 0.8
RATIO_VAL = 0.15
RATIO_TEST = 0.05
PATIENCE = 3

if os.path.exists(WORD2VEC_PATH):
    try :
        global VOCAB_SIZE, PADDING_INDEX, vec
        vec = KeyedVectors.load_word2vec_format(WORD2VEC_PATH, binary=False)
        print(f"KeyedVectors loaded from {WORD2VEC_PATH}")
        PADDING_INDEX = vec.add_vector(PADDING_TOKEN, np.zeros(TEXT_VECTOR_SIZE))
        print(f"Padding index : {PADDING_INDEX}")
        VOCAB_SIZE = len(vec.index_to_key)
        print("VOCAB_SIZE :",VOCAB_SIZE)
        
    except e as Exception:
        print(f"Error loading KeyedVectors from {WORD2VEC_PATH} error : {e}")
        vec = None
else:
    print(f"No model found at {WORD2VEC_PATH}")

assert RATIO_TRAIN + RATIO_VAL + RATIO_TEST == 1 # Vérification de la somme des ratios
assert TEXT_VECTOR_SIZE == vec.vector_size # Vérification de la taille des vecteurs

def find_closest_word(vector,):
    # Cas pour le padding
    if np.all(vector == 0):
        return PADDING_TOKEN
    # Calculer la similarité de cosine entre le vecteur donné et tous les vecteurs dans Word2Vec
    similarities = cosine_similarity([vector], vec.vectors)[0] # 0 -> tuple (word, similarity) 0 -> word  PS : Fonction O(n) mais très optimisé (493 μs ± 4.53)
    # Trouver l'index du vecteur le plus similaire
    closest_index = similarities.argmax()
    # Retourner le mot correspondant à cet index
    return vec.index_to_key[closest_index]

def find_closest_vector(vector, vec, printable=False):
    # Calculer la similarité de cosine entre le vecteur donné et tous les vecteurs dans Word2Vec
    similarities = cosine_similarity([vector], vec.vectors)[0]
    if printable:
        print(f'Similarities : {similarities}')
    # Récupérer le vecteur le plus similaire
    closest_vector = vec.vectors[similarities.argmax()]
    return closest_vector

def get_positional_encoding(max_seq_length, embed_size, scale=10000):
    positional_encoding = np.array([
        [pos / np.power(scale, 2 * (i//2) / embed_size) for i in range(embed_size)]
        if pos != 0 else np.zeros(embed_size) for pos in range(max_seq_length)
    ], dtype=np.float32)
    positional_encoding[:, 0::2] = np.sin(positional_encoding[:, 0::2])  # dimensions 2i
    positional_encoding[:, 1::2] = np.cos(positional_encoding[:, 1::2])  # dimensions 2i+1
    return tf.cast(positional_encoding, dtype=tf.float32)

class PositionalEncodingLayer(tf.keras.layers.Layer):
    def __init__(self, max_seq_length, embed_size, scale=10000, **kwargs):
        super().__init__(**kwargs)
        self.max_seq_length = max_seq_length
        self.embed_size = embed_size
        self.positional_encoding = get_positional_encoding(max_seq_length, embed_size, scale=scale)

    def call(self, x, mask=None):
        seq_length = tf.shape(x)[1]
        # Réduit positional_encoding à la longueur de la séquence réelle en cas de séquence plus courte que max_seq_length
        pe = self.positional_encoding[:seq_length, :]
        
        if mask is not None:
            # Étend le masque pour qu'il ait la même dimension que x et pe
            mask = tf.cast(mask, tf.float32)
            mask = tf.expand_dims(mask, axis=-1)
            # Utilise le masque pour annuler l'encoding sur les positions masquées
            pe *= mask
        
        return x + pe

    def get_config(self):
        config = super().get_config()
        config.update({
            "max_seq_length": self.max_seq_length,
            "embed_size": self.embed_size
        })
        return config

class PaddingTruncatingLayer(tf.keras.layers.Layer):
    def __init__(self, maxlen=MAX_LEN_SEQUENCE, padding_value=0.0, **kwargs):
        super().__init__(**kwargs)
        self.maxlen = maxlen
        self.padding_value = padding_value

    def call(self, inputs):
        # Obtient la taille réelle des séquences
        input_shape = tf.shape(inputs)

        # Tronque les séquences si elles sont plus longues que maxlen
        inputs = inputs[:, input_shape[1]-self.maxlen:, :]

        # Calcule le padding nécessaire
        padding_size = self.maxlen - tf.shape(inputs)[1]

        # Crée un padding de taille [batch_size, padding_size, features_dim]
        padding = tf.fill([input_shape[0], padding_size, input_shape[2]], self.padding_value)
        padding = tf.cast(padding, tf.float32)

        # Concatène le padding à l'input pour atteindre maxlen
        out_tensor = tf.concat([inputs, padding], axis=1)
        return tf.reshape(out_tensor, self.compute_output_shape(input_shape))
    
    def compute_output_shape(self, input_shape):
        return (input_shape[0], self.maxlen, input_shape[2])


## Dataset

### Visualisation des données

In [None]:

# Sélectionner un ID d'image au hasard
imgIds = coco_instances.getImgIds()
print(f' Number of images found in instances :',len(imgIds))
randomImgId = np.random.choice(imgIds)
found_img = coco_instances.imgs[randomImgId]
file_name = found_img['file_name']
print(f" Filename : {file_name}")

image = Image.open(f'{DATADIR}/{file_name}')

plt.imshow(image)
plt.axis('off')  # Désactiver les axes, qui ne sont pas nécessaires pour l'affichage d'image
plt.show()

# Récupérer les IDs des annotations de légendes pour l'image sélectionnée
annIds = coco_captions.getAnnIds(imgIds=randomImgId)
# Charger les annotations
anns = coco_captions.loadAnns(annIds)
# Afficher les légendes
print("Captions for the selected image:")
for ann in anns:
    print(f"- {ann['caption']}")

### Word2Vec

#### Pré-Traitement pour Word2Vec

In [None]:
# Fonction de nettoyage de texte
def process_text(text):
    # Retirer les caractères non-alphabétiques et convertir en minuscules
    tokens = word_tokenize(text.lower())
    # Retirer les stop words si besoin
    if not ALLOW_STOPWORD:
        tokens = [w for w in tokens if w not in stopwords.words('english')]
    # Ajouter les tokens de début et de fin
    tokens.insert(0, START_TOKEN)  # Insérer le token de début en première position
    tokens.append(END_TOKEN)  # Ajouter le token de fin
    return tokens

count_captions = 0
count_invidual_captions = 0
raw_captions = []
for id in imgIds :
    caption_ids = coco_captions.getAnnIds(imgIds=id)
    captions_data = coco_captions.loadAnns(caption_ids)
    captions = [process_text(caption['caption']) for caption in captions_data]
    count_invidual_captions += len(captions)
    count_captions += 1
    raw_captions += captions # On aura donc raw_captions une liste de listes
max_captions = max([len(raw_captions[i]) for i in range(len(raw_captions))])
max_len_captions = max([len(raw_captions[i][j]) for i in range(len(raw_captions)) for j in range(len(raw_captions[i]))])
print('Attention, statistique avec captions altérés (ajout des tokens de début et de fin)')
print('count_captions :',count_captions)
print('count_invidual_captions :',count_invidual_captions)
print('mean number of caption per image :',count_invidual_captions/count_captions)
print('max number of captions :',max_captions)

#### Entrainement de Word2Vec

In [None]:
# Entraîner un modèle Word2Vec
wordvec = Word2Vec(raw_captions, vector_size=TEXT_VECTOR_SIZE, window=4, min_count=1, workers=3, epochs=100)

# Nombre de mots dans le vocabulaire
vocab_size = len(wordvec.wv.key_to_index)
print(f"Nombre de mots dans le vocabulaire : {vocab_size}")

#### Sauvegarde de Word2Vec (données uniquement)

In [9]:
wordvec.wv.save_word2vec_format(WORD2VEC_PATH, binary=False)

#### Chargement de Word2Vec

In [5]:
wordvec = Word2Vec.load(WORD2VEC_PATH)

#### Test unitaire de Word2Vec

In [None]:
word = input('Quel mot souhaitez-vous avoir de similaire ? :')
#START_TOKEN = '<sos>'
#END_TOKEN = '<eos>'

if word in vec.key_to_index:
    similar_words = vec.most_similar(word)
    print("Mots similaires à '{}':".format(word))
    for similar_word, similarity in similar_words:
        print(f"{similar_word}: {similarity:.4f}")
else:
    # Si le mot n'est pas dans le vocabulaire, afficher un message d'erreur
    print("Désolé, le mot '{}' n'est pas dans le vocabulaire.".format(word))


### Générateur de données

#### Création des générateurs

In [3]:
class DatasetGenerator(Sequence):
    def _getsplit(self, ensemble):
        if ensemble == 'train':
            start = 0
            stop = int(RATIO_TRAIN * len(self.imgIds))
        elif ensemble == 'val':
            start = int(RATIO_TRAIN * len(self.imgIds))
            stop = int((RATIO_TRAIN + RATIO_VAL) * len(self.imgIds))
        elif ensemble == 'test':
            start = int((RATIO_TRAIN + RATIO_VAL) * len(self.imgIds))
            stop = len(self.imgIds)
        return start, stop
    
    # Fonction de nettoyage de texte
    def _clean_text(self,text):
        # Retirer les caractères non-alphabétiques et convertir en minuscules
        tokens = word_tokenize(text.lower())
        # Retirer les stop words si besoin
        if not ALLOW_STOPWORD:
            tokens = [w for w in tokens if w not in stopwords.words('english')]
        # Ajouter les tokens de début et de fin
        tokens.insert(0, START_TOKEN)  # Insérer le token de début en première position
        tokens.append(END_TOKEN)  # Ajouter le token de fin
        return tokens
    
    def __init__(self, ensemble, **kwargs):
        super().__init__(**kwargs)
        self.ensemble = ensemble
        
        # Créer une liste de tous les IDs d'images
        self.imgIds = coco_instances.getImgIds()
        start, stop = self._getsplit(ensemble)
        self.ids = self.imgIds[start:stop]
        self.captions_ids = { id : coco_captions.getAnnIds(imgIds=id) for id in self.ids }

    def __len__(self):
        return int(np.ceil(len(self.ids) / BATCH_SIZE))
    
    def __getitem__(self, index):
        batch_ids = self.ids[index * BATCH_SIZE : (index + 1) * BATCH_SIZE]
        batch_images = []
        batch_captions = []
        batch_labels = []
        max_len_captions = 0
        for id in batch_ids:
            # Charger l'image
            file_name = coco_instances.imgs[id]['file_name']
            image = Image.open(f'{DATADIR}/{file_name}')
            image = image.resize((224, 224))
            image = image.convert('RGB')
            image = np.array(image)
            batch_images.append(image)
            # Charger une légende aléatoire
            caption_ids = self.captions_ids[id]
            chosen_id = np.random.choice(caption_ids)
            caption = coco_captions.anns[chosen_id]['caption'] # Accès directe car API buggée
            caption = self._clean_text(caption)
            r_index = np.random.randint(1, len(caption)) # On ne prend pas le token de début
            caption_crop = caption[:r_index] # On crop la légende pour l'entrainement du modèle
            caption_label = caption[r_index] # On garde le mot suivant pour la prédiction
            caption_indexs = [ vec.key_to_index[caption_crop[i]] for i in range(len(caption_crop))]
            caption_index_label = vec.key_to_index[caption_label]
            len_caption = len(caption_indexs)
            if len_caption > max_len_captions:
                max_len_captions = len_caption
            batch_captions.append(caption_indexs)
            batch_labels.append(caption_index_label)
        batch_images = preprocess_input(np.array(batch_images).copy())
        batch_captions = pad_sequences(batch_captions, maxlen=max_len_captions, padding='post', value=PADDING_INDEX, dtype='float32')
        batch_labels = np.array(batch_labels)

        return ((batch_images, batch_captions), batch_labels)

    def on_epoch_end(self):
        self.ids = np.random.permutation(self.ids)

train_generator = DatasetGenerator('train')
val_generator = DatasetGenerator('val')
test_generator = DatasetGenerator('test')


#### Test unitaire du générateur de données

In [None]:
generator = train_generator
# Récupérer un batch d'images et de légendes
r_index = np.random.randint(len(generator))
x, labels = generator.__getitem__(r_index-1)
images, captions = x
print(f"Images shape: {images.shape}")
print(f"Captions shape: {captions.shape}")
print(f"Labels shape: {labels.shape}")
# Plot d'une des images avec sa légende
r_index = np.random.randint(0, images.shape[0])
selected_image = images[r_index]
selected_caption = captions[r_index]
selected_caption = np.array(selected_caption, dtype='int16')
selected_label = vec.index_to_key[labels[r_index]]

# On recentre les valeurs de l'image
selected_image = ( selected_image - np.min(selected_image) ) / ( np.max(selected_image) - np.min(selected_image) ) * 255
# On convertit l'image en RGB pour l'affichage
selected_image = np.array(selected_image)
selected_image = selected_image.astype('uint8')
selected_image = selected_image[...,::-1]

# Convertir les indices de la légende en mots
selected_caption_words = [ vec.index_to_key[index] for index in selected_caption]
selected_caption_str = ' '.join(selected_caption_words)
# Affichage de l'image et de la légende
plt.figure(figsize=(8, 8))
plt.imshow(selected_image)
plt.title(f"(Input: {selected_caption_str}) (Label: {selected_label})")
plt.axis('off')  # Désactiver les axes pour une meilleure visibilité
plt.show()


## Modèle

### Tests

#### Test de la layer d'embedding

In [None]:
def test_embedding(word):
    # On test la layer d'embedding de tf
    if word not in vec.key_to_index:
        print(f"Le mot '{word}' n'est pas dans le vocabulaire.")
        return
    word_index = vec.key_to_index[word]
    embedding = vec.get_vector(word)
    print(f"Index du mot '{word}' dans le vocabulaire : {word_index}")
    print(f"Embedding du mot '{word} (wordvec)' : {embedding[0:5]}..")
    
    model = models.Sequential()
    model.add(layers.Input(shape=(1,)))
    model.add(layers.Embedding(input_dim=VOCAB_SIZE, output_dim=TEXT_VECTOR_SIZE, weights=[vec.vectors], trainable=False))
    model.summary()
    # Test de l'embedding
    embedded_word = model.predict(np.array([[word_index]]))
    print(f"Embedding du mot '{word}' (calculé) : {embedded_word[0][0][0:5]}..")
    
    print("Les deux embeddings sont-ils égaux ? :", np.allclose(embedding, embedded_word[0][0]))

test_embedding(input('Quel mot souhaitez-vous tester ? :'))

#### Test de la layer de positionnal encoding

In [None]:

def test_pe(length, scale):
    model = models.Sequential()
    model.add(layers.Input(shape=(None,)))
    model.add(layers.Embedding(input_dim=VOCAB_SIZE, output_dim=TEXT_VECTOR_SIZE, weights=[vec.vectors], trainable=False))
    model.add(PositionalEncodingLayer(max_seq_length=MAX_LEN_SEQUENCE, embed_size=TEXT_VECTOR_SIZE, scale=scale))
    # Test de l'encoding avec un batch de données
    r_index = np.random.randint(len(train_generator))
    x, y = train_generator.__getitem__(r_index)
    x0, x1 = x
    try :
        model.predict(x1, verbose=0)
        print("PE fonctionne correctement sur un batch de données.")
    except e as Exception:
        print(f"Erreur lors de l'application de PE sur un batch de données : {e}")
    
    # Test de l'encoding avec un masque de zéros
    model = models.Sequential()
    model.add(layers.Input(shape=(None,TEXT_VECTOR_SIZE)))
    model.add(PositionalEncodingLayer(max_seq_length=MAX_LEN_SEQUENCE, embed_size=TEXT_VECTOR_SIZE, scale=scale))
    try :
        zero_mask = np.zeros((BATCH_SIZE, length, TEXT_VECTOR_SIZE), dtype='float32')
        pe = model.predict(zero_mask, verbose=0)
        print("PE fonctionne correctement avec un masque de zéros.")
    except e as Exception:
        print(f"Erreur lors de l'application de PE avec un masque de zéros : {e}")
    
    if pe is not None:
        # Affichage de l'encoding
        plt.figure(figsize=(25, 5))
        pe0 = pe[0]
        # Plot des valeurs pair et impair
        fig, ax = plt.subplots(1, 2, figsize=(15, 5))
        ax[0].plot(pe0[:, 0::2])
        ax[0].set_title('Dimensions paires')
        ax[1].plot(pe0[:, 1::2])
        ax[1].set_title('Dimensions impaires')
        plt.show()

test_pe(length=60, scale=100)



#### Test de la layer de Padding Truncage

In [None]:

def test_padding():
    model = models.Sequential()
    model.add(layers.Input(shape=(None,)))
    model.add(layers.Embedding(input_dim=VOCAB_SIZE, output_dim=TEXT_VECTOR_SIZE, weights=[vec.vectors], trainable=False))
    model.add(PaddingTruncatingLayer(maxlen=MAX_LEN_SEQUENCE))
    model.summary()
    # Test de la couche de padding avec un batch de données
    r_index = np.random.randint(len(train_generator))
    x, y = train_generator.__getitem__(r_index)
    x0, x1 = x
    # Choix aléatoire d'un index dans le batch
    r_index = np.random.randint(x1.shape[0])
    selected_caption = x1[r_index]
    selected_image = x0[r_index]
    # On recentre les valeurs de l'image
    selected_image = ( selected_image - np.min(selected_image) ) / ( np.max(selected_image) - np.min(selected_image) ) * 255
    # On convertit l'image en RGB pour l'affichage
    selected_image = np.array(selected_image)
    selected_image = selected_image.astype('uint8')
    selected_image = selected_image[...,::-1]
    selected_caption = np.expand_dims(selected_caption, axis=0)
    res = None
    res = model.predict(selected_caption, verbose=0)[0] # x1 : batch de captions  selected_caption : caption sélectionnée
    true_caption = [ vec.index_to_key[int(selected_caption[0][i])] for i in range(len(selected_caption[0])) ]
    true_caption_str = ' '.join(true_caption)
    print(f'True caption : {true_caption_str}')
    # On cherche les mot donc les vecteurs sont similaires depuis "res"
    selected_caption_words = [ find_closest_word(res[i]) for i in range(len(res))]
    selected_caption_words_str = ' '.join(selected_caption_words)
    print(f'Model caption : {selected_caption_words_str}')
    print(f"Lenght of true caption : {len(true_caption)}")
    print(f"Lenght of Model caption : {len(selected_caption_words)}")
    # On plot le résultat du padding
    plt.figure(figsize=(25, 5))
    plt.imshow(selected_image)
    plt.axis('off')
    plt.show()

test_padding()

#### Test de la layer d'attention

In [None]:
def test_attention(shape1, shape2):
    input1 = layers.Input(shape=shape1)
    input2 = layers.Input(shape=shape2)
    attention = layers.Attention()([input1, input2])
    model = Model(inputs=[input1, input2], outputs=attention)
    model.summary()

test_attention((64, 256), (63, 196))

#### Test de la layer Multi-Head Attention

In [None]:
def test_multi_head_attention(shape1, shape2, key_dim=128, num_heads=1):
    input1 = layers.Input(shape=shape1)
    input2 = layers.Input(shape=shape2)
    # Création de la couche d'attention
    attention = layers.MultiHeadAttention(num_heads=num_heads, key_dim=key_dim)(input1, input2)
    model = Model(inputs=[input1, input2], outputs=attention)
    model.summary()


test_multi_head_attention((60,256),(49,256),1,1)

### Création du modèle

In [None]:

def caption_modelv1():
    '''
    inputs :
        image : (batch_size, 224, 224, 3)
        text : (batch_size, None, TEXT_VECTOR_SIZE)
    outputs :
        output : (batch_size, TEXT_VECTOR_SIZE)
    results :
        cosine similarity loss : -0.48
    '''
    # Image processing
    base_model = ResNet50(weights='imagenet', include_top=False, input_shape=(224, 224, 3))
    for layer in base_model.layers:
        layer.trainable = False
    x = base_model.output
    x = layers.Flatten()(x)
    image_data = layers.Dense(256)(x)  # Nouvelle couche dense pour les caractéristiques
    
    # Text processing
    text_input = layers.Input(shape=(None,TEXT_VECTOR_SIZE))
    x = layers.Masking(mask_value=0.0)(text_input) # Extrèmement important
    text_data = layers.LSTM(512)(x)
    context = layers.Concatenate()([image_data, text_data])
    context = layers.BatchNormalization()(context)
    context = layers.Dense(2024)(context)
    context = layers.BatchNormalization()(context)
    output = layers.Dense(TEXT_VECTOR_SIZE, activation='relu')(context)
    
    model = Model(inputs=(base_model.input, text_input), outputs=[output], name='caption_modelv1')
    cosinus_loss = losses.CosineSimilarity()
    model.compile(loss=cosinus_loss, optimizer='adam')
    return model

def caption_modelv2():
    '''
    inputs :
        image : (batch_size, 224, 224, 3)
        text : (batch_size, None, TEXT_VECTOR_SIZE)
    outputs :
        output : (batch_size, TEXT_VECTOR_SIZE)
    results :
        sparse_categorical_crossentropy : 2.6
        sparse_categorical_crossentropy : 2.9   # Version lourde
        sparse_categorical_crossentropy : 2.55 # V2
    '''
    
    
    # Image processing
    base_model = ResNet50(weights='imagenet', include_top=False, input_shape=(224, 224, 3))
    for layer in base_model.layers:
        layer.trainable = False # True pour fine-tuning ou si beaucoup de mémoire disponible
    x = base_model.output
    x = layers.Flatten()(x)
    x = layers.Dense(512)(x)
    x = layers.BatchNormalization()(x)
    image_data = layers.Activation('leaky_relu')(x)

    
    # Text processing
    text_input = layers.Input(shape=(None,))
    x = layers.Embedding(input_dim=VOCAB_SIZE, output_dim=TEXT_VECTOR_SIZE, weights=[vec.vectors], trainable=False)(text_input)
    x = layers.Masking(mask_value=0.0)(x) # Extrèmement important    
    x = layers.LSTM(1024, return_sequences=False)(x) # return_sequences = True pour obtenir une sortie pour chaque mot
    x = layers.BatchNormalization()(x)
    text_data = layers.Activation('leaky_relu')(x)
    context = layers.Concatenate()([image_data, text_data])
    context = layers.Dropout(0.1)(context)
    output = layers.Dense(VOCAB_SIZE, activation='softmax')(context)
    
    model = Model(inputs=(base_model.input, text_input), outputs=[output], name='caption_modelv2')
    loss = losses.sparse_categorical_crossentropy
    model.compile(loss=loss, optimizer='adam')
    return model

def caption_modelv3():
    '''
    inputs :
        image : (batch_size, 224, 224, 3)
        text : (batch_size, None, TEXT_VECTOR_SIZE)
    outputs :
        output : (batch_size, TEXT_VECTOR_SIZE)
    results :
        sparse_categorical_crossentropy : 5.2137
    '''
    
    # Image processing
    base_model = ResNet50(weights='imagenet', include_top=False, input_shape=(224, 224, 3))
    for layer in base_model.layers:
        layer.trainable = False # True pour fine-tuning ou si beaucoup de mémoire disponible
    x = base_model.output
    x = layers.Flatten()(x)
    image_data = layers.Dense(256)(x)  # Nouvelle couche dense pour les caractéristiques
    image_data_expanded = tf.expand_dims(image_data, 1)  # Ajoute une dimension de séquence
    
    # Text processing
    text_input = layers.Input(shape=(None,))
    image_data_tiled = tf.tile(image_data_expanded, [1, tf.shape(text_input)[1], 1])  # Réplique le long de la dimension de séquence
    text_features  = layers.Embedding(input_dim=VOCAB_SIZE, output_dim=TEXT_VECTOR_SIZE, weights=[vec.vectors], trainable=False)(text_input)
    text_features  = layers.Masking(mask_value=0.0)(text_features) # Extrèmement important
    combined_features = layers.Concatenate(axis=-1)([text_features, image_data_tiled])
    context = layers.Bidirectional(layers.LSTM(1024))(combined_features) # return_sequences = True pour obtenir une sortie pour chaque mot
    context = layers.BatchNormalization()(context)
    context = layers.Activation('leaky_relu')(context)
    context = layers.Dense(256)(context)
    context = layers.BatchNormalization()(context)
    context = layers.Activation('leaky_relu')(context)
    output = layers.Dense(VOCAB_SIZE, activation='softmax')(context)
    
    model = Model(inputs=(base_model.input, text_input), outputs=[output], name='caption_modelv3')
    loss = losses.sparse_categorical_crossentropy
    model.compile(loss=loss, optimizer='adam')
    return model

def caption_modelv4():
    '''
    inputs :
        image : (batch_size, 224, 224, 3)
        text : (batch_size, None, TEXT_VECTOR_SIZE)
    outputs :
        output : (batch_size, TEXT_VECTOR_SIZE)
    results :
        sparse_categorical_crossentropy : 2.98     # self-attention avec causal_mask=False sans PE
        sparse_categorical_crossentropy : 2.88     # self-attention avec causal_mask=True sans PE
        sparse_categorical_crossentropy : 3.17     # self-attention avec causal_mask=True avec PE
        sparse_categorical_crossentropy : 3.47        # self-attention avec causal_mask=True sans PE et avec modèle lourd (head=64 LSTM 1024-512-256 + 2x Dense 256)
    '''
    # Image processing
    base_model = ResNet50(weights='imagenet', include_top=False, input_shape=(224, 224, 3))
    for layer in base_model.layers:
        layer.trainable = False # True pour fine-tuning ou si beaucoup de mémoire disponible
    x = base_model.output
    x = layers.Flatten()(x)
    image_data = layers.Dense(256)(x)  # Nouvelle couche dense pour les caractéristiques

    # Text processing
    text_input = layers.Input(shape=(None,))
    text_features  = layers.Embedding(input_dim=VOCAB_SIZE, output_dim=TEXT_VECTOR_SIZE, weights=[vec.vectors], trainable=False)(text_input)
    #text_features = PositionalEncodingLayer(max_seq_length=MAX_LEN_SEQUENCE, embed_size=TEXT_VECTOR_SIZE, scale=100)(text_features)
    text_features  = layers.Masking(mask_value=0.0)(text_features) # Extrèmement important
    
    # Self-Attention mechanism
    self_attention_text = layers.MultiHeadAttention(num_heads=64, key_dim=TEXT_VECTOR_SIZE)(text_features, text_features)
    self_attention_text.use_causal_mask = True
    self_attention_text = layers.Add()([text_features, self_attention_text])
    self_attention_text = layers.LayerNormalization()(self_attention_text)
    self_attention_text = layers.Activation('leaky_relu')(self_attention_text)

    text_data = layers.Bidirectional(layers.LSTM(1024, return_sequences=True))(self_attention_text)
    text_data = layers.BatchNormalization()(text_data)
    text_data = layers.Activation('leaky_relu')(text_data)
    text_data = layers.Bidirectional(layers.LSTM(512, return_sequences=True))(text_data)
    text_data = layers.BatchNormalization()(text_data)
    text_data = layers.Activation('leaky_relu')(text_data)
    text_data = layers.Bidirectional(layers.LSTM(256))(text_data)
    text_data = layers.BatchNormalization()(text_data)
    text_data = layers.Activation('leaky_relu')(text_data)
    context = layers.Concatenate()([image_data, text_data])
    context = layers.Dense(256)(context)
    context = layers.BatchNormalization()(context)
    context = layers.Activation('leaky_relu')(context)
    context = layers.Dense(256)(context)
    context = layers.BatchNormalization()(context)
    context = layers.Activation('leaky_relu')(context)
    output = layers.Dense(VOCAB_SIZE, activation='softmax')(context)

    model = Model(inputs=(base_model.input, text_input), outputs=[output], name='caption_modelv4')
    loss = losses.sparse_categorical_crossentropy
    model.compile(loss=loss, optimizer='adam')
    return model

def caption_modelv5():  
    '''
    inputs :
        image : (batch_size, 224, 224, 3)
        text : (batch_size, None, TEXT_VECTOR_SIZE)
    outputs :
        output : (batch_size, TEXT_VECTOR_SIZE)
    results :
        sparse_categorical_crossentropy : 4.16
    '''
    # Image processing
    base_model = ResNet50(weights='imagenet', include_top=False, input_shape=(224, 224, 3))
    for layer in base_model.layers:
        layer.trainable = False # True pour fine-tuning ou si beaucoup de mémoire disponible
    x = base_model.output
    x = layers.Flatten()(x)
    image_data = layers.Dense(256)(x)  # Nouvelle couche dense pour les caractéristiques

    # Text processing
    text_input = layers.Input(shape=(None,))
    text_features  = layers.Embedding(input_dim=VOCAB_SIZE, output_dim=TEXT_VECTOR_SIZE, weights=[vec.vectors], trainable=False)(text_input)
    #text_features = PositionalEncodingLayer(max_seq_length=MAX_LEN_SEQUENCE, embed_size=TEXT_VECTOR_SIZE, scale=100)(text_features)
    text_features  = layers.Masking(mask_value=0.0)(text_features) # Extrèmement important
    
    # Self-Attention mechanism
    self_attention_text = layers.MultiHeadAttention(num_heads=64, key_dim=TEXT_VECTOR_SIZE)(text_features, text_features)
    self_attention_text.use_causal_mask = True
    self_attention_text = layers.Add()([text_features, self_attention_text])
    self_attention_text = layers.LayerNormalization()(self_attention_text)
    self_attention_text = layers.Activation('leaky_relu')(self_attention_text)

    # Prepare image data for cross-attention by repeating it to match text sequence length
    seq_length = tf.shape(text_features)[1]  # Get the sequence length of text features
    image_data_expanded = tf.expand_dims(image_data, 1)  # Expand dims to simulate sequence length
    image_features_for_attention = tf.tile(image_data_expanded, [1, seq_length, 1])  # Tile across the sequence length

    # Cross-Attention mechanism
    cross_attention_text = layers.MultiHeadAttention(num_heads=64, key_dim=TEXT_VECTOR_SIZE)(self_attention_text, image_features_for_attention)
    cross_attention_text = layers.Add()([self_attention_text, cross_attention_text])
    cross_attention_text = layers.LayerNormalization()(cross_attention_text)
    cross_attention_text = layers.Activation('leaky_relu')(cross_attention_text)

    text_data = layers.Bidirectional(layers.LSTM(1024, return_sequences=True))(self_attention_text)
    text_data = layers.BatchNormalization()(text_data)
    text_data = layers.Activation('leaky_relu')(text_data)
    text_data = layers.Bidirectional(layers.LSTM(512, return_sequences=True))(text_data)
    text_data = layers.BatchNormalization()(text_data)
    text_data = layers.Activation('leaky_relu')(text_data)
    text_data = layers.Bidirectional(layers.LSTM(256))(text_data)
    text_data = layers.BatchNormalization()(text_data)
    text_data = layers.Activation('leaky_relu')(text_data)
    context = layers.Concatenate()([image_data, text_data])
    context = layers.Dense(256)(context)
    context = layers.BatchNormalization()(context)
    context = layers.Activation('leaky_relu')(context)
    output = layers.Dense(VOCAB_SIZE, activation='softmax')(context)

    model = Model(inputs=(base_model.input, text_input), outputs=[output], name='caption_modelv5')
    loss = losses.sparse_categorical_crossentropy
    model.compile(loss=loss, optimizer='adam')
    return model

def caption_modelv6():
    def text_block(x, n_head, d_lstm, d_model=TEXT_VECTOR_SIZE, return_sequences=True):
        x = layers.MultiHeadAttention(num_heads=n_head, key_dim=d_model)(x, x)  # Self-attention
        x.use_causal_mask = True # Mask des tokens futurs
        x = layers.Add()([x, x])
        x = layers.LayerNormalization()(x)
        x = layers.Activation('leaky_relu')(x)
        x = layers.Bidirectional(layers.LSTM(d_lstm, return_sequences=return_sequences))(x)
        x = layers.BatchNormalization()(x)
        x = layers.Activation('leaky_relu')(x)
        return x

    '''
    inputs :
        image : (batch_size, 224, 224, 3)
        text : (batch_size, None, TEXT_VECTOR_SIZE)
    outputs :
        output : (batch_size, TEXT_VECTOR_SIZE)
    results :
        sparse_categorical_crossentropy : 3.6

    '''
    # Image processing
    base_model = ResNet50(weights='imagenet', include_top=False, input_shape=(224, 224, 3))
    for layer in base_model.layers:
        layer.trainable = False # True pour fine-tuning ou si beaucoup de mémoire disponible
    x = base_model.output
    x = layers.Flatten()(x)
    image_data = layers.Dense(256)(x)  # Nouvelle couche dense pour les caractéristiques

    # Text processing
    text_input = layers.Input(shape=(None,))
    text_features  = layers.Embedding(input_dim=VOCAB_SIZE, output_dim=TEXT_VECTOR_SIZE, weights=[vec.vectors], trainable=False)(text_input)
    #text_features = PositionalEncodingLayer(max_seq_length=MAX_LEN_SEQUENCE, embed_size=TEXT_VECTOR_SIZE, scale=100)(text_features)
    text_features  = layers.Masking(mask_value=0.0)(text_features) # Extrèmement important

    text_data = text_block(text_features, 64, 1024)
    text_data = text_block(text_data, 32, 512)
    text_data = text_block(text_data, 16, 256, return_sequences=False)

    context = layers.Concatenate()([image_data, text_data])
    context = layers.Dense(256)(context)
    context = layers.BatchNormalization()(context)
    context = layers.Activation('leaky_relu')(context)
    context = layers.Dense(256)(context)
    context = layers.BatchNormalization()(context)
    context = layers.Activation('leaky_relu')(context)
    output = layers.Dense(VOCAB_SIZE, activation='softmax')(context)

    model = Model(inputs=(base_model.input, text_input), outputs=[output], name='caption_modelv6')
    loss = losses.sparse_categorical_crossentropy
    model.compile(loss=loss, optimizer='adam')
    return model

def caption_modelv7():
    def transformer_decoder_layer(query, key_value, key_dim, num_heads, dff, dropout=0.1,use_causal_mask=False):
        """
        inputs :
            query : (batch_size, query_seq_len, dim)
            key_value : (batch_size, key_value_seq_len, dim)
            key_dim : dimension des clés et valeurs dans la couche d'attention
            num_heads : nombre de têtes dans la couche d'attention
            dff : multiplieur pour la dimension des couches cachées dans le feed-forward network
            dropout : taux de dropout
            use_causal_mask : booléen pour utiliser un masque causal dans la couche d'attention
        outputs :
        """
        # Multi-Head Attention (utilise causal mask pour respecter l'ordre des mots dans le texte généré)
        attention = layers.MultiHeadAttention(num_heads=num_heads, key_dim=key_dim)(query, key_value)
        attention.use_causal_mask = use_causal_mask
        attention = layers.Dropout(dropout)(attention)
        attention = layers.Add()([attention, query])
        attention = layers.LayerNormalization(epsilon=1e-6)(attention)
        
        # Feed-Forward Network
        ffn_output = layers.Dense(dff*key_dim, activation='leaky_relu')(attention)
        ffn_output = layers.Dense(key_dim)(ffn_output)
        ffn_output = layers.Dropout(dropout)(ffn_output)
        ffn_output = layers.Add()([ffn_output, attention])
        ffn_output = layers.LayerNormalization(epsilon=1e-6)(ffn_output)
        return ffn_output
    
    # Image processing
    base_model = ResNet50(weights='imagenet', include_top=False, input_shape=(224, 224, 3))
    image_input = base_model.input
    for layer in base_model.layers:
        layer.trainable = False # True pour fine-tuning ou si beaucoup de mémoire disponible
    x = base_model.output
    x = layers.Reshape((7*7, 2048))(x)
    IMAGE_VECTOR_SIZE = TEXT_VECTOR_SIZE # Pour la couche Dense
    image_data = layers.Dense(IMAGE_VECTOR_SIZE)(x)
    image_data = layers.BatchNormalization()(image_data)
    image_data = layers.Activation('leaky_relu')(image_data)
    image_data = transformer_decoder_layer(image_data, image_data, IMAGE_VECTOR_SIZE, num_heads=4, dff=4, use_causal_mask=False)
    image_data = transformer_decoder_layer(image_data, image_data, IMAGE_VECTOR_SIZE, num_heads=4, dff=4, use_causal_mask=False)


    # Text processing
    text_input = layers.Input(shape=(None,))
    text_features = layers.Embedding(input_dim=VOCAB_SIZE, output_dim=TEXT_VECTOR_SIZE, weights=[vec.vectors], trainable=False)(text_input)
    text_features = PaddingTruncatingLayer()(text_features)
    text_features = PositionalEncodingLayer(max_seq_length=60, embed_size=TEXT_VECTOR_SIZE, scale=100)(text_features)

    # Transformer Decoder
    global_dff = 4
    global_num_heads = 4

    x = transformer_decoder_layer(text_features, text_features, TEXT_VECTOR_SIZE, num_heads=global_num_heads, dff=global_dff, use_causal_mask=True)
    x = transformer_decoder_layer(x,x, TEXT_VECTOR_SIZE, num_heads=global_num_heads, dff=global_dff, use_causal_mask=True)
    x = transformer_decoder_layer(x,x, TEXT_VECTOR_SIZE, num_heads=global_num_heads, dff=global_dff, use_causal_mask=True)
    x = transformer_decoder_layer(x,x, TEXT_VECTOR_SIZE, num_heads=global_num_heads, dff=global_dff, use_causal_mask=True)

    x = transformer_decoder_layer(x, image_data, TEXT_VECTOR_SIZE, num_heads=global_num_heads, dff=global_dff)


    decoder_output = layers.Lambda(lambda x: x[:, -1, :])(x)

    # Final output layer
    output = layers.Dense(VOCAB_SIZE, activation='softmax')(decoder_output)

    model = Model(inputs=(image_input, text_input), outputs=[output], name='caption_modelv7')
    model.compile(loss='sparse_categorical_crossentropy', optimizer='adam')

    return model

model = caption_modelv7()
model.summary()

### Chargement d'un modèle pré-existant

In [4]:
model = load_model('Livrable3_caption_modelv2.keras')

### Visualisation du modèle

In [None]:
plot_model(model, to_file=f'Livrable3_{model.name}.png', show_shapes=True, show_layer_names=True)

### Entrainement du modèle

In [None]:
early_stopping = callbacks.EarlyStopping(monitor='val_loss',
                                         patience=PATIENCE, 
                                         restore_best_weights=True)

checkpoint_path = f'checkpoints/{model.name}'
checkpoint_path = checkpoint_path + '-{epoch:04d}.keras'
checkpoint_dir = os.path.dirname(checkpoint_path)
model_checkpoint = callbacks.ModelCheckpoint(
    filepath=checkpoint_path,
    monitor='val_loss',
    mode='min',
    save_best_only=True,
    verbose=1
)

history = model.fit(train_generator,
                    validation_data=val_generator,
                    epochs=EPOCHS,
                    callbacks=[early_stopping],
                    verbose=1)

# Plot
plt.figure(figsize=(12, 6))
plt.plot(history.history['loss'], label='Train loss')
plt.plot(history.history['val_loss'], label='Validation loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.legend()
plt.show()

#### Sauvegarde du modèle

In [13]:
model.save(f'Livrable3_{model.name}.keras')

#### Fine-tuning

In [None]:
if not model :
    model = load_model('Livrable3_caption_modelv2.keras')

for layer in model.layers:
    layer.trainable = True

model.summary()

# Compilation du modèle
adam = tf.keras.optimizers.Adam(learning_rate=1e-5)
model.compile(loss=losses.sparse_categorical_crossentropy, optimizer=adam) # Attention à choisir la bonne loss

# Fine-tuning
early_stopping = callbacks.EarlyStopping(monitor='val_loss',
                                         patience=PATIENCE, 
                                         restore_best_weights=True)

history = model.fit(train_generator,
                    validation_data=val_generator,
                    epochs=EPOCHS,
                    callbacks=[early_stopping],
                    verbose=1)



## Evaluation

### Test unitaire

In [None]:
def caption_model_inferencev1(model,vec, image):
    """
    inputs :
        model : model keras
        word2vec : model word2vec
        image : (batch_size, 224,224,3) (RGB) (0,255)
        text : (batch_size, None, TEXT_VECTOR_SIZE)
    outputs :
        caption : (batch_size, 1, TEXT_VECTOR_SIZE)
    """
    start_index = vec.key_to_index[START_TOKEN]
    start_vector = vec.vectors[start_index]
    processed_image = preprocess_input(image.copy())
    caption = np.expand_dims([start_vector], axis=0)
    caption_words = ''
    
    while True :
        model.reset_states() # Reset les états de la LSTM
        caption_vector = np.array(caption)
        result = model.predict([np.array([processed_image]), caption_vector], verbose=0)[0]
        result_vector = find_closest_vector(result, vec)

        if np.array_equal(result_vector, vec[END_TOKEN]):
            break
        else:
            caption_words += find_closest_word(result_vector, vec) + ' '
            result_vector = np.expand_dims(result_vector, axis=0)
            caption = np.concatenate([caption, [result_vector]], axis=1)
        if len(caption_words.split()) >= MAX_LEN_SEQUENCE :
            break
    return caption_words

def caption_model_inferencev2(model,vec, image, temperature=1e-5):
    """
    inputs :
        model : model keras
        word2vec : model word2vec
        image : image (batch_size, 224, 224, 3) (RGB) (0,255)
        caption : (batch_size, None, TEXT_VECTOR_SIZE)
    outputs :
        caption : (batch_size, VOCAB_SIZE)
    """
    start_index = vec.key_to_index[START_TOKEN]
    processed_image = preprocess_input(image.copy())
    caption = np.expand_dims([start_index], axis=0)
    caption_words = ''
    
    while True :
        model.reset_states() # Reset les états de la LSTM
        caption_tensor = np.array(caption)
        logits = model.predict([np.array([processed_image]), caption_tensor], verbose=0)[0]
        probabilities = tf.nn.softmax(logits / temperature).numpy()  # Appliquer la température
        index_result = np.random.choice(np.arange(len(probabilities)), p=probabilities)  # Échantillonnage


        if index_result == vec.key_to_index[END_TOKEN]:
            break
        else:
            # On print le % de confiance du token <eos> pour voir si le modèle est confiant
            #print(f' % de confiance pour le mot <eos> : {result[vec.key_to_index[END_TOKEN]]}')
            caption_words += vec.index_to_key[index_result] + ' '
            result_tensor = np.expand_dims(index_result, axis=0)
            caption = np.concatenate([caption, [result_tensor]], axis=1)
        if len(caption_words.split()) >= MAX_LEN_SEQUENCE :
            break
    return caption_words

# Choix du générateur
generator = val_generator
# Test du modèle
r_index = np.random.randint(len(generator))
x, _ = generator.__getitem__(r_index)
images, _ = x
r_index = np.random.randint(0, images.shape[0]-1)
selected_image = images[r_index]
# On recentre les valeurs de l'image
selected_image = ( selected_image - np.min(selected_image) ) / ( np.max(selected_image) - np.min(selected_image) ) * 255
# On convertit l'image en RGB pour l'affichage
selected_image = np.array(selected_image)
selected_image = selected_image.astype('uint8')
selected_image = selected_image[...,::-1]

# Prédiction de la légende
predicted_caption = caption_model_inferencev2(model, vec, selected_image)

# Plot de l'image avec la légende prédite
plt.figure(figsize=(8, 8))
plt.imshow(selected_image)
plt.title(predicted_caption)
plt.axis('off')  # Désactiver les axes pour une meilleure visibilité
plt.show()