## Library setup

In [None]:
import os
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
import nltk
import re
import tensorflow as tf
from tensorflow.keras.applications.inception_v3 import InceptionV3
from tensorflow.keras.preprocessing.image import img_to_array, load_img
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical, plot_model
from tensorflow.keras.models import Model, load_model
from tensorflow.keras.layers import Input, Dense, LSTM, Embedding, add
from tensorflow.keras.layers import Flatten, Dropout, BatchNormalization
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping, LearningRateScheduler
from sklearn.model_selection import train_test_split
from collections import Counter
from PIL import Image

# Pre-Processing

In [None]:
def preprocess_image(image_path):
    img = load_img(image_path, target_size=(420, 420))
    img = img_to_array(img)
    img = np.expand_dims(img, axis=0)
    img = tf.keras.applications.inception_v3.preprocess_input(img)
    return img

def extract_image_features(model, image_path):
    img = preprocess_image(image_path)
    features = model.predict(img, verbose=0)
    return features

# Loading the pre-trained InceptionV3 model
inception_v3_model = InceptionV3(weights = '//tf_on_gpu/CNN/Pre_Trained/inception_v3.h5', input_shape=(420, 420, 3))
inception_v3_model.layers.pop()
inception_v3_model = Model(inputs=inception_v3_model.inputs, outputs=inception_v3_model.layers[-2].output)

inception_v3_model.summary()

In [None]:
def load_captions(file_path):
    with open(file_path, 'r') as f:
        captions = f.readlines()
        # Lowercasing  the captions
        captions = [caption.lower() for caption in captions]
    return captions

def tokenize_captions(captions):
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(captions)
    return tokenizer

# Loading the captions from the dataset
captions_path = '//tf_on_gpu/LSTMs/Image_Caption/data_ImageCaption/Flickr8k.token.txt'
captions = load_captions(captions_path)
captions[:15:3]

### Removing some characters
#### On this dataset, removing stop words damaged the final results. Additionally, removing rare words didn’t actually help model accuracy.

In [None]:
def clean_text(text):
    # Removing punctuation marks
    text = re.sub(r'[^\w\s]', '', text)
    
    # Removing numbers
    text = re.sub(r'\d+', '', text)
    
    # Removing extra whitespace
    text = re.sub(r'\s+', ' ', text).strip()
    
    return text

# Cleaning the captions
cleaned_captions = [clean_text(caption.split('\t')[1]) for caption in captions]
cleaned_captions[:15:2]

In [None]:
# Tokenizing the captions
all_words = [word for caption in cleaned_captions for word in nltk.word_tokenize(caption)]

# Counting the occurrences of each word
word_counts = Counter(all_words)

# Sorting the words by frequency
sorted_words = sorted(word_counts.items(), key=lambda x: x[1], reverse=True)

# The most common words
sorted_words[:10:2]

In [None]:
unique_words = [i[0] for i in word_counts.items()]
count = [i[1] for i in word_counts.items()]
df = pd.DataFrame({'words':unique_words, 'count':count})
df.sort_values(by='count', inplace=True, ascending=False)
df.reset_index(inplace=True)
df.drop(columns=['index'], inplace=True)
df

## Final captions

In [None]:
captions_IDs = []
for i in range(len(cleaned_captions)):
    #  it's suggested to add 'start' and 'end' to the captions
    item = captions[i].split('\t')[0][:-2]+'\t'+'start '+cleaned_captions[i]+' end\n'
    captions_IDs.append(item)
    
captions_IDs[:20:3], len(captions_IDs)

### Visualizing some of the images with their first and last captions.

In [None]:
images_directory = '//tf_on_gpu/LSTMs/Image_Caption/data_ImageCaption/Images/' # The path where the images are stored

# We use dictionaries to access captions of each image by its ID
captions_dictionary = {}
for item in captions_IDs[:(24)*5]:
    image_id, caption = item.split('\t')
    if image_id not in captions_dictionary:
        captions_dictionary[image_id] = []
    captions_dictionary[image_id].append(caption)
else:
    list_captions = [x for x in captions_dictionary.items()]

plt.figure(figsize=[14, 22], dpi=200)
for i in range(24):
    image_path = os.path.join(images_directory, list_captions[i][0]) # image_directory + ID = exact path
    captions_list = [x[6:-4].split() for x in list_captions[i][1]] # 'start' and 'end' are now removed from text
    
    plt.subplot(6, 4, i+1)
    img = Image.open(image_path)
    img = img.resize((140, 140))  # To have a clean plot, it's necessary to resize all images to a uniform size
    plt.imshow(img)
    plt.axis('off')
    
    # The first and the last captions of the image (each at two lines)
    split = len(captions_list[0])//2 # To split the captions into two lines
    first_caption =  f"\"{' '.join(captions_list[0][:split])}\n{' '.join(captions_list[0][split:])}\""
    split = len(captions_list[-1])//2
    last_caption = f"\"{' '.join(captions_list[-1][:split])}\n{' '.join(captions_list[-1][split:])}\""
    title = f"{first_caption}\n{last_caption}"
    
    plt.title(title, fontsize=8, fontweight='bold')

plt.show()

### Tokenizing captions and defining vocab size

In [None]:
# Tokenizing captions and creating word-to-index mapping
tokenizer = tokenize_captions(cleaned_captions)
vocab_size = len(tokenizer.word_index) + 1
vocab_size

### Defining tain, validation and test captions

In [None]:
# Storing all image IDs
all_image_ids = os.listdir(images_directory)

# Separating image IDs
train_image_ids, val_image_ids = train_test_split(all_image_ids, test_size=0.12, random_state=42)
val_image_ids, test_image_ids = train_test_split(val_image_ids, test_size=0.04, random_state=42)
test_image_ids.extend(val_image_ids[100:200:10]) # The test set and the validation set have ten samples in common


# Filtering train captions
train_captions = []
for caption in captions_IDs:
    image_id, _ = caption.split('\t')
    if image_id in train_image_ids:
        train_captions.append(caption)
        
        
# Filtering validation captions
val_captions = []
for caption in captions_IDs:
    image_id, _ = caption.split('\t')
    if image_id in val_image_ids:
        val_captions.append(caption)        
        
        
# Filtering test captions
test_captions = []
for caption in captions_IDs:
    image_id, _ = caption.split('\t')
    if image_id in test_image_ids:
        test_captions.append(caption)

train_captions[0], val_captions[0], test_captions[0], len(train_captions)/5, len(val_captions)/5, len(test_captions)/5

In [None]:
labels = ['Train', 'Validation', 'Test']
common_items = set(val_image_ids).intersection(test_image_ids) # Storing common smaples in test set and validation set
num_data = [len(train_image_ids), len(val_image_ids), len(test_image_ids)]

plt.figure(figsize=[4, 4], dpi=100)
sns.set_theme()

plt.pie(num_data, labels=labels, explode=(0.02, 0.03, 0.03),
            autopct='%1.0f%%', colors=sns.color_palette('Set2'))

plt.title(f'All Data: {sum(num_data)-len(common_items)}\n({len(all_image_ids)} Unique samples)', fontweight='bold')
plt.axis('off')

plt.show()

## Extracting Image features using InceptionV3 model

In [None]:
train_image_features = {}  # A Dictionary to store image features with their corresponding IDs       
counter = 1 # A counter to monitor loop iteration (might take too long)
print('Train images features\n==> Extracted: ', end=' ')
for caption in train_image_ids:
    image_id = caption.split('\t')[0]
    image_path = os.path.join(images_directory, image_id)
    image_features = extract_image_features(inception_v3_model, image_path) # Extracting features
    train_image_features[image_id] = image_features.flatten()  # Flattening the features
    if counter%300 == 0:
        print(counter, end=', ')
    counter+=1
else:
    # Saving the image features (avoid re-extracting at each run)
    np.save('train_features_InceptionV3.npy', train_image_features)
    print(f'{len(train_image_features)}, Done.')

In [None]:
# Loading the saved image features
train_image_features = np.load('train_features_InceptionV3.npy', allow_pickle=True).item()
print("Features successfully loaded.")

In [None]:
val_image_features = {}  # A Dictionary to store image features with their corresponding IDs
counter = 1 # A counter to monitor loop iteration (might take too long)
print('Validation images features\n==> Extracted: ', end=' ')
for image_id in val_image_ids:
    image_path = os.path.join(images_directory, image_id)
    image_features = extract_image_features(inception_v3_model, image_path) # Extracting features
    val_image_features[image_id] = image_features.flatten()  # Flattening the features
    if counter%250 == 0:
        print(counter, end=', ')
    counter+=1
else:
    # Saving the image features (avoid re-extracting at each run)
    np.save('val_features_InceptionV3.npy', val_image_features)
    print(f'{len(val_image_features)}, Done.')

In [None]:
# Loading the saved image features
val_image_features = np.load('val_features_InceptionV3.npy', allow_pickle=True).item()
print("Features successfully loaded.")

In [None]:
test_image_features = {}  # A Dictionary to store image features with their corresponding IDs
counter = 1 # A counter to monitor loop iteration (might take too long)
print('Test images features\n==> Extracted: ', end=' ')
for image_id in test_image_ids:
    image_path = os.path.join(images_directory, image_id)
    image_features = extract_image_features(inception_v3_model, image_path) # Extracting features
    test_image_features[image_id] = image_features
    if counter%40 == 0:
        print(counter, end=', ')
    counter+=1
else:
    # Saving the image features (avoid re-extracting at each run)
    np.save('test_features_InceptionV3.npy', test_image_features)
    print(f'{len(test_image_features)}, Done.')

In [None]:
# Loading the saved image features
test_image_features = np.load('test_features_InceptionV3.npy', allow_pickle=True).item()
print("Features successfully loaded.")

In [None]:
def data_generator(captions, image_features, tokenizer, max_caption_length, batch_size):
    num_samples = len(captions)
    image_ids = list(image_features.keys())
    while True:
        np.random.shuffle(image_ids)  # Shuffle image_ids for each epoch
        for start_idx in range(0, num_samples, batch_size):
            end_idx = min(start_idx + batch_size, num_samples)
            X_images, X_captions, y = [], [], []
            for caption in captions[start_idx:end_idx]:
                image_id, caption_text = caption.split('\t')
                caption_text = caption_text.rstrip('\n')
                seq = tokenizer.texts_to_sequences([caption_text])[0] # Tokenizing the caption
                for i in range(1, len(seq)):
                    in_seq, out_seq = seq[:i], seq[i] # X_caption, Y
                    in_seq = pad_sequences([in_seq], maxlen=max_caption_length)[0]
                    out_seq = to_categorical([out_seq], num_classes=vocab_size)[0]
                    X_images.append(image_features[image_id])
                    X_captions.append(in_seq)
                    y.append(out_seq)
            
            yield [np.array(X_images), np.array(X_captions)], np.array(y)
         
        
# Defining the maximum length of captions
max_caption_length = max(len(caption.split()) for caption in cleaned_captions) + 1
# Defining the CNN output dimension (size of feature vector from InceptionV3)
cnn_output_dim = 2048
# Defining batch size
batch_size_train = 270
batch_size_val = 150

# Creating data generators for training and validation
train_data_generator = data_generator(train_captions, train_image_features, tokenizer, max_caption_length, batch_size_train)
val_data_generator = data_generator(val_captions, val_image_features, tokenizer, max_caption_length, batch_size_val)

In [None]:
# Checking shapes
sample_batch = next(train_data_generator)
print("Training sample batch shapes:")
print("X_images:", sample_batch[0][0].shape)
print("X_captions:", sample_batch[0][1].shape)
print("y:", sample_batch[1].shape)
print('=========================')
sample_batch = next(val_data_generator)
print("Validation sample batch shapes:")
print("X_images:", sample_batch[0][0].shape)
print("X_captions:", sample_batch[0][1].shape)
print("y:", sample_batch[1].shape)

# Model number 1

In [None]:
def build_model(vocab_size, max_caption_length, cnn_output_dim):
    # Encoder Model
    
    # image feature layers
    input_image = Input(shape=(cnn_output_dim,))
    fe1 = BatchNormalization()(input_image)
    fe2 = Dense(256, activation='relu')(fe1) # Adding a Dense layer to the CNN output to match the LSTM input size
    fe3 = BatchNormalization()(fe2)
    
    # sequence feature layers
    input_caption = Input(shape=(max_caption_length,))
    se1 = Embedding(vocab_size, 256, mask_zero=True)(input_caption)
    se2 = LSTM(256)(se1)
    
    # Decoder Model
    decoder1 = add([fe3, se2])
    decoder2 = Dense(256, activation='relu')(decoder1)
    outputs = Dense(vocab_size, activation='softmax')(decoder2)

    # Returning The Model
    model = Model(inputs=[input_image, input_caption], outputs=outputs)
    return model
    
# Build the model
model_1 = build_model(vocab_size, max_caption_length, cnn_output_dim)

optimizer = Adam(learning_rate=0.01, clipnorm=1.0)
model_1.compile(loss='categorical_crossentropy', optimizer=optimizer)

model_1.summary()

### Training the model

In [None]:
# Define early stopping callback to prevent overfitting
early_stopping = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)

def lr_scheduler(epoch, lr):
    return lr * tf.math.exp(-0.7)

lr_schedule = LearningRateScheduler(lr_scheduler)

In [None]:
history_1 = model_1.fit(train_data_generator, steps_per_epoch=len(train_captions) // batch_size_train,
                        validation_data=val_data_generator, validation_steps=len(val_captions) // batch_size_val,
                        epochs=10, callbacks=[early_stopping, lr_schedule])

model_1.save('best_model_1.h5')

### Visualizing the model performance

In [None]:
plt.figure(figsize=(15, 7), dpi=200)
sns.set_style('whitegrid')
plt.plot([x+1 for x in range(len(history_1.history['loss']))], history_1.history['loss'], color='#E74C3C', marker='o')
plt.plot([x+1 for x in range(len(history_1.history['loss']))], history_1.history['val_loss'], color='#641E16', marker='h')
plt.title('Train VS Validation', fontsize=15, fontweight='bold')
plt.xticks(fontweight='bold')
plt.yticks(fontweight='bold')
plt.xlabel('Epoch', fontweight='bold')
plt.ylabel('Loss', fontweight='bold')
plt.legend(['Train Loss', 'Validation Loss'], loc='best')
# plt.savefig('frequency_barplot 1.png')
plt.show()

# Generating captions on test set

In [None]:
def generate_caption(model, image_features, tokenizer, max_caption_length): # A function to generate captions
    # Each captions is started with the word 'start'
    in_text = 'start '
    for _ in range(max_caption_length):
        sequence = tokenizer.texts_to_sequences([in_text])[0]
        sequence = pad_sequences([sequence], maxlen=max_caption_length)
        prediction = model.predict([image_features, sequence], verbose=0)
        idx = np.argmax(prediction)
        word = tokenizer.index_word[idx]
        # Adding the predicted word to the sequence
        in_text += ' ' + word
        # When the model returns the word 'end' (which is the last word) then the generating loop must be stopped 
        if word == 'end':
            break
    return in_text

In [None]:
# Selecting the model
selected_model = load_model('best_model_1.h5')
# A Dictionary to store each image's generated caption with its corresponding IDs      
Generated_output = {}
counter = 1 # A counter to monitor loop iteration (might take too long)
print('Generating captions:\n==> Generated: ', end=' ')
for i, image_id in enumerate(test_image_features):
    generated_caption = generate_caption(selected_model, test_image_features[image_id], tokenizer, max_caption_length)
    # Adding the generated caption with its corresponding ID to the dictionary
    Generated_output[image_id] = generated_caption
    counter += 1
    if counter%20 == 0:
        print(counter, end=', ')
else:
    print(f'{len(Generated_output)}, Done.')

# Checking the result
[x for x in Generated_output.items()][:5]

## Evaliuating model on test images

In [None]:
# Most common text evaliuation methods
from nltk.translate.bleu_score import corpus_bleu
from nltk.translate.meteor_score import meteor_score
from rouge import Rouge

In [None]:
# A dictionary to access actual captions of each image by its ID
test_captions_dictionary = {}
for item in test_captions:
    image_id, caption = item.split('\t')
    if image_id not in test_captions_dictionary:
        test_captions_dictionary[image_id] = []
    test_captions_dictionary[image_id].append(caption)

In [None]:
# BLEU score
def calculate_bleu(reference, candidate):
    return corpus_bleu([[reference.split()]], [candidate.split()])

# ROUGE score
def calculate_rouge(reference, candidate):
    rouge = Rouge()
    scores = rouge.get_scores(candidate, reference)
    return scores[0]['rouge-l']['f']

# METEOR score
def calculate_meteor(reference, candidate):
    return meteor_score([reference.split()], candidate.split())

# Getting the average score of each image
def Average(lst):
    return sum(lst) / len(lst)

In [None]:
# Some Dictionaries to access each image's average score by its ID 
bleu_scores, rouge_scores, meteor_scores = {}, {}, {}
for image_id in Generated_output:
    if image_id in test_captions_dictionary:
        
        # At each iteration, five scores will be added to the lists below
        bleu, rouge, meteor = [], [], []
        for i in range(len(test_captions_dictionary[image_id])):
            bleuScore = calculate_bleu(test_captions_dictionary[image_id][i], Generated_output[image_id])
            bleu.append(bleuScore)    
            
            rougeScore = calculate_rouge(test_captions_dictionary[image_id][i], Generated_output[image_id])
            rouge.append(rougeScore)
            
            meteorScore = calculate_meteor(test_captions_dictionary[image_id][i], Generated_output[image_id])
            meteor.append(meteorScore)
            
        else:
            # At the end of the iteration, the average of the five scores will be add to the corresponding ID in the dicts
            bleu_scores[image_id] = Average(bleu)
            rouge_scores[image_id] = Average(rouge)
            meteor_scores[image_id] = Average(meteor)            
    else:
        print('Wrong image id!')
else:
    # Finally we get a total average of the model performace score
    average_bleu = sum([x[1] for x in bleu_scores.items()]) / len([x[1] for x in bleu_scores.items()])
    average_rouge = sum([x[1] for x in rouge_scores.items()]) / len([x[1] for x in rouge_scores.items()])
    average_meteor = sum([x[1] for x in meteor_scores.items()]) / len([x[1] for x in meteor_scores.items()])
    

print(f"Average BLEU Score: {round(average_bleu, 5)}")
print(f"Average ROUGE Score: {round(average_rouge, 5)}")
print(f"Average METEOR Score: {round(average_meteor, 5)}")

### Visualizing the model performance on 24 images

In [None]:
list_captions = [x for x in Generated_output.items()]

plt.figure(figsize=[12, 22], dpi=200)
for i in range(24):
    image_path = os.path.join(images_directory, list_captions[i][0]) # image_directory + ID = exact path
    image_id, caption = list_captions[i][0], list_captions[i][1][6:-4].split() # 'start' and 'end' are now removed from text
    
    # Average scores of each image
    BLEU_score = round(bleu_scores[image_id], 5)
    ROUGE_score = round(rouge_scores[image_id], 5)
    METEOR_score = round(meteor_scores[image_id], 5)
    
    plt.subplot(6, 4, i+1)
    img = Image.open(image_path)
    img = img.resize((140, 140)) # To have a clean plot, it's necessary to resize all images to a uniform size
    plt.imshow(img)
    plt.axis('off')
    # Captions might be too larg (due to a bad generation issue) so we ignore them
    if len(caption) > 27:
        caption = 'None'
    else:
        # Captions might be larg so we write them in two lines
        split = len(caption)//2
        caption = f"\"{' '.join(caption[:split])}\n{' '.join(caption[split:])}\""
    title = f"{caption}\nROUGE score: {ROUGE_score}\nMETEOR score: {METEOR_score}\nBELU score: {BLEU_score}"
    plt.title(title, fontsize=8, fontweight='bold')

plt.savefig('Generated Captions 1.png')
plt.show()

In [None]:
counter = 0
for i in list_captions:
    if len(i[1].split()) > 20:
        counter += 1
print(f'{counter} Nonsensical outputs')

## --------------------------------------------------------------------------------------------------------------------------------------

#

# Model number 2

In [None]:
def build_model_2(vocab_size, max_caption_length, cnn_output_dim):
    # Encoder Model
    
    # image feature layers
    input_image = Input(shape=(cnn_output_dim,))
    fe1 = BatchNormalization()(input_image)
    fe2 = Dense(256, activation='relu')(fe1) # Adding a Dense layer to the CNN output to match with the LSTM input size
    fe2 = BatchNormalization()(fe2)

    # sequence feature layers
    input_caption = Input(shape=(max_caption_length,))
    se1 = Embedding(vocab_size, 256, mask_zero=True)(input_caption)
    se2 = LSTM(256)(se1)
    
    # Decoder Model
    decoder1 = add([fe2, se2])
    decoder2 = Dense(256, activation='relu')(decoder1)
    decoder2 = BatchNormalization()(decoder2)
    outputs = Dense(vocab_size, activation='softmax')(decoder2)

    # Returning The Model
    model = Model(inputs=[input_image, input_caption], outputs=outputs)
    return model

# Build the model
optimizer = Adam(learning_rate=0.015, clipnorm=1.0)
model_2 = build_model_2(vocab_size, max_caption_length, cnn_output_dim)
model_2.compile(loss='categorical_crossentropy', optimizer=optimizer)
model_2.summary()

### Training the model

In [None]:
history_2 = model_2.fit(train_data_generator, steps_per_epoch=len(train_captions) // batch_size_train,
                        validation_data=val_data_generator, validation_steps=len(val_captions) // batch_size_val,
                        epochs=15, callbacks=[early_stopping, lr_schedule])

model_2.save('best_model_2.h5')

### Visualizing the model performance

In [None]:
plt.figure(figsize=(15, 7), dpi=200)
sns.set_style('whitegrid')
plt.plot([x+1 for x in range(len(history_2.history['loss']))], history_2.history['loss'], color='#E74C3C', marker='o')
plt.plot([x+1 for x in range(len(history_2.history['loss']))], history_2.history['val_loss'], color='#641E16', marker='h')
plt.title('Train VS Validation', fontsize=15, fontweight='bold')
plt.xticks(fontweight='bold')
plt.yticks(fontweight='bold')
plt.xlabel('Epoch', fontweight='bold')
plt.ylabel('Loss', fontweight='bold')
plt.legend(['Train Loss', 'Validation Loss'], loc='lower right')
# plt.savefig('frequency_barplot 2.png')
plt.show()

# Generating captions on test set

In [None]:
# Selecting the model
selected_model_2 = load_model('best_model_2.h5')
# A Dictionary to store each image's generated caption with its corresponding IDs      
Generated_output_2 = {}
counter = 1 # A counter to monitor loop iteration (might take too long)
print('Generating captions:\n==> Generated: ', end=' ')
for i, image_id in enumerate(test_image_features):
    generated_caption = generate_caption(selected_model_2, test_image_features[image_id], tokenizer, max_caption_length)
    # Adding the generated caption with its corresponding ID to the dictionary
    Generated_output_2[image_id] = generated_caption
    counter += 1
    if counter%20 == 0:
        print(counter, end=', ')
else:
    print(f'{len(Generated_output_2)}, Done.')

# Checking the result
[x for x in Generated_output_2.items()][:5]

## Evaliuating model on test images

In [None]:
# Some Dictionaries to access each image's average score by its ID 
bleu_scores_2, rouge_scores_2, meteor_scores_2 = {}, {}, {}
for image_id in Generated_output_2:
    if image_id in test_captions_dictionary:
        
        # At each iteration, five scores will be added to the lists below
        bleu_2, rouge_2, meteor_2 = [], [], []
        for i in range(len(test_captions_dictionary[image_id])):
            bleu_2Score = calculate_bleu(test_captions_dictionary[image_id][i], Generated_output_2[image_id])
            bleu_2.append(bleu_2Score)    
            
            rouge_2Score = calculate_rouge(test_captions_dictionary[image_id][i], Generated_output_2[image_id])
            rouge_2.append(rouge_2Score)
            
            meteor_2Score = calculate_meteor(test_captions_dictionary[image_id][i], Generated_output_2[image_id])
            meteor_2.append(meteor_2Score)
            
        else:
            # At the end of the iteration, the average of the five scores will be add to the corresponding ID in the dicts
            bleu_scores_2[image_id] = Average(bleu_2)
            rouge_scores_2[image_id] = Average(rouge_2)
            meteor_scores_2[image_id] = Average(meteor_2)            
    else:
        print('Wrong image id!')
else:
    # Finally we get a total average of the model performace score
    average_bleu_2 = sum([x[1] for x in bleu_scores_2.items()]) / len([x[1] for x in bleu_scores_2.items()])
    average_rouge_2 = sum([x[1] for x in rouge_scores_2.items()]) / len([x[1] for x in rouge_scores_2.items()])
    average_meteor_2 = sum([x[1] for x in meteor_scores_2.items()]) / len([x[1] for x in meteor_scores_2.items()])
    

print(f"Average BLEU Score: {round(average_bleu_2, 5)}")
print(f"Average ROUGE Score: {round(average_rouge_2, 5)}")
print(f"Average METEOR Score: {round(average_meteor_2, 5)}")

### Visualizing the model performance on 24 images

In [None]:
list_captions_2 = [x for x in Generated_output_2.items()]

plt.figure(figsize=[12, 22], dpi=200)
for i in range(24):
    image_path = os.path.join(images_directory, list_captions_2[i][0]) # image_directory + ID = exact path
    image_id, caption = list_captions_2[i][0], list_captions_2[i][1][6:-4].split() # 'start' and 'end' are now removed from text
    
    # Average scores of each image
    BLEU_score = round(bleu_scores_2[image_id], 5)
    ROUGE_score = round(rouge_scores_2[image_id], 5)
    METEOR_score = round(meteor_scores_2[image_id], 5)
    
    plt.subplot(6, 4, i+1)
    img = Image.open(image_path)
    img = img.resize((140, 140)) # To have a clean plot, it's necessary to resize all images to a uniform size
    plt.imshow(img)
    plt.axis('off')
    # Captions might be too larg (due to a bad generation issue) so we ignore them
    if len(caption) > 27:
        caption = 'None'
    else:
        # Captions might be larg so we write them in two lines
        split = len(caption)//2
        caption = f"\"{' '.join(caption[:split])}\n{' '.join(caption[split:])}\""
    title = f"{caption}\nROUGE score: {ROUGE_score}\nMETEOR score: {METEOR_score}\nBELU score: {BLEU_score}"
    plt.title(title, fontsize=8, fontweight='bold')

plt.savefig('Generated Captions 2.png')
plt.show()

### Sometimes the model generates one or two words repeatedly, which makes long nonsensical sentences, so we need to count how many times this occurs.

In [None]:
counter = 0
for i in list_captions_2:
    if len(i[1].split()) > 20:
        counter += 1
print(f'{counter} Nonsensical outputs')

## --------------------------------------------------------------------------------------------------------------------------------------

#

# Comparing the two models

In [None]:
model_1_report = [4-history_1.history['val_loss'][4], average_rouge, average_bleu*10, average_meteor]
model_1_report.append(sum(model_1_report)/len(model_1_report))

model_2_report = [4-history_2.history['val_loss'][5], average_rouge_2, average_bleu_2*10, average_meteor_2]
model_2_report.append(sum(model_2_report)/len(model_2_report))

#### To normalize the numbers for better visualization in the bar plot, we subtract val_loss from 4 and multiply bleu score by 10.

In [None]:
opacity = 0.6
bar_width = 0.4

sns.set_theme()

plt.figure(figsize=(14, 13), dpi=200)
plt.title('Comparing Models', fontweight='bold')

plt.xlabel('Metrics', fontweight='bold')
plt.ylabel('Scores', fontweight='bold')

plt.xticks([(x)+(bar_width/2) for x in range(len(model_1_report))],
           ('Val_loss', 'Rouge', 'Bleu*10', 'Metero', 'Mean'), rotation=0, fontweight='bold')
plt.yticks(fontweight='bold')

bar1 = plt.bar([(x) for x in range(len(model_1_report))], model_1_report,
        bar_width, align='center', alpha=opacity, color='g', label='Model 1')

bar2 = plt.bar([(x)+(bar_width) for x in range(len(model_2_report))], model_2_report,
        bar_width, align='center', alpha=opacity, color='b', label='Model 2')

for rect in bar1 + bar2:
    height = rect.get_height()
    plt.text(rect.get_x()+rect.get_width()/2.0, height, f'{height:.02f}', ha='center', va='bottom', fontweight='bold')

plt.legend()
plt.tight_layout()
plt.show()

In [None]:
plt.figure(figsize=[12, 22], dpi=200)
for i in range(24):
    image_path = os.path.join(images_directory, list_captions[i+24][0]) # image_directory + ID = exact path
    image_id, caption = list_captions[i+24][0], list_captions[i+24][1][6:-4].split() # 'start' and 'end' are now removed from text
    
    # Average scores of each image
    BLEU_score = round(bleu_scores[image_id], 5)
    ROUGE_score = round(rouge_scores[image_id], 5)
    METEOR_score = round(meteor_scores[image_id], 5)
    
    plt.subplot(6, 4, i+1)
    img = Image.open(image_path)
    img = img.resize((140, 140)) # To have a clean plot, it's necessary to resize all images to a uniform size
    plt.imshow(img)
    plt.axis('off')
    # Captions might be too larg (due to a bad generation issue) so we ignore them
    if len(caption) > 27:
        caption = 'None'
    else:
        # Captions might be larg so we write them in two lines
        split = len(caption)//2
        caption = f"\"{' '.join(caption[:split])}\n{' '.join(caption[split:])}\""
    title = f"{caption}\nROUGE score: {ROUGE_score}\nMETEOR score: {METEOR_score}\nBELU score: {BLEU_score}"
    plt.title(title, fontsize=8, fontweight='bold')

plt.savefig('Generated Captions 1_2.png')
plt.show()