# Visionizer

In [None]:
import os

try:
    from google.colab import drive
    IN_COLAB = True
except:
    IN_COLAB = False

if IN_COLAB:
    print("We're running Colab")
    # Mount the Google Drive at mount
    mount = '/content/gdrive'
    print("Colab: mounting Google drive on ", mount)
    drive.mount(mount)

    # Switch to the directory on the Google Drive that you want to use
    drive_root = mount + "/My Drive/NLP/Visionizer/"

    # Create drive_root if it doesn't exist
    create_drive_root = True
    if create_drive_root:
        print("\nColab: making sure ", drive_root, " exists.")
        os.makedirs(drive_root, exist_ok=True)

    # Change to the directory
    print("\nColab: Changing directory to ", drive_root)
    % cd $drive_root

In [None]:
%%capture
if IN_COLAB:
    !pip install bert-score
    !pip install matplotlib==3.4

import matplotlib.pyplot as plt
import gensim.downloader as gloader
import tensorflow as tf
import numpy as np
import pandas as pd
import random
import time
import pickle
import h5py
import copy
import time
import re
import io

from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from nltk.translate.meteor_score import single_meteor_score, meteor_score
from bert_score import score as bert_score
from nltk.translate.chrf_score import sentence_chrf
from operator import itemgetter
from tqdm.auto import tqdm
from numba import cuda
from PIL import Image

tf.get_logger().setLevel('WARNING')

If running in Google Colab, restart runtime after running the previous cell to update libraries

In [None]:
EPOCHS = 2
BASELINE = False
LSTM = True
DATASET_NAME = "FACAD"
BATCH_SIZE = 64
EMBEDDING_SIZE = 300
VAL_TEST_PER = 0.2
units = 512
# 2 are extra for the start and end tokens
MAX_SEQ_LEN = 22 

np.random.seed(42)

assert DATASET_NAME in ["FashionGen", "InFashAI_DeepFashion", "FACAD"]

gpus = tf.config.experimental.list_physical_devices('GPU')
print(f"GPUs: {gpus}")
for gpu in gpus:
    tf.config.experimental.set_memory_growth(gpu, True)
    
!nvidia-smi

## Import Libraries

## Load Dataset and Clean Captions

In [None]:
s_lens = []
c_lens = []

def clean_captions(sentences, fashiongen=False):
    cleaned = []
    for s in tqdm(sentences):
        sen = s.decode('ISO-8859-1').lower()
        sen = re.sub(r'\.', ' ,', sen)
        sen = re.sub('<br>', ' ', sen)
        sen = re.sub(',(\w)', ', \1', sen)
        sen = re.sub('(\w),', '\1 ,', sen)
        sen = re.sub(r'[^\w\s\,-<>]', '', sen)
        sen = re.sub('-', ' ', sen)
        if fashiongen:
            # consider only first comma comment
            sen = re.sub(r',.+$', '', sen)
        sen = re.sub(',', '', sen)
        sen = '<start> ' + sen + ' <end>'
        sen = re.sub(' +', ' ', sen)
        sen = sen.lower()
        cleaned.append(sen)
        s_lens.append(len(sen.split()[1:-1])) # [1:-1] to not consider start end 
        c_lens.append(len(sen) - 14) # 14 to not consider start end 
    return cleaned

dataset = h5py.File(f"{DATASET_NAME}/dataset/{DATASET_NAME}.h5", 'r')
if DATASET_NAME == "FashionGen":
    cleaned_captions = clean_captions(dataset['input_description'], fashiongen=True)
elif DATASET_NAME in ["InFashAI_DeepFashion", "FACAD"]:
    cleaned_captions = clean_captions(dataset['input_description'])
else:
    raise NotImplementedError
    
plt.boxplot(s_lens)  # looking for outliers in sentence length
plt.show()

img_indexes = list(range(len(cleaned_captions)))
if max(s_lens) < MAX_SEQ_LEN:
    MAX_SEQ_LEN = max(s_lens)

print()
print(f"Lengths before start end [mean, min, max, median]:\n "
      f"{np.mean(s_lens):.3F}\t {min(s_lens)}\t {max(s_lens)}\t {np.median(s_lens):.3F} ")
print(f"Characters before start end [mean, min, max, median]:\n "
      f"{np.mean(c_lens):.3F}\t {min(c_lens)}\t {max(c_lens)}\t {np.median(c_lens):.3F} ")
    
# not including sentences longer than the MAX_SEQ_LEN
img_indexes_maxthr = []
cleaned_captions_maxthr = []

for i, c in enumerate(cleaned_captions):
    if len(c.split()) <= MAX_SEQ_LEN: 
        img_indexes_maxthr.append(img_indexes[i])
        cleaned_captions_maxthr.append(c)
        
print("Removed: ", len(img_indexes) - len(img_indexes_maxthr), " items")
print(len(img_indexes_maxthr), len(cleaned_captions_maxthr))
print()

s_lens = [len(s.split()[1:-1]) for s in cleaned_captions_maxthr]
c_lens = [len(s) - 14 for s in cleaned_captions_maxthr]
print(f"Lengths after max threshold [mean, min, max, median]:\n "
      f"{np.mean(s_lens):.3F}\t {min(s_lens)}\t {max(s_lens)}\t {np.median(s_lens):.3F} ")
print(f"Characters after max threshold [mean, min, max, median]:\n "
      f"{np.mean(c_lens):.3F}\t {min(c_lens)}\t {max(c_lens)}\t {np.median(c_lens):.3F} ")

In [None]:
# Copy to COLAB local disk to have faster access
if IN_COLAB:
    data_path = f"{DATASET_NAME}/dataset/{DATASET_NAME}.h5"
    !cp -v $data_path /content/
    dataset = h5py.File(f"{DATASET_NAME}/dataset/{DATASET_NAME}.h5", 'r')


In [None]:
for i in range(1):
    idx = np.random.randint(len(dataset['input_image']))
    img = tf.io.decode_jpeg(dataset['input_image'][idx]).numpy()
    plt.matshow(img)
    print(f"{idx}, Shape: {img.shape}, {cleaned_captions[idx]}")

del idx

## Features Extraction: InceptionV3

### Load InceptionV3 pretrained weights

In order to extract features we use InceptionV3 (which is pretrained on Imagenet). Features are extracted from the last convolutional layer.

In [None]:
%%capture
image_model = tf.keras.applications.InceptionV3(include_top=False, weights='imagenet')
new_input = image_model.input
hidden_layer = image_model.layers[-1].output
image_features_extract_model = tf.keras.Model(new_input, hidden_layer)

### Preprocess the images using InceptionV3

Before processing images with InceptionV3, we convert them into InceptionV3's expected format

In [None]:
@tf.function
def preprocess_image(img):
    # resize all the images in the dataset before for faster generation of batches
    # otherwise uncomment the following
    # img = tf.keras.layers.experimental.preprocessing.Resizing(299, 299)(img)
    img = tf.keras.applications.inception_v3.preprocess_input(img)
    return img

@tf.function
def forward_in_inception(images):
    batch = tf.map_fn(preprocess_image, images)
    batch_features = image_features_extract_model(batch)
    batch_features = tf.reshape(batch_features, (batch_features.shape[0], -1, batch_features.shape[3]))
    return batch_features

## Caption Processing

Once the images are prepared, we have to tokenize the captions. 

### Tokenizer (Class)

Tokenizer is the class implemented to deal with text tokenization. It creates the embedding matrix and the dictionaries used to map words with tokens and viceversa. 
The embedding used are GloVe embeddings. 

In [None]:
class Tokenizer(object):
    def __init__(self, dataset_sentences, embedding_dim, glove_dict, glove_matrix):
        """
            dataset_sentences : sentences to be tokenized in form of strings
            embedding_dim : size of the embeddings
            glove_dict : glove dictionary
            glove_matrix : glove matrix
        """
        self.embedding_matrix = None
        # word to token dictionary
        self.value_to_key = {}
        # useful for implementation, contains only new OOV terms
        self.value_to_key_new = {}
        self.num_unique_words = 0
        # word to token
        self.key_to_value = {}
        self.dataset_sentences = dataset_sentences
        self.embedding_dim = embedding_dim
        self.glove_dict = glove_dict
        self.glove_matrix = glove_matrix
        # set containing all the words of the vocabulary, update after the processing of each split
        self.unique_words = set()

    def get_val_to_key(self):
        return copy.deepcopy(self.value_to_key)

    def tokenize(self):
        """
            create and/or upload tokenizer. Each time that a new word is passed
            from a sentence, check if it has already been tokenized otherwise 
            tokenize it and add to the vocabulary. 
        """
        self.value_to_key_new = {}
        unique_words = set()
        for sen in self.dataset_sentences:
            for w in sen.split():
                # get set of unique words
                unique_words.add(w)
                # new unique are the words not still processed
        new_unique = unique_words - self.unique_words
        for i, word in enumerate(new_unique):
            # it means we are in the validation or test set, the embedding has already been created and updated by the train
            if self.embedding_matrix is not None:
                # tokenization
                self.key_to_value[i + len(self.embedding_matrix)] = word
                self.value_to_key[word] = i + len(self.embedding_matrix)
            else:
                # first time we are tokenizing (train), don't need to add len
                self.key_to_value[i] = word
                self.value_to_key[word] = i
            self.value_to_key_new[word] = i

        self.num_unique_words = len(new_unique)
        # update unique words with new unique
        self.unique_words = self.unique_words | new_unique

    def __build_embedding_matrix_glove(self):
        """
            create the embedding matrix. The rows corresponding to the words
            contained in glove will be filled. If a word is not in glove the
            word and its index are saved in order to be processed later.   
        """
        oov_words = []
        tmp_embedding_matrix = np.zeros((self.num_unique_words, self.embedding_dim))
        len_old_emb_matrix = len(self.embedding_matrix) if self.embedding_matrix is not None else 0
        for word, idx in tqdm(self.value_to_key_new.items()):
            try:
                embedding_vector = self.glove_matrix[self.glove_dict[word]]
                # create tmp embedding matrix to be concatenated to the original embedding matrix
                tmp_embedding_matrix[idx] = embedding_vector
            except (KeyError, TypeError):
                oov_words.append((word, idx + len_old_emb_matrix))

        if self.embedding_matrix is not None:
            # concatenate old embedding matrix and new one (new OOVs)
            self.embedding_matrix = np.vstack((self.embedding_matrix, tmp_embedding_matrix))
        else:
            self.embedding_matrix = copy.deepcopy(tmp_embedding_matrix)
        return oov_words

    def build_embedding_matrix(self):
        """
            create embedding vector for OOV words. If a word 
            is out of vocabulary values follow uniform distribution . 
        """
        oov_words = self.__build_embedding_matrix_glove()
        for word, idx in oov_words:
            embedding_vector = np.random.uniform(low=-0.05, high=0.05, size=self.embedding_dim)
            self.embedding_matrix[idx] = embedding_vector
        return copy.deepcopy(self.embedding_matrix)

### Preprocess and tokenize the captions

Transform the text captions into integer sequences using the Tokenizer.
We remove elements in the datset bigger than  a threshold

In the following code we first load Glove Embeddings. After that we use the Tokenizer methods to tokenize the text (map each word to an integer, the token) and we create the embedding matrix. 

In [None]:
# Load Glove Embeddings from a pickle file or download if necessary
try:
    with open(f"./GloVe/glove-{EMBEDDING_SIZE}.pkl", 'rb') as f:
        emb_model = pickle.load(f)
except Exception:
    emb_model = gloader.load(f"glove-wiki-gigaword-{EMBEDDING_SIZE}")
    with open(f"./GloVe/glove-{EMBEDDING_SIZE}.pkl", 'wb') as f:
        pickle.dump(emb_model, f)
####################################################################

glove_dict = emb_model.key_to_index
glove_matrix = emb_model.vectors

tokenizer = Tokenizer(cleaned_captions_maxthr, EMBEDDING_SIZE, glove_dict, glove_matrix)
tokenizer.tokenize()

# create embedding matrix
emb_matrix = tokenizer.build_embedding_matrix()
# first column corresponding to padding
emb_matrix = np.vstack((np.zeros((1, EMBEDDING_SIZE)), emb_matrix))

val_to_key = tokenizer.get_val_to_key()

# Shifts tokens by 1 because of padding (to adapt to the network embedding layer)
val_to_key.update((x, y + 1) for x, y in val_to_key.items())

# Translation dictionary to retranslate from tokens to words
key_to_val = {}
key_val_list_items = list(tokenizer.key_to_value.items())
for i, (token, value) in enumerate(key_val_list_items):
    if i > 0:
        key_to_val[token] = key_val_list_items[i - 1][1]
    else:
        key_to_val[i] = '<PAD>'

key_to_val[len(key_val_list_items)] = key_val_list_items[-1][1]


def word_to_index(word):
    return val_to_key[word]


def index_to_word(index):
    return key_to_val[index]

In [None]:
# Create the tokenized vectors
cap_vector = []
for sen in cleaned_captions_maxthr:
    cap_vector.append([word_to_index(i) for i in sen.split()])

## Training Preparation and Model Setting

### Split the data into training and testing

In [None]:
img_to_cap_vector = {}
#for img, cap in zip(img_name_vector, cap_vector):
for img, cap in zip(img_indexes_maxthr, cap_vector):
    img_to_cap_vector[img] = cap
# Create training and validation sets using an 80-20 split randomly.
img_keys = list(img_to_cap_vector.keys())

# It is important not to shuffle the FashionGen dataset all together
# since there are multiple images per caption in order
if DATASET_NAME != "FashionGen":
    random.shuffle(img_keys)
print(len(img_keys))

slice_index = int(len(img_keys) * (1 - VAL_TEST_PER))
slice_index_test = int(len(img_keys) * (1 - (VAL_TEST_PER/2)))

img_name_train_keys, img_name_val_keys = img_keys[:slice_index], img_keys[slice_index:slice_index_test]
img_name_test_keys = img_keys[slice_index_test:]

# TRAIN SPLIT
img_name_train = []
cap_train = []
for img in img_name_train_keys:
    img_name_train.append(img)
    img_to_cap_vector[img] = img_to_cap_vector[img] + ([0] * (MAX_SEQ_LEN - len(img_to_cap_vector[img])))
    cap_train.append(img_to_cap_vector[img])
# Shuffle in split
train_pairs = list(zip(img_name_train, cap_train))
random.shuffle(train_pairs)
img_name_train, cap_train = zip(*train_pairs)
del train_pairs

# VAL SPLIT
img_name_val = []
cap_val = []
for img in img_name_val_keys:
    img_name_val.append(img)
    img_to_cap_vector[img] = img_to_cap_vector[img] + ([0] * (MAX_SEQ_LEN - len(img_to_cap_vector[img])))
    cap_val.append(img_to_cap_vector[img])
# Shuffle in split
val_pairs = list(zip(img_name_val, cap_val))
random.shuffle(val_pairs)
img_name_val, cap_val = zip(*val_pairs)
del val_pairs


# TEST SPLIT
img_name_test = []
cap_test = []
for img in img_name_test_keys:
    img_name_test.append(img)
    img_to_cap_vector[img] = img_to_cap_vector[img] + ([0] * (MAX_SEQ_LEN - len(img_to_cap_vector[img])))
    cap_test.append(img_to_cap_vector[img])
# Shuffle in split
test_pairs = list(zip(img_name_test, cap_test))
random.shuffle(test_pairs)
img_name_test, cap_test = zip(*test_pairs)
del test_pairs


In [None]:
len(img_name_train), len(cap_train), len(img_name_val), len(cap_val), len(img_name_test), len(cap_test)

In [None]:
num_steps_per_epoch = len(img_name_train) // BATCH_SIZE
num_steps_val_per_epoch = len(img_name_val) // BATCH_SIZE
# Shape of the vector extracted from InceptionV3 is (64, 2048)
# These two variables represent that vector shape
features_shape = 2048
attention_features_shape = 64

print(f"Train Steps: {num_steps_per_epoch} | Validation Steps: {num_steps_val_per_epoch}")

In [None]:
print('first sentence check: \n', [index_to_word(i) for i in cap_test[0]])
print('test shape: ', len(cap_test))
plt.matshow(tf.io.decode_jpeg(dataset['input_image'][img_name_test[0]]).numpy())
plt.show()

In [None]:
# Yield also the following to debug the order of the indexes:
# indexes = [el for el in img_name_train[i * bs : (i * bs) + bs]]

def dataset_generator(img_names, captions):
    bs = BATCH_SIZE
    i = 0
    while i < len(captions) and (len(captions[i * bs : -1]) + 1 >= bs):
        images_in_batch = np.array([tf.io.decode_jpeg(dataset["input_image"][el], fancy_upscaling=False)
                                    for el in img_names[i * bs : (i * bs) + bs]], dtype=np.float32)
        features_in_batch = forward_in_inception(images_in_batch)
        captions_in_batch = np.array(captions[i * bs : (i * bs) + bs], dtype=np.int32)
        i+=1
        yield features_in_batch, captions_in_batch

In [None]:
%%time
for (batch, (img_tensor, target)) in enumerate(dataset_generator(img_name_train, cap_train)):
    # Some Debugging
    # print(type(batch), batch)
    # print(type(img_tensor), img_tensor.shape)
    # print(len(target))
    # print(' '.join([index_to_word(w) for w in target[0]]))
    if batch == 0:
        break

### Model


In [None]:
class BahdanauAttention(tf.Module):
    def __init__(self, units):
        super(BahdanauAttention, self).__init__()
        self.W1 = tf.keras.layers.Dense(units)
        self.W2 = tf.keras.layers.Dense(units)
        self.V = tf.keras.layers.Dense(1)

    @tf.function(input_signature=[tf.TensorSpec(shape=[None, 64, 300], dtype=tf.float32),
                                  tf.TensorSpec(shape=[None, 512], dtype=tf.float32)])
    def __call__(self, features, hidden):
        # features(CNN_encoder output) shape == (batch_size, 64, embedding_dim)

        # hidden shape == (batch_size, hidden_size)
        # hidden_with_time_axis shape == (batch_size, 1, hidden_size)
        hidden_with_time_axis = tf.expand_dims(hidden, 1)

        # attention_hidden_layer shape == (batch_size, 64, units)
        attention_hidden_layer = (tf.nn.tanh(self.W1(features) +
                                             self.W2(hidden_with_time_axis)))

        # score shape == (batch_size, 64, 1)
        # This gives you an unnormalized score for each image feature.
        score = self.V(attention_hidden_layer)

        # attention_weights shape == (batch_size, 64, 1)
        attention_weights = tf.nn.softmax(score, axis=1)

        # context_vector shape after sum == (batch_size, hidden_size)
        context_vector = attention_weights * features
        context_vector = tf.reduce_sum(context_vector, axis=1)

        return context_vector, attention_weights

In [None]:
class NoAttention(tf.Module):
    def __init__(self, ):
        super(NoAttention, self).__init__()
        
    @tf.function(input_signature=[tf.TensorSpec(shape=[None, 64, 300], dtype=tf.float32)])
    def __call__(self, features):
        context_vector = features
        context_vector = tf.reduce_sum(context_vector, axis=1)

        return context_vector

In [None]:
class CNN_Encoder(tf.Module):
    # Since you have already extracted the features and dumped it
    # This encoder passes those features through a Fully connected layer
    def __init__(self, embedding_dim):
        super(CNN_Encoder, self).__init__()
        # shape after fc == (batch_size, 64, embedding_dim)
        self.fc = tf.keras.layers.Dense(embedding_dim)
    
    @tf.function(input_signature=[tf.TensorSpec(shape=[None, attention_features_shape, features_shape], dtype=tf.float32),]) 
    def __call__(self, x):
        x = self.fc(x)
        x = tf.nn.relu(x)
        return x

In [None]:
class RNN_Decoder(tf.Module):
    def __init__(self, units, embedding_matrix, vocab_size, lstm_enabled=False, baseline=False):
        super(RNN_Decoder, self).__init__()
        self.units = units
        self.lstm_enabled = lstm_enabled
        self.baseline = baseline
        if self.baseline:            
            self.embedding = tf.keras.layers.Embedding(input_dim=vocab_size,
                                                   output_dim=EMBEDDING_SIZE,
                                                   input_length=MAX_SEQ_LEN,
                                                   mask_zero=True,
                                                   trainable=True
                                                   )
        else:
            self.embedding = tf.keras.layers.Embedding(input_dim=vocab_size,
                                                    output_dim=EMBEDDING_SIZE,
                                                    input_length=MAX_SEQ_LEN,
                                                    mask_zero=True,
                                                    weights=tf.expand_dims(embedding_matrix, axis=0),
                                                    trainable=True
                                                    )
        self.gru = tf.keras.layers.GRU(self.units,
                                       return_sequences=True,
                                       return_state=True,
                                       recurrent_initializer='glorot_uniform')
        self.lstm = tf.keras.layers.LSTM(self.units,
                                         return_sequences=True,
                                         return_state=True,
                                         recurrent_initializer='glorot_uniform')

        self.fc1 = tf.keras.layers.Dense(self.units)
        self.fc2 = tf.keras.layers.Dense(vocab_size)
        self.attention = BahdanauAttention(self.units)
        self.no_attention = NoAttention()
        
    # Credits to MariaMsu and ant0nisk for explaining how to save
    # Check: https://stackoverflow.com/questions/59417433/typeerror-when-using-tf-keras-models-save-model-to-save-multi-inputs-tf-2-x-su
    # And: https://stackoverflow.com/questions/62250441/saving-a-tensorflow-keras-model-encoder-decoder-to-savedmodel-format
    # However, to date, there is no possible way to save and reload on TF2.5 this architecture
    @tf.function(input_signature=[tf.TensorSpec(shape=[None, 1], dtype=tf.int32), 
                                  tf.TensorSpec(shape=[None, 64, 300], dtype=tf.float32),
                                  tf.TensorSpec(shape=[None, 512], dtype=tf.float32)])
    
    def __call__(self, x, features, hidden):
        # defining attention as a separate model
        if self.baseline:
            context_vector = self.no_attention(features)
            attention_weights = None
        else:
            context_vector, attention_weights = self.attention(features, hidden)

        # x shape after passing through embedding == (batch_size, 1, embedding_dim)
        x = self.embedding(x)

        # x shape after concatenation == (batch_size, 1, embedding_dim + hidden_size)
        if self.baseline:
            x = tf.concat([tf.expand_dims(context_vector, 1), hidden, x], axis=-1)
        else:
            x = tf.concat([tf.expand_dims(context_vector, 1), x], axis=-1)

        if self.lstm_enabled:
            # passing the concatenated vector to the LSTM
            output, state, _ = self.lstm(x)
        else:
            # passing the concatenated vector to the GRU
            output, state = self.gru(x)

        # shape == (batch_size, max_length, hidden_size)
        x = self.fc1(output)

        # x shape == (batch_size * max_length, hidden_size)
        x = tf.reshape(x, (-1, x.shape[2]))

        # output shape == (batch_size * max_length, vocab)
        x = self.fc2(x)
        #probs = tf.reshape(x, (-1, MAX_SEQ_LEN, self.vocab_size))
        probs = tf.nn.softmax(x)

        return x, state, attention_weights, probs

In [None]:
optimizer = tf.keras.optimizers.Adam()
loss_object = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True, reduction='none')

@tf.function
def loss_function(real, pred):
    mask = tf.math.logical_not(tf.math.equal(real, 0))
    loss_ = loss_object(real, pred)

    mask = tf.cast(mask, dtype=loss_.dtype)
    loss_ *= mask

    return tf.reduce_mean(loss_)

### Checkpoint

Set checkpoints to save and load model

In [None]:
checkpoint_path = f"checkpoints/{DATASET_NAME}/train_{'lstm' if LSTM else 'gru'}_{EPOCHS}epochs{'_Baseline' if BASELINE else ''}"
print(checkpoint_path)

In [None]:
encoder = CNN_Encoder(EMBEDDING_SIZE)
decoder = RNN_Decoder(units, 
                      tf.convert_to_tensor(emb_matrix), 
                      len(val_to_key.keys()) + 1)

## Training

In [None]:
# adding this in a separate cell because if you run the training cell
# many times, the loss_plot array will be reset
loss_plot = []
loss_plot_val = []

loss_plot_by_steps = []

In [None]:
@tf.function
def train_step(img_tensor, target):
    loss = 0

    # initializing the hidden state for each batch
    # because the captions are not related from image to image
    hidden = tf.zeros((BATCH_SIZE, units))
    dec_input = tf.expand_dims([word_to_index('<start>')] * target.shape[0], 1)

    with tf.GradientTape() as tape:
        features = encoder(img_tensor)

        for i in range(1, target.shape[1]):
            # passing the features through the decoder
            predictions, hidden, _, probs = decoder(dec_input, features, hidden)
            loss += loss_function(target[:, i], predictions)
            # using teacher forcing
            dec_input = tf.expand_dims(target[:, i], 1)

    total_loss = (loss / int(target.shape[1]))
    trainable_variables = encoder.trainable_variables + decoder.trainable_variables
    gradients = tape.gradient(loss, trainable_variables)
    optimizer.apply_gradients(zip(gradients, trainable_variables))

    return loss, total_loss


@tf.function
def val_step(img_tensor, target):
    loss = 0
    # initializing the hidden state for each batch
    # because the captions are not related from image to image
    hidden = tf.zeros((BATCH_SIZE, units))
    dec_input = tf.expand_dims([word_to_index('<start>')] * target.shape[0], 1)
    features = encoder(img_tensor)

    for i in range(1, target.shape[1]):
        # passing the features through the decoder
        predictions, hidden, _, probs = decoder(dec_input, features, hidden)
        loss += loss_function(target[:, i], predictions)
        # using teacher forcing
        dec_input = tf.expand_dims(target[:, i], 1)

    total_loss = (loss / int(target.shape[1]))
    return loss, total_loss

In [None]:
if not os.path.exists(checkpoint_path):
    os.mkdir(checkpoint_path)
    progress_bar = tqdm(range(num_steps_per_epoch * EPOCHS))
    print_str = f'Epoch {1} | Batch {0} | Loss ----- | '
    progress_bar.set_description_str(print_str)

    for epoch in range(0, EPOCHS):
        start = time.time()
        total_loss = 0
        total_loss_val = 0

        for (batch, (img_tensor, target)) in enumerate(dataset_generator(img_name_train, cap_train)):
            batch_loss, t_loss = train_step(img_tensor, target)
            total_loss += t_loss
            loss_plot_by_steps.append(t_loss)

            average_batch_loss = batch_loss.numpy() / int(target.shape[1])
            if batch % 5 == 0:
                print_str = f'Epoch {epoch + 1} | Batch {batch + 5} | Loss {average_batch_loss:.4F} | '
                progress_bar.set_description_str(print_str)
                progress_bar.update(5)
                
        for (batch, (img_tensor, target)) in enumerate(dataset_generator(img_name_val, cap_val)):
            batch_loss, t_loss = val_step(img_tensor, target)
            total_loss_val += t_loss

        # storing the epoch end loss value to plot later
        loss_plot.append(total_loss / num_steps_per_epoch)
        loss_plot_val.append(total_loss_val / num_steps_val_per_epoch)
        
        print()
        print(f'Epoch {epoch + 1} Loss {total_loss / num_steps_per_epoch:.4F} Val Loss {total_loss_val / num_steps_val_per_epoch:.4F}')
        print(f'Time taken for 1 epoch {time.time() - start:.2f} sec\n')
        
    plt.plot(loss_plot)
    plt.plot(loss_plot_val)
    plt.xlabel('Epochs')
    plt.ylabel('Loss')
    plt.title('Loss Plot')
    plt.legend(['train', 'validation'])
    plt.savefig(f"{checkpoint_path}/loss.png")
    plt.show()

    plt.plot(loss_plot_by_steps)
    plt.xlabel('Num Steps')
    plt.ylabel('Loss')
    plt.title('Loss Plot by Steps')
    plt.legend(['train'])
    plt.savefig(f"{checkpoint_path}/loss_plot_by_steps.png")
    plt.show()

    tf.saved_model.save(encoder, f"{checkpoint_path}/encoder")
    tf.saved_model.save(decoder, f"{checkpoint_path}/decoder")     
    #     encoder.save_weights(f"{checkpoint_path}/encoder")
    #     decoder.save_weights(f"{checkpoint_path}/decoder")     
    
else:
    image = Image.open(f"{checkpoint_path}/loss.png")
    image.show()
    image = Image.open(f"{checkpoint_path}/loss_plot_by_steps.png")
    image.show()
    
    encoder = tf.saved_model.load(f"{checkpoint_path}/encoder")
    decoder = tf.saved_model.load(f"{checkpoint_path}/decoder")
    #     encoder.load_weights(f"{checkpoint_path}/encoder")
    #     decoder.load_weights(f"{checkpoint_path}/decoder")   

## Evaluation

* The evaluate function is similar to the training loop, except you don't use teacher forcing here. The input to the decoder at each time step is its previous predictions along with the hidden state and the encoder output.
* Stop predicting when the model predicts the end token.
* And store the attention weights for every time step.

In [None]:
def predict(sent_predict, features, hidden):
    '''
    ::params
    src_input: sequence generated at this point
    features: image features extracted
    return : hidden (context) and new probilites from softmax
    '''
    predictions, hidden, attention_weights, probs = decoder(sent_predict,
                                                            features,
                                                            hidden)
    return hidden, probs[0]

In [None]:
def beam_search(features, beam_size=2, top_k=2, sequence_max_len=MAX_SEQ_LEN):
    # (probability, token sequence, last token)
    # first element: incremental probability of the sentence until the current point
    # second element: token sequence corresponding to the generated sentence
    # third element: next possible token
    k_beams_running = [(0, [word_to_index('<start>')], [word_to_index('<start>')]) for i in range(beam_size)]

    # initialize list to store completed sentences
    k_beams_ended = []

    # initialize hiddens for each beam
    hiddens = [tf.zeros((1, units)) for _ in range(beam_size)]

    first = True
    while beam_size > 0:
        # list of beams: contains all the possible sentences obtained by concatenating current sentence with the
        # next possible word
        list_of_beams = []
        for beam in range(beam_size):
            hiddens[beam], probs = predict(tf.expand_dims((k_beams_running[beam][2]), 0), features, hiddens[beam])
            # extract top_k probabilites and relative indices
            predicted = tf.math.top_k(tf.math.log(probs), k=top_k)
            top_k_indices = predicted.indices.numpy()
            top_k_logprobs = predicted.values.numpy()
            # adding temporary sums for logprobs
            for i in range(top_k):
                k_beams_running[beam][1].append(top_k_indices[i])
                running_copy = copy.deepcopy(k_beams_running[beam])
                list_of_beams.append((running_copy[0] + top_k_logprobs[i], running_copy[1], [top_k_indices[i]]))
                k_beams_running[beam][1].pop()
            # if first time just have top_k possible paths instead of beam_size * top_k
            if first:
                first = False
                break
        beams_to_remove = []
        for beam in range(beam_size):
            # extract tuple corresponding to max probability
            max_s = max(list_of_beams, key=itemgetter(0))
            k_beams_running[beam] = max_s
            list_of_beams.remove(max_s)
            # if token correspond to <end> or exceedes max length store in k_beams_ended
            if k_beams_running[beam][2][0] == word_to_index('<end>') or len(
                    k_beams_running[beam][1]) >= sequence_max_len:
                k_beams_ended.append((k_beams_running[beam][0], k_beams_running[beam][1]))
                beams_to_remove.append(beam)
                beam_size -= 1
        for i, beam in enumerate(beams_to_remove):
            # remove from k_beams running the ended sentence
            k_beams_running.pop(beam - i)
            hiddens.pop(beam - i)

    # normalize
    normalize_probs = []
    for sen in k_beams_ended:
        normalize_probs.append(sen[0] / len(sen[1]))
    # extract sentence with maximum probability
    tokenized_sen = k_beams_ended[np.argmax(normalize_probs)][1]
    sen = [index_to_word(tokenized_word) for tokenized_word in tokenized_sen]
    return sen

In [None]:
def beam_search_evaluate(image_features):
    features = encoder(image_features)
    return beam_search(features)

In [None]:
def evaluate(image_features):
    attention_plot = np.zeros((MAX_SEQ_LEN, attention_features_shape))
    hidden = tf.zeros((1, units))
    features = encoder(image_features)
    dec_input = tf.expand_dims([word_to_index('<start>')], 0)
    result = []

    for i in range(MAX_SEQ_LEN):
        predictions, hidden, attention_weights, probs = decoder(dec_input,
                                                                features,
                                                                hidden)

        if not BASELINE:
            attention_plot[i] = tf.reshape(attention_weights, (-1,)).numpy()

        predicted_id = tf.random.categorical(predictions, 1)[0][0].numpy()
        predicted_word = tf.compat.as_text(index_to_word(predicted_id))
        result.append(predicted_word)

        if predicted_word == '<end>':
            if not BASELINE:
                return result, attention_plot
            else:
                return result

        dec_input = tf.expand_dims([predicted_id], 0)

    if not BASELINE:
        attention_plot = attention_plot[:len(result), :]
        return result, attention_plot
    else:
        return result


In [None]:
def plot_attention(image, result, attention_plot):
    fig = plt.figure(figsize=(18, 18))

    len_result = len(result)
    for i in range(len_result):
        temp_att = np.resize(attention_plot[i], (8, 8))
        grid_size = max(int(np.ceil(len_result / 2)), 2)
        ax = fig.add_subplot(grid_size, grid_size, i + 1)
        ax.set_title(result[i], fontsize=19)
        img = ax.imshow(image)
        ax.imshow(temp_att, cmap='gray', alpha=0.3, extent=img.get_extent())

    plt.tight_layout()
    plt.show()

In [None]:
# captions on the validation set

def print_one_output(rid, image, cap_split):
    print(image.shape)
    image_features = forward_in_inception(np.array([image], dtype=np.float32))
    print(image_features.shape)

    real_caption = ' '.join([tf.compat.as_text(index_to_word(i)) for i in cap_split[rid] if i not in [0]])
    if BASELINE:
        result = evaluate(image_features)
    else:
        result, attention_plot = evaluate(image_features)
    print()
    print('Real Caption::\t\t', real_caption[8:-6])
    print('Prediction Caption::\t', ' '.join(result[:-1]))
    print('Beam search output::\t', ' '.join(beam_search_evaluate(image_features)[1:-1]))

    if not BASELINE:
        plot_attention(image, result, attention_plot)

    print()
    
rid = np.random.randint(len(img_name_val))        
image = tf.io.decode_jpeg(dataset['input_image'][img_name_val[rid]])

print_one_output(rid, image, cap_val)

 ## Error Analysis


In [None]:
# BEST, WORST, AND AVG, CAPTIONS based on Bleu scores

bleu_argmax = []
bleu_beam = []

y_argmax = []
y_beam = []
y_true = []

for el_id in tqdm(range(len(img_name_test[:100]))):
    image = tf.io.decode_jpeg(dataset['input_image'][img_name_test[el_id]])
    image_features = forward_in_inception(np.array([image], dtype=np.float32))

    real_caption = ' '.join([tf.compat.as_text(index_to_word(j)) for j in cap_test[el_id] if j not in [0]])
    real_caption = re.sub(r'(<start> )|( <end>)', '', real_caption)
    beam = ' '.join(beam_search_evaluate(image_features)[1:-1])
    if BASELINE:
        argmax = ' '.join(evaluate(image_features)[:-1])
    else:
        argmax = ' '.join(evaluate(image_features)[0][:-1])

    try:
        bleu_beam.append(sentence_bleu([real_caption], beam, smoothing_function=SmoothingFunction().method4))
        bleu_argmax.append(sentence_bleu([real_caption], argmax, smoothing_function=SmoothingFunction().method4))

        y_argmax.append(argmax)
        y_beam.append(beam)
        y_true.append(real_caption)
    except Exception as e:
        print(e)
        print(real_caption)

        
# BEST
print(f"Max Bleu {max(bleu_argmax)}")
index = np.argmax(bleu_argmax)
image = tf.io.decode_jpeg(dataset['input_image'][img_name_test[index]])
print_one_output(index, image, cap_test)

# WORST
print(f"Min Bleu {min(bleu_argmax)}")
index = np.argmin(bleu_argmax)
image = tf.io.decode_jpeg(dataset['input_image'][img_name_test[index]])
print_one_output(index, image, cap_test)

# AVERAGE
average = np.mean(bleu_argmax)
print(f"Average Bleu {average}")
index = np.argmin([np.abs(average - s) for s in bleu_argmax])
image = tf.io.decode_jpeg(dataset['input_image'][img_name_test[index]])
print_one_output(index, image, cap_test)

In [None]:
def print_element_from_caption(caption):
    caption = re.sub(r'^', '<start> ', caption)
    caption = re.sub(r'$', ' <end>', caption)
    indexes = [i for i, x in enumerate(cleaned_captions_maxthr) if x == caption]
    print(len(indexes))
    if len(indexes) > 0:
        for index in indexes:
            image = tf.io.decode_jpeg(dataset['input_image'][img_indexes_maxthr[index]])
            image_features = forward_in_inception(np.array([image], dtype=np.float32))

            real_caption = re.sub(r'(<start> )|( <end>)', '', caption)
            if BASELINE:
                argmax = ' '.join(evaluate(image_features)[:-1])
            else:
                argmax = ' '.join(evaluate(image_features)[0][:-1])

            print('Real Caption::\t\t', real_caption)
            print('Prediction Caption::\t', argmax)

            plt.figure(figsize=(6,6))
            plt.imshow(image.numpy())
            plt.show()
    else:
        print("Caption inserted is NOT present in the dataset")
    
print_element_from_caption("the lady is wearing a western multicolor long sleeved dress")

## Evaluation with BLEU, CHRF and BERTscore

In [None]:
import nltk
nltk.download('wordnet')
nltk.download('omw-1.4')

y_pred = []
y_pred_nobeam = []
y_true = []

chrf_scores = []
bleu_scores = []
meteor_scores = []
chrf_scores_nobeam = []
bleu_scores_nobeam = []
meteor_scores_nobeam = []


for i in tqdm(range(len(img_name_test))):
    decoded_img = np.array([tf.io.decode_jpeg(dataset['input_image'][img_name_test[i]])], dtype=np.float32)
    image = forward_in_inception(decoded_img)
    real_caption = ' '.join([tf.compat.as_text(index_to_word(w)) for w in cap_test[i] if w not in [0]])
    real_caption = re.sub(r'(<start> )|( <end>)', '', real_caption)
    output = ' '.join(beam_search_evaluate(image)[1:-1])
    if BASELINE:
        output_nobeam = ' '.join(evaluate(image)[:-1])
    else:   
        output_nobeam = ' '.join(evaluate(image)[0][:-1])

    try:
        chrf_scores.append(sentence_chrf(real_caption, output))
        bleu_scores.append(sentence_bleu([real_caption], output, smoothing_function=SmoothingFunction().method4))
        meteor_scores.append(meteor_score([real_caption.split()], output.split()))

        chrf_scores_nobeam.append(sentence_chrf(real_caption, output_nobeam))
        bleu_scores_nobeam.append(sentence_bleu([real_caption], output_nobeam, smoothing_function=SmoothingFunction().method4))
        meteor_scores_nobeam.append(meteor_score([real_caption.split()], output_nobeam.split()))

        y_pred.append(output)
        y_pred_nobeam.append(output_nobeam)
        y_true.append(real_caption)
    except Exception as e:
        print(e)
        print(real_caption)

print("BEAM:")
print(y_pred[1:5])

print("NO BEAM:")
print(y_pred_nobeam[1:5])

print("TRUE:")
print(y_true[1:5])

print()
print()

In [None]:
# You might run out of GPU memory to use bert
# Only way to date to clear in a concise way nvidia gpu memory with tensorflow models

device = cuda.get_current_device()
device.reset()
del encoder
del decoder

In [None]:
bleu_scr = sum(bleu_scores) / len(bleu_scores)
chrf_scr = sum(chrf_scores) / len(chrf_scores)
meteor_scr = sum(meteor_scores) / len(meteor_scores)
bert_scr_prec, bert_scr_recall, bert_scr_f1 = bert_score(y_pred, y_true, lang="en", verbose=False)
bleu_scr_nobeam = sum(bleu_scores_nobeam) / len(bleu_scores_nobeam)
chrf_scr_nobeam = sum(chrf_scores_nobeam) / len(chrf_scores_nobeam)
meteor_scr_nobeam = sum(meteor_scores_nobeam) / len(meteor_scores_nobeam)
bert_scr_prec_nobeam, bert_scr_recall_nobeam, bert_scr_f1_nobeam = bert_score(y_pred_nobeam, y_true, lang="en",
                                                                              verbose=False)

print()
print(f"BLEU beam: {bleu_scr:.3F} "
      f"CHRF beam: {chrf_scr:.3F} "
      f"METEOR beam: {meteor_scr:.3F} "
      f"BERTscore beam: {bert_scr_f1.mean():.3F} ")
print(f"BLEU: {bleu_scr_nobeam:.3F} "
      f"CHRF: {chrf_scr_nobeam:.3F} "
      f"METEOR: {meteor_scr_nobeam:.3F} "
      f"BERTscore: {bert_scr_f1_nobeam.mean():.3F}")

In [None]:
def plot_scores(labels: list, values: list, title: str):
    x = np.array(labels)
    y = np.array([round(val, 3) for val in values])

    fig = plt.figure(figsize=(12, 5))
    ax = fig.add_subplot(111)
    yvals = range(len(y))

    ax.barh(yvals, y, align='center', alpha=0.4, color=['red', 'blue', 'yellow', 'green'])
    plt.yticks(yvals, x)
    plt.xlim([0, 1.1])
    plt.axvline(x = 1, color='black', label='axvline - full height', lw=0.3)
    plt.title(title)
    plt.tight_layout()
    [ax.bar_label(container, fontsize=10, rotation='60') for container in ax.containers]
    plt.savefig(f'plots/{title}.png')
    plt.show()

In [None]:
values = [bleu_scr, chrf_scr, meteor_scr, bert_scr_f1.mean().item()]
labels = ["BLEU", "CHRF", "METEOR", "BERT"]

plot_scores(labels, values, f"Scores with BEAM search on {DATASET_NAME} {EPOCHS}epochs "
                            f"{'LSTM' if LSTM else 'GRU'} {EMBEDDING_SIZE}emb_size " 
                            f"{'Baseline' if BASELINE else ''}")

In [None]:
values = [bleu_scr_nobeam, chrf_scr_nobeam, meteor_scr_nobeam, bert_scr_prec_nobeam.mean().item()]
labels = ["BLEU", "CHRF", "METEOR", "BERT"]

plot_scores(labels, values, f"Scores with ARGMAX generation on {DATASET_NAME} {EPOCHS}epochs "
                            f"{'LSTM' if LSTM else 'GRU'} {EMBEDDING_SIZE}emb_size " 
                            f"{'Baseline' if BASELINE else ''}")