# Importing Libraries

In [None]:
import os
import math
import spacy
import torch
import numpy as np
import pandas as pd
import torch.optim as optim
import torch.nn.functional as F
import torchvision.transforms as T

from PIL import Image
from torch import nn
from matplotlib import pyplot as plt
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
from torch.nn.utils.rnn import pad_sequence
from torchvision.models import vgg16, VGG16_Weights
from nltk.translate.bleu_score import sentence_bleu

spacy_eng = spacy.load("en_core_web_sm")

In [None]:
!pip install torchsummary
!pip install pycocoevalcap

from torchsummary import summary
from pycocoevalcap.bleu.bleu import Bleu
from pycocoevalcap.meteor.meteor import Meteor
from pycocoevalcap.rouge.rouge import Rouge
from pycocoevalcap.cider.cider import Cider

### Setting the Device

In [None]:
if torch.cuda.is_available():
    torch.backends.cudnn.deterministic = True

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

print('Device:', device)

# Preparing the Flikr30k Dataset
### Preparing the Vocabulary

In [None]:
class Vocabulary:
    def __init__(self, freq_threshold):
        self.freq_threshold = freq_threshold
        self.itos = {0: "<PAD>", 1: "<SOS>", 2: "<EOS>", 3: "<UNK>"}
        self.stoi = {v: k for k, v in self.itos.items()}
    
    def __len__(self):
        return len(self.itos)
  
    def build_vocab(self, sentence_list):
        freqs = {}
        idx = 4

        for sentence in sentence_list:
            sentence = str(sentence)

            for word in self.tokenize(sentence):
                freqs[word] = freqs.get(word, 0) + 1

                if freqs[word] == self.freq_threshold:
                    self.itos[idx] = word
                    self.stoi[word] = idx
                    
                    idx += 1

    def numericalize(self, sentence):
        tokens = self.tokenize(sentence)
        result = []

        for token in tokens:
            result.append(self.stoi.get(token, self.stoi["<UNK>"]))

        return result
    
    @staticmethod
    def tokenize(sentence):
        return [token.text.lower() for token in spacy_eng.tokenizer(str(sentence))]

### Defining a Custom Dataset

In [None]:
class Flickr(Dataset):
    def __init__(self, root_dir, caption_path, transform, freq_threshold=5):
        self.freq_threshold = freq_threshold
        self.transform = transform
        self.root_dir = root_dir
    
        self.df = pd.read_csv(caption_path, delimiter='|')
        
        self.images = self.df['image_name']
        self.captions = self.df[' comment']
        
        self.vocab = Vocabulary(freq_threshold)
        
        self.vocab.build_vocab(self.captions.tolist())
        
    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, index):
        max_seq_length = 50
        
        image, caption = self.images[index], self.captions[index]
        
        image = Image.open(os.path.join(self.root_dir, image)).convert("RGB")
        
        image = self.transform(image)
        
        numericalized_caption = [self.vocab.stoi["<SOS>"]]
        
        numericalized_caption += self.vocab.numericalize(caption)
        
        numericalized_caption.append(self.vocab.stoi["<EOS>"])
        
        if len(numericalized_caption) > max_seq_length:
            numericalized_caption = numericalized_caption[: max_seq_length]
        else:
            numericalized_caption += [self.vocab.stoi["<PAD>"]] * (max_seq_length - len(numericalized_caption))
        
        return image, torch.tensor(numericalized_caption)
    
    def get_label(self, index):
        _, caption = self[index]
    
        label = [self.vocab.itos[token] for token in caption.tolist()][1: -1]

        return ' '.join(label)
    
    def captions_list(self):
        return self.captions.tolist()
        

### Defining a Custom Caption Collat for Padding

In [None]:
class CapCollat:
    def __init__(self, pad_seq, batch_first=False):
        self.pad_seq = pad_seq
        self.batch_first = batch_first
  
    def __call__(self, batch):
        images = [item[0].unsqueeze(0) for item in batch]
        images = torch.cat(images, dim=0)

        target_caps = [item[1] for item in batch]
        target_caps = pad_sequence(target_caps,
                                   batch_first=self.batch_first,
                                   padding_value=self.pad_seq)
        
        return images, target_caps

### Defining Custom Scorer 

In [None]:
class Scorer():
    def __init__(self, references, candidates):
        self.references = references
        self.candidates = candidates

        self.word_based_scorers = [
            (Bleu(4), ["Bleu_1", "Bleu_2", "Bleu_3", "Bleu_4"]),
            (Meteor(),"METEOR"),
            (Rouge(), "ROUGE_L"),
            (Cider(), "CIDEr"),
            ]

    def compute_scores(self):
        total_scores = {
            "Bleu1":[],
            "Bleu2":[],
            "Bleu3":[],
            "Bleu4":[],
            "METEOR":[],
            "ROUGE_L":[],
            "CIDEr":[],
        }

        for scorer, method in self.word_based_scorers:
            score, _ = scorer.compute_score(self.references, self.candidates)
    
            if type(method) is list:
                total_scores["Bleu1"].append(score[0])
                total_scores["Bleu2"].append(score[1])
                total_scores["Bleu3"].append(score[2])
                total_scores["Bleu4"].append(score[3])

            else:
                total_scores[method].append(score)

        return total_scores
    
    def compute_scores_iterative(self):
        total_scores = {
            "Bleu1":[],
            "Bleu2":[],
            "Bleu3":[],
            "Bleu4":[],
            "METEOR":[],
            "ROUGE_L":[],
            "CIDEr":[],
            "SPICE":[]
        
        }

        for key in self.candidates:
            curr_reference = {key:self.references[key]}
            curr_candidate = {key:self.candidates[key]}

            for scorer, method in self.word_based_scorers:
                score, _ = scorer.compute_score(curr_reference, curr_candidate)
                if type(method) == list:
                    total_scores["Bleu1"].append(score[0])
                    total_scores["Bleu2"].append(score[1])
                    total_scores["Bleu3"].append(score[2])
                    total_scores["Bleu4"].append(score[3])

                else:
                    total_scores[method].append(score)

        return total_scores

### Loading and Testing the Dataset 

In [None]:
root_folder = "/kaggle/input/flickr-image-dataset/flickr30k_images/flickr30k_images/"
csv_file = "/kaggle/input/flickr-image-dataset/flickr30k_images/results.csv"

transform = T.Compose([
                T.Resize((256, 256), interpolation=T.InterpolationMode.BILINEAR),
                T.CenterCrop((224, 224)),
                T.ToTensor(),
                T.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
                ])

batch_size = 37
num_workers = 2
freq_threshold = 5
batch_first = True
pin_memory = True

dataset = Flickr(root_folder, csv_file, transform, freq_threshold)
all_labels = dataset.captions_list()
pad_idx = dataset.vocab.stoi["<PAD>"]

data_size = len(dataset)
train_size = int(0.9 * data_size)
val_size = data_size - train_size

val_set_start = data_size - val_size - 1
val_set_end = data_size - 1

train_set, val_set = torch.utils.data.Subset(dataset, range(0, train_size)), torch.utils.data.Subset(dataset, range(train_size, data_size))

train_loader = DataLoader(train_set,
                            batch_size=batch_size,
                            pin_memory=pin_memory,
                            num_workers=num_workers,
                            shuffle=True,
                            collate_fn=CapCollat(pad_seq=pad_idx, batch_first=batch_first))

val_loader = DataLoader(val_set,
                            batch_size=batch_size,
                            pin_memory=pin_memory,
                            num_workers=num_workers,
                            shuffle=False,
                            collate_fn=CapCollat(pad_seq=pad_idx, batch_first=batch_first))

### Setting Vocabulary

In [None]:
vocab = dataset.vocab
vocab_size = len(vocab)

### Testing Dataset

In [None]:
for idx in range(0, 100, 10):
    image, _ = dataset[idx + val_size]
    
    label = all_labels[idx + val_size]

    image = image.permute(1,2,0)
    
    plt.imshow(image)
    plt.title(label)

    plt.show()

# Encoder: Pre-Trained VGG16 

In [None]:
class Encoder(nn.Module):
    def __init__(self, embed_size=224):
        super(Encoder, self).__init__()

        vgg = vgg16(weights=VGG16_Weights.IMAGENET1K_FEATURES)
        
        for param in vgg.parameters():
            param.requires_grad = False
            
        feature_extractor = list(vgg.children())[:-1]
            
        embedding_layer = nn.Linear(512 * 7 * 7, embed_size)

        self.encoder = nn.Sequential(*feature_extractor,
                                     nn.Flatten(),
                                     embedding_layer)
       
    def forward(self, image):
        encoded_image = self.encoder(image)
  
        return encoded_image

# Decoder: Vanilla RNN

In [None]:
class Decoder(nn.Module):
    def __init__(self, embed_size, vocab_size, hidden_size, num_layers):
        super(Decoder, self).__init__()

        self.embedding = nn.Embedding(vocab_size, embed_size)
        self.rnn = nn.RNN(embed_size, hidden_size, num_layers, batch_first=True)
        self.linear = nn.Linear(hidden_size, vocab_size)

        self.features2hidden = nn.Linear(embed_size, hidden_size)

    def forward(self, features, captions):
        captions_embed = self.embedding(captions)

        initial_hidden_state = self.features2hidden(features).unsqueeze(0).repeat(self.rnn.num_layers, 1, 1)

        output, _ = self.rnn(captions_embed, initial_hidden_state)
        output = self.linear(output)

        return output


# Encoder-Decoder: VGG16 + Vanilla RNN

In [None]:
class EncoderDecoder(nn.Module):
    def __init__(self, embed_size, vocab_size, hidden_size, num_layers):
        super(EncoderDecoder, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        
        self.encoder = Encoder(embed_size)
        self.decoder = Decoder(embed_size, vocab_size, hidden_size, num_layers)
    
    def forward(self, images, captions):
        x = self.encoder(images)
        x = self.decoder(x, captions)
        
        return x
    
    def caption(self, image, vocabulary, maxlength=50):
        result_caption = []

        with torch.no_grad():
            x = self.encoder(image).unsqueeze(0)
            states = None

            for _ in range(maxlength):
                hiddens, states = self.decoder.rnn(x, states)
                output = self.decoder.linear(hiddens.squeeze(0))
                predicted = output.argmax(1)
                result_caption.append(predicted.item())
                x = self.decoder.embedding(predicted).unsqueeze(0)

                if vocabulary.itos[predicted.item()] == "<EOS>":
                    break

        return [vocabulary.itos[i] for i in result_caption]


# Training and Evaluating

### Setting Hyperparameters

In [None]:
num_epochs = 4
enc_dim = 2048
embed_size = 224
hidden_size = 512
num_layers = 1
learning_rate = 3e-4

### Configuring Model

In [None]:
criterion = nn.CrossEntropyLoss(ignore_index = vocab.stoi["<PAD>"]).to(device)

model = EncoderDecoder(embed_size, vocab_size, hidden_size, num_layers).to(device)

optimizer = optim.Adam(model.parameters(), lr = learning_rate)

In [None]:
print("ImageCap Model Summary: VGG16 + Vanilla RNN")
summary(model, (3, 224, 224))
print("\n--------------------------------------------------\n")
print("Encoder Model Summary: VGG16")
summary(model.encoder, (3, 224, 224))
print("\n--------------------------------------------------\n")
print("Decoder Model Summary: Vanilla RNN")
summary(model.decoder, (1, 512))

### Training The Model

In [None]:
train_losses = []
val_losses = []

bleu1_scores = []
bleu2_scores = []
bleu3_scores = []
bleu4_scores = []

cider_scores = []

meteor_scores = []

rougel_scores = []

val_iter = iter(val_loader)

for epoch in range(num_epochs):
    model.train()

    for batch_idx, (images, captions) in enumerate(train_loader):
        images, captions = images.to(device), captions.to(device)
        
        train_score = model(images, captions)

        optimizer.zero_grad()
        
        train_loss = criterion(train_score.view(-1, vocab_size), captions.view(-1))
        train_losses.append(train_loss.item())
        
        train_loss.backward()
        
        optimizer.step()

    with torch.no_grad():
        model.eval()

        idx = val_set_start
        
        references = {}
        candidates = {}

        while idx < val_set_end:
            if (idx - val_set_start) % batch_size == 0:
                batch = next(val_iter)

            images, captions = batch

            images, captions = images.to(device), captions.to(device)

            val_score = model(images, captions)

            val_loss = criterion(val_score.view(-1, vocab_size), captions.view(-1))

            val_losses.append(val_loss.item())
            
            for image in images:
                val_pred = model.caption(image.unsqueeze(0), vocab)
                
                candidate = ' '.join(val_pred)

                candidates[idx] = [candidate]
                
                label = all_labels[idx]

                references[idx] = [label]
            
                idx += 1
            
    metrics = Scorer(references, candidates).compute_scores()

    bleu1_scores.append(metrics['Bleu_1'])
    bleu2_scores.append(metrics['Bleu_2'])
    bleu3_scores.append(metrics['Bleu_3'])
    bleu4_scores.append(metrics['Bleu_4'])

    cumulative_bleu_score = (metrics['Bleu_1'] / 4) + (metrics['Bleu_2'] / 4) + (metrics['Bleu_3'] / 4) + (metrics['Bleu_4'] / 4)

    cider_scores.append(metrics['CIDEr'])

    rougel_scores.append(metrics['ROUGE_L'])

    meteor_scores.append(metrics['METEOR'])
    
    print(f"Epoch [{epoch + 1}/{num_epochs}] | Training loss: {train_loss.item()} Validation loss: {val_loss.item()} \nCumulative BLEU Score: {cumulative_bleu_score}\tCIDEr Score: {metrics['CIDEr']}\tMETEOR Score: {metrics['METEOR']}\tROUGE_L: {metrics['ROUGE_L']}\n")
    
    torch.save(model.state_dict(), f'/kaggle/working/ImageCap_{epoch + 1}.pth')


### Plotting Cross-Validation and Scores

In [None]:
plt.figure(0)
plt.plot(train_losses, label = 'Training loss')
plt.plot(val_losses, label = 'Validation loss')
plt.ylabel('Cross Entropy Loss')
plt.legend()
plt.savefig(f'/kaggle/working/{freq_threshold}_{batch_size}_{hidden_size}_{num_epochs}_losses.png')

plt.figure(1)
plt.plot(bleu1_scores, label = 'BLEU 1')
plt.plot(bleu2_scores, label = 'BLEU 2')
plt.plot(bleu3_scores, label = 'BLEU 3')
plt.plot(bleu4_scores, label = 'BLEU 4')
plt.ylabel('BLEU Scores')
plt.legend()
plt.savefig(f'/kaggle/working/{freq_threshold}_{batch_size}_{hidden_size}_{num_epochs}_bleu_scores.png')
        
plt.figure(2)
plt.plot(cider_scores, label = 'CIDEr')
plt.plot(meteor_scores, label = 'METEOR')
plt.plot(rougel_scores, label = 'ROUGE_L')
plt.ylabel('Scores')
plt.legend()
plt.savefig(f'/kaggle/working/{freq_threshold}_{batch_size}_{hidden_size}_{num_epochs}_scores.png')

plt.show()