## <font color="blue"> Importing Libraries

In [None]:
import os
import json
import torch
import nltk

import numpy as np
import pandas as pd
import torchvision.transforms as transforms
import matplotlib.image as mpimg
import matplotlib.pyplot as plt

from torch import nn
from collections import Counter
from PIL import Image
from string import punctuation
from tqdm import tqdm
from gensim import corpora
from sklearn.model_selection import train_test_split
from sklearn.utils.class_weight import compute_class_weight
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
from torch.nn.utils.rnn import pack_padded_sequence
from keras.utils import pad_sequences
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import confusion_matrix
from torchvision.models import resnet101, ResNet101_Weights
from nltk.translate.bleu_score import corpus_bleu, sentence_bleu

pd.set_option('display.max_colwidth', None)

In [None]:
!wget http://nlp.stanford.edu/data/glove.6B.zip
!unzip glove.6B.zip
!ls -lat

## <font color="blue"> Reading caption and image dir file

In [None]:
image_path = "/kaggle/input/flickr8k/Images/"
df = pd.read_csv("/kaggle/input/flickr8k/captions.txt")
df["image"].nunique()

## <font color="blue"> Load Pretrained Glove

In [None]:
vocab,embeddings = [],[]
with open('/kaggle/working/glove.6B.100d.txt','rt') as fi:
    full_content = fi.read().strip().split('\n')
print(full_content[0])
for i in range(len(full_content)):
    i_word = full_content[i].split(' ')[0]
    i_embeddings = [float(val) for val in full_content[i].split(' ')[1:]]
    vocab.append(i_word)
    embeddings.append(i_embeddings)

In [None]:
vocab_npa = np.array(vocab)
embs_npa = np.array(embeddings)
vocab_npa = np.insert(vocab_npa, 0, '<pad>')
vocab_npa = np.insert(vocab_npa, 1, '<unk>')
vocab_npa = np.insert(vocab_npa, 2, '<start>')
vocab_npa = np.insert(vocab_npa, 3, '<end>')
pad_emb_npa = np.zeros((1,embs_npa.shape[1]))
start_emb_npa = np.random.rand(1,embs_npa.shape[1])
end_emb_npa = np.random.rand(1,embs_npa.shape[1])
unk_emb_npa = np.mean(embs_npa,axis=0,keepdims=True)
embs_npa = np.vstack((pad_emb_npa,unk_emb_npa,start_emb_npa,end_emb_npa,embs_npa))

## <font color="blue"> Check sample images

In [None]:
def show_image_and_its_caption(index):
    image_name, caption = df.iloc[[index]]["image"].values[0], df.iloc[[index]]["caption"].values[0]
    img = mpimg.imread(os.path.join(image_path, image_name))
    plt.imshow(img)
    print(caption)

In [None]:
show_image_and_its_caption(98)

## <font color="blue"> Preprocess caption

In [None]:
unique_words = Counter()
df["tokens"] = None
max_caption_len = 30
for index, row in tqdm(df.iterrows(), total=df.shape[0]):
    words = nltk.word_tokenize(row["caption"])
    words = [word.lower() for word in words if word.isalpha() and word not in punctuation]
    if len(words) <= max_caption_len:
        df.at[index, "tokens"] = words
    else:
        df = df.drop(index)
        continue
    unique_words.update(words)

## <font color="blue"> Take more frequent words

In [None]:
min_word_freq = 20
words = [w for w in unique_words.keys() if unique_words[w] > min_word_freq]
word_map = {k: v+1 for v, k in enumerate(words)}
word_map["<unk>"] = len(word_map) + 1
word_map["<start>"] = len(word_map) + 1
word_map["<end>"] = len(word_map) + 1
word_map["<pad>"] = 0
len(unique_words), len(words), len(word_map)

In [None]:
embeddings_focus = []
keys = list(word_map.keys())
values = list(word_map.values())
sorted_value_index = np.argsort(values)
sorted_word_map = {keys[i]: values[i] for i in sorted_value_index}
for key, value in sorted_word_map.items():
    index = np.where(vocab_npa == key)[0][0]
    embedding = embs_npa[index]
    embeddings_focus.append(embedding)
embeddings_focus = np.array(embeddings_focus)

In [None]:
index_to_word = {}
for k, v in sorted_word_map.items():
    index_to_word[v] = k

## <font color="blue"> Encode Captions

In [None]:
df["caption_encoded"] = None
df["caption_length"] = None
for index, row in tqdm(df.iterrows(), total=df.shape[0]):
    words = row["tokens"]
    encoded_caption = [word_map["<start>"]] + [word_map.get(word, word_map["<unk>"]) for word in words] + [
        word_map["<end>"]] + [word_map["<pad>"]]*(max_caption_len - len(words))
    df.at[index, "caption_encoded"] = encoded_caption
    df.at[index, "caption_length"] = len(words) + 2

In [None]:
df_ = df.groupby("image").apply(lambda x: x.sample(n=4) if x.shape[0] >= 4 else None).reset_index(drop=True)

## <font color="blue"> Encoder

In [None]:
class ImageEncoder(nn.Module):
    def __init__(self, encoded_image_size=14):
        super().__init__()
        self.resnet101 = resnet101(pretrained=True)
        modules = list(self.resnet101.children())[:-2]
        self.resnet = nn.Sequential(*modules)
        self.adaptive_pooling = nn.AdaptiveAvgPool2d((encoded_image_size, encoded_image_size))
        self.fine_tune()
    def forward(self, image):
        x = self.resnet(image)
        x = self.adaptive_pooling(x)
        x = x.permute(0,2,3,1)
        return x
    def fine_tune(self, fine_tune=True):
        for p in self.resnet.parameters():
            p.requires_grad = False
        for layer in list(self.resnet.children())[5:]:
            for p in layer.parameters():
                p.requires_grad = fine_tune

In [None]:
ie = ImageEncoder(14)

In [None]:
img = Image.open("/kaggle/input/flickr8k/Images/1000268201_693b08cb0e.jpg")
new_image = img.resize((224, 224))
np_image = np.array(new_image).astype('float32')/255
np_image = np.expand_dims(np_image, axis=0)
tensor = torch.transpose(torch.from_numpy(np_image), 1, 3)
tensor.size()

In [None]:
device = "cuda" if torch.cuda.is_available() else "cpu"
ie.to(device)
out = ie(tensor.to(device))
out.size()

## <font color="blue"> Attention Module for encoded image and previous decoder state

In [None]:
class Attention(nn.Module):
    def __init__(self, encoder_dim, decoder_dim, attention_dim):
        super().__init__()
        self.encoder_attn = nn.Linear(encoder_dim, attention_dim)
        self.decoder_attn = nn.Linear(decoder_dim, attention_dim)
        self.aggregate = nn.Linear(attention_dim, 1)
        self.relu = nn.ReLU()
        self.softmax = nn.Softmax()
    def forward(self, encoder_out, decoder_out):
        encoder_attn = self.encoder_attn(encoder_out)
        decoder_attn = self.decoder_attn(decoder_out)
        attention = self.aggregate(self.relu(encoder_attn + decoder_attn.unsqueeze(1))).squeeze(2)
        alpha = self.softmax(attention)
        attention_weighted_encoding = (encoder_out * alpha.unsqueeze(2)).sum(dim=1)
        return attention_weighted_encoding, alpha

## <font color="blue"> Decoder with Attention

In [None]:
class DecoderWithAttention(nn.Module):
    def __init__(self, decoder_dim, attention_dim, embed_dim, vocab_size, dropout=0.5,encoder_dim=2048):
        super().__init__()
        self.encoder_dim = encoder_dim
        self.decoder_dim = decoder_dim
        self.attention_dim = attention_dim
        self.embed_dim = embed_dim
        self.vocab_size = vocab_size
        self.dropout = dropout
        
        self.embedding = nn.Embedding.from_pretrained(torch.from_numpy(embeddings_focus).float())
        self.attention = Attention(encoder_dim, decoder_dim, attention_dim)
        self.lstmcell = nn.LSTMCell(embed_dim + encoder_dim, decoder_dim, bias=True)
        self.classification = nn.Linear(decoder_dim, vocab_size)
        self.f_gate = nn.Linear(decoder_dim, encoder_dim)
        self.init_h = nn.Linear(encoder_dim, decoder_dim)
        self.init_c = nn.Linear(encoder_dim, decoder_dim)
        self.sigmoid = nn.Sigmoid()
        self.dropout_layer = nn.Dropout(self.dropout)
        self.fine_tune_embeddings()
    
    def init_h_init_c(self, encoder_out):
        h = self.init_h(encoder_out).mean(dim=1)
        c = self.init_c(encoder_out).mean(dim=1)
        return h, c

    def fine_tune_embeddings(self):
        for p in self.embedding.parameters():
            p.requires_grad = True
        
    def forward(self, encoder_out, encoded_captions, captions_length):
        batch_size = encoder_out.size()[0]
        encoder_out = encoder_out.view(encoder_out.size()[0], -1, encoder_out.size()[-1])
        captions_length, sort_ind = captions_length.squeeze(1).sort(dim=0, descending=True)
        encoded_captions = encoded_captions[sort_ind]
        encoder_out = encoder_out[sort_ind]
        embeddings = self.embedding(encoded_captions)
        h, c = self.init_h_init_c(encoder_out)
        decoder_sequence_length = (captions_length - 1).tolist()
        num_pixels = encoder_out.size()[1]
        predictions = torch.zeros(batch_size, max(decoder_sequence_length), self.vocab_size).to(device)
        images = torch.zeros(batch_size, max(decoder_sequence_length), num_pixels).to(device)
        
        for i in range(max(decoder_sequence_length)):
            batch_size_i = sum([l > i for l in decoder_sequence_length])
            attention_weighted_encoding, alpha = self.attention(encoder_out[:batch_size_i], h[:batch_size_i])
            gate = self.sigmoid(self.f_gate(h[:batch_size_i]))
            attention_weighted_encoding = gate * attention_weighted_encoding
            h, c = self.lstmcell(torch.cat([embeddings[:batch_size_i, i, :],attention_weighted_encoding], dim=1),
                                 (h[:batch_size_i], c[:batch_size_i]))
            preds = self.classification(self.dropout_layer(h))
            predictions[:batch_size_i, i, :] = preds
            images[:batch_size_i, i, :] = alpha
        return predictions, images, sort_ind, encoded_captions, decoder_sequence_length
            

## <font color="blue"> DataLoader

In [None]:
class CaptionDataLoader(Dataset):
    def __init__(self, df, image_path, dataset="TRAIN", transform=None):
        super().__init__()
        self.image_ids = df["image"].to_list()
        self.caption_encoded = df["caption_encoded"].to_list()
        self.caption_lengths = df["caption_length"].to_list()
        self.image_path = image_path
        self.dataset = dataset
        self.transform = transform
    def __len__(self):
        return len(self.image_ids)
    def __getitem__(self, index):
        img = Image.open(os.path.join(self.image_path, self.image_ids[index]))
        new_image = img.resize((224, 224))
        np_image = np.array(new_image).astype('float32')/255
        np_image = np.expand_dims(np_image, axis=0)
        image_tensor = torch.transpose(torch.from_numpy(np_image), 1, 3)
        image_tensor = image_tensor.squeeze(0)
        if self.transform is not None:
            image_tensor = self.transform(image_tensor)
        encoded_captions = torch.tensor(self.caption_encoded[index])
        captions_length = torch.tensor(np.array([self.caption_lengths[index]]))
        if self.dataset == "TRAIN":
            return image_tensor, encoded_captions, captions_length
        else:
            all_indices = np.where(np.array(self.image_ids) == self.image_ids[index])[0]
            all_captions = torch.tensor(np.take(self.caption_encoded, all_indices, axis=0))
            return image_tensor, encoded_captions, captions_length, all_captions

## <font color="blue"> Split data into train, val, test

In [None]:
image_ids = df_["image"].unique()
train, validate, test = np.split(image_ids, [int(.6*image_ids.shape[0]), int(.65*image_ids.shape[0])])

In [None]:
df_train = df_[df_["image"].isin(train)]
df_validate = df_[df_["image"].isin(validate)]
df_test = df_[df_["image"].isin(test)]

## <font color="blue"> Check Data Loader

In [None]:
normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406],
                                     std=[0.229, 0.224, 0.225])
train_loader = CaptionDataLoader(df_train, image_path, transform=transforms.Compose([normalize]))
train_data_loader = DataLoader(train_loader, batch_size=32, num_workers=4, shuffle=True)
valid_loader = CaptionDataLoader(df_validate, image_path, dataset="VAL", transform=transforms.Compose([normalize]))
valid_data_loader = DataLoader(valid_loader, batch_size=32, num_workers=4, shuffle=True)
for image, caption, length, all_caps in valid_data_loader:
    print(image.size())
    print(caption.size())
    print(length.size())
    print(all_caps.size())
    break

In [None]:
class AverageMeter:
    def __init__(self):
        self.reset()

    def reset(self):
        self.val = 0
        self.avg = 0
        self.sum = 0
        self.count = 0

    def update(self, val, n=1):
        self.sum += val
        self.count += n
        self.avg = self.sum / self.count

def accuracy(scores, targets, k):
    batch_size = targets.size(0)
    _, ind = scores.topk(k, 1, True, True)
    correct = ind.eq(targets.view(-1, 1).expand_as(ind))
    correct_total = correct.view(-1).float().sum() 
    return correct_total.item()
def clip_gradient(optimizer, grad_clip):
    for group in optimizer.param_groups:
        for param in group['params']:
            if param.grad is not None:
                param.grad.data.clamp_(-grad_clip, grad_clip)

## <font color="blue"> Training the model

In [None]:
embed_dim = 100
attention_dim = 512
decoder_dim = 512
device = "cuda" if torch.cuda.is_available() else "cpu"
start_epoch = 0
num_epochs = 8
encoder_lr = 1e-4
decoder_lr = 5e-4
fine_tune_encoder = True
grad_clip = 5

decoder = DecoderWithAttention(
    decoder_dim=decoder_dim, embed_dim=embed_dim, vocab_size=len(word_map), attention_dim=attention_dim)

decoder_optim = torch.optim.Adam(params=filter(lambda p: p.requires_grad, decoder.parameters()), lr=decoder_lr)

encoder = ImageEncoder()
encoder.fine_tune()
encoder_optim = torch.optim.Adam(params=filter(lambda p: p.requires_grad, encoder.parameters()), lr=encoder_lr)

decoder.to(device)
encoder.to(device)
criterion = torch.nn.CrossEntropyLoss().cuda()
best_loss = np.inf
for i in range(num_epochs):
    encoder.train()
    decoder.train()
    train_losses = AverageMeter()
    train_top5accs = AverageMeter()
    for image, captions, caption_length in tqdm(train_data_loader, total=len(train_data_loader)):
        image = image.to(device)
        captions = captions.to(device)
        caption_length = caption_length.to(device)
        
        encoder_out = encoder(image)
        predictions, images, sort_ind, encoded_captions, decoder_sequence_length = decoder(encoder_out, captions, caption_length)
        
        targets = encoded_captions[:, 1:]
        
        scores, _, _, _ = pack_padded_sequence(predictions, decoder_sequence_length, batch_first=True)
        targets, _, _, _ = pack_padded_sequence(targets, decoder_sequence_length, batch_first=True)
        
        loss = criterion(scores, targets)
        train_losses.update(loss, sum(decoder_sequence_length))
        top5 = accuracy(scores, targets, 5)
        train_top5accs.update(top5, sum(decoder_sequence_length))
        decoder_optim.zero_grad()
        encoder_optim.zero_grad()
        loss.backward()
        clip_gradient(decoder_optim, grad_clip)
        clip_gradient(encoder_optim, grad_clip)
        decoder_optim.step()
        encoder_optim.step()
    
    with torch.no_grad():
        encoder.eval()
        decoder.eval()
        losses = AverageMeter()
        top5accs = AverageMeter()
        all_true_captions = list()
        predicted_caption = list()
        for image, captions, caption_length, all_captions in tqdm(valid_data_loader, total=len(valid_data_loader)):
            image = image.to(device)
            captions = captions.to(device)
            caption_length = caption_length.to(device)
            all_captions = all_captions.to(device)
            encoder_out = encoder(image)
            predictions, images, sort_ind, encoded_captions, decoder_sequence_length = decoder(encoder_out, captions, caption_length)
            targets = encoded_captions[:, 1:]
            scores_copy = predictions.clone()
            scores, _, _, _ = pack_padded_sequence(predictions, decoder_sequence_length, batch_first=True)
            targets, _, _, _ = pack_padded_sequence(targets, decoder_sequence_length, batch_first=True)
            loss = criterion(scores, targets)
            losses.update(loss, sum(decoder_sequence_length))
            top5 = accuracy(scores, targets, 5)
            top5accs.update(top5, sum(decoder_sequence_length))
#             all_captions = all_captions[sort_ind]
#             all_captions = all_captions.tolist()
#             for caps in all_captions:
#                 cur_caps = list(map(lambda c: [w for w in c if w not in [word_map["<start>"], word_map["<pad>"]]], caps))
#                 all_true_captions.append(cur_caps)
#             _, preds = torch.max(scores_copy, dim=2)
#             preds = preds.tolist()
#             temp_preds = []
#             for j in range(len(preds)):
#                 cur_preds = preds[j][:decoder_sequence_length[j]]
#                 temp_preds.append(cur_preds)
#             predicted_caption.extend(temp_preds)
            
#             assert len(all_true_captions) == len(predicted_caption)
#         bleu_score = corpus_bleu(all_true_captions, predicted_caption, weights=(0.1, 0.2, 0.3, 0.4))
        if losses.val < best_loss:
            best_loss = losses.val
            state_dict_encoder = encoder.state_dict()
            state_dict_decoder = decoder.state_dict()
            
    print(f"epoch {i+1}, Train loss = {round(float(train_losses.avg), 4)}, Train Top5 Accuracy = {round(float(train_top5accs.avg), 4)}, Valid loss = {round(float(losses.avg), 4)}, Valid top5 accuracy = {round(float(top5accs.avg), 4)}")

## <font color="blue"> Evaluation on test set

In [None]:
torch.save(state_dict_encoder, "/kaggle/working/encoder.pth")
torch.save(state_dict_decoder, "/kaggle/working/decoder.pth")

In [None]:
encoder.load_state_dict(state_dict_encoder)
decoder.load_state_dict(state_dict_decoder)
encoder.to(device)
decoder.to(device)

In [None]:
test_loader = CaptionDataLoader(df_test, dataset="TEST", image_path=image_path)
test_data_loader = DataLoader(test_loader, batch_size=1, num_workers=1)

In [None]:
beam_size = 5
bleu_score = []
for image, caps, caplens, allcaps in tqdm(test_data_loader, total=len(test_data_loader)):
    k = beam_size
    image = image.to(device)
    encoder_out = encoder(image)
    feature_size = encoder_out.size(3)
    encoder_out = encoder_out.view(1, -1, feature_size)
    num_pixels = encoder_out.size(1)
    encoder_out = encoder_out.expand(k, num_pixels, feature_size)
    k_prev_words = torch.LongTensor([[word_map["<start>"]]] * k).to(device)
    seqs = k_prev_words
    topk_scores = torch.zeros(k,1).to(device)
    complete_sequence = list()
    complete_sequence_scores = list()
    h, c = decoder.init_h_init_c(encoder_out)
    step = 1
    references = []
    hypotheses = []
    while True:
        embeddings = decoder.embedding(k_prev_words).squeeze(1)
        attention_encoding, alpha = decoder.attention(encoder_out, h)
        gate = decoder.sigmoid(decoder.f_gate(h))
        attention_encoding = gate * attention_encoding
        h, c = decoder.lstmcell(torch.cat([embeddings, attention_encoding], dim=1), (h,c))
        output = decoder.classification(h)
        scores = nn.LogSoftmax(dim=1)(output)
        scores = topk_scores.expand_as(scores) + scores
        if step == 1:
            topk_scores, topk_words_ind = scores[0].topk(k, 0, True, True)
        else:
            topk_scores, topk_words_ind = scores.view(-1).topk(k, 0, True, True)
        prev_word_inds = topk_words_ind // len(word_map)
        next_word_inds = topk_words_ind % len(word_map)
        seqs = torch.cat([seqs[prev_word_inds], next_word_inds.unsqueeze(1)], dim = 1)
        incomplete_ind = [ind for ind, next_word in enumerate(next_word_inds) if next_word != word_map["<end>"]]
        complete_inds = list(set(range(len(next_word_inds))) - set(incomplete_ind))
        if len(complete_inds) > 0:
            k -= len(complete_inds)
            complete_sequence.extend(seqs[complete_inds].tolist())
            complete_sequence_scores.extend(topk_scores[complete_inds])
        
        seqs = seqs[incomplete_ind]
        h = h[prev_word_inds[incomplete_ind]]
        c = c[prev_word_inds[incomplete_ind]]
        encoder_out = encoder_out[prev_word_inds[incomplete_ind]]
        k_prev_words = next_word_inds[incomplete_ind].unsqueeze(1)
        topk_scores = topk_scores[incomplete_ind].unsqueeze(1)
        if step > 50:
            break
        step += 1
    i = complete_sequence_scores.index(max(complete_sequence_scores))
    seq = complete_sequence[i]

    img_caps = allcaps[0].tolist()
    img_captions = list(
        map(lambda c: [w for w in c if w not in {word_map['<start>'], word_map['<end>'], word_map['<pad>']}],
            img_caps))  # remove <start> and pads
    references.append(img_captions)

    # Hypotheses
    hypotheses.append([w for w in seq if w not in {word_map['<start>'], word_map['<end>'], word_map['<pad>']}])
    bleu4 = 0
    bleu4 = sentence_bleu(references[0], hypotheses[0])
    bleu_score.append(bleu4)

In [None]:
sum(bleu_score)/len(bleu_score)

## <font color="blue"> Generate Caption from image. Enter image path as input

In [None]:
def generate_caption(image_path):
    beam_size = 3
    k = beam_size
    img = Image.open(image_path)
    new_image = img.resize((224, 224))
    plt.imshow(new_image)
    np_image = np.array(new_image).astype('float32')/255
    np_image = np.expand_dims(np_image, axis=0)
    image_tensor = torch.transpose(torch.from_numpy(np_image), 1, 3)
    image_tensor = image_tensor.to(device)
    encoder_out = encoder(image_tensor)
    feature_size = encoder_out.size(3)
    encoder_out = encoder_out.view(1, -1, feature_size)
    num_pixels = encoder_out.size(1)
    encoder_out = encoder_out.expand(k, num_pixels, feature_size)
    k_prev_words = torch.LongTensor([[word_map["<start>"]]] * k).to(device)
    seqs = k_prev_words
    topk_scores = torch.zeros(k,1).to(device)
    complete_sequence = list()
    complete_sequence_scores = list()
    h, c = decoder.init_h_init_c(encoder_out)
    step = 1
    hypothesis = []
    while True:
        embeddings = decoder.embedding(k_prev_words).squeeze(1)
        attention_encoding, alpha = decoder.attention(encoder_out, h)
        gate = decoder.sigmoid(decoder.f_gate(h))
        attention_encoding = gate * attention_encoding
        h, c = decoder.lstmcell(torch.cat([embeddings, attention_encoding], dim=1), (h,c))
        output = decoder.classification(h)
        scores = nn.LogSoftmax(dim=1)(output)
        scores = topk_scores.expand_as(scores) + scores
        if step == 1:
            topk_scores, topk_words_ind = scores[0].topk(k, 0, True, True)
        else:
            topk_scores, topk_words_ind = scores.view(-1).topk(k, 0, True, True)
        prev_word_inds = topk_words_ind // len(word_map)
        next_word_inds = topk_words_ind % len(word_map)
        seqs = torch.cat([seqs[prev_word_inds], next_word_inds.unsqueeze(1)], dim = 1)
        incomplete_ind = [ind for ind, next_word in enumerate(next_word_inds) if next_word != word_map["<end>"]]
        complete_inds = list(set(range(len(next_word_inds))) - set(incomplete_ind))
        if len(complete_inds) > 0:
            k -= len(complete_inds)
            complete_sequence.extend(seqs[complete_inds].tolist())
            complete_sequence_scores.extend(topk_scores[complete_inds])
        if k == 0:
            break
        
        seqs = seqs[incomplete_ind]
        h = h[prev_word_inds[incomplete_ind]]
        c = c[prev_word_inds[incomplete_ind]]
        encoder_out = encoder_out[prev_word_inds[incomplete_ind]]
        k_prev_words = next_word_inds[incomplete_ind].unsqueeze(1)
        topk_scores = topk_scores[incomplete_ind].unsqueeze(1)
        if step > 50:
            break
        step += 1
#     print(complete_sequence_scores)
#     print(complete_sequence)
    i = complete_sequence_scores.index(max(complete_sequence_scores))
    seq = complete_sequence[i]
#     print(seq)
    hypothesis.append([w for w in seq if w not in {word_map['<start>'], word_map['<end>'], word_map['<pad>']}])
    caption = ""
    for index in hypothesis[0]:
        caption += index_to_word[index] + " "
    plt.imshow(img)
    print(caption)

In [None]:
generate_caption("/kaggle/input/flickr8k/Images/1020651753_06077ec457.jpg")