In [None]:
!pip install tensorboardX
!pip install torchmetrics
!pip install Pillow

In [None]:
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.nn import TransformerDecoderLayer, TransformerDecoder
import torchvision.models as models
import torchvision.transforms
from torch.utils.data import DataLoader
from PIL import Image
from torch.nn.utils.rnn import pad_sequence

In [None]:
config = {
    "dataset_dir": "dataset",
    "glove_dir": "glove_embeddings",
    "image_specs": {
        "img_feature_channels": 2048,
        "image_size": 224
    },
    "embeddings": {
        "size": 17
    },
    "vocab_size": 17,

    "PAD_token": "<pad>",
    "PAD_idx": 0,

    "START_idx": 1,
    "START_token": "<start>",

    "END_idx": 2,
    "END_token": "<end>",

    "UNK_idx": 3,
    "UNK_token": "<unk>",

    "max_len": 256,

    "use_gpu": True,
    "seed": 2021,

    "batch_size": {
        "train": 128,
        "eval": 64
    },

    "model_configuration": {
        "decoder_layers": 6,
        "d_model": 16,
        "ff_dim": 1024,
        "attention_heads": 16,
        "dropout": 0.5
    },

    "train_config": {
        "num_of_epochs": 30,
        "warmup_steps": 2811,
        "l2_penalty": 0.5,
        "learning_rate": 0.0001,
        "gradient_clipping": 2.0,
        "eval_period": 3
    },

    "bleu_weights": {
        "bleu-1": [1.0],
        "bleu-2": [0.5, 0.5],
        "bleu-3": [0.333, 0.333, 0.333],
        "bleu-4": [0.25, 0.25, 0.25, 0.25]
    },

    "checkpoint": {
        "load": False,
        "checkpoint_path": "."
    }
}

In [None]:
class ResidualBlock(nn.Module):
    def __init__(self, input_dim):
        super(ResidualBlock, self).__init__()
        self.block = nn.Sequential(
            nn.Linear(input_dim, input_dim),
            nn.LeakyReLU(),
            nn.Linear(input_dim, input_dim),
        )

    def forward(self, x):
        return x + self.block(x)

In [None]:
class PositionalEncodings(nn.Module):
    """Attention is All You Need positional encoding layer"""

    def __init__(self, seq_len, d_model, p_dropout):
        """Initializes the layer."""
        super(PositionalEncodings, self).__init__()
        token_positions = torch.arange(start=0, end=seq_len).view(-1, 1)
        dim_positions = torch.arange(start=0, end=d_model).view(1, -1)
        angles = token_positions / (10000 ** ((2 * dim_positions) / d_model))

        encodings = torch.zeros(1, seq_len, d_model)
        encodings[0, :, ::2] = torch.cos(angles[:, ::2])
        encodings[0, :, 1::2] = torch.sin(angles[:, 1::2])
        encodings.requires_grad = False
        self.register_buffer("positional_encodings", encodings)

        self.dropout = nn.Dropout(p_dropout)

    def forward(self, x):
        """Performs forward pass of the module."""
        x = x + self.positional_encodings
        x = self.dropout(x)
        return x

In [None]:
class Decoder(nn.Module):
    """Decoder for image captions.

    Generates prediction for next caption word given the prviously
    generated word and image features extracted from CNN.
    """

    def __init__(self, config):
        """Initializes the model."""
        super(Decoder, self).__init__()
        model_config = config["model_configuration"]
        decoder_layers = model_config["decoder_layers"]
        attention_heads = model_config["attention_heads"]
        d_model = model_config["d_model"]
        ff_dim = model_config["ff_dim"]
        dropout = model_config["dropout"]

        embedding_dim = config["embeddings"]["size"]
        vocab_size = config["vocab_size"]
        img_feature_channels = config["image_specs"]["img_feature_channels"]

        self.embedding_layer = nn.Embedding(vocab_size+1, d_model)

        self.entry_mapping_tokens = nn.Linear(d_model, d_model)
        self.entry_mapping_img = nn.Linear(img_feature_channels, d_model)

        self.res_block = ResidualBlock(d_model)

        self.positional_encodings = PositionalEncodings(config["max_len"], d_model, dropout)
        transformer_decoder_layer = TransformerDecoderLayer(
            d_model=d_model,
            nhead=attention_heads,
            dim_feedforward=ff_dim,
            dropout=dropout
        )
        self.decoder = TransformerDecoder(transformer_decoder_layer, decoder_layers)
        self.classifier = nn.Linear(d_model, vocab_size + 1)

    def forward(self, x, image_features, tgt_padding_mask=None, tgt_mask=None):
        """Performs forward pass of the module."""
        # Adapt the dimensionality of the features for image patches
        image_features = self.entry_mapping_img(image_features)
        image_features = image_features.permute(1, 0, 2)
        image_features = F.leaky_relu(image_features)

        # Inside the forward method of your Decoder class
        # print("Before embedding layer - Shape:", x.shape, "Minimum index:", torch.min(x), "Maximum index:", torch.max(x))
        x = self.embedding_layer(x)
        # print("After embedding layer - Shape:", x.shape, "Minimum index:", torch.min(x), "Maximum index:", torch.max(x))

        # TODO: uncomment this???
        # Ensure that indices are within the valid range
        # assert torch.min(x) >= 0, "Minimum index should be non-negative"
        # assert torch.max(x) < self.embedding_layer.num_embeddings, "Maximum index exceeds vocabulary size"

        # x = self.entry_mapping_tokens(x)
        # x = F.leaky_relu(x)

        x = self.res_block(x)
        x = F.leaky_relu(x)

        x = self.positional_encodings(x)

        # Get output from the decoder
        x = x.permute(1, 0, 2)
        x = self.decoder(
            tgt=x,
            memory=image_features,
            tgt_key_padding_mask=tgt_padding_mask,
            tgt_mask=tgt_mask
        )
        x = x.permute(1, 0, 2)

        x = self.classifier(x)
        return x

In [None]:
def set_up_causal_mask(seq_len, device):
    """Defines the triangular mask used in transformers.

    This mask prevents decoder from attending the tokens after the current one.

    Arguments:
        seq_len (int): Maximum length of input sequence
        device: Device on which to map the created tensor mask
    Returns:
        mask (torch.Tensor): Created triangular mask
    """
    mask = (torch.triu(torch.ones(seq_len, seq_len)) == 1).transpose(0, 1)
    mask = mask.float().masked_fill(mask == 0, float('-inf')).masked_fill(mask == 1, float(0.0)).to(device)
    mask.requires_grad = False
    return mask

In [None]:
import os
import pandas as pd
from torchvision.io import read_image
import re
from torchvision import transforms
import random

class CustomImageDataset():
    def __init__(self, img_dir, text_dir, transform=None, target_transform=None):
        self.text_dir = text_dir
        self.img_dir = img_dir
        self.transform = transform
        self.target_transform = target_transform

    def __len__(self):
        return 1700

    def __getitem__(self, idx):
        image_path = os.path.join(self.img_dir, f"{idx}.png")

        # Open the image using PIL and convert it to RGB
        image_pil = Image.open(image_path).convert("RGB")

        # Use transforms.ToTensor() to convert the PIL image to a tensor
        image = transforms.ToTensor()(image_pil)

        text_path = os.path.join(self.text_dir, f"{idx}.gui")
        tgt_padding_mask = None

        with open(text_path, 'r') as file:
            content = file.read()
            content.replace(',', '  , ')
            label = re.split(r'\s+|\n', content)
            label = list(filter(None, label))

        label = tokenize(label)
        start_token = tokenize(['<START>'])
        end_token = tokenize(['<END>'])
        tokens = start_token + label + end_token

        input_tokens = tokens[:-1].copy()
        tgt_tokens = tokens[1:].copy()

        sample_size = len(input_tokens)
        padding_size = config['max_len'] - sample_size

        if padding_size > 0:
            padding_vec = [0 for _ in range(padding_size)]
            input_tokens += padding_vec.copy()
            tgt_tokens += padding_vec.copy()

        input_tokens = torch.Tensor(input_tokens).long()
        tgt_tokens = torch.Tensor(tgt_tokens).long()

        tgt_padding_mask = torch.ones([config['max_len'], ])
        tgt_padding_mask[:sample_size] = 0.0
        tgt_padding_mask = tgt_padding_mask.bool()

        if self.transform:
            image = self.transform(image)

        # print(image.shape, input_tokens.shape, tgt_tokens.shape, tgt_padding_mask.shape)
        return image, input_tokens, tgt_tokens, tgt_padding_mask

In [None]:
from torch.utils.data import Subset

class CustomSubset(Subset):
    def __init__(self, dataset, indices):
        super().__init__(dataset, indices)

    def get_random_subset(self, subset_size):
        num_samples = len(self)
        subset_indices = random.sample(range(num_samples), subset_size)
        subset = [self[i] for i in subset_indices]
        return subset

In [None]:
sentence = "<PADDING> , { } small-title text quadruple row btn-inactive btn-orange btn-green btn-red double <START> header btn-active <END> single"
vocabulary = sentence.split()

In [None]:
def tokenize(code):
    if isinstance(code, str):
        # Assuming code is a JSON-like string, you can use json.loads to parse it
        codels = json.loads(code)
    elif isinstance(code, list):
        codels = []
        for word in code:
            splitted = word.split(',')
            codels.extend(splitted)
        codels = list(filter(lambda x: x!= '', codels))
    else:
        raise ValueError("Unsupported type for 'code'. It should be either a string or a list.")

    token_list = [vocabulary.index(item) for item in codels if item in vocabulary]

    return token_list

In [None]:
def untokenize(tokens):
  word = []
  for token in tokens:
    word.append(vocabulary[token])

  return (" ").join(word)

In [None]:
from google.colab import drive
drive.mount('/content/drive')
!ls "/content/drive/"

In [None]:
def greedy_decoding(model, img_features_batched, sos_id, eos_id, pad_id, idx2word, max_len, device):
    """Performs greedy decoding for the caption generation.
    At each iteration model predicts the next word in the caption given the previously
    generated words and image features. For the next word we always take the most probable one.
    Arguments:
        model (torch.nn.Module): Transformer Decoder model which generates prediction for the next word
        img_features_padded (torch.Tensor): Image features generated by CNN encoder
            Stacked along 0-th dimension for each image in the mini-batch
        sos_id (int): Id of <start> token in the vocabulary
        eos_id (int): Id of <end> token in the vocabulary
        pad_id (int): Id of <pad> token in the vocabulary
        idx2word (dict): Mapping from ordinal number of token (i.e. class number) to the string of word
        max_len (int): Maximum length of the caption
        device (torch.device): Device on which to port used tensors
    Returns:
        generated_captions (list of str): Captions generated for each image in the batch
    """
    batch_size = img_features_batched.size(0)

    print(batch_size)

    # Define the initial state of decoder input
    x_words = torch.Tensor([sos_id] + [pad_id] * (max_len - 1)).to(device).long()
    x_words = x_words.repeat(batch_size, 1)
    padd_mask = torch.Tensor([True] * max_len).to(device).bool()
    padd_mask = padd_mask.repeat(batch_size, 1)

    # Is each image from the batch decoded
    is_decoded = [False] * batch_size
    generated_captions = []
    for _ in range(batch_size):
        generated_captions.append([])

    for i in range(max_len - 1):
        # Update the padding masks
        padd_mask[:, i] = False

        # Get the model prediction for the next word
        y_pred_prob = model(x_words, img_features_batched, padd_mask)
        # Extract the prediction from the specific (next word) position of the target sequence
        y_pred_prob = y_pred_prob[torch.arange(batch_size), [i] * batch_size].clone()
        # Extract the most probable word
        y_pred = y_pred_prob.argmax(-1)

        for batch_idx in range(batch_size):
            print(str(y_pred[batch_idx].item()))
            if is_decoded[batch_idx]:
                continue
            # Add the generated word to the caption
            generated_captions[batch_idx].append(idx2word[y_pred[batch_idx].item()])
            if y_pred[batch_idx] == eos_id:
                # Caption has been fully generated for this image
                is_decoded[batch_idx] = True

        if np.all(is_decoded):
            break

        if i < (max_len - 1):   # We haven't reached maximum number of decoding steps
            # Update the input tokens for the next iteration
            x_words[torch.arange(batch_size), [i+1] * batch_size] = y_pred.view(-1)

    # Complete the caption for images which haven't been fully decoded
    for batch_idx in range(batch_size):
        if not is_decoded[batch_idx]:
            generated_captions[batch_idx].append(idx2word[eos_id])

    # # Clean the EOS symbol
    # for caption in generated_captions:
    #     caption.remove("<END>")

    return generated_captions

In [None]:
from nltk.translate.bleu_score import corpus_bleu

def evaluate(subset, encoder, decoder, config, device):
    """Evaluates (BLEU score) caption generation model on a given subset.

    Arguments:
        subset (CustomDataset): Train/Val/Test subset
        encoder (nn.Module): CNN which generates image features
        decoder (nn.Module): Transformer Decoder which generates captions for images
        config (object): Contains configuration for the evaluation pipeline
        device (torch.device): Device on which to port used tensors
    Returns:
        bleu (float): BLEU-{1:4} scores performance metric on the entire subset - corpus bleu
    """
    batch_size = config["batch_size"]["eval"]
    max_len = config["max_len"]
    bleu_w = config["bleu_weights"]

    # Mapping from vocab index to string representation
    idx2word = dict(zip(range(18), vocabulary))
    # Ids for special tokens
    sos_id = 13
    eos_id = 16
    pad_id = 0

    references_total = []
    predictions_total = []

    print("Evaluating model.")
    for x_img, _, y_caption, _ in subset.get_random_subset(batch_size):
        x_img = x_img.to(device)

        # Extract image features
        img_features = encoder(x_img)
        img_features = img_features.view(img_features.size(0), img_features.size(1), -1)
        img_features = img_features.permute(0, 2, 1)
        img_features = img_features.detach()

        # Get the caption prediction for each image in the mini-batch
        predictions = greedy_decoding(decoder, img_features, sos_id, eos_id, pad_id, idx2word, max_len, device)
        references_total += y_caption
        predictions_total += predictions

    # Evaluate BLEU score of the generated captions
    bleu_1 = corpus_bleu(references_total, predictions_total, weights=bleu_w["bleu-1"]) * 100
    bleu_2 = corpus_bleu(references_total, predictions_total, weights=bleu_w["bleu-2"]) * 100
    bleu_3 = corpus_bleu(references_total, predictions_total, weights=bleu_w["bleu-3"]) * 100
    bleu_4 = corpus_bleu(references_total, predictions_total, weights=bleu_w["bleu-4"]) * 100
    bleu = [bleu_1, bleu_2, bleu_3, bleu_4]
    return bleu


In [None]:
import time
from torchsummary import summary
from sklearn.model_selection import train_test_split

def train(decoder, config, writer, device):
    """Performs the training of the model.

    Arguments:
        config (object): Contains configuration of the pipeline
        writer: tensorboardX writer object
        device: device on which to map the model and data
    """
    torch.manual_seed(config["seed"])
    np.random.seed(config["seed"])

   # Define dataloader hyper-parameters
    train_hyperparams = {
        "batch_size": config["batch_size"]["train"],
        "shuffle": True,
        "num_workers": 4,
        "drop_last": True
    }

    # Create dataloaders
    full_set = CustomImageDataset(img_dir='/content/drive/MyDrive/Training/img_dir', text_dir='/content/drive/MyDrive/Training/gui_dir', transform=torchvision.transforms.Resize((224, 224)))

    train_indices, test_indices= train_test_split(
        range(len(full_set)),
        test_size = 0.1,
        random_state = 42
    )

    train_split = CustomSubset(full_set, train_indices)
    test_split = CustomSubset(full_set, test_indices)

    train_loader = DataLoader(train_split, batch_size=config["batch_size"]["train"], shuffle=True, num_workers=4)
    test_loader = DataLoader(test_split, batch_size=config["batch_size"]["eval"], num_workers=4)

    #######################
    # Set up the encoder
    #######################
    # Download pretrained CNN encoder
    encoder = models.resnet50(pretrained=True)
    # Extract only the convolutional backbone of the model
    encoder = torch.nn.Sequential(*(list(encoder.children())[:-2]))
    encoder = encoder.to(device)
    # Freeze encoder layers
    for param in encoder.parameters():
      param.requires_grad = False
    encoder.eval()

    ######################
    # Set up the decoder
    ######################
    # Instantiate the decoder
    decoder = decoder.to(device)

    # summary(decoder, input_size=(512, 1024))

    # Set up causal mask for transformer decoder
    causal_mask = set_up_causal_mask(config["max_len"], device)

    # Load training configuration
    train_config = config["train_config"]
    learning_rate = train_config["learning_rate"]

    # Prepare the model optimizer
    optimizer = torch.optim.AdamW(
        decoder.parameters(),
        lr=train_config["learning_rate"],
        weight_decay=train_config["l2_penalty"]
    )
    scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, factor=0.2, patience=1, threshold=0.01)
    # Loss function
    loss_fcn = nn.CrossEntropyLoss(label_smoothing=0.1)

    start_time = time.strftime("%b-%d_%H-%M-%S")
    train_step = 0
    for epoch in range(train_config["num_of_epochs"]):
        print("Epoch:", epoch)
        decoder.train()

        total_loss = 0
        num_batches = 0
        for x_img, x_words, y, tgt_padding_mask in train_loader:
            # print(x_img.shape, x_words.shape, y.shape)
            optimizer.zero_grad()
            train_step += 1

            # Move the used tensors to defined device
            x_img, x_words = x_img.to(device), x_words.to(device)
            y = y.to(device)
            tgt_padding_mask = tgt_padding_mask.to(device)

            x_img = x_img.unsqueeze(1)

            # Extract image features

            with torch.no_grad():
                img_features = encoder(x_img)
                img_features = img_features.view(img_features.size(0), img_features.size(1), -1)
                img_features = img_features.permute(0, 2, 1)
                img_features = img_features.detach()

            '''
            img_features = encoder(x_img)
            img_features = img_features.view(img_features.size(0), img_features.size(1), -1)
            img_features = img_features.permute(0, 2, 1)
            img_features = img_features.detach()
            '''
            # Get the prediction of the decoder
            y_pred = decoder(x_words, img_features)
            tgt_padding_mask = torch.logical_not(tgt_padding_mask)
            y_pred = y_pred[tgt_padding_mask]

            y = y[tgt_padding_mask]

            # Calculate the loss
            loss = loss_fcn(y_pred, y.long())
            # Update model weights
            loss.backward()
            optimizer.step()

            writer.add_scalar("Train/Step-Loss", loss.item(), train_step)
            writer.add_scalar("Train/Learning-Rate", learning_rate, train_step)
            print("Train/Step-Loss", loss.item())
            total_loss += loss.item()
            num_batches += 1
        # Evaluate model performance
        with torch.no_grad():
            encoder.eval()
            decoder.eval()

            # Evaluate model performance on subsets
            train_bleu = evaluate(train_split, encoder, decoder, config, device)
            test_bleu = evaluate(test_split, encoder, decoder, config, device)

            # Log the evaluated BLEU score
            for i, t_b in enumerate(train_bleu):
                writer.add_scalar(f"Train/BLEU-{i+1}", t_b, epoch)

            decoder.train()

        scheduler.step(sum(train_bleu)/len(train_bleu))
        scheduler.step(total_loss / num_batches)
        print("Learning rate", optimizer.param_groups[0]["lr"])
    torch.save(decoder.state_dict(), '/content/drive/MyDrive/Training/modelTest.pth')

In [None]:
from tensorboardX import SummaryWriter
import json

writer = SummaryWriter(log_dir='./logs')
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# with open('/content/drive/MyDrive/Training/config.json', 'r') as f:
    # config = json.load(f)
decoder = Decoder(config)
train(decoder, config, writer, device)

In [None]:
from torchvision.transforms import ToPILImage
from PIL import Image

inv_vocab = dict(zip(range(18), vocabulary))

print(inv_vocab)

train_set = CustomImageDataset(img_dir='/content/drive/MyDrive/Training/img_dir', text_dir='/content/drive/MyDrive/Training/gui_dir', transform=torchvision.transforms.Resize((224, 224)))
train_loader = DataLoader(train_set, batch_size=config["batch_size"]["train"], shuffle=True, num_workers=1)

encoder = models.resnet50(pretrained=True)
# Extract only the convolutional backbone of the model
encoder = torch.nn.Sequential(*(list(encoder.children())[:-2]))
encoder = encoder.to(device)
# Freeze encoder layers
for param in encoder.parameters():
  param.requires_grad = False
encoder.eval()
decoder.eval()

img_features = encoder(train_set[1][0].to(device).unsqueeze(0))
img_features = img_features.view(img_features.size(0), img_features.size(1), -1)
img_features = img_features.permute(0, 2, 1)
img_features = img_features.detach()

y_pred = greedy_decoding(decoder, img_features, 13, 16, 0, inv_vocab, config['max_len'], device)

print(y_pred)

In [None]:
print(train_set[1])

In [None]:
!pip install pipreqs

In [None]:
!pipreqs --force .