# Training Coattention Model

In [2]:
from transformers import BertModel, BertTokenizer, ViTImageProcessor, ViTModel
import torch
from torchinfo import summary
from torch import nn
from torch.nn import Transformer, TransformerDecoder, TransformerDecoderLayer, TransformerEncoder, TransformerEncoderLayer

2024-05-18 12:08:45.998640: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-05-18 12:08:45.998709: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-05-18 12:08:46.002718: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [3]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
# device = torch.device('cpu')

In [4]:
device

device(type='cuda')

## Initialize Necessary Modules

In [5]:
class TextTokenizer(torch.nn.Module):
    def __init__(
        self,
        text_tokenizer=BertTokenizer,
        max_length=25  # Add a max_length parameter
    ):
        super().__init__()
        self.text_tokenizer = text_tokenizer.from_pretrained('bert-base-uncased')
        self.max_length = max_length  # Store the max_length

    def forward(self, input_question, padding='max_length', truncation=True):
        tokens = self.text_tokenizer(input_question, return_tensors='pt', 
                                     padding=padding, truncation=truncation, 
                                     max_length=self.max_length).to(device)  # Use max_length

        return tokens

class ImageProcessor(torch.nn.Module):
    def __init__(
        self,
        image_model_processor=ViTImageProcessor
    ):

        super().__init__()
        self.image_model_processor = image_model_processor.from_pretrained('google/vit-base-patch16-224-in21k')

    def forward(self, image):
        image = self.image_model_processor(image, return_tensors='pt').to(device)

        return image

class TextEmbedding(torch.nn.Module):
    def __init__(
        self,
        text_model=BertModel,
    ):
        super().__init__()
        self.text_model = text_model.from_pretrained('bert-base-uncased').to(device)


    def forward(self, tokens):
        text_output = self.text_model(input_ids=tokens.input_ids, attention_mask=tokens.attention_mask)
        text_output = text_output.last_hidden_state     # CLS token from the last layer

        return text_output


class ImageEmbedding(torch.nn.Module):
    def __init__(
            self, 
            image_model=ViTModel
        ):
        
        super().__init__()
        self.image_model = image_model.from_pretrained('google/vit-base-patch16-224-in21k').to(device)


    def forward(self, image):
        image_output = self.image_model(pixel_values=image.pixel_values).last_hidden_state

        return image_output

## Load Dataset

In [11]:
from datasets import load_dataset

dataset = load_dataset("HuggingFaceM4/VQAv2")

You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.
Repo card metadata block was not found. Setting CardData to empty.


In [12]:
train_dataset = dataset['train']
test_dataset = dataset['test']
val_dataset = dataset['validation']

In [13]:
import pandas as pd

train_df = pd.read_csv('/kaggle/input/vqdata/vqa_train_dataset.csv')
val_df = pd.read_csv('/kaggle/input/vqdata/vqa_val_dataset.csv')

train_df = train_df[~train_df['answers'].isna()]
val_df = val_df[~val_df['answers'].isna()]

In [14]:
train_df[train_df['answers'].isna()]

Unnamed: 0,index,image_name,question,answers,question_type


In [15]:
from torch.utils.data import Dataset, DataLoader
from PIL import Image
from torchvision.transforms import Compose, Resize, CenterCrop, ToTensor

class VQADataset(Dataset):
    def __init__(self, dataframe, image_dataset):
        self.dataframe = dataframe
        self.image_dataset = image_dataset
        self.text_tokenizer = TextTokenizer()
        self.image_processor = ImageProcessor()

    def __len__(self):
        return len(self.dataframe)

    def __getitem__(self, idx):
        row = self.dataframe.iloc[idx]
        
        ind = int(row['index'])
        image = self.image_dataset[ind]['image']
        question = row['question']
        answer = row['answers']
        
        # sanity check        
        assert self.image_dataset[ind]['question'] == question, "Mismatching training and Image data"

        
        image = image.convert('RGB')

        tokens = self.text_tokenizer(question, padding='max_length', truncation=True)
        tokens.input_ids = tokens.input_ids.squeeze()
        tokens.attention_mask = tokens.attention_mask.squeeze()
        
        answer_tokens = self.text_tokenizer(answer, padding='max_length', truncation=True)
        answer_tokens.input_ids = answer_tokens.input_ids.squeeze()
        answer_tokens.attention_mask = tokens.attention_mask.squeeze()
        
        
        image = self.image_processor(image)
        return {
            'image': image,
            'questions': tokens,
            'answer': answer,
            'answer_tokens' : answer_tokens
        }

batch_size = 64
    
# Assuming you have separate dataframes for training and validation
train_data = VQADataset(train_df, train_dataset)
val_data = VQADataset(val_df, train_dataset)

# DataLoader for training and validation
train_dataloader = DataLoader(train_data, batch_size=batch_size, shuffle=True)
val_dataloader = DataLoader(val_data, batch_size=batch_size)

In [16]:
import pickle

with open('/kaggle/input/vqdata/answers_dictionaries.pkl', 'rb') as f:
    data = pickle.load(f)
    id_to_answer = data['id_to_answer']
    answer_to_id = data['answer_to_id']

# print("Dictionaries have been loaded from answers_dictionaries.pkl")
# print("ID to Answer Dictionary:", id_to_answer)
# print("Answer dede bhai: ", answer_to_id)

## Model Creation

In [17]:
class VQAModel(nn.Module):
    def __init__(
        self,
        dim_model = 768,      # image and text embeddings concatenated
        nhead = 12,                    # No. of Attention heads
        num_layers = 1,               # No. of encoder layers
        num_classes = 8000
    ):
        super().__init__()
        self.text_embedder = TextEmbedding()
        self.image_embedder = ImageEmbedding()
        
        self.transformer = Transformer(num_encoder_layers=1, num_decoder_layers=1, nhead=nhead, d_model=dim_model).to(device)


    def forward(self, questions, images, answers):
        question_embedding = self.text_embedder(questions)
        image_embedding = self.image_embedder(images)
        answer_embedding = self.text_embedder(answers)
        
        embeddings = torch.cat((question_embedding, image_embedding), dim=1)
        embeddings = embeddings.permute(1, 0, 2)  # (seq, batch, feature)
        answer_embedding = answer_embedding.permute(1, 0, 2)
#         print(embeddings.shape, answer_embedding.shape)
        output = self.transformer(embeddings, answer_embedding)

        return output[:answer_embedding.shape[0],:,:], answer_embedding

In [18]:
assert len(answer_to_id) == len(id_to_answer)
model = VQAModel(num_classes=len(answer_to_id))



In [19]:
train_df[train_df['answers'].isna()]

Unnamed: 0,index,image_name,question,answers,question_type


In [31]:
!pip install peft

Collecting peft
  Downloading peft-0.11.1-py3-none-any.whl.metadata (13 kB)
Downloading peft-0.11.1-py3-none-any.whl (251 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m251.6/251.6 kB[0m [31m8.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: peft
Successfully installed peft-0.11.1


### Comment the next cell and uncomment the next of next cell to run Encoder-decoder VQA Model without LoRA

In [40]:
# Apply LoRA

from peft import LoraConfig, get_peft_model

# Define the LoRA configuration
LORA_R = 16
LORA_ALPHA = 512
LORA_DROPOUT = 0.05

lora_config = LoraConfig(
    r=LORA_R,
    lora_alpha=LORA_ALPHA,
    lora_dropout=LORA_DROPOUT,
    bias="none",
    target_modules=[
        "transformerEncoder.layers.0.linear1",
        "transformerEncoder.layers.0.linear2",
        "image_embedder.image_model.encoder.layer.11.intermediate.dense",
        "image_embedder.image_model.encoder.layer.11.output.dense"
    ]
#     target_modules = linear_layers
)

# Initialize the VQALORAModel
# model = VQALORAModel()

# Apply LoRA to the model
model = get_peft_model(model, lora_config)

# Print the trainable parameters
model.print_trainable_parameters()
model.train()
writer = SummaryWriter('runs/experiment_self_supervised_lora_16')
num_epochs = 3

trainable params: 122,880 || all params: 209,389,312 || trainable%: 0.0587


In [41]:
# from torch.utils.tensorboard import SummaryWriter

# # Create a SummaryWriter object
# writer = SummaryWriter('runs/experiment_self_supervised')
# num_epochs = 3
# model.train()

In [42]:
criterion = nn.CosineEmbeddingLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.003)
# loss = criterion(out, tar)
# loss.backward()
# optimizer.step()

In [43]:
checkpoint_ref = 100

## Evaluation Metrics

In [44]:
from torchmetrics.classification import Precision, Recall, Accuracy, F1Score, AUROC

In [45]:
precision_metric = Precision(task="multiclass", num_classes=len(answer_to_id)).to(device)
recall_metric = Recall(task="multiclass", num_classes=len(answer_to_id)).to(device)
accuracy_metric = Accuracy(task="multiclass", num_classes=len(answer_to_id)).to(device)
f1_metric = F1Score(task="multiclass", num_classes=len(answer_to_id)).to(device)
# auroc_metric = AUROC(task="multiclass", num_classes=len(answer_to_id)).to(device)

def evaluate(preds, true):
    p = precision_metric(preds, true)
    r = recall_metric(preds, true)
    a = accuracy_metric(preds, true)
    f = f1_metric(preds, true)
#     am = auroc_metric(preds, true)
    
    return {
        "precision": p,
        "recall": r,
        "accuracy": a,
        "f1": f,
#         "auroc": auroc_metric
    }


## Training Loop

In [46]:
# !mkdir ./checkpoints

In [None]:
import torch.nn.functional as F

batch_no = 0
avg_accuracy = 0

for epoch in range(num_epochs):
    batch_no = 0
    avg_accuracy = 0
    for batch in train_dataloader:
        # Get the inputs and targets from the batch
        images = batch['image']
        questions = batch['questions']
        answers = batch['answer_tokens']

        questions.input_ids = questions.input_ids.squeeze()
        questions.attention_mask = questions.attention_mask.squeeze()
        images.pixel_values = images.pixel_values.squeeze()
        answers.input_ids = answers.input_ids.squeeze()
        answers.attention_mask = answers.attention_mask.squeeze()
    
        # Forward pass
        outputs, answers = model(questions, images, answers)
        
        target = torch.ones(outputs.shape[1] * outputs.shape[0]).to(device)
        
        outputs = outputs.reshape(-1, 768)
        answers = answers.reshape(-1, 768)
        
        loss = criterion(outputs, answers, target)
 
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        iter_val = epoch * len(train_dataloader) + batch_no
        
        
        
        writer.add_scalar('Training Loss', loss.item(), iter_val)
        
        
        if batch_no % checkpoint_ref == 0:
            torch.save(model.state_dict(), f"./checkpoints/latest.pth")
            
        batch_no += 1
        print(f"Batch -> {batch_no} done -> cosine Loss: {loss}\r", end="")
#         avg_accuracy += eval_met['accuracy']
        

    print(f'Epoch {epoch+1}/{num_epochs}, Loss: {loss.item()}')
# writer.close()

### Model Evaluation

In [48]:
def pearson_correlation(x, y):
    mean_x = torch.mean(x, dim=1, keepdim=True)
    mean_y = torch.mean(y, dim=1, keepdim=True)
    xm = x.sub(mean_x)
    ym = y.sub(mean_y)
    r_num = xm.mul(ym).sum(dim=1)
    r_den = torch.norm(xm, 2, dim=1) * torch.norm(ym, 2, dim=1)
    r_val = r_num / r_den
    return r_val.sum()

In [49]:
writer.close()

In [50]:
import torch.nn.functional as F
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics.pairwise import pairwise_distances
from scipy.stats import pearsonr
import numpy as np
import time
model.eval()
batch_no = 0


with torch.no_grad():
    total_loss = 0
    total_cosine = 0
    total_euc = 0
    total_manhattan = 0
    total_pearson = 0
    start_time = time.time()
    for batch in val_dataloader:
        # Get the inputs and targets from the batch
        images = batch['image']
        questions = batch['questions']
        answers = batch['answer_tokens']

        questions.input_ids = questions.input_ids.squeeze()
        questions.attention_mask = questions.attention_mask.squeeze()
        images.pixel_values = images.pixel_values.squeeze()
        answers.input_ids = answers.input_ids.squeeze()
        answers.attention_mask = answers.attention_mask.squeeze()
    
        # Forward pass
        outputs, answers = model(questions, images, answers)

        target = torch.ones(outputs.shape[1] * outputs.shape[0]).to(device)
        
                
        outputs = outputs.reshape(-1, 768)
        answers = answers.reshape(-1, 768)
        
        loss = criterion(outputs, answers, target)
        total_loss += loss.item()
        
        total_pearson += pearson_correlation(outputs, answers)
        total_cosine += torch.sum(F.cosine_similarity(outputs, answers))
        
        euclidean_distances = torch.sqrt(torch.sum((outputs - answers) ** 2, dim=1))
        total_euc = torch.sum(euclidean_distances)
        
        manhattan_distances = torch.sum(torch.abs(outputs - answers), dim=1)
        total_manhattan = torch.sum(manhattan_distances)

        
        print(f"Batch -> {batch_no} done -> Loss: {loss.item()}\r", end="")
        batch_no += 1
#         if batch_no == 1: break
        
    end_time = time.time()
    print(f'Loss: {total_loss}, Time taken: {end_time - start_time}')
# writer.close()

Loss: 216.48974961042404, Time taken: 658.4874546527863


In [51]:
print("Average Correlation: ", total_pearson / (25 * 64 * len(val_dataloader)))
print("Average Euclidean Distance: ", total_euc, (25 * 64 * len(val_dataloader)))
print("Average Manhattan Distance: ", total_manhattan, (25 * 64 * len(val_dataloader)))
print("Average cosine similarity: ", total_cosine / (25 * 64 * len(val_dataloader)))
print("Total time taken: ", end_time - start_time)

Average Correlation:  tensor(0.5385, device='cuda:0')
Average Euclidean Distance:  tensor(24556.4922, device='cuda:0') 750400
Average Manhattan Distance:  tensor(567850.6250, device='cuda:0') 750400
Average cosine similarity:  tensor(0.5380, device='cuda:0')
Total time taken:  658.4874546527863
