In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import DataLoader,Dataset
from transformers import ViTFeatureExtractor, BertModel, BertTokenizer
from sklearn.model_selection import train_test_split


import json
# import pandas as pd
from collections import Counter
import cv2
import gc
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename));

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
from transformers import ViTModel

In [None]:
# Load BERT model and tokenizer
bert_model = BertModel.from_pretrained('bert-base-uncased')
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
# Load ViT model and feature extractor
vit_feature_extractor = ViTFeatureExtractor.from_pretrained('google/vit-base-patch16-224-in21k')
vit_model = ViTModel.from_pretrained('google/vit-base-patch16-224-in21k')

# Define a function to extract features from images using ViT
def extract_vit_features(images):
    inputs = vit_feature_extractor(images, return_tensors="pt")
    vit_outputs = vit_model(**inputs).last_hidden_state
    return vit_outputs

In [None]:
# Define your custom neural network architecture
class VQAModel(nn.Module):
    def __init__(self, num_classes, vit_hidden_size, bert_hidden_size, dim):
        super(VQAModel, self).__init__()
        self.layer1 = nn.Sequential(
            nn.Linear(in_features=dim*2, out_features=512),
            nn.ReLU(),
            nn.Dropout(0.3),
        )

        self.layer2 = nn.Sequential(
            nn.Linear(in_features=512, out_features=256),
            nn.ReLU(),
            nn.Dropout(0.3),
        )
        
        self.output_layer = nn.Linear(in_features=256, out_features=num_classes)

    def forward(self, vit_outputs, bert_outputs):
        combined_repr = torch.cat((vit_outputs, bert_outputs), dim=-1)
        
        x = self.layer1(combined_repr)
        x = self.layer2(x)
        output = self.output_layer(x)
        return output

        return output

In [None]:
class VQADataset(Dataset):
    def __init__(self, data):
        self.data = data

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        image, question, answer = self.data[index]
        return image, question, answer

In [None]:
def getImageName(image_path, image_id):

    path = image_path+"COCO_train2014_"
    output = "0" * (12 - len(str(image_id))) + str(image_id)
    path = path+output+".jpg"
    return path

In [None]:
def filterMajoritySingleWord(answer_list):

    single_word_answers = [entry["answer"] for entry in answer_list if len(entry["answer"].split()) == 1]
    if (len(single_word_answers) == 0):
        single_word_answers = [entry["answer"] for entry in answer_list]

    answer_counts = Counter(single_word_answers)

    majority_answer_count = max(answer_counts.values())
    majority_answers = [answer for answer, count in answer_counts.items() if count == majority_answer_count]

    return majority_answers[0]

def load_image(image_path):
    # Load image with OpenCV
    image = cv2.imread(image_path)
    # Convert BGR to RGB
    image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
    # Resize image
    image = cv2.resize(image, (224, 224))
#     print(image.shape)
    image = np.transpose(image, (2, 0, 1))  # Change shape from HxWxC to CxHxW
    return image
#     return image

In [None]:
df = pd.read_csv("/kaggle/input/Final.csv")
image_path = "/kaggle/input/Subset_train2014/Subset_train2014/"

dummy_data = [tuple(row) for row in df.values]

unique_ids = set([image for image,_,_ in dummy_data])

images_dict = {image_id : load_image(getImageName(image_path, image_id)) for image_id in unique_ids}

print(images_dict[36].shape)

In [None]:
answer_vocabulary = set([answer for _, _, answer in dummy_data])

answer_to_label = {answer: label for label, answer in enumerate(answer_vocabulary)}

dummy_data_new = [(images_dict[image_id], question, answer_to_label.get(answer, -1)) for (image_id, question, answer) in dummy_data]

dummy_data = dummy_data_new

# print(dummy_data[:5])

In [None]:
train_data, val_data = train_test_split(dummy_data, test_size=0.2)

# Define data loaders
train_loader = DataLoader(VQADataset(train_data), batch_size=32, shuffle=True)
val_loader = DataLoader(VQADataset(val_data), batch_size=32, shuffle=True)

In [None]:
num_classes = len(answer_vocabulary)

In [None]:
bert_hidden_size = 768  # BERT hidden size
vit_hidden_size = 768  # ViT hidden size
model = VQAModel(num_classes, vit_hidden_size, bert_hidden_size, bert_hidden_size)
criterion = nn.CrossEntropyLoss()
# optimizer = optim.Adam(model.parameters(), lr=1e-4)
optimizer = optim.Adam([
    {'params': model.parameters()},
    {'params': vit_model.parameters()},
    {'params': bert_model.parameters()}
], lr=1e-4)

In [None]:
from sklearn.metrics import precision_score, recall_score, f1_score

num_epochs = 10
num_batches_per_epoch = 30

for epoch in range(num_epochs):
    model.train()
    total_loss = 0.0
    total_correct = 0
    total_samples = 0
    
    all_predicted = []
    all_answers = []
    
    for batch_idx, (images, questions, answers) in enumerate(train_loader):
        if batch_idx >= num_batches_per_epoch:
            break
            
        optimizer.zero_grad()
        
        # Extract features from images using ViT
        vit_outputs = extract_vit_features(images)[:,0,:]
#         print(vit_outputs.shape)
        
        inputs = tokenizer(questions, return_tensors='pt', padding=True, truncation=True)
        input_ids = inputs['input_ids']
        # Get BERT embeddings
        bert_outputs = bert_model(input_ids)[1]  # [CLS] token output
#         print(bert_outputs.shape)
        
        # Forward pass
        outputs = model(vit_outputs, bert_outputs)
        loss = criterion(outputs, answers)
        
        # Backward pass and optimize
        loss.backward()
        optimizer.step()
        
        # Update metrics
        total_loss += loss.item()
        _, predicted = torch.max(outputs, 1)
        total_correct += (predicted == answers).sum().item()
        total_samples += images.size(0)
        
        all_predicted.extend(predicted.cpu().numpy())
        all_answers.extend(answers.cpu().numpy())
        
#         print(f"Done {batch_idx}")
    val_predicted = []
    val_answers = []
    
    # Evaluation loop
    model.eval()
    val_loss = 0.0
    val_correct = 0
    val_samples = 0
    
    with torch.no_grad():
        for batch_idx, (images, questions, answers) in enumerate(val_loader):
            
            if batch_idx >= num_batches_per_epoch:
                break
            # Extract features from images using ViT
            vit_outputs = extract_vit_features(images)[:,0,:]

            # Get BERT embeddings
            inputs = tokenizer(questions, return_tensors='pt', padding=True, truncation=True)
            input_ids = inputs['input_ids']
            bert_outputs = bert_model(input_ids)[1]  # [CLS] token output

            outputs = model(vit_outputs, bert_outputs)
            
            # Compute loss
            loss = criterion(outputs, answers)

            # Update metrics
            val_loss += loss.item()
            _, predicted = torch.max(outputs, 1)
            val_correct += (predicted == answers).sum().item()
            val_samples += images.size(0)
            
            val_predicted.extend(predicted.cpu().numpy())
            val_answers.extend(answers.cpu().numpy())
            
#             print(f"Done val {batch_idx}")

    avg_loss = total_loss / len(train_loader)
    accuracy = (total_correct / total_samples) * 100
    
#     Calculate precision, recall, and F1 score for training
    precision = precision_score(all_answers, all_predicted, average='weighted')
    recall = recall_score(all_answers, all_predicted, average='weighted')
    f1 = f1_score(all_answers, all_predicted, average='weighted')
    
    # Calculate precision, recall, and F1 score for validation
    val_precision = precision_score(val_answers, val_predicted, average='weighted')
    val_recall = recall_score(val_answers, val_predicted, average='weighted')
    val_f1 = f1_score(val_answers, val_predicted, average='weighted')

    val_avg_loss = val_loss / len(val_loader)
    val_accuracy = (val_correct / val_samples) * 100

    # Print epoch-level metrics for both training and validation data
    print(f"Epoch {epoch+1}/{num_epochs}, Train Loss: {avg_loss}, Train Accuracy: {accuracy}%, "
          f"Train Precision: {precision}, Train Recall: {recall}, Train F1: {f1}")
    print(f"Validation Loss: {val_avg_loss}, Validation Accuracy: {val_accuracy}%, "
          f"Validation Precision: {val_precision}, Validation Recall: {val_recall}, Validation F1: {val_f1}")

In [None]:
torch.save(model.state_dict(), "Model.pth")