In [125]:
# Install required libraries
!pip install transformers datasets torch peft -q

import os
import time
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset, Subset
from transformers import AutoTokenizer, AutoModel, AutoImageProcessor, ViTModel
from PIL import Image
from sklearn.preprocessing import LabelEncoder
from datasets import load_dataset
from peft import LoraConfig, get_peft_model
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
import matplotlib.pyplot as plt
from torchvision.transforms import Compose, Resize, ToTensor, Normalize


from IPython.display import display
import random

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [126]:
# %%time
# Load the VQAv2 dataset from Hugging Face Hub
dataset = load_dataset("HuggingFaceM4/VQAv2")

Repo card metadata block was not found. Setting CardData to empty.


In [127]:
# Access train, validation, and test sets
train_dataset = dataset['train']
test_dataset = dataset['test']
val_dataset = dataset['validation']

print(train_dataset[0])
image=train_dataset[0]['image']
display(image)
answer=train_dataset[0]['answers']
print(answer)
# If you need to ensure the image is in RGB mode
image = image.convert("RGB")

def load_image(image):
    return image.convert("RGB")

def display_image(image):
    display(image)

In [128]:
# %%time
# Tokenizer and Image Processor setup
bert_tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")
image_processor = AutoImageProcessor.from_pretrained("google/vit-base-patch16-224")

In [129]:
# %%time
# Custom Dataset class
class VQADataset(Dataset):
    def __init__(self, dataset, tokenizer, image_processor):
        self.dataset = dataset
        self.tokenizer = tokenizer
        self.image_processor = image_processor

    def __len__(self):
        return len(self.dataset)

    def __getitem__(self, idx):
        item = self.dataset[idx]
        question = item['question']
        answers = item['answers']
        image = item['image'].convert("RGB")

        text_inputs = self.tokenizer(question, padding='max_length', truncation=True, return_tensors="pt")
        image_inputs = self.image_processor(images=[image], return_tensors="pt")

        text_inputs = {k: v.squeeze(0) for k, v in text_inputs.items()}
        image_inputs = {k: v.squeeze(0) for k, v in image_inputs.items()}
        
        answer_texts = [answer['answer'] for answer in answers]
        encoded_labels = self.label_encoder.transform(answer_texts)
        one_hot_labels = np.zeros((len(self.label_encoder.classes_)))
        for encoded_label in encoded_labels:
            one_hot_labels[encoded_label] += 1
        one_hot_labels /= len(encoded_labels)  # Average the one-hot vectors

        label = torch.tensor(one_hot_labels, dtype=torch.float)  # Convert to tensor

        return {'text_inputs': text_inputs, 'image_inputs': image_inputs, 'labels': label}

In [130]:
# %%time
# Data preparation
label_encoder = LabelEncoder()
all_answer_texts = [answer['answer'] for example in train_dataset for answer in example['answers']]
label_encoder.fit(all_answer_texts)
train_dataset = Subset(VQADataset(dataset['train'], bert_tokenizer, image_processor), range(len(dataset['train']) // 4))
dataloader = DataLoader(train_dataset, batch_size=2, shuffle=True)

In [131]:
# %%time
# Define and modify the VQA model using LoRA
class VQAModel(nn.Module):
    def __init__(self, text_model_name="bert-base-cased", image_model_name="google/vit-base-patch16-224", num_answers=1000):
        super(VQAModel, self).__init__()
        self.text_model = AutoModel.from_pretrained(text_model_name)
        self.image_model = ViTModel.from_pretrained(image_model_name)
        self.text_fc = nn.Linear(self.text_model.config.hidden_size, 512)
        self.image_fc = nn.Linear(self.image_model.config.hidden_size, 512)
        self.classifier = nn.Linear(1024, num_answers)

    def forward(self, text_inputs, image_inputs):
        text_outputs = self.text_model(**text_inputs).last_hidden_state[:, 0, :]  # CLS token
        image_outputs = self.image_model(**image_inputs).last_hidden_state[:, 0, :]  # CLS token
        text_features = self.text_fc(text_outputs)
        image_features = self.image_fc(image_outputs)
        combined_features = torch.cat((text_features, image_features), dim=1)
        logits = self.classifier(combined_features)
        return logits

In [132]:
# %%time
model = VQAModel(num_answers=162496)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = model.to(device)

Some weights of ViTModel were not initialized from the model checkpoint at google/vit-base-patch16-224 and are newly initialized: ['vit.pooler.dense.bias', 'vit.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [133]:
# %%time
# LoRA Configuration
lora_config = LoraConfig(r=16, lora_alpha=16, target_modules=["query", "value"], lora_dropout=0.1, bias="none")
lora_model = get_peft_model(model, lora_config)

In [134]:
# %%time
# Training function with LoRA
def train_lora(model, dataloader, criterion, optimizer, num_epochs=5):
    model.train()
    for epoch in range(num_epochs):
        total_loss = 0
        start_time = time.time()  # start time for epoch
        all_labels = []
        all_predictions = []
        for batch in dataloader:
            text_inputs = {k: v.to(device) for k, v in batch['text_inputs'].items()}
            image_inputs = {k: v.to(device) for k, v in batch['image_inputs'].items()}
            labels = batch['labels'].to(device)

            optimizer.zero_grad()
            outputs = model(text_inputs, image_inputs)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()

            total_loss += loss.item()
            all_labels.extend(labels.cpu().numpy())
            all_predictions.extend(torch.argmax(outputs, dim=1).cpu().numpy())
            
        end_time = time.time()  # End time for epoch
        epoch_time = end_time - start_time  # Time taken for epoch
        accuracy = accuracy_score(all_labels, all_predictions)
        f1 = f1_score(all_labels, all_predictions, average='weighted')
        precision = precision_score(all_labels, all_predictions, average='weighted')
        recall = recall_score(all_labels, all_predictions, average='weighted')
        
        print(f'Epoch {epoch+1}/{num_epochs}, Loss: {total_loss / len(dataloader)}, Accuracy: {accuracy}, F1 Score: {f1}, Precision: {precision}, Recall: {recall}, Time Taken: {epoch_time:.2f} seconds')


In [135]:
# %%time
# Setup optimizer and loss function
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(lora_model.parameters(), lr=1e-4)

In [1]:
# Start training
train_lora(lora_model, dataloader, criterion, optimizer, num_epochs=3)

Epoch 1/3, Loss: 4.076500, Accuracy: 0.162451, F1 Score: 0.003517, Precision: 0.309171, Recall: 0.198438, Time Taken: 4389.29 seconds
Epoch 2/3, Loss: 3.629100, Accuracy: 0.199854, F1 Score: 0.009496, Precision: 0.348322, Recall: 0.199768, Time Taken: 4245.11 seconds
Epoch 3/3, Loss: 3.323100, Accuracy: 0.212109, F1 Score: 0.013184, Precision: 0.363648, Recall: 0.200010, Time Taken: 4389.58 seconds


In [2]:
# %%time
# Save the trained model
lora_model_path = 'lora_vqa_model_25percent.pth'
torch.save(lora_model.state_dict(), lora_model_path)
print(f'Model saved to {lora_model_path}')

Model saved to lora_vqa_model_25percent.pth


In [None]:
# import shutil
# # Zip the model file
# shutil.make_archive('/kaggle/working/lora_vqa_model_25percent', 'zip', '/kaggle/working/', 'lora_vqa_model_25percent.pth')

In [138]:
import os
import subprocess
from IPython.display import FileLink, display

def download_file(path, download_file_name):
    os.chdir('/kaggle/working/')
    zip_name = f"/kaggle/working/{download_file_name}.zip"
    command = f"zip {zip_name} {path} -r"
    result = subprocess.run(command, shell=True, capture_output=True, text=True)
    if result.returncode != 0:
        print("Unable to run zip command!")
        print(result.stderr)
        return
    display(FileLink(f'{download_file_name}.zip'))

In [None]:
download_file('/kaggle/working/lora_vqa_model_25percent.pth', 'out')

In [None]:
# !rm -rf /kaggle/working/out*