In [4]:
!pip install transformers datasets peft accelerate bitsandbytes


Collecting bitsandbytes
  Downloading bitsandbytes-0.45.5-py3-none-manylinux_2_24_x86_64.whl.metadata (5.0 kB)
Collecting fsspec<=2025.3.0,>=2023.1.0 (from fsspec[http]<=2025.3.0,>=2023.1.0->datasets)
  Downloading fsspec-2025.3.0-py3-none-any.whl.metadata (11 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch>=1.13.0->peft)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch>=1.13.0->peft)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch>=1.13.0->peft)
  Downloading nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-curand-cu12==10.3.5.147 (from torch>=1.13.0->peft)
  Downloading nvidia_curand_cu12-10.3.5.147-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cusolver-cu12==11.6.1.9 (from torch>=1.13.0->peft)
  Downloading nvidia_c

In [5]:
import torch
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms
from transformers import ViltProcessor, ViltForQuestionAnswering, TrainingArguments, Trainer
from peft import get_peft_model, LoraConfig, TaskType
import pandas as pd
from PIL import Image
import os
from sklearn.preprocessing import LabelEncoder


2025-05-18 12:08:55.380894: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1747570135.593043      35 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1747570135.653163      35 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [60]:
MODEL_NAME = "dandelin/vilt-b32-finetuned-vqa"
CSV_PATH = "/kaggle/input/amazon-vqa-dataset/merged.csv"
IMAGE_ROOT = "/kaggle/input/amazon-vqa-images"
BATCH_SIZE = 4
NUM_EPOCHS = 10
MAX_LENGTH = 40
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"


In [61]:
processor = ViltProcessor.from_pretrained(MODEL_NAME)
model = ViltForQuestionAnswering.from_pretrained(MODEL_NAME)


full_df = pd.read_csv(CSV_PATH)


full_df["answer"] = full_df["answer"].astype(str).str.lower()

df = full_df.iloc[:10000].reset_index(drop=True)

label_encoder = LabelEncoder()
df['label'] = label_encoder.fit_transform(df['answer'])
num_classes = len(label_encoder.classes_)


In [62]:
full_df.head

<bound method NDFrame.head of             image_path                           question        answer
0      ee/ee856bc8.jpg  What is the color of the dog bed?          blue
1      ee/ee856bc8.jpg      What type of product is this?  pet_supplies
2      ee/ee856bc8.jpg                 What is the brand?  amazonbasics
3      ee/ee6c137a.jpg        What is the loveseat color?       caramel
4      ee/ee6c137a.jpg     What is the loveseat material?       leather
...                ...                                ...           ...
63274  07/07309433.jpg                What is the design?         heart
63275  07/07309433.jpg              What is the material?       plastic
63276  07/0730a868.jpg      What kind of meat is visible?          beef
63277  07/0730a868.jpg          What is the product type?       grocery
63278  07/0730a868.jpg                 What is the brand?        amazon

[63279 rows x 3 columns]>

In [63]:
class VQADataset(Dataset):
    def __init__(self, dataframe, processor, image_root):
        self.data = dataframe
        self.processor = processor
        self.image_root = image_root
        self.resize = transforms.Resize((384, 384))

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        item = self.data.iloc[idx]
        image = Image.open(os.path.join(self.image_root, item["image_path"])).convert("RGB")
        image = self.resize(image)
        encoding = self.processor(
            images=image, 
            text=item["question"], 
            return_tensors="pt", 
            padding="max_length", 
            truncation=True, 
        )
        encoding = {k: v.squeeze(0) for k, v in encoding.items()}
        encoding["labels"] = torch.tensor(int(item["label"])).long()
        return encoding

In [64]:
from sklearn.model_selection import train_test_split
train_df, val_df = train_test_split(df, test_size=0.2, random_state=42, shuffle=True)

train_df = train_df.reset_index(drop=True)
val_df = val_df.reset_index(drop=True)

train_dataset = VQADataset(train_df, processor, IMAGE_ROOT)
val_dataset = VQADataset(val_df, processor, IMAGE_ROOT)

train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False)


In [65]:
lora_config = LoraConfig(
    r=8,
    lora_alpha=16,
    target_modules=["query","value","key"],
    lora_dropout=0.1,
    bias="none",
    task_type=TaskType.SEQ_CLS
)

model = get_peft_model(model, lora_config)


In [66]:
from sklearn.model_selection import train_test_split

train_df, val_df = train_test_split(df, test_size=0.1, random_state=42)


In [67]:
train_dataset = VQADataset(train_df, processor, IMAGE_ROOT)
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)

val_dataset = VQADataset(val_df, processor, IMAGE_ROOT)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False)

optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5)


In [68]:
model.train()
model.to(DEVICE)

for epoch in range(NUM_EPOCHS):
    total_loss = 0
    for batch_idx, batch in enumerate(train_loader):
        try:
            input_ids = batch["input_ids"].to(DEVICE)
            attention_mask = batch["attention_mask"].to(DEVICE)
            pixel_values = batch["pixel_values"].to(DEVICE)
            labels = batch["labels"].to(DEVICE)

            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask,
                pixel_values=pixel_values
            )
            
            logits = outputs.logits
            
            loss_fn = torch.nn.CrossEntropyLoss()
            loss = loss_fn(logits, labels)

            loss.backward()
            optimizer.step()
            optimizer.zero_grad()

            total_loss += loss.item()
            if batch_idx % 1000 == 0:
                print(f"[Batch {batch_idx}] Loss: {loss.item():.4f}")

        except Exception as e:
            print(f"[Error] Unexpected error in batch {batch_idx}: {e}")
            continue

    print(f"Epoch {epoch+1}/{NUM_EPOCHS} | Loss: {total_loss:.4f}")


[Batch 0] Loss: 15.4034
[Batch 1000] Loss: 3.0413
[Batch 2000] Loss: 3.2571
Epoch 1/10 | Loss: 14185.2091
[Batch 0] Loss: 4.6892
[Batch 1000] Loss: 4.6589
[Batch 2000] Loss: 4.6884
Epoch 2/10 | Loss: 9436.2312
[Batch 0] Loss: 7.5842
[Batch 1000] Loss: 4.6465
[Batch 2000] Loss: 0.8339
Epoch 3/10 | Loss: 8070.0597
[Batch 0] Loss: 4.4270
[Batch 1000] Loss: 4.5453
[Batch 2000] Loss: 3.5460
Epoch 4/10 | Loss: 7125.4378
[Batch 0] Loss: 3.6547
[Batch 1000] Loss: 2.1266
[Batch 2000] Loss: 3.3765
Epoch 5/10 | Loss: 6359.2182
[Batch 0] Loss: 2.2158
[Batch 1000] Loss: 3.2377
[Batch 2000] Loss: 1.4842
Epoch 6/10 | Loss: 5682.6799
[Batch 0] Loss: 2.9502
[Batch 1000] Loss: 2.0086
[Batch 2000] Loss: 2.8269
Epoch 7/10 | Loss: 5087.7894
[Batch 0] Loss: 1.8917
[Batch 1000] Loss: 0.4355
[Batch 2000] Loss: 3.1696
Epoch 8/10 | Loss: 4527.1529
[Batch 0] Loss: 0.5471
[Batch 1000] Loss: 4.3848
[Batch 2000] Loss: 0.3052
Epoch 9/10 | Loss: 4001.1792
[Batch 0] Loss: 2.1932
[Batch 1000] Loss: 1.3908
[Batch 2000] 

In [59]:
import copy

old_model = copy.deepcopy(model)


In [69]:
from torch.utils.data import DataLoader
import torch

model.eval()

correct = 0
total = 0

with torch.no_grad():
    for batch in val_loader:
        input_ids = batch["input_ids"].to(DEVICE)
        attention_mask = batch["attention_mask"].to(DEVICE)
        pixel_values = batch["pixel_values"].to(DEVICE)
        labels = batch["labels"].to(DEVICE)

        outputs = model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            pixel_values=pixel_values
        )

        logits = outputs.logits
        preds = torch.argmax(logits, dim=1)

        correct += (preds == labels).sum().item()
        total += labels.size(0)

accuracy = correct / total
print(f"Validation Accuracy: {accuracy*100:.2f}%")


Validation Accuracy: 38.90%


In [78]:
!pip install bert-score 

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Collecting git+https://github.com/sileod/bart-score.git
  Cloning https://github.com/sileod/bart-score.git to /tmp/pip-req-build-wtw75_s5
  Running command git clone --filter=blob:none --quiet https://github.com/sileod/bart-score.git /tmp/pip-req-build-wtw75_s5
Username for 'https://github.com': ^C
[31mERROR: Operation cancelled by user[0m[31m
[0m

### Compute BERTScore

In [81]:
from bert_score import score as bertscore
import torch

id2label = model.config.id2label

model.eval()

generated_answers = []
ground_truth_answers = []

with torch.no_grad():
    for batch in val_loader:
        input_ids = batch["input_ids"].to(DEVICE)
        attention_mask = batch["attention_mask"].to(DEVICE)
        pixel_values = batch["pixel_values"].to(DEVICE)
        labels = batch["labels"]

        outputs = model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            pixel_values=pixel_values
        )
        logits = outputs.logits
        preds = torch.argmax(logits, dim=1).cpu().tolist()

        pred_texts = [id2label[p] for p in preds]
        ref_texts = [id2label[int(l)] for l in labels]

        generated_answers.extend(pred_texts)
        ground_truth_answers.extend(ref_texts)

P, R, F1 = bertscore(
    generated_answers,
    ground_truth_answers,
    lang="en"
)

print(f"\nBERTScore - Precision: {P.mean():.4f}, Recall: {R.mean():.4f}, F1: {F1.mean():.4f}")


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/482 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



BERTScore - Precision: 0.9259, Recall: 0.9294, F1: 0.9268


### Compute BARTScore

In [83]:
from transformers import BartTokenizer, BartForConditionalGeneration
import torch
import torch.nn.functional as F
from tqdm import tqdm

tokenizer = BartTokenizer.from_pretrained("facebook/bart-large-cnn")
model = BartForConditionalGeneration.from_pretrained("facebook/bart-large-cnn").to(DEVICE)
model.eval()

def compute_bart_score(references, candidates):
    scores = []
    for ref, cand in tqdm(zip(references, candidates), total=len(references)):
        input_ids = tokenizer(cand, return_tensors="pt").input_ids.to(DEVICE)
        with tokenizer.as_target_tokenizer():
            labels = tokenizer(ref, return_tensors="pt").input_ids.to(DEVICE)

        with torch.no_grad():
            output = model(input_ids=input_ids, labels=labels)
            loss = output.loss
            score = -loss.item()
            scores.append(score)

    return scores

approx_bart_scores = compute_bart_score(ground_truth_answers, generated_answers)
print(f"Mean approximate BARTScore: {sum(approx_bart_scores)/len(approx_bart_scores):.4f}")


vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.58k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

100%|██████████| 1000/1000 [00:19<00:00, 51.84it/s]

Mean approximate BARTScore: -5.2421





In [70]:
import os

LORA_ADAPTER_DIR = "/kaggle/working/vilt_vqa_lora_adapters_10epochs_key"

os.makedirs(LORA_ADAPTER_DIR, exist_ok=True)


In [71]:
os.makedirs(LORA_ADAPTER_DIR, exist_ok=True)
print(f"Ensured LoRA adapter directory exists: {LORA_ADAPTER_DIR}")

try:
    model.save_pretrained(LORA_ADAPTER_DIR)
    print(f"LoRA adapters (model weights and config) successfully saved to {LORA_ADAPTER_DIR}")

    processor.save_pretrained(LORA_ADAPTER_DIR)
    print(f"Processor configuration successfully saved to {LORA_ADAPTER_DIR}")

    print("\nContents of the adapter directory after saving:")
    for item in os.listdir(LORA_ADAPTER_DIR):
        print(f"  - {item}")

except Exception as e:
    print(f"An error occurred during saving LoRA adapters or processor: {e}")
    print("Please check permissions, disk space, or the model/processor state.")

Ensured LoRA adapter directory exists: /kaggle/working/vilt_vqa_lora_adapters_10epochs_key
LoRA adapters (model weights and config) successfully saved to /kaggle/working/vilt_vqa_lora_adapters_10epochs_key
Processor configuration successfully saved to /kaggle/working/vilt_vqa_lora_adapters_10epochs_key

Contents of the adapter directory after saving:
  - vocab.txt
  - tokenizer_config.json
  - README.md
  - adapter_model.safetensors
  - special_tokens_map.json
  - preprocessor_config.json
  - tokenizer.json
  - adapter_config.json


In [72]:
!zip -r /kaggle/working/vilt_vqa_lora_adapters_10epochs_key.zip /kaggle/working/vilt_vqa_lora_adapters_10epochs_key

  adding: kaggle/working/vilt_vqa_lora_adapters_10epochs_key/ (stored 0%)
  adding: kaggle/working/vilt_vqa_lora_adapters_10epochs_key/vocab.txt (deflated 53%)
  adding: kaggle/working/vilt_vqa_lora_adapters_10epochs_key/tokenizer_config.json (deflated 75%)
  adding: kaggle/working/vilt_vqa_lora_adapters_10epochs_key/README.md (deflated 66%)
  adding: kaggle/working/vilt_vqa_lora_adapters_10epochs_key/adapter_model.safetensors

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


 (deflated 8%)
  adding: kaggle/working/vilt_vqa_lora_adapters_10epochs_key/special_tokens_map.json (deflated 80%)
  adding: kaggle/working/vilt_vqa_lora_adapters_10epochs_key/preprocessor_config.json (deflated 51%)
  adding: kaggle/working/vilt_vqa_lora_adapters_10epochs_key/tokenizer.json (deflated 71%)
  adding: kaggle/working/vilt_vqa_lora_adapters_10epochs_key/adapter_config.json (deflated 53%)
