In [4]:
!pip install transformers datasets peft accelerate bitsandbytes


Collecting bitsandbytes
  Downloading bitsandbytes-0.45.5-py3-none-manylinux_2_24_x86_64.whl.metadata (5.0 kB)
Collecting fsspec<=2025.3.0,>=2023.1.0 (from fsspec[http]<=2025.3.0,>=2023.1.0->datasets)
  Downloading fsspec-2025.3.0-py3-none-any.whl.metadata (11 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch>=1.13.0->peft)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch>=1.13.0->peft)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch>=1.13.0->peft)
  Downloading nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-curand-cu12==10.3.5.147 (from torch>=1.13.0->peft)
  Downloading nvidia_curand_cu12-10.3.5.147-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cusolver-cu12==11.6.1.9 (from torch>=1.13.0->peft)
  Downloading nvidia_c

In [5]:
import torch
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms
from transformers import ViltProcessor, ViltForQuestionAnswering, TrainingArguments, Trainer
from peft import get_peft_model, LoraConfig, TaskType
import pandas as pd
from PIL import Image
import os
from sklearn.preprocessing import LabelEncoder


2025-05-18 12:08:55.380894: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1747570135.593043      35 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1747570135.653163      35 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [6]:
MODEL_NAME = "dandelin/vilt-b32-finetuned-vqa"
CSV_PATH = "/kaggle/input/amazon-vqa-dataset/merged.csv"
IMAGE_ROOT = "/kaggle/input/amazon-vqa-images"
BATCH_SIZE = 4
NUM_EPOCHS = 10
MAX_LENGTH = 40
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"


In [7]:
processor = ViltProcessor.from_pretrained(MODEL_NAME)
model = ViltForQuestionAnswering.from_pretrained(MODEL_NAME)


full_df = pd.read_csv(CSV_PATH)

df = full_df.iloc[:10000].reset_index(drop=True)

label_encoder = LabelEncoder()
df['label'] = label_encoder.fit_transform(df['answer'])
num_classes = len(label_encoder.classes_)


preprocessor_config.json:   0%|          | 0.00/251 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/320 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/136k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/470M [00:00<?, ?B/s]

In [10]:
class VQADataset(Dataset):
    def __init__(self, dataframe, processor, image_root):
        self.data = dataframe
        self.processor = processor
        self.image_root = image_root
        self.resize = transforms.Resize((384, 384))

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        item = self.data.iloc[idx]
        image = Image.open(os.path.join(self.image_root, item["image_path"])).convert("RGB")
        image = self.resize(image)
        encoding = self.processor(
            images=image, 
            text=item["question"], 
            return_tensors="pt", 
            padding="max_length", 
            truncation=True, 
        )
        encoding = {k: v.squeeze(0) for k, v in encoding.items()}
        encoding["labels"] = torch.tensor(int(item["label"])).long()
        return encoding

In [11]:
from sklearn.model_selection import train_test_split
train_df, val_df = train_test_split(df, test_size=0.2, random_state=42, shuffle=True)

train_df = train_df.reset_index(drop=True)
val_df = val_df.reset_index(drop=True)

train_dataset = VQADataset(train_df, processor, IMAGE_ROOT)
val_dataset = VQADataset(val_df, processor, IMAGE_ROOT)

train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False)


In [12]:
lora_config = LoraConfig(
    r=8,
    lora_alpha=16,
    target_modules=["query","value"],
    lora_dropout=0.1,
    bias="none",
    task_type=TaskType.SEQ_CLS
)

model = get_peft_model(model, lora_config)


In [13]:
from sklearn.model_selection import train_test_split

train_df, val_df = train_test_split(df, test_size=0.1, random_state=42)


In [14]:
train_dataset = VQADataset(train_df, processor, IMAGE_ROOT)
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)

val_dataset = VQADataset(val_df, processor, IMAGE_ROOT)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False)

optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5)


In [15]:
model.train()
model.to(DEVICE)

for epoch in range(NUM_EPOCHS):
    total_loss = 0
    for batch_idx, batch in enumerate(train_loader):
        try:
            input_ids = batch["input_ids"].to(DEVICE)
            attention_mask = batch["attention_mask"].to(DEVICE)
            pixel_values = batch["pixel_values"].to(DEVICE)
            labels = batch["labels"].to(DEVICE)

            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask,
                pixel_values=pixel_values
            )
            
            logits = outputs.logits
            
            loss_fn = torch.nn.CrossEntropyLoss()
            loss = loss_fn(logits, labels)

            loss.backward()
            optimizer.step()
            optimizer.zero_grad()

            total_loss += loss.item()
            if batch_idx % 10 == 0:
                print(f"[Batch {batch_idx}] Loss: {loss.item():.4f}")

        except Exception as e:
            print(f"[Error] Unexpected error in batch {batch_idx}: {e}")
            continue

    print(f"Epoch {epoch+1}/{NUM_EPOCHS} | Loss: {total_loss:.4f}")


[Batch 0] Loss: 16.5075
[Batch 10] Loss: 12.9167
[Batch 20] Loss: 12.7982
[Batch 30] Loss: 13.0660
[Batch 40] Loss: 13.1376
[Batch 50] Loss: 12.8834
[Batch 60] Loss: 11.5431
[Batch 70] Loss: 15.0252
[Batch 80] Loss: 14.2385
[Batch 90] Loss: 12.3309
[Batch 100] Loss: 10.7889
[Batch 110] Loss: 10.6791
[Batch 120] Loss: 14.5354
[Batch 130] Loss: 13.4280
[Batch 140] Loss: 15.3726
[Batch 150] Loss: 12.2334
[Batch 160] Loss: 12.0656
[Batch 170] Loss: 11.7608
[Batch 180] Loss: 12.9401
[Batch 190] Loss: 12.0693
[Batch 200] Loss: 9.2784
[Batch 210] Loss: 10.1894
[Batch 220] Loss: 10.4256
[Batch 230] Loss: 8.2377
[Batch 240] Loss: 8.5830
[Batch 250] Loss: 6.5307
[Batch 260] Loss: 7.8323
[Batch 270] Loss: 9.8828
[Batch 280] Loss: 7.9106
[Batch 290] Loss: 8.8729
[Batch 300] Loss: 9.7937
[Batch 310] Loss: 9.1855
[Batch 320] Loss: 8.5487
[Batch 330] Loss: 7.2863
[Batch 340] Loss: 6.3611
[Batch 350] Loss: 10.3810
[Batch 360] Loss: 4.0772
[Batch 370] Loss: 7.5670
[Batch 380] Loss: 8.8568
[Batch 390] L

In [16]:
from torch.utils.data import DataLoader
import torch

model.eval()

correct = 0
total = 0

with torch.no_grad():
    for batch in val_loader:
        input_ids = batch["input_ids"].to(DEVICE)
        attention_mask = batch["attention_mask"].to(DEVICE)
        pixel_values = batch["pixel_values"].to(DEVICE)
        labels = batch["labels"].to(DEVICE)

        outputs = model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            pixel_values=pixel_values
        )

        logits = outputs.logits
        preds = torch.argmax(logits, dim=1)

        correct += (preds == labels).sum().item()
        total += labels.size(0)

accuracy = correct / total
print(f"Validation Accuracy: {accuracy*100:.2f}%")


Validation Accuracy: 38.60%


In [17]:
import os

LORA_ADAPTER_DIR = "/kaggle/working/vilt_vqa_lora_adapters"

os.makedirs(LORA_ADAPTER_DIR, exist_ok=True)


In [18]:
LORA_ADAPTER_DIR = "/kaggle/working/vilt_vqa_lora_adapters" # (from Cell 2)
os.makedirs(LORA_ADAPTER_DIR, exist_ok=True)
print(f"Ensured LoRA adapter directory exists: {LORA_ADAPTER_DIR}")

try:
    model.save_pretrained(LORA_ADAPTER_DIR)
    print(f"LoRA adapters (model weights and config) successfully saved to {LORA_ADAPTER_DIR}")

    processor.save_pretrained(LORA_ADAPTER_DIR)
    print(f"Processor configuration successfully saved to {LORA_ADAPTER_DIR}")

    print("\nContents of the adapter directory after saving:")
    for item in os.listdir(LORA_ADAPTER_DIR):
        print(f"  - {item}")

except Exception as e:
    print(f"An error occurred during saving LoRA adapters or processor: {e}")
    print("Please check permissions, disk space, or the model/processor state.")

Ensured LoRA adapter directory exists: /kaggle/working/vilt_vqa_lora_adapters
LoRA adapters (model weights and config) successfully saved to /kaggle/working/vilt_vqa_lora_adapters
Processor configuration successfully saved to /kaggle/working/vilt_vqa_lora_adapters

Contents of the adapter directory after saving:
  - vocab.txt
  - tokenizer_config.json
  - README.md
  - adapter_model.safetensors
  - special_tokens_map.json
  - preprocessor_config.json
  - tokenizer.json
  - adapter_config.json


In [19]:
!zip -r /kaggle/working/vilt_vqa_lora_adapters.zip /kaggle/working/vilt_vqa_lora_adapters

  adding: kaggle/working/vilt_vqa_lora_adapters/ (stored 0%)
  adding: kaggle/working/vilt_vqa_lora_adapters/vocab.txt (deflated 53%)
  adding: kaggle/working/vilt_vqa_lora_adapters/tokenizer_config.json (deflated 75%)
  adding: kaggle/working/vilt_vqa_lora_adapters/README.md (deflated 66%)
  adding: kaggle/working/vilt_vqa_lora_adapters/adapter_model.safetensors

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


 (deflated 8%)
  adding: kaggle/working/vilt_vqa_lora_adapters/special_tokens_map.json (deflated 80%)
  adding: kaggle/working/vilt_vqa_lora_adapters/preprocessor_config.json (deflated 51%)
  adding: kaggle/working/vilt_vqa_lora_adapters/tokenizer.json (deflated 71%)
  adding: kaggle/working/vilt_vqa_lora_adapters/adapter_config.json (deflated 53%)
