In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
!cp /content/drive/MyDrive/Colab\ Notebooks/FYP/data/clevr_dataset_py.py /content
!cp /content/drive/MyDrive/Colab\ Notebooks/FYP/models/vilt_vqa_model_py.py /content


In [5]:
import os
import math
import random
from dataclasses import dataclass
from typing import Dict, List, Optional, Tuple

import torch
from torch.utils.data import DataLoader
from transformers import ViltProcessor, get_linear_schedule_with_warmup

# import clevr_dataset_py
# import vilt_vqa_model_py

from clevr_dataset_py import CLEVRCurriculumViltDataset, vilt_collate_fn, build_answer_vocab
from vilt_vqa_model_py import get_clevr_model




Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [6]:
# =========================
# Configuration (paths + processor)
# =========================
QUESTIONS_DIR = "/content/drive/MyDrive/Colab Notebooks/FYP/dataset/clevr_kaggle/CLEVR_v1.0/questions"
IMAGES_DIR    = "/content/drive/MyDrive/Colab Notebooks/FYP/dataset/clevr_kaggle/CLEVR_v1.0/images"

processor = ViltProcessor.from_pretrained("dandelin/vilt-b32-mlm")

# Build answer vocab (from all train tiers)
tier_paths = [os.path.join(QUESTIONS_DIR, f"CLEVR_train_questions_L{i}.json") for i in [1,2,3,4,5]]
answer2id = build_answer_vocab(tier_paths)
print("Answer vocab size:", len(answer2id))

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Device:", device)


def data_loader_for_tier(
    tier: int,
    split: str = "train",
    batch_size: int = 16,
    max_length: int = 32,
    shuffle: bool = True,
) -> DataLoader:
  dataset_sample = CLEVRCurriculumViltDataset(
        questions_dir=QUESTIONS_DIR,
        images_dir=IMAGES_DIR,
        processor=processor,
        split=split,
        tiers=[tier],
        answer2id=answer2id,
        max_length=max_length,
    )
  loader = DataLoader(
        dataset_sample,
        batch_size=batch_size,
        shuffle=shuffle if split == "train" else False,
        num_workers=0,           # keep 0 on Drive
        pin_memory=True,
        collate_fn=vilt_collate_fn,
    )
  return loader


# Quick sanity check Tier-1 batch shapes (same as you had)
train_loader_L1 = data_loader_for_tier(1, split="train", batch_size=16, shuffle=True)
batch = next(iter(train_loader_L1))
for k, v in batch.items():
    print(k, v.shape, v.dtype)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Answer vocab size: 28
Device: cpu




pixel_values torch.Size([16, 3, 384, 576]) torch.float32
pixel_mask torch.Size([16, 384, 576]) torch.int64
input_ids torch.Size([16, 32]) torch.int64
token_type_ids torch.Size([16, 32]) torch.int64
attention_mask torch.Size([16, 32]) torch.int64
tier torch.Size([16]) torch.int64
question_id torch.Size([16]) torch.int64
labels torch.Size([16]) torch.int64


In [7]:
model = get_clevr_model(answer2id=answer2id)
model.to(device)

optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5, weight_decay=0.01)

Some weights of ViltForQuestionAnswering were not initialized from the model checkpoint at dandelin/vilt-b32-mlm-itm and are newly initialized: ['classifier.0.bias', 'classifier.0.weight', 'classifier.1.bias', 'classifier.1.weight', 'classifier.3.bias', 'classifier.3.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [8]:
import torch

def train_samples(model, train_loader, optimizer, device, epochs=1, max_batches=50):
    model.train()

    # Only pass the keys that ViLT expects
    ALLOWED_KEYS = {
        "input_ids",
        "attention_mask",
        "token_type_ids",
        "pixel_values",
        "pixel_mask",
        "labels",
    }

    for epoch in range(epochs):
        running_loss = 0.0
        steps = 0

        for b_idx, batch in enumerate(train_loader):
            if max_batches is not None and b_idx >= max_batches:
                break

            # # 1) keep only model inputs
            # batch = {k: v for k, v in batch.items() if k in ALLOWED_KEYS}

            # # 2) move tensors to device
            # batch = {k: v.to(device) for k, v in batch.items()}

            # # 3) forward + loss
            # outputs = model(**batch)
            # loss = outputs.loss
              # keep only model inputs
            batch = {k: v for k, v in batch.items() if k in ALLOWED_KEYS}

            # convert labels -> multi-hot
            labels = batch["labels"]                # [B]
            num_labels = model.config.num_labels

            multi_hot = torch.zeros(
                labels.size(0), num_labels, device=labels.device
            )
            multi_hot.scatter_(1, labels.unsqueeze(1), 1.0)

            batch["labels"] = multi_hot             # [B, num_labels]

            # move to device
            batch = {k: v.to(device) for k, v in batch.items()}

            outputs = model(**batch)
            loss = outputs.loss


            optimizer.zero_grad(set_to_none=True)
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
            optimizer.step()

            running_loss += loss.item()
            steps += 1

            if (b_idx + 1) % 10 == 0:
                print(f"Epoch {epoch+1} | Batch {b_idx+1} | Loss: {loss.item():.4f}")

        print(f"âœ… Epoch {epoch+1} finished | Avg loss: {running_loss / max(steps, 1):.4f}")


In [9]:

train_samples(model, train_loader_L1, optimizer, device, epochs=1, max_batches=50)

Epoch 1 | Batch 10 | Loss: 12.5392
Epoch 1 | Batch 20 | Loss: 7.1218
Epoch 1 | Batch 30 | Loss: 4.8211
Epoch 1 | Batch 40 | Loss: 4.1678
Epoch 1 | Batch 50 | Loss: 3.5618
âœ… Epoch 1 finished | Avg loss: 7.7943


In [10]:
SAVE_DIR = "/content/drive/MyDrive/Colab Notebooks/FYP/checkpoints/vilt_L1_from_memory"

import os
os.makedirs(SAVE_DIR, exist_ok=True)

model.save_pretrained(SAVE_DIR)
processor.save_pretrained(SAVE_DIR)

print("âœ… Model exported from memory to:", SAVE_DIR)


âœ… Model exported from memory to: /content/drive/MyDrive/Colab Notebooks/FYP/checkpoints/vilt_L1_from_memory


In [18]:
import torch
from transformers import ViltForQuestionAnswering, ViltProcessor

SAVE_DIR = "/content/drive/MyDrive/Colab Notebooks/FYP/checkpoints/vilt_L1_from_memory"
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Reload model + processor
model = ViltForQuestionAnswering.from_pretrained(SAVE_DIR).to(device)
processor = ViltProcessor.from_pretrained(SAVE_DIR)

# id2answer
id2answer = {i: a for a, i in answer2id.items()}

# âœ… Get ONE sample from the DATASET (inside the DataLoader)
ds = train_loader_L1.dataset
sample = ds[3]   # now this works

print("Sample keys:", list(sample.keys()))

# Keep only model inputs
KEEP = {"input_ids", "attention_mask", "token_type_ids", "pixel_values", "pixel_mask"}
sample_inputs = {k: v for k, v in sample.items() if k in KEEP}

# Add batch dimension + move to device
sample_inputs = {k: v.unsqueeze(0).to(device) for k, v in sample_inputs.items()}

# Inference
model.eval()
with torch.no_grad():
    logits = model(**sample_inputs).logits
    pred_id = int(logits.argmax(dim=-1).item())

print("âœ… Predicted answer:", id2answer[pred_id])

# Optional: compare with ground truth (if your sample includes labels)
if "labels" in sample:
    true_id = int(sample["labels"])
    print("ðŸŽ¯ True answer:", id2answer[true_id])


Sample keys: ['pixel_values', 'pixel_mask', 'input_ids', 'token_type_ids', 'attention_mask', 'labels', 'tier', 'question_id']
âœ… Predicted answer: yes
ðŸŽ¯ True answer: cube
