<a href="https://colab.research.google.com/github/SenaliWij/competence-aware-curriculum-framework-VQA/blob/master/clevr_dataset_py.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [4]:
%%time

# Core dependencies
!pip install -q transformers torch torchvision pillow

# Optional but recommended
!pip install -q tqdm  # For progress bars during training
!pip install -q wandb  # For experiment tracking (optional)

CPU times: user 2.95 s, sys: 424 ms, total: 3.37 s
Wall time: 10.3 s


In [5]:
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


### **1. ENCODING IMAGES AND QUESTIONS USING ViLT PROCESSOR**

In [6]:
import os
import json
from typing import Dict, List, Optional, Any

import torch
from torch.utils.data import Dataset, DataLoader
from PIL import Image
from transformers import ViltProcessor

Utils functions


In [7]:
def load_questions(path: str) -> List[Dict[str, Any]]:
    with open(path, "r") as f:
        return json.load(f)["questions"]


def build_answer_vocab(paths: List[str]) -> Dict[str, int]:
    answers = set()
    for p in paths:
        data = load_questions(p)
        for q in data:
            if "answer" in q:
                answers.add(str(q["answer"]).strip().lower())
    answers = sorted(answers)
    return {a: i for i, a in enumerate(answers)}

Dataset class (loads image + question and encodes using ViLT)

In [8]:
class CLEVRCurriculumViltDataset(Dataset):
    def __init__(
        self,
        questions_dir: str,
        images_dir: str,
        processor: ViltProcessor,
        split: str,
        answer2id: Optional[Dict[str, int]] = None,
        tiers: Optional[List[int]] = None,     # only used for train
        max_length: int = 32,
    ):
        assert split in {"train", "val", "test"}
        self.questions_dir = questions_dir
        self.images_dir = images_dir
        self.processor = processor
        self.split = split
        self.answer2id = answer2id
        self.max_length = max_length

        # sanity: ensure split image folder exists
        self.split_img_dir = os.path.join(images_dir, split if split != "val" else "val")
        if not os.path.isdir(self.split_img_dir):
            raise FileNotFoundError(f"Image folder not found in Colab FS: {self.split_img_dir}")

        self.samples: List[Dict[str, Any]] = []

        use_tiers = (split in {"train", "val"}) and (tiers is not None)

        if use_tiers:
            # For tiered val/train, default tiers if not provided (optional)
            if tiers is None:
                tiers = [1, 2, 3, 4, 5]

            for t in tiers:
                qpath = os.path.join(questions_dir, f"CLEVR_{split}_questions_L{t}.json")
                if not os.path.exists(qpath):
                    raise FileNotFoundError(f"Tier file not found: {qpath}")

                for q in load_questions(qpath):
                    self.samples.append(
                        {
                            "question": q["question"],
                            "answer": q.get("answer"),  # train/val should have answers
                            "image_filename": q["image_filename"],
                            "question_index": q.get("question_index", -1),
                            "tier": t,
                        }
                    )
        else:
            qfile = f"CLEVR_{split}_questions.json"
            qpath = os.path.join(questions_dir, qfile)
            if not os.path.exists(qpath):
                raise FileNotFoundError(f"{split} questions file not found: {qpath}")

            for q in load_questions(qpath):
                self.samples.append(
                    {
                        "question": q["question"],
                        "answer": q.get("answer"),  # val has answers; test may not
                        "image_filename": q["image_filename"],
                        "question_index": q.get("question_index", -1),
                        "tier": -1,
                    }
                )

    def __len__(self) -> int:
        return len(self.samples)

    def __getitem__(self, idx: int) -> Dict[str, Any]:
        s = self.samples[idx]
        img_path = os.path.join(self.split_img_dir, s["image_filename"])

        if not os.path.exists(img_path):
            raise FileNotFoundError(f"Image not found: {img_path}")

        image = Image.open(img_path).convert("RGB")

        enc = self.processor(
            images=image,
            text=s["question"],
            return_tensors="pt",
            padding="max_length",
            truncation=True,
            max_length=self.max_length,
        )
        item = {k: v.squeeze(0) for k, v in enc.items()}

        # labels if available
        if s.get("answer") is not None and self.answer2id is not None:
            key = str(s["answer"]).strip().lower()
            item["labels"] = torch.tensor(self.answer2id[key], dtype=torch.long)

        item["tier"] = torch.tensor(s["tier"], dtype=torch.long)
        item["question_id"] = torch.tensor(s["question_index"], dtype=torch.long)
        return item


def vilt_collate_fn(batch: List[Dict[str, Any]]) -> Dict[str, torch.Tensor]:
    out: Dict[str, torch.Tensor] = {}
    for k in batch[0].keys():
        if k == "labels":
            continue
        out[k] = torch.stack([b[k] for b in batch])
    if all("labels" in b for b in batch):
        out["labels"] = torch.stack([b["labels"] for b in batch])
    return out

Encoding process(Build answer vocab + Create dataset + dataloader +  Save encoded batch (.pt))

In [9]:

# # Configuration (paths + processor)
# QUESTIONS_DIR = "/content/drive/MyDrive/Colab Notebooks/FYP/dataset/clevr_kaggle/CLEVR_v1.0/questions"
# IMAGES_DIR    = "/content/drive/MyDrive/Colab Notebooks/FYP/dataset/clevr_kaggle/CLEVR_v1.0/images"

# processor = ViltProcessor.from_pretrained("dandelin/vilt-b32-mlm")

# # Build answer vocab (from all train tiers)
# tier_paths = [os.path.join(QUESTIONS_DIR, f"CLEVR_train_questions_L{i}.json") for i in [1,2,3,4,5]]
# answer2id = build_answer_vocab(tier_paths)
# print("Answer vocab size:", len(answer2id))

# # Create dataset + dataloader (Tier 1 only)

# train_ds_L1 = CLEVRCurriculumViltDataset(
#     questions_dir=QUESTIONS_DIR,
#     images_dir=IMAGES_DIR,
#     processor=processor,
#     split="train",
#     tiers=[1],
#     answer2id=answer2id,
#     max_length=32,
# )

# train_loader_L1 = DataLoader(
#     train_ds_L1,
#     batch_size=16,
#     shuffle=True,
#     num_workers=0,  # keep 0 while using Google Drive
#     collate_fn=vilt_collate_fn,
# )


# # Sanity check (get one batch + print shapes)

# batch = next(iter(train_loader_L1))
# for k, v in batch.items():
#     print(k, v.shape, v.dtype)

# # Save encoded batch (.pt)

# OUT_DIR = "/content/drive/MyDrive/Colab Notebooks/FYP/encode_output"
# os.makedirs(OUT_DIR, exist_ok=True)

# batch_path = os.path.join(OUT_DIR, "sanity_batch_L1.pt")
# torch.save(batch, batch_path)
# print("Saved:", batch_path)
