### Phase-0: - Setting up the Notebook

#### Step-1: - Importing Necessary Libraries

In [2]:

from dataclasses import dataclass
from pathlib import Path
from typing import Optional

import random
import json
import numpy as np
from tqdm import tqdm

import torch
from datasets import load_dataset, DatasetDict
from PIL import Image
import requests
from io import BytesIO
from multiprocessing.dummy import Pool as ThreadPool  # threads (Jupyter-friendly)
from multiprocessing import cpu_count


#### Step-2:- Setting the Seed and paths

In [3]:
# Reproducibility
SEED = 42
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)

@dataclass
class Config:
    # HF dataset
    dataset_name: str = "allenai/pixmo-cap"
    train_subset_size: int = 1000   # you can change later
    val_subset_size: int = 100

    # Vision encoder
    vision_model_name: str = "facebook/dinov2-base"  # or whatever you use

    # Paths
    root_dir: Path = Path("./data/pixmo")
    features_dir: Path = Path("./data/pixmo/features")

    # Device
    device: str = "cuda" if torch.cuda.is_available() else "cpu"

cfg = Config()
cfg.root_dir.mkdir(parents=True, exist_ok=True)
cfg.features_dir.mkdir(parents=True, exist_ok=True)

print(cfg)


Config(dataset_name='allenai/pixmo-cap', train_subset_size=1000, val_subset_size=100, vision_model_name='facebook/dinov2-base', root_dir=PosixPath('data/pixmo'), features_dir=PosixPath('data/pixmo/features'), device='cpu')


### Load Vision Encoder

In [4]:
from transformers import AutoImageProcessor, AutoModel

print("Loading vision encoder:", cfg.vision_model_name)

vision_processor = AutoImageProcessor.from_pretrained(cfg.vision_model_name)
vision_model = AutoModel.from_pretrained(cfg.vision_model_name, output_hidden_states=True)
vision_model.eval().to(cfg.device)

vision_hidden_size = vision_model.config.hidden_size
perceiver_dim = 2 * vision_hidden_size
print("Vision hidden size:", vision_hidden_size)
print("Perceiver dim (2 * hidden_size):", perceiver_dim)


Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


Loading vision encoder: facebook/dinov2-base
Vision hidden size: 768
Perceiver dim (2 * hidden_size): 1536


### Cell 3 – Load, Filter, and Split PixMo-Cap

In [5]:
print("Using dataset:", cfg.dataset_name)
pixmo_raw = load_dataset(cfg.dataset_name, split="train")
print(pixmo_raw)
print("Columns:", pixmo_raw.column_names)

Using dataset: allenai/pixmo-cap
Dataset({
    features: ['image_url', 'caption', 'transcripts'],
    num_rows: 717042
})
Columns: ['image_url', 'caption', 'transcripts']


In [6]:
def has_image_and_caption(ex):
    url = ex.get("image_url", None)
    cap = ex.get("caption", None)
    return (url is not None) and (len(str(url)) > 0) and (cap is not None) and (len(cap.strip()) > 0)

pixmo_filtered = pixmo_raw.filter(has_image_and_caption)

print("Original size:", len(pixmo_raw))
print("Filtered size:", len(pixmo_filtered))


Filter: 100%|██████████| 717042/717042 [00:02<00:00, 348291.37 examples/s]

Original size: 717042
Filtered size: 717042





In [7]:
# Train/val split with subset sizes
total = len(pixmo_filtered)
if total == 0:
    raise ValueError("No valid examples left after filtering – check dataset or filter.")

train_size = min(cfg.train_subset_size, total - cfg.val_subset_size)
val_size = min(cfg.val_subset_size, total - train_size)

if train_size <= 0 or val_size <= 0:
    # fallback: simple 90/10 split if config is too aggressive
    print("⚠️ Falling back to 90/10 split due to small dataset.")
    split = pixmo_filtered.train_test_split(test_size=0.1, seed=SEED)
    pixmo_train, pixmo_val = split["train"], split["test"]
else:
    pixmo_train = pixmo_filtered.select(range(train_size))
    pixmo_val   = pixmo_filtered.select(range(train_size, train_size + val_size))

print("Train subset size:", len(pixmo_train))
print("Val subset size:", len(pixmo_val))


Train subset size: 1000
Val subset size: 100


### Image Fetch & Encoder Wrapper

In [8]:
def fetch_image(url: str) -> Image.Image:
    resp = requests.get(url, timeout=10)
    resp.raise_for_status()
    img = Image.open(BytesIO(resp.content)).convert("RGB")
    return img


In [9]:
def encode_image_to_patches(img: Image.Image) -> torch.Tensor:
    """
    img: PIL.Image
    returns: torch.Tensor of shape (num_patches, feat_dim_concat) on CPU
    """
    inputs = vision_processor(images=img, return_tensors="pt").to(cfg.device)

    with torch.no_grad():
        outputs = vision_model(**inputs)
        hidden_states = outputs.hidden_states  # tuple of length (num_layers + 1)

    num_layers_plus_embed = len(hidden_states)
    if num_layers_plus_embed < 4:
        raise ValueError(f"Not enough layers ({num_layers_plus_embed}) to take 2nd and 2nd-to-last.")

    h_low  = hidden_states[1]    # (B, seq_len, D)
    h_high = hidden_states[-2]   # (B, seq_len, D)

    h_low_patches  = h_low[:, 1:, :]    # drop CLS at position 0
    h_high_patches = h_high[:, 1:, :]

    feats = torch.cat([h_low_patches, h_high_patches], dim=-1)  # (B, num_patches, 2D)
    feats = feats[0].to(torch.float16).cpu()  # (num_patches, 2D)

    return feats


### Per-Example Processing & Saving

In [10]:
def process_and_save_example(ex, idx: int, split_name: str):
    """
    ex: example from pixmo_train or pixmo_val
    idx: index in that split
    split_name: "train" or "val"
    returns: metadata dict if success, None if failure
    """
    url = ex["image_url"]
    caption = ex["caption"].strip()

    try:
        img = fetch_image(url)
        feats = encode_image_to_patches(img)  # (num_patches, feat_dim)
    except Exception as e:
        # ❌ No noisy print here
        # ✅ Quietly append to a log file
        log_path = cfg.root_dir / f"{split_name}_skipped.log"
        with open(log_path, "a") as f:
            f.write(
                f"idx={idx}\turl={url}\terror={repr(e)}\n"
            )
        return None

    fname = f"{split_name}_feat_{idx}.pt"
    fpath = cfg.features_dir / fname

    torch.save(
        {
            "features": feats,      # (num_patches, feat_dim), float16
            "caption": caption,     # raw text
            "image_url": url,
            "orig_idx": idx,
        },
        fpath,
    )

    return {
        "orig_idx": idx,
        "file": str(fpath),
        "num_patches": feats.shape[0],
        "feat_dim": feats.shape[1],
    }


### Threaded Builders for Train/Val

In [11]:
def _process_example(args):
    i, ex, split_name = args
    return process_and_save_example(ex, idx=i, split_name=split_name)





In [12]:
def build_split_index_threaded(dataset, split_name: str, workers: Optional[int] = None, chunksize: int = 32):
    """
    Extract & cache features for a given split using threads.
    Creates:
      - <cfg.root_dir>/<split_name>_index.json
      - .pt feature files under cfg.features_dir
    """
    if workers is None:
        workers = min(8, cpu_count())

    try:
        total = len(dataset)
    except TypeError:
        total = None

    tasks = ((i, ex, split_name) for i, ex in enumerate(dataset))
    split_index = []

    with ThreadPool(processes=workers) as pool:
        for meta in tqdm(
            pool.imap_unordered(_process_example, tasks, chunksize=chunksize),
            total=total,
            desc=f"Extracting {split_name} features (threads)",
        ):
            if meta is not None:
                split_index.append(meta)

    index_path = cfg.root_dir / f"{split_name}_index.json"
    with open(index_path, "w") as f:
        json.dump(split_index, f, indent=2)

    print(f"Saved {split_name} index with {len(split_index)} items to {index_path}")
    return split_index


### Run the Builders

In [13]:
# This is the main "build" step for this notebook

train_index = build_split_index_threaded(pixmo_train, "train", workers=4)
val_index   = build_split_index_threaded(pixmo_val,   "val",   workers=4)


Extracting train features (threads): 100%|██████████| 1000/1000 [02:49<00:00,  5.89it/s]


Saved train index with 873 items to data/pixmo/train_index.json


Extracting val features (threads): 100%|██████████| 100/100 [00:14<00:00,  6.71it/s]

Saved val index with 89 items to data/pixmo/val_index.json





### Sanity Check a Saved Feature File

In [14]:
index_path = cfg.root_dir / "train_index.json"

with open(index_path, "r") as f:
    train_index = json.load(f)

print("Loaded", len(train_index), "train items")

if len(train_index) > 0:
    first_meta = train_index[0]
    print("First meta:", first_meta)

    sample_path = first_meta["file"]
    blob = torch.load(sample_path)
    feats = blob["features"]
    caption = blob["caption"]
    print("Loaded features shape:", feats.shape)
    print("Caption snippet:", caption[:200], "...")
else:
    print("No train examples cached – check earlier logs for errors.")


Loaded 873 train items
First meta: {'orig_idx': 64, 'file': 'data/pixmo/features/train_feat_64.pt', 'num_patches': 256, 'feat_dim': 1536}
Loaded features shape: torch.Size([256, 1536])
Caption snippet: In this meme, a close-up photograph of a bald-headed Black man is prominently featured, capturing a raw moment of emotion with tears streaming down his face and lips pursed. The man, with a dark compl ...
