# NoCaps Validation — OpenCLIP CoCa End-to-End (Option A)

This notebook:
1. Installs dependencies (OpenCLIP CoCa + COCO caption eval).
2. Loads NoCaps **validation** annotations (expects 10 captions per image).
3. Loads a **pretrained CoCa** from OpenCLIP.
4. Generates captions (beam search) for all images.
5. Evaluates with **BLEU, METEOR, ROUGE_L, CIDEr** (SPICE optional).

> **Paths to set:** `ANN_PATH` and `IMG_DIR` near the top.  
> **Expected files:**  
>  - `data/nocap_val_4500_captions.json`  
>  - `data/validation/<image files>`

In [1]:
# --- Install dependencies (internet required) ---
%pip install --upgrade pip
%pip install open_clip_torch pillow tqdm torchvision pycocotools
%pip install git+https://github.com/salaniz/pycocoevalcap
# Optional for SPICE (Java required):
# !apt-get update && apt-get install -y default-jre

Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Collecting git+https://github.com/salaniz/pycocoevalcap
  Cloning https://github.com/salaniz/pycocoevalcap to /tmp/pip-req-build-l9sqn96x
  Running command git clone --filter=blob:none --quiet https://github.com/salaniz/pycocoevalcap /tmp/pip-req-build-l9sqn96x
[31mERROR: Operation cancelled by user[0m[31m
[0m^C
Note: you may need to restart the kernel to use updated packages.


In [None]:
import os, json, random
from pathlib import Path
from collections import defaultdict

import torch
from PIL import Image
from tqdm import tqdm

import open_clip
from torchvision import transforms

from pycocotools.coco import COCO
from pycocoevalcap.eval import COCOEvalCap

# ---- Set your paths here ----
ANN_PATH = "data/nocap_val_4500_captions.json"   # NoCaps validation annotations
IMG_DIR  = "data/validation"                      # folder containing validation images

device = "cuda" if torch.cuda.is_available() else "cpu"
print("Device:", device)
assert Path(ANN_PATH).exists(), f"Annotation file not found: {ANN_PATH}"
assert Path(IMG_DIR).exists(), f"Image folder not found: {IMG_DIR}"

random.seed(0)
torch.manual_seed(0)
if device == "cuda":
    torch.cuda.manual_seed_all(0)

In [None]:
# --- Load annotations + verify 10 refs per image ---
with open(ANN_PATH, "r") as f:
    ann = json.load(f)

id2file = {img["id"]: img["file_name"] for img in ann["images"]}
caps_by_id = defaultdict(list)
for a in ann["annotations"]:
    caps_by_id[a["image_id"]].append(a["caption"])

num_images = len(ann["images"])
lens = [len(caps_by_id[i["id"]]) for i in ann["images"]]

print(f"# images: {num_images}")
print(f"min refs: {min(lens)}, max refs: {max(lens)}, mean refs: {sum(lens)/len(lens):.2f}")

bad = [(i["id"], id2file[i["id"]], len(caps_by_id[i["id"]])) for i in ann["images"] if len(caps_by_id[i["id"]]) != 10]
print("non-10 reference counts:", len(bad))
if not bad:
    first = ann["images"][0]
    print("Example image:", first["file_name"])
    print("Refs:", caps_by_id[first["id"]])

In [None]:
import open_clip
open_clip.list_pretrained()

In [None]:
# --- Load OpenCLIP CoCa ---

''' 
('coca_ViT-B-32', 'laion2b_s13b_b90k'),
('coca_ViT-B-32', 'mscoco_finetuned_laion2b_s13b_b90k'),
('coca_ViT-L-14', 'laion2b_s13b_b90k'),
('coca_ViT-L-14', 'mscoco_finetuned_laion2b_s13b_b90k'),
'''

model_name = "coca_ViT-L-14"
pretrained_tag = "mscoco_finetuned_laion2b_s13b_b90k"

model, _, preprocess = open_clip.create_model_and_transforms(model_name, pretrained=pretrained_tag)
tokenizer = open_clip.get_tokenizer(model_name)
model = model.to(device).eval()

print("Loaded:", model_name, "/", pretrained_tag)

In [None]:
import torch
from pathlib import Path
from PIL import Image

try:
    from open_clip import tokenizer as openclip_tok_mod
    _have_openclip_decoder = hasattr(openclip_tok_mod, "decode")
except Exception:
    openclip_tok_mod = None
    _have_openclip_decoder = False

@torch.no_grad()
def generate_caption_openclip(pil_img, max_len=30, temperature=1.0, top_k=None, top_p=None):
    img = preprocess(pil_img).unsqueeze(0).to(device)

    tried = []
    out = None
    for kwargs in (
        dict(seq_len=max_len, temperature=temperature, top_k=top_k, top_p=top_p),
        dict(seq_len=max_len, temperature=temperature),
        dict(max_len=max_len, temperature=temperature),
        dict(seq_len=max_len),
        dict(max_len=max_len),
        dict(),
    ):
        try:
            out = model.generate(img, **{k: v for k, v in kwargs.items() if v is not None})
            break
        except TypeError as e:
            tried.append(str(e))
            out = None

    if out is None:
        raise RuntimeError("open_clip CoCa.generate() signature not recognized. Tried:\n" + "\n".join(tried))

    # --- decode handling ---
    if isinstance(out, list):
        if len(out) and isinstance(out[0], str):
            return out[0]
        if len(out) and torch.is_tensor(out[0]):
            ids = out[0]
        elif len(out) and isinstance(out[0], (list, tuple)):
            ids = torch.tensor(out[0])
        else:
            raise RuntimeError(f"Unexpected list return type from model.generate(): {type(out[0])}")
    elif torch.is_tensor(out):
        ids = out[0]
    else:
        return str(out)

    if not torch.is_tensor(ids):
        ids = torch.tensor(ids)
    if _have_openclip_decoder:
        return openclip_tok_mod.decode(ids)
    if hasattr(model, "tokenizer") and hasattr(model.tokenizer, "decode"):
        return model.tokenizer.decode(ids.tolist())

    raise RuntimeError("model.generate returned token IDs but no decoder is available.")

In [None]:
test_path = Path(IMG_DIR) / id2file[ann["images"][0]["id"]]
print("Test caption:", generate_caption_openclip(Image.open(test_path).convert("RGB")))

In [None]:
with torch.no_grad():
    raw = model.generate(preprocess(Image.open(test_path).convert("RGB")).unsqueeze(0).to(device), seq_len=30)
print(type(raw), isinstance(raw, list), torch.is_tensor(raw))
print(raw[:1] if isinstance(raw, list) else raw.shape)

In [None]:
# --- Generate predictions for all images ---
preds, missing = [], []
for img_info in tqdm(ann["images"], desc="Captioning"):
    iid, fname = img_info["id"], img_info["file_name"]
    path = Path(IMG_DIR) / fname
    if not path.exists():
        missing.append(fname); continue
    pil = Image.open(path).convert("RGB")
    cap = generate_caption_openclip(pil)
    preds.append({"image_id": iid, "caption": cap})

print("Generated:", len(preds), "/", num_images)
print("Missing images:", len(missing))

In [None]:
# --- Save predictions ---
OUT_JSON = "preds_nocaps_val_openclip.json"
with open(OUT_JSON, "w") as f:
    json.dump(preds, f)
print("Saved:", OUT_JSON)

In [None]:
from pycocotools.coco import COCO
from pycocoevalcap.eval import COCOEvalCap
from pycocoevalcap.bleu.bleu import Bleu
from pycocoevalcap.meteor.meteor import Meteor
from pycocoevalcap.rouge.rouge import Rouge
from pycocoevalcap.cider.cider import Cider

# Load GT and predictions
coco = COCO(ANN_PATH)
cocoRes = coco.loadRes(OUT_JSON)

evaluator = COCOEvalCap(coco, cocoRes)

# Replace the default scorers (which includes SPICE)
evaluator.scorers = [
    (Bleu(4), ["Bleu_1", "Bleu_2", "Bleu_3", "Bleu_4"]),
    (Meteor(), "METEOR"),
    (Rouge(), "ROUGE_L"),
    (Cider(), "CIDEr"),
]

# Run evaluation
evaluator.evaluate()

print("\n=== NoCaps-val (overall, no SPICE) ===")
for k, v in evaluator.eval.items():
    print(f"{k:10s}: {v:.4f}")


In [None]:
# --- Show some qualitative examples ---
import random
from IPython.display import display

def show_examples(n=3):
    ids = [im["id"] for im in random.sample(ann["images"], k=n)]
    for iid in ids:
        f = id2file[iid]
        p = Path(IMG_DIR) / f
        if not p.exists():
            print("Missing:", f); continue
        img = Image.open(p).convert("RGB").resize((384, 384))
        display(img)
        ref_caps = caps_by_id[iid][:3]
        gen = next((x["caption"] for x in preds if x["image_id"] == iid), None)
        print("Generated:", iid,gen)
        print("Refs:")
        for rc in ref_caps:
            print("  -", rc)
        print("-"*80)

show_examples(3)

## Notes
- Beam size 3–5 is good for CIDEr.
- Max caption length ~20–30 tokens.
- SPICE metric requires Java.
- Leaderboard results differ (use online eval server for test split).