
# NoCaps Validation — CoCa Generation + CLIP-Guided Reranking (Option A)

This notebook:
1. Loads **NoCaps validation** annotations.
2. Loads **OpenCLIP CoCa** for caption generation (one or many candidates).
3. Loads **OpenAI CLIP (ViT-B/32)** for scoring (CLIPScore = cosine similarity).
4. Generates **N candidates per image** (via repeated generation calls with temperature).
5. **Reranks** the N candidates by CLIPScore and keeps the best per image.
6. Evaluates with **BLEU, METEOR, ROUGE_L, CIDEr** *(SPICE skipped to avoid Java)*.
7. Shows qualitative examples.

> Set your paths in the **Config** cell below.


In [None]:
# --- Install dependencies (internet required) ---
%pip install --upgrade pip
%pip install open_clip_torch pillow tqdm torchvision pycocotools
%pip install git+https://github.com/salaniz/pycocoevalcap
# SPICE is skipped by design; if you later want SPICE, install Java 11 and do not skip it.

In [1]:
# --- Imports & Config ---
import os, json, random
from pathlib import Path
from collections import defaultdict

import torch
from PIL import Image
from tqdm import tqdm

import open_clip

from pycocotools.coco import COCO
from pycocoevalcap.eval import COCOEvalCap
from pycocoevalcap.bleu.bleu import Bleu
from pycocoevalcap.meteor.meteor import Meteor
from pycocoevalcap.rouge.rouge import Rouge
from pycocoevalcap.cider.cider import Cider

# ====== Paths: change these to your local files ======
ANN_PATH = "data/nocap_val_4500_captions.json"   # NoCaps validation annotations
IMG_DIR  = "data/validation"                      # Folder with validation images

# Generation & reranking hyperparams
N_CANDIDATES = 5       # N candidates per image (increase for stronger reranking)
SEQ_LEN      = 28      # caption length
TEMP         = 1.1     # >1.0 adds diversity; if outputs repeat, try 1.2~1.3

# Device
device = "cuda" if torch.cuda.is_available() else "cpu"
print("Device:", device)

# Basic checks
assert Path(ANN_PATH).exists(), f"Annotation file not found: {ANN_PATH}"
assert Path(IMG_DIR).exists(), f"Image folder not found: {IMG_DIR}"

# Reproducibility for any deterministic parts
random.seed(0)
torch.manual_seed(0)
if device == "cuda":
    torch.cuda.manual_seed_all(0)

  from .autonotebook import tqdm as notebook_tqdm


Device: cuda


In [2]:
# --- Load annotations + verify 10 refs per image ---
with open(ANN_PATH, "r") as f:
    ann = json.load(f)

id2file = {img["id"]: img["file_name"] for img in ann["images"]}
caps_by_id = defaultdict(list)
for a in ann["annotations"]:
    caps_by_id[a["image_id"]].append(a["caption"])

num_images = len(ann["images"])
lens = [len(caps_by_id[i["id"]]) for i in ann["images"]]

print(f"# images: {num_images}")
print(f"min refs: {min(lens)}, max refs: {max(lens)}, mean refs: {sum(lens)/len(lens):.2f}")

bad = [(i["id"], id2file[i["id"]], len(caps_by_id[i["id"]])) for i in ann["images"] if len(caps_by_id[i["id"]]) != 10]
print("non-10 reference counts:", len(bad))
if not bad:
    ex = ann["images"][0]
    print("Example image:", ex["file_name"])
    print("Refs:", caps_by_id[ex["id"]][:3], "...")

# images: 4500
min refs: 10, max refs: 10, mean refs: 10.00
non-10 reference counts: 0
Example image: 0013ea2087020901.jpg
Refs: ['A baby is standing in front of a house.', 'A little girl in a white jacket and sandals.', 'A young child stands in front of a house.'] ...


In [3]:
# --- Load CoCa (for generation) ---
coca_name = "coca_ViT-L-14"
coca_tag  = "mscoco_finetuned_laion2b_s13b_b90k"  # strong for captioning

coca_model, _, coca_preprocess = open_clip.create_model_and_transforms(coca_name, pretrained=coca_tag)
coca_model = coca_model.to(device).eval()
print("Loaded CoCa:", coca_name, "/", coca_tag)

# --- Load CLIP (for scoring) ---
clip_name = "ViT-B-32"
clip_tag  = "openai"

clip_model, _, clip_preprocess = open_clip.create_model_and_transforms(clip_name, pretrained=clip_tag)
clip_tokenizer = open_clip.get_tokenizer(clip_name)
clip_model = clip_model.to(device).eval()
print("Loaded CLIP:", clip_name, "/", clip_tag)


Loaded CoCa: coca_ViT-L-14 / mscoco_finetuned_laion2b_s13b_b90k




Loaded CLIP: ViT-B-32 / openai


In [4]:
# --- Robust CoCa.generate wrapper ---
try:
    from open_clip import tokenizer as openclip_tok_mod
    _have_openclip_decoder = hasattr(openclip_tok_mod, "decode")
except Exception:
    openclip_tok_mod = None
    _have_openclip_decoder = False

@torch.no_grad()
def generate_caption_openclip(pil_img, max_len=30, temperature=1.0, top_k=None, top_p=None):
    img = coca_preprocess(pil_img).unsqueeze(0).to(device)

    tried = []
    out = None
    for kwargs in (
        dict(seq_len=max_len, temperature=temperature, top_k=top_k, top_p=top_p),
        dict(seq_len=max_len, temperature=temperature),
        dict(max_len=max_len, temperature=temperature),
        dict(seq_len=max_len),
        dict(max_len=max_len),
        dict(),
    ):
        try:
            clean = {k: v for k, v in kwargs.items() if v is not None}
            out = coca_model.generate(img, **clean)
            break
        except TypeError as e:
            tried.append(str(e))
            out = None

    if out is None:
        raise RuntimeError("CoCa.generate() signature not recognized. Tried:\\n" + "\\n".join(tried))

    if isinstance(out, list):
        if len(out) and isinstance(out[0], str):
            return out[0]
        if len(out) and torch.is_tensor(out[0]):
            ids = out[0]
        elif len(out) and isinstance(out[0], (list, tuple)):
            ids = torch.tensor(out[0])
        else:
            return str(out)
    elif torch.is_tensor(out):
        ids = out[0]
    else:
        return str(out)

    if not torch.is_tensor(ids):
        ids = torch.tensor(ids)

    if _have_openclip_decoder:
        return openclip_tok_mod.decode(ids)
    if hasattr(coca_model, "tokenizer") and hasattr(coca_model.tokenizer, "decode"):
        return coca_model.tokenizer.decode(ids.tolist())

    return " ".join(map(str, ids.tolist()))

In [5]:
# --- Generate N candidates (repeat with temperature for diversity) ---
@torch.no_grad()
def generate_n_candidates(pil_img, N=5, seq_len=28, temperature=1.1):
    caps = []
    for _ in range(N):
        cap = generate_caption_openclip(pil_img, max_len=seq_len, temperature=temperature)
        caps.append(cap)
    # Deduplicate while preserving order
    seen = set(); uniq = []
    for c in caps:
        if c not in seen:
            seen.add(c); uniq.append(c)
    return uniq

In [6]:
# --- CLIPScore computation and reranking ---
@torch.no_grad()
def clipscore_rerank(pil_img, candidates):
    if not candidates:
        return None, []

    # Encode image once
    img = clip_preprocess(pil_img).unsqueeze(0).to(device)
    img_feat = clip_model.encode_image(img)
    img_feat = img_feat / img_feat.norm(dim=-1, keepdim=True)

    # Encode all candidate captions
    toks = clip_tokenizer(candidates).to(device)
    txt_feat = clip_model.encode_text(toks)
    txt_feat = txt_feat / txt_feat.norm(dim=-1, keepdim=True)

    sims = (img_feat @ txt_feat.T).squeeze(0).detach().float().cpu().tolist()
    ranked = sorted(zip(candidates, sims), key=lambda x: x[1], reverse=True)
    best_cap = ranked[0][0]
    return best_cap, ranked

In [8]:

# --- Quick test on one image ---
test_path = Path(IMG_DIR) / id2file[ann["images"][8]["id"]]
if test_path.exists():
    pil = Image.open(test_path).convert("RGB")
    cands = generate_n_candidates(pil, N=N_CANDIDATES, seq_len=SEQ_LEN, temperature=TEMP)
    best, ranked = clipscore_rerank(pil, cands)
    print("Candidates:")
    for c in cands:
        print(" -", c)
    print("\nTop-1 by CLIPScore:\n", best)
else:
    print("Test image missing:", test_path)

Candidates:
 - <start_of_text>a garage that has been torn down on the side of the street . <end_of_text>

Top-1 by CLIPScore:
 <start_of_text>a garage that has been torn down on the side of the street . <end_of_text>


In [None]:
# --- Full dataset: generate N candidates + CLIP rerank ---
preds = []
missing = []
all_candidates_debug = []   # optional: save ranked lists per image

for img_info in tqdm(ann["images"], desc="Captioning + Reranking"):
    iid, fname = img_info["id"], img_info["file_name"]
    fpath = Path(IMG_DIR) / fname
    if not fpath.exists():
        missing.append(fname)
        continue

    pil = Image.open(fpath).convert("RGB")
    cands = generate_n_candidates(pil, N=N_CANDIDATES, seq_len=SEQ_LEN, temperature=TEMP)
    if not cands:
        cands = [generate_caption_openclip(pil, max_len=SEQ_LEN, temperature=TEMP)]

    best, ranked = clipscore_rerank(pil, cands)
    preds.append({"image_id": iid, "caption": best})
    all_candidates_debug.append({
        "image_id": iid,
        "file_name": fname,
        "ranked": [{"caption": c, "clipscore": s} for c, s in ranked]
    })

print("Generated:", len(preds), "/", len(ann["images"]))
print("Missing images:", len(missing))


In [None]:

# --- Save outputs ---
OUT_JSON = "preds_nocaps_val_openclip_cliprank.json"
with open(OUT_JSON, "w") as f:
    json.dump(preds, f)
print("Saved predictions:", OUT_JSON)

DEBUG_JSON = "nocaps_candidates_cliprank.json"
with open(DEBUG_JSON, "w") as f:
    json.dump(all_candidates_debug, f, indent=2)
print("Saved candidates (debug):", DEBUG_JSON)


In [None]:

# --- COCO caption metrics (BLEU/METEOR/ROUGE_L/CIDEr), SPICE skipped ---
coco = COCO(ANN_PATH)
cocoRes = coco.loadRes(OUT_JSON)

evaluator = COCOEvalCap(coco, cocoRes)
# Replace default scorers to avoid SPICE (Java)
evaluator.scorers = [
    (Bleu(4), ["Bleu_1", "Bleu_2", "Bleu_3", "Bleu_4"]),
    (Meteor(), "METEOR"),
    (Rouge(), "ROUGE_L"),
    (Cider(), "CIDEr"),
]
evaluator.evaluate()

print("\n=== NoCaps-val (overall, no SPICE) ===")
for k, v in evaluator.eval.items():
    print(f"{k:10s}: {v:.4f}")


In [None]:

# --- Qualitative examples: show a few images, top caption, and references ---
import random
from IPython.display import display

def show_examples(n=3):
    ids = [im["id"] for im in random.sample(ann["images"], k=n)]
    for iid in ids:
        fname = id2file[iid]
        p = Path(IMG_DIR) / fname
        if not p.exists():
            print("Missing:", fname); continue
        display(Image.open(p).convert("RGB"))
        gen = next((x["caption"] for x in preds if x["image_id"] == iid), None)
        print("Top-1 (CLIP rerank):", gen)
        print("Refs:")
        for rc in caps_by_id[iid][:3]:
            print("  -", rc)
        print("-"*80)

show_examples(3)



## Notes & Tips
- **Transforms**: `coca_preprocess` for CoCa; `clip_preprocess` for CLIP.
- **Diversity**: If candidates repeat, increase `TEMP` to 1.2–1.3. If still identical, your OpenCLIP build may be deterministic; upgrade OpenCLIP or implement a custom decoder.
- **N candidates**: 5–10 is a good range; higher N improves reranking but costs time.
- **SPICE**: Skipped here to avoid Java. If needed, install Java 11 and revert to default scorers.
- **Speed**: CLIP ViT-B/32 is fast; CoCa L-14 is heavy. For quick trials, try `coca_ViT-B-32`.
