# NoCaps Validation — OpenCLIP CoCa End-to-End (Option A)

This notebook:
1. Installs dependencies (OpenCLIP CoCa + COCO caption eval).
2. Loads NoCaps **validation** annotations (expects 10 captions per image).
3. Loads a **pretrained CoCa** from OpenCLIP.
4. Generates captions (beam search) for all images.
5. Evaluates with **BLEU, METEOR, ROUGE_L, CIDEr** (SPICE optional).

> **Paths to set:** `ANN_PATH` and `IMG_DIR` near the top.  
> **Expected files:**  
>  - `data/nocap_val_4500_captions.json`  
>  - `data/validation/<image files>`

In [1]:
# --- Install dependencies (internet required) ---
%pip install --upgrade pip
%pip install open_clip_torch pillow tqdm torchvision pycocotools
%pip install git+https://github.com/salaniz/pycocoevalcap
# Optional for SPICE (Java required):
# !apt-get update && apt-get install -y default-jre

Note: you may need to restart the kernel to use updated packages.
Collecting open_clip_torch
  Using cached open_clip_torch-3.2.0-py3-none-any.whl.metadata (32 kB)
Collecting tqdm
  Using cached tqdm-4.67.1-py3-none-any.whl.metadata (57 kB)
Collecting pycocotools
  Using cached pycocotools-2.0.10-cp312-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (1.3 kB)
Collecting regex (from open_clip_torch)
  Using cached regex-2025.9.18-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl.metadata (40 kB)
Collecting ftfy (from open_clip_torch)
  Using cached ftfy-6.3.1-py3-none-any.whl.metadata (7.3 kB)
Collecting huggingface-hub (from open_clip_torch)
  Using cached huggingface_hub-0.35.3-py3-none-any.whl.metadata (14 kB)
Collecting safetensors (from open_clip_torch)
  Using cached safetensors-0.6.2-cp38-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.1 kB)
Collecting timm>=1.0.17 (from open_clip_torch)
  Using cached timm-1.0.20-py3-non

In [2]:
import os, json, random
from pathlib import Path
from collections import defaultdict

import torch
from PIL import Image
from tqdm import tqdm

import open_clip
from torchvision import transforms

from pycocotools.coco import COCO
from pycocoevalcap.eval import COCOEvalCap

# ---- Set your paths here ----
ANN_PATH = "data/nocap_val_4500_captions.json"   # NoCaps validation annotations
IMG_DIR  = "data/validation"                      # folder containing validation images

device = "cuda" if torch.cuda.is_available() else "cpu"
print("Device:", device)
assert Path(ANN_PATH).exists(), f"Annotation file not found: {ANN_PATH}"
assert Path(IMG_DIR).exists(), f"Image folder not found: {IMG_DIR}"

random.seed(0)
torch.manual_seed(0)
if device == "cuda":
    torch.cuda.manual_seed_all(0)

Device: cuda


  from .autonotebook import tqdm as notebook_tqdm


In [3]:
# --- Load annotations + verify 10 refs per image ---
with open(ANN_PATH, "r") as f:
    ann = json.load(f)

id2file = {img["id"]: img["file_name"] for img in ann["images"]}
caps_by_id = defaultdict(list)
for a in ann["annotations"]:
    caps_by_id[a["image_id"]].append(a["caption"])

num_images = len(ann["images"])
lens = [len(caps_by_id[i["id"]]) for i in ann["images"]]

print(f"# images: {num_images}")
print(f"min refs: {min(lens)}, max refs: {max(lens)}, mean refs: {sum(lens)/len(lens):.2f}")

bad = [(i["id"], id2file[i["id"]], len(caps_by_id[i["id"]])) for i in ann["images"] if len(caps_by_id[i["id"]]) != 10]
print("non-10 reference counts:", len(bad))
if not bad:
    first = ann["images"][0]
    print("Example image:", first["file_name"])
    print("Refs:", caps_by_id[first["id"]])

# images: 4500
min refs: 10, max refs: 10, mean refs: 10.00
non-10 reference counts: 0
Example image: 0013ea2087020901.jpg
Refs: ['A baby is standing in front of a house.', 'A little girl in a white jacket and sandals.', 'A young child stands in front of a house.', 'A child is wearing a white shirt and standing on a side walk. ', 'A little boy is standing in his diaper with a white shirt on.', 'A child wearing a diaper and shoes stands on the sidewalk.', 'A child is wearing a light-colored shirt during the daytime.', 'A little kid standing on the pavement in a shirt. ', 'Black and white photo of a little girl smiling.', 'a cute baby is standing alone with white shirt']


In [4]:
import open_clip
open_clip.list_pretrained()

[('RN50', 'openai'),
 ('RN50', 'yfcc15m'),
 ('RN50', 'cc12m'),
 ('RN101', 'openai'),
 ('RN101', 'yfcc15m'),
 ('RN50x4', 'openai'),
 ('RN50x16', 'openai'),
 ('RN50x64', 'openai'),
 ('ViT-B-32', 'openai'),
 ('ViT-B-32', 'laion400m_e31'),
 ('ViT-B-32', 'laion400m_e32'),
 ('ViT-B-32', 'laion2b_e16'),
 ('ViT-B-32', 'laion2b_s34b_b79k'),
 ('ViT-B-32', 'datacomp_xl_s13b_b90k'),
 ('ViT-B-32', 'datacomp_m_s128m_b4k'),
 ('ViT-B-32', 'commonpool_m_clip_s128m_b4k'),
 ('ViT-B-32', 'commonpool_m_laion_s128m_b4k'),
 ('ViT-B-32', 'commonpool_m_image_s128m_b4k'),
 ('ViT-B-32', 'commonpool_m_text_s128m_b4k'),
 ('ViT-B-32', 'commonpool_m_basic_s128m_b4k'),
 ('ViT-B-32', 'commonpool_m_s128m_b4k'),
 ('ViT-B-32', 'datacomp_s_s13m_b4k'),
 ('ViT-B-32', 'commonpool_s_clip_s13m_b4k'),
 ('ViT-B-32', 'commonpool_s_laion_s13m_b4k'),
 ('ViT-B-32', 'commonpool_s_image_s13m_b4k'),
 ('ViT-B-32', 'commonpool_s_text_s13m_b4k'),
 ('ViT-B-32', 'commonpool_s_basic_s13m_b4k'),
 ('ViT-B-32', 'commonpool_s_s13m_b4k'),
 ('ViT-

In [5]:
# # --- Load OpenCLIP CoCa ---

# ''' 
# ('coca_ViT-B-32', 'laion2b_s13b_b90k'),
# ('coca_ViT-B-32', 'mscoco_finetuned_laion2b_s13b_b90k'),
# ('coca_ViT-L-14', 'laion2b_s13b_b90k'),
# ('coca_ViT-L-14', 'mscoco_finetuned_laion2b_s13b_b90k'),
# '''

# model_name = "coca_ViT-L-14"
# pretrained_tag = "mscoco_finetuned_laion2b_s13b_b90k"

# model, _, preprocess = open_clip.create_model_and_transforms(model_name, pretrained=pretrained_tag)
# tokenizer = open_clip.get_tokenizer(model_name)
# model = model.to(device).eval()

# print("Loaded:", model_name, "/", pretrained_tag)

In [6]:
# Fix CA certificate bundle issue for HTTPS downloads
%pip install --force-reinstall certifi

# A separate CLIP model for scoring (small & fast: ViT-B/32)
clip_model, _, clip_preprocess = open_clip.create_model_and_transforms(
    "ViT-B-32", pretrained="openai"  # or "laion2b_s34b_b79k" if you prefer open LAION weights
)
clip_tokenizer = open_clip.get_tokenizer("ViT-B-32")
clip_model = clip_model.to(device).eval()

@torch.no_grad()
def clipscore(pil_img, caption: str) -> float:
    # Encode image once
    img = clip_preprocess(pil_img).unsqueeze(0).to(device)
    with torch.no_grad():
        img_feat = clip_model.encode_image(img)
        img_feat = img_feat / img_feat.norm(dim=-1, keepdim=True)

    # Encode caption
    txt = clip_tokenizer([caption]).to(device)
    with torch.no_grad():
        txt_feat = clip_model.encode_text(txt)
        txt_feat = txt_feat / txt_feat.norm(dim=-1, keepdim=True)

    # cosine similarity
    sim = (img_feat @ txt_feat.T).item()
    return sim


Collecting certifi
  Using cached certifi-2025.10.5-py3-none-any.whl.metadata (2.5 kB)
Using cached certifi-2025.10.5-py3-none-any.whl (163 kB)
Installing collected packages: certifi
  Attempting uninstall: certifi
    Found existing installation: certifi 2025.10.5
    Uninstalling certifi-2025.10.5:
      Successfully uninstalled certifi-2025.10.5
Successfully installed certifi-2025.10.5
Note: you may need to restart the kernel to use updated packages.




In [7]:
import torch
from pathlib import Path
from PIL import Image

try:
    from open_clip import tokenizer as openclip_tok_mod
    _have_openclip_decoder = hasattr(openclip_tok_mod, "decode")
except Exception:
    openclip_tok_mod = None
    _have_openclip_decoder = False

@torch.no_grad()
def generate_caption_openclip(pil_img, max_len=30, temperature=1.0, model_name=None, pretrained_tag=None, top_k=None, top_p=None):
    
    model, _, preprocess = open_clip.create_model_and_transforms(model_name, pretrained=pretrained_tag)
    tokenizer = open_clip.get_tokenizer(model_name)
    model = model.to(device).eval()
    
    img = preprocess(pil_img).unsqueeze(0).to(device)

    tried = []
    out = None
    for kwargs in (
        dict(seq_len=max_len, temperature=temperature, top_k=top_k, top_p=top_p),
        dict(seq_len=max_len, temperature=temperature),
        dict(max_len=max_len, temperature=temperature),
        dict(seq_len=max_len),
        dict(max_len=max_len),
        dict(),
    ):
        try:
            out = model.generate(img, **{k: v for k, v in kwargs.items() if v is not None})
            break
        except TypeError as e:
            tried.append(str(e))
            out = None

    if out is None:
        raise RuntimeError("open_clip CoCa.generate() signature not recognized. Tried:\n" + "\n".join(tried))

    # --- decode handling ---
    if isinstance(out, list):
        if len(out) and isinstance(out[0], str):
            return out[0]
        if len(out) and torch.is_tensor(out[0]):
            ids = out[0]
        elif len(out) and isinstance(out[0], (list, tuple)):
            ids = torch.tensor(out[0])
        else:
            raise RuntimeError(f"Unexpected list return type from model.generate(): {type(out[0])}")
    elif torch.is_tensor(out):
        ids = out[0]
    else:
        return str(out)

    if not torch.is_tensor(ids):
        ids = torch.tensor(ids)
    if _have_openclip_decoder:
        return openclip_tok_mod.decode(ids)
    if hasattr(model, "tokenizer") and hasattr(model.tokenizer, "decode"):
        return model.tokenizer.decode(ids.tolist())

    raise RuntimeError("model.generate returned token IDs but no decoder is available.")


In [8]:
import random

@torch.no_grad()
def generate_n_candidates(pil_img, seq_len=28, temperature=0.9, N=4):
    """
    Calls your existing generate_caption_openclip N times to get diverse candidates.
    NOTE: If your build is purely greedy, multiple calls may be identical.
    Diversity relies on temperature / stochastic decoding in your OpenCLIP build.
    """
    cands = []
    for _ in range(N):
        if _ == 0:
            model_name = "coca_ViT-B-32"
            pretrained_tag = "laion2b_s13b_b90k"
            print("1")
        elif _ == 1:
            model_name = "coca_ViT-B-32"
            pretrained_tag = "mscoco_finetuned_laion2b_s13b_b90k"
            print("2")
        elif _ == 2:
            model_name = "coca_ViT-L-14"
            pretrained_tag = "laion2b_s13b_b90k"
            print("3")
        else:
            model_name = "coca_ViT-L-14"
            pretrained_tag = "mscoco_finetuned_laion2b_s13b_b90k"
            print("4")
            
        cap = generate_caption_openclip(
            pil_img,
            max_len=seq_len,         # or seq_len=seq_len in your wrapper, both handled
            temperature=temperature,  # >1.0 = more diverse; <1.0 = safer
            model_name=model_name,
            pretrained_tag=pretrained_tag,
        )
        cands.append(cap)
        print(f"Candidate {_}: {cap}")
    # Deduplicate while keeping order
    seen = set(); uniq = []
    for c in cands:
        if c not in seen:
            seen.add(c); uniq.append(c)
    return uniq

In [9]:
@torch.no_grad()
def clip_rerank(pil_img, candidates):
    # Cache image feature once
    img = clip_preprocess(pil_img).unsqueeze(0).to(device)
    img_feat = clip_model.encode_image(img)
    img_feat = img_feat / img_feat.norm(dim=-1, keepdim=True)

    # Encode all captions together (batched)
    if not candidates:
        return None, []
    toks = clip_tokenizer(candidates).to(device)
    txt_feat = clip_model.encode_text(toks)
    txt_feat = txt_feat / txt_feat.norm(dim=-1, keepdim=True)

    sims = (img_feat @ txt_feat.T).squeeze(0)      # (num_cands,)
    sims = sims.detach().float().cpu().tolist()
    # Get best
    best_idx = max(range(len(candidates)), key=lambda i: sims[i])
    best_caption = candidates[best_idx]
    ranked = sorted(zip(candidates, sims), key=lambda x: x[1], reverse=True)
    return best_caption, ranked


In [10]:
N = 4           # number of candidates per image
SEQ_LEN = 28     # caption length
TEMP = 0.9       # diversity

preds = []
missing = []
all_candidates_debug = []   # optional: keep all N + scores per image for analysis
ann["images"] = ann["images"]

for img_info in tqdm(ann["images"], desc="Captioning + CLIP rerank"):
    image_id = img_info["id"]
    fpath = Path(IMG_DIR) / img_info["file_name"]
    if not fpath.exists():
        missing.append(img_info["file_name"]); continue

    pil = Image.open(fpath).convert("RGB")

    # 1) generate N candidates with CoCa
    cands = generate_n_candidates(pil, seq_len=SEQ_LEN, temperature=TEMP, N=N)
    print("Final candidates:", cands)
    if not cands:
        cands = [generate_caption_openclip(pil, max_len=SEQ_LEN, temperature=TEMP, N=N)]

    # 2) CLIP rerank
    # best_cap, ranked = clip_rerank(pil, cands)

    preds.append({"file_name": img_info["file_name"],"image_id": image_id, "caption": cands})
    all_candidates_debug.append({
        "file_name": img_info["file_name"],
        "image_id": image_id,
        "file_name": img_info["file_name"],
        # "ranked": [{"caption": c, "clipscore": s} for c, s in ranked]
    })

len(preds), len(all_candidates_debug), len(missing)

Captioning + CLIP rerank:   0%|          | 0/4500 [00:00<?, ?it/s]

1


Captioning + CLIP rerank:   0%|          | 0/4500 [00:02<?, ?it/s]


AssertionError: Please install transformers for generate functionality. `pip install transformers`.

In [11]:
# In a new cell (with internet)
%pip install -U "transformers>=4.42" "tokenizers>=0.15" accelerate


Collecting transformers>=4.42
  Using cached transformers-4.57.0-py3-none-any.whl.metadata (41 kB)
Collecting tokenizers>=0.15
  Using cached tokenizers-0.22.1-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.8 kB)
Collecting accelerate
  Using cached accelerate-1.10.1-py3-none-any.whl.metadata (19 kB)
Using cached transformers-4.57.0-py3-none-any.whl (12.0 MB)
Using cached tokenizers-0.22.1-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.3 MB)
Using cached accelerate-1.10.1-py3-none-any.whl (374 kB)
Installing collected packages: tokenizers, transformers, accelerate
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3/3[0m [accelerate]3[0m [transformers]
[1A[2KSuccessfully installed accelerate-1.10.1 tokenizers-0.22.1 transformers-4.57.0
Note: you may need to restart the kernel to use updated packages.


In [12]:
import importlib
import open_clip
import open_clip.coca_model as coca_model

import transformers  # just to ensure it's importable now
importlib.reload(coca_model)  # refreshes _has_transformers = True
importlib.reload(open_clip)   # (optional but safe)


<module 'open_clip' from '/home/cse_g3/CoCa-pytorch/.venv/lib/python3.12/site-packages/open_clip/__init__.py'>

In [13]:
# --- Install dependencies (internet required) ---
%pip install --upgrade pip
%pip install open_clip_torch pillow tqdm torchvision pycocotools
%pip install git+https://github.com/salaniz/pycocoevalcap
%pip install -U "transformers>=4.42" "tokenizers>=0.15" accelerate
# Optional for SPICE (Java required):
# !apt-get update && apt-get install -y default-jre


Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Collecting git+https://github.com/salaniz/pycocoevalcap
  Cloning https://github.com/salaniz/pycocoevalcap to /tmp/pip-req-build-5ykyulhm
  Running command git clone --filter=blob:none --quiet https://github.com/salaniz/pycocoevalcap /tmp/pip-req-build-5ykyulhm
  Resolved https://github.com/salaniz/pycocoevalcap to commit a24f74c408c918f1f4ec34e9514bc8a76ce41ffd
  Installing build dependencies ... [?25ldone
[?25h  Getting requirements to build wheel ... [?25ldone
[?25h  Preparing metadata (pyproject.toml) ... [?25ldone
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.


In [14]:
# ==== CACHED MODEL LOADER ====
from functools import lru_cache

MODEL_SPECS = [
    ("coca_ViT-B-32", "laion2b_s13b_b90k"),
    ("coca_ViT-B-32", "mscoco_finetuned_laion2b_s13b_b90k"),
    ("coca_ViT-L-14", "laion2b_s13b_b90k"),
    ("coca_ViT-L-14", "mscoco_finetuned_laion2b_s13b_b90k"),
]

@lru_cache(maxsize=None)
def _load_openclip_model(model_name: str, pretrained_tag: str):
    m, _, preprocess = open_clip.create_model_and_transforms(model_name, pretrained=pretrained_tag)
    tok = open_clip.get_tokenizer(model_name)
    m = m.to(device).eval()
    return m, tok, preprocess

@torch.no_grad()
def generate_caption_openclip_cached(pil_img, model_name: str, pretrained_tag: str,
                                     max_len=30, temperature=1.0, top_k=None, top_p=None):
    model, tokenizer, preprocess = _load_openclip_model(model_name, pretrained_tag)
    img = preprocess(pil_img).unsqueeze(0).to(device)

    # Try several argument names (OpenCLIP/Coca signatures vary by version)
    tried = []
    out = None
    for kwargs in (
        dict(seq_len=max_len, temperature=temperature, top_k=top_k, top_p=top_p),
        dict(seq_len=max_len, temperature=temperature),
        dict(max_len=max_len, temperature=temperature),
        dict(seq_len=max_len),
        dict(max_len=max_len),
        dict(),
    ):
        try:
            out = model.generate(img, **{k: v for k, v in kwargs.items() if v is not None})
            break
        except TypeError as e:
            tried.append(str(e))
            out = None

    if out is None:
        raise RuntimeError("open_clip CoCa.generate() signature not recognized. Tried:\n" + "\n".join(tried))

    # Normalize outputs to a string
    if isinstance(out, list):
        if len(out) and isinstance(out[0], str):
            return out[0]
        if len(out) and torch.is_tensor(out[0]):
            ids = out[0]
        elif len(out) and isinstance(out[0], (list, tuple)):
            ids = torch.tensor(out[0])
        else:
            return str(out)
    elif torch.is_tensor(out):
        ids = out[0]
    else:
        return str(out)

    # Decode token ids
    if not torch.is_tensor(ids):
        ids = torch.tensor(ids)
    if _have_openclip_decoder:
        return openclip_tok_mod.decode(ids)
    if hasattr(model, "tokenizer") and hasattr(model.tokenizer, "decode"):
        return model.tokenizer.decode(ids.tolist())

    # Fallback: join token IDs as string (shouldn't happen in practice)
    return " ".join(map(str, ids.tolist()))

@torch.no_grad()
def generate_n_candidates(pil_img, seq_len=28, temperature=0.9, N=4):
    """
    Generate exactly N captions (no deduping), using different model checkpoints
    to encourage diversity. If N > len(MODEL_SPECS), we cycle through the list.
    """
    caps = []
    for i in range(N):
        model_name, pretrained_tag = MODEL_SPECS[i % len(MODEL_SPECS)]
        cap = generate_caption_openclip_cached(
            pil_img,
            model_name=model_name,
            pretrained_tag=pretrained_tag,
            max_len=seq_len,
            temperature=temperature,
        )
        caps.append(cap)
    return caps

# ======= MAIN LOOP (always 4 captions per image) =======
N = 4         # number of captions per image
SEQ_LEN = 28  # caption length
TEMP = 0.9    # sampling temperature

preds = []
missing = []
all_candidates_debug = []

for img_info in tqdm(ann["images"], desc="Captioning (+optional CLIP rerank)"):
    image_id = img_info["id"]
    fpath = Path(IMG_DIR) / img_info["file_name"]
    if not fpath.exists():
        missing.append(img_info["file_name"])
        continue

    with Image.open(fpath).convert("RGB") as pil:
        # 1) generate N candidates (no dedupe)
        cands = generate_n_candidates(pil, seq_len=SEQ_LEN, temperature=TEMP, N=N)

        # 2) (Optional) CLIP rerank just for debugging/inspection, not changing the saved list
        # best_cap, ranked = clip_rerank(pil, cands)

    # Save ALL 4 captions for this image
    preds.append({
        "file_name": img_info["file_name"],
        "image_id": image_id,
        "captions": cands,   # <-- list of 4 strings
    })

    # Optional debug block if you later re-enable reranking
    all_candidates_debug.append({
        "file_name": img_info["file_name"],
        "image_id": image_id,
        # "ranked": [{"caption": c, "clipscore": s} for c, s in ranked],
    })

print(f"Images processed: {len(preds)}; missing files: {len(missing)}")

# --- Save predictions ---
OUT_JSON = "preds_nocaps_val_openclip.json"
with open(OUT_JSON, "w") as f:
    json.dump(preds, f, ensure_ascii=False, indent=2)
print("Saved:", OUT_JSON)


Captioning (+optional CLIP rerank):   0%|          | 0/4500 [00:01<?, ?it/s]


AssertionError: Please install transformers for generate functionality. `pip install transformers`.

In [None]:
# --- Save predictions ---
OUT_JSON = "preds_nocaps_val_openclip.json"
with open(OUT_JSON, "w") as f:
    json.dump(preds, f)
print("Saved:", OUT_JSON)

In [None]:
# # %pip install clip-by-openai

# import clip
# import torch
# from PIL import Image

# device = "cuda" if torch.cuda.is_available() else "cpu"
# model, preprocess = clip.load("ViT-B/32", device=device)

# image = preprocess(Image.open("data/validation/0013ea2087020901.png")).unsqueeze(0).to(device)
# text = clip.tokenize(["this photo is one of the first photos i have of my great - great - great great great great great great great great great great great ",
#             "a little boy that is standing up with a bat",
#             "1 9 5 0 - 0 4 - 0 1 - baby - in - front - of - house - 0 1 . jpg",
#             "an old black and white photo of a little boy"]).to(device)

# with torch.no_grad():
#     image_features = model.encode_image(image)
#     text_features = model.encode_text(text)
    
#     logits_per_image, logits_per_text = model(image, text)
#     probs = logits_per_image.softmax(dim=-1).cpu().numpy()

# print("Label probs:", probs)  # prints: [[0.9927937  0.00421068 0.00299572]]

In [None]:
from pycocotools.coco import COCO
from pycocoevalcap.eval import COCOEvalCap
from pycocoevalcap.bleu.bleu import Bleu
from pycocoevalcap.meteor.meteor import Meteor
from pycocoevalcap.rouge.rouge import Rouge
from pycocoevalcap.cider.cider import Cider

# Load GT and predictions
coco = COCO(ANN_PATH)
cocoRes = coco.loadRes(OUT_JSON)

evaluator = COCOEvalCap(coco, cocoRes)

# Replace the default scorers (which includes SPICE)
evaluator.scorers = [
    (Bleu(4), ["Bleu_1", "Bleu_2", "Bleu_3", "Bleu_4"]),
    (Meteor(), "METEOR"),
    (Rouge(), "ROUGE_L"),
    (Cider(), "CIDEr"),
]

# Run evaluation
evaluator.evaluate()

print("\n=== NoCaps-val (overall, no SPICE) ===")
for k, v in evaluator.eval.items():
    print(f"{k:10s}: {v:.4f}")


## Notes
- Beam size 3–5 is good for CIDEr.
- Max caption length ~20–30 tokens.
- SPICE metric requires Java.
- Leaderboard results differ (use online eval server for test split).