# Inference + Evaluation Script (BLEU-4 & CIDEr)

In [1]:
!git clone https://github.com/salaniz/pycocoevalcap.git
%cd pycocoevalcap
!pip install .
%cd ..

Cloning into 'pycocoevalcap'...
remote: Enumerating objects: 821, done.[K
remote: Counting objects: 100% (12/12), done.[K
remote: Compressing objects: 100% (9/9), done.[K
remote: Total 821 (delta 4), reused 3 (delta 3), pack-reused 809 (from 2)[K
Receiving objects: 100% (821/821), 130.06 MiB | 35.79 MiB/s, done.
Resolving deltas: 100% (424/424), done.
/kaggle/working/pycocoevalcap
Processing /kaggle/working/pycocoevalcap
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pycocoevalcap
  Building wheel for pycocoevalcap (setup.py) ... [?25l[?25hdone
  Created wheel for pycocoevalcap: filename=pycocoevalcap-1.2-py3-none-any.whl size=104312245 sha256=c27fa6535fa655a9ab2e41ffd67af4472f914f383249936b01970766d63c966b
  Stored in directory: /tmp/pip-ephem-wheel-cache-3_8h83br/wheels/e1/95/5b/9a3357937c812a0ff04bc78701371bb96f914719385ff3183f
Successfully built pycocoevalcap
Installing collected packages: pycocoevalcap
Successfully

In [2]:
import os, csv, numpy as np
import torch
from torch.utils.data import Dataset
from tqdm import tqdm
from pycocoevalcap.cider.cider import Cider
from pycocoevalcap.bleu.bleu import Bleu
from pycocoevalcap.meteor.meteor import Meteor
from pycocoevalcap.rouge.rouge import Rouge

import sentencepiece as spm

# === CONFIG ===
CHECKPOINT_PATH = "/kaggle/input/mixcap-final-best-model/MixCap_model_only.pth"
SPM_PATH = "/kaggle/input/mrsvtt-features-final-full-dataset/tokenizer/spm.model"
VAL_CAPTION_PATH = "/kaggle/input/mrsvtt-features-final-full-dataset/tokenizer/tokenized_captions/test_captions.npy"
VIDEO_DIR = "/kaggle/input/mrsvtt-features-final-full-dataset/test_set/video"
AUDIO_DIR = "/kaggle/input/mrsvtt-features-final-full-dataset/test_set/audio"
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

# Load tokenizer
sp_model = spm.SentencePieceProcessor()
sp_model.load(SPM_PATH)

PAD_ID = 4
SOS_ID = 5
EOS_ID = 6
VOCAB_SIZE = 8000

In [3]:
class EvalDataset(Dataset):
    def __init__(self, video_dir, audio_dir, caption_dict):
        self.video_dir = video_dir
        self.audio_dir = audio_dir
        self.caption_dict = caption_dict
        self.vids = list(caption_dict.keys())

    def __len__(self):
        return len(self.vids)

    def __getitem__(self, idx):
        vid = self.vids[idx]
        video_np = np.load(os.path.join(self.video_dir, f"{vid}_video.npy"))
        try:
            audio_np = np.load(os.path.join(self.audio_dir, f"{vid}_audio.npy"))
        except:
            audio_np = np.zeros((1, 1024), dtype=np.float32)
        return vid, video_np, audio_np

In [4]:
@torch.no_grad()
def generate_caption(model, video_np, audio_np, max_len=30):
    model.eval()
    video = torch.from_numpy(video_np).unsqueeze(0).float().to(DEVICE)
    audio = torch.from_numpy(audio_np).unsqueeze(0).float().to(DEVICE)
    v_mask = torch.zeros(1, video.size(1), dtype=torch.bool, device=DEVICE)
    a_mask = torch.zeros(1, audio.size(1), dtype=torch.bool, device=DEVICE)

    tgt = torch.tensor([[SOS_ID]], dtype=torch.long, device=DEVICE)
    for _ in range(max_len):
        logits = model(video, audio, tgt, v_mask, a_mask, tgt.eq(PAD_ID))
        next_tok = logits[:, -1].argmax(-1, keepdim=True)
        tgt = torch.cat([tgt, next_tok], dim=1)
        if next_tok.item() == EOS_ID:
            break
    return tgt.squeeze().tolist()

In [5]:
# Load model definition from training script
# Make sure this matches exactly your architecture
class PositionalEmbedding(torch.nn.Module):
    def __init__(self, max_len:int, dim:int):
        super().__init__()
        self.embed = torch.nn.Embedding(max_len, dim)
    def forward(self, x):
        idx = torch.arange(x.size(1), device=x.device)
        return x + self.embed(idx)[None, :, :]

class CrossAttentionBlock(torch.nn.Module):
    def __init__(self, dim:int, heads:int, dropout:float=0.1):
        super().__init__()
        self.attn = torch.nn.MultiheadAttention(dim, heads, dropout, batch_first=True)
        self.ff   = torch.nn.Sequential(torch.nn.Linear(dim, dim*4), torch.nn.ReLU(), torch.nn.Linear(dim*4, dim), torch.nn.Dropout(dropout))
        self.norm1 = torch.nn.LayerNorm(dim); self.norm2 = torch.nn.LayerNorm(dim)
    def forward(self, q, kv, kv_mask=None):
        attn_out, _ = self.attn(q, kv, kv, key_padding_mask=kv_mask)
        x = self.norm1(q + attn_out)
        return self.norm2(x + self.ff(x))


class MixcapEncoder(torch.nn.Module):
    def __init__(self, v_dim=1408, a_dim=1024, f_dim=768, layers=4, heads=8, dropout=0.1):
        super().__init__()
        self.v_proj = torch.nn.Linear(v_dim, f_dim)
        self.a_proj = torch.nn.Linear(a_dim, f_dim)
        self.pe = PositionalEmbedding(320, f_dim)
        self.drop = torch.nn.Dropout(dropout)
        self.v2a = torch.nn.ModuleList([CrossAttentionBlock(f_dim, heads, dropout) for _ in range(layers)])
        self.a2v = torch.nn.ModuleList([CrossAttentionBlock(f_dim, heads, dropout) for _ in range(layers)])

    def forward(self, v, a, v_mask=None, a_mask=None):
        v = self.drop(self.pe(self.v_proj(v)))
        a = self.drop(self.pe(self.a_proj(a)))
        for i in range(len(self.v2a)):
            v = self.v2a[i](v, a, a_mask)
            a = self.a2v[i](a, v, v_mask)
        return v, a


class CaptionDecoder(torch.nn.Module):
    def __init__(self, f_dim=768, vocab=VOCAB_SIZE, layers=4, heads=8, ff=2048, dropout=0.1):
        super().__init__()
        self.embed = torch.nn.Embedding(vocab, f_dim, padding_idx=PAD_ID)
        self.pe = PositionalEmbedding(320, f_dim)
        dec_layer = torch.nn.TransformerDecoderLayer(f_dim, heads, ff, dropout, batch_first=True)
        self.trans = torch.nn.TransformerDecoder(dec_layer, layers)
        self.out = torch.nn.Linear(f_dim, vocab)
    def _causal_mask(self, T, device):
        return torch.triu(torch.ones((T, T), dtype=torch.bool, device=device), 1)
    def forward(self, tgt, memory, tgt_pad_mask=None, mem_pad_mask=None):
        x = self.pe(self.embed(tgt))
        causal = self._causal_mask(tgt.size(1), tgt.device)
        return self.out(self.trans(x, memory,
                                   tgt_mask=causal.masked_fill(causal, float('-inf')),
                                   tgt_key_padding_mask=tgt_pad_mask,
                                   memory_key_padding_mask=mem_pad_mask))

class MixcapModel(torch.nn.Module):
    def __init__(self):
        super().__init__()
        self.enc = MixcapEncoder()
        self.dec = CaptionDecoder()
    def forward(self, v, a, tgt, v_mask=None, a_mask=None, tgt_pad_mask=None):
        v_enc, a_enc = self.enc(v, a, v_mask, a_mask)
        mem = torch.cat([v_enc, a_enc], dim=1)
        mem_mask = torch.cat([v_mask, a_mask], dim=1) if v_mask is not None else None
        return self.dec(tgt, mem, tgt_pad_mask, mem_mask)

In [6]:
# Load model and checkpoint
model = MixcapModel().to(DEVICE)
ckpt = torch.load(CHECKPOINT_PATH, map_location=DEVICE)
model.load_state_dict(ckpt)
model.eval()

MixcapModel(
  (enc): MixcapEncoder(
    (v_proj): Linear(in_features=1408, out_features=768, bias=True)
    (a_proj): Linear(in_features=1024, out_features=768, bias=True)
    (pe): PositionalEmbedding(
      (embed): Embedding(320, 768)
    )
    (drop): Dropout(p=0.1, inplace=False)
    (v2a): ModuleList(
      (0-3): 4 x CrossAttentionBlock(
        (attn): MultiheadAttention(
          (out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
        )
        (ff): Sequential(
          (0): Linear(in_features=768, out_features=3072, bias=True)
          (1): ReLU()
          (2): Linear(in_features=3072, out_features=768, bias=True)
          (3): Dropout(p=0.1, inplace=False)
        )
        (norm1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (norm2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      )
    )
    (a2v): ModuleList(
      (0-3): 4 x CrossAttentionBlock(
        (attn): MultiheadAttention(
          (o

In [7]:
# Load captions and create dataset
val_caps = np.load(VAL_CAPTION_PATH, allow_pickle=True).item()
val_dataset = EvalDataset(VIDEO_DIR, AUDIO_DIR, val_caps)

# Evaluate
gts, res = {}, {}
print("Generating predictions and decoding...")

for vid, video_np, audio_np in tqdm(val_dataset):
    hyp_ids = generate_caption(model, video_np, audio_np)
    hyp_text = sp_model.decode_ids(hyp_ids)
    ref_texts = [sp_model.decode_ids(ref) for ref in val_caps[vid]]

    gts[vid] = ref_texts
    res[vid] = [hyp_text]

# Compute metrics
bleu = Bleu(4)
cider = Cider()
meteor = Meteor()
rouge = Rouge()


bleu_score, _ = bleu.compute_score(gts, res)
cider_score, _ = cider.compute_score(gts, res)
meteor_score, _ = meteor.compute_score(gts, res)
rouge_score, _  = rouge.compute_score(gts, res)


# print(f"\n BLEU-4: {bleu_score[3]:.2f} | CIDEr: {cider_score:.2f}")
print(f"""
BLEU-1:  {bleu_score[0]:.2f}
BLEU-2:  {bleu_score[1]:.2f}
BLEU-3:  {bleu_score[2]:.2f}
BLEU-4:  {bleu_score[3]:.2f}
METEOR:  {meteor_score:.2f}
ROUGE-L: {rouge_score:.2f}
CIDEr:   {cider_score:.2f}
""")


import random

# === Print 6 examples ===
print("\nExample Predictions:\n")

example_vids = random.sample(list(gts.keys()), 6)
for i, vid in enumerate(example_vids, 1):
    print(f"--- Example {i} ---")
    print(f"Video ID: {vid}")
    print(f"Prediction: {res[vid][0]}")
    print(f"References:")
    for ref in gts[vid][:10]: 
        print(f"  - {ref}")
    print()

import pandas as pd
records = [{"video_id": vid, "prediction": res[vid][0], "references": " ||| ".join(gts[vid])} for vid in gts]
df = pd.DataFrame(records)
df.to_csv("evaluation_results.csv", index=False)

Generating predictions and decoding...


100%|██████████| 2990/2990 [05:10<00:00,  9.62it/s]


{'testlen': 22428, 'reflen': 22938, 'guess': [22428, 19438, 16448, 13458], 'correct': [19247, 11886, 6153, 2734]}
ratio: 0.9777661522364208

BLEU-1:  0.84
BLEU-2:  0.71
BLEU-3:  0.57
BLEU-4:  0.44
METEOR:  0.30
ROUGE-L: 0.63
CIDEr:   0.55


Example Predictions:

--- Example 1 ---
Video ID: video9274
Prediction: a cartoon character is playing with a ball
References:
  - a cartoon character speaks to the camera
  - a cartoon si shown
  - a clip from the tv show spongebob squarepants
  - a recording of a tv showing spongebob square pants
  - a scene from spongebob squarepants is shown
  - a scene from spongebob squarepants
  - an animated cartoon is looking at something and laughing
  - animated cartoon scene spongebob
  - cartoon character laugh at squid word then leave the restaurant
  - cartoon character playing a clarinet

--- Example 2 ---
Video ID: video9251
Prediction: a man is talking about a movie
References:
  - a ballet of sleeping beauty
  - a group of people are dancing and a