In [None]:
import pandas as pd
import numpy as np
from collections import Counter

GOLD_PATH = "final_gold_combined.csv"
HUM_PATH  = "human_aggregated.csv"

gold = pd.read_csv(GOLD_PATH)
hum  = pd.read_csv(HUM_PATH)

# --- Canonical doc ids ---
# We use filename as docid (stable, already in both files)
assert gold.shape[0] == 432
assert hum.shape[0] == 432
assert set(gold["filename"]) == set(hum["filename"])

# --- Label set (queries) ---
LABEL_COLS = ["final_top1", "final_top2", "final_top3"]
labels = sorted(pd.unique(gold[LABEL_COLS].values.ravel()))
print("Num labels (queries):", len(labels))
print("Labels:", labels)

# --- Build graded qrels: qrels[label][docid] = 3/2/1/0 ---
GRADE = {"final_top1": 3, "final_top2": 2, "final_top3": 1}

qrels = {lab: {} for lab in labels}
for _, r in gold.iterrows():
    docid = r["filename"]
    for col, rel in GRADE.items():
        lab = r[col]
        qrels[lab][docid] = max(qrels[lab].get(docid, 0), rel)  # safe

# Per-query relevant doc counts (for reporting)
per_query_rels = {lab: len(qrels[lab]) for lab in labels}
print("Relevant docs per query (min/max):", min(per_query_rels.values()), max(per_query_rels.values()))


In [2]:
import math

def dcg_at_k(rels, k):
    """rels: list of relevance grades in ranked order."""
    rels = rels[:k]
    s = 0.0
    for i, r in enumerate(rels, start=1):
        s += (2**r - 1) / math.log2(i + 1)   # graded gain
    return s

def ndcg_at_k(rels, ideal_rels, k):
    denom = dcg_at_k(ideal_rels, k)
    return 0.0 if denom == 0 else dcg_at_k(rels, k) / denom

def ap_at_k(binary_rels, k):
    """Average precision uses binary relevance; standard IR practice."""
    binary_rels = binary_rels[:k]
    hits = 0
    s = 0.0
    for i, r in enumerate(binary_rels, start=1):
        if r:
            hits += 1
            s += hits / i
    return 0.0 if hits == 0 else s / hits

def recall_at_k(binary_rels, total_relevant, k):
    if total_relevant == 0:
        return 0.0
    return sum(binary_rels[:k]) / total_relevant


In [None]:
# Build CLAP-derived scores: for each query label, score doc by CLAP rank (3/2/1/0)
CLAP_COLS = ["clap_top1", "clap_top2", "clap_top3"]
CLAP_GRADE = {"clap_top1": 3, "clap_top2": 2, "clap_top3": 1}

clap_scores = {lab: {} for lab in labels}
for _, r in hum.iterrows():
    docid = r["filename"]
    for col, score in CLAP_GRADE.items():
        lab = r[col]
        clap_scores[lab][docid] = max(clap_scores[lab].get(docid, 0), score)

def rank_docs_for_label(score_dict, all_docids):
    """Returns a ranked list of docids. Ties broken deterministically by docid."""
    # score_dict: docid -> score
    return sorted(all_docids, key=lambda d: (-score_dict.get(d, 0), d))

all_docids = sorted(gold["filename"].tolist())

def evaluate_run(ranked, qrel_for_query, k=50):
    rels = [qrel_for_query.get(d, 0) for d in ranked]
    ideal = sorted(qrel_for_query.values(), reverse=True)

    ndcg = ndcg_at_k(rels, ideal, k)

    binary = [r > 0 for r in rels]
    ap = ap_at_k(binary, k)
    rec = recall_at_k(binary, total_relevant=len(qrel_for_query), k=k)
    return ndcg, ap, rec

# Run evaluation across all label queries
K = 50
rows = []
for q in labels:
    ranked = rank_docs_for_label(clap_scores[q], all_docids)
    ndcg, ap, rec = evaluate_run(ranked, qrels[q], k=K)
    rows.append((q, ndcg, ap, rec, len(qrels[q])))

res = pd.DataFrame(rows, columns=["query_label", f"nDCG@{K}", f"MAP@{K}", f"Recall@{K}", "num_rel_docs"])
print(res.sort_values(f"nDCG@{K}", ascending=False).to_string(index=False))
print("\nMacro-avg (over queries):")
print(res[[f"nDCG@{K}", f"MAP@{K}", f"Recall@{K}"]].mean())


In [None]:
import numpy as np

def random_run(all_docids, seed=0):
    rng = np.random.default_rng(seed)
    ranked = all_docids.copy()
    rng.shuffle(ranked)
    return ranked

K = 50
seeds = [0, 1, 2, 3, 4]
rand_rows = []

for seed in seeds:
    per_q = []
    for q in labels:
        ranked = random_run(all_docids, seed=seed)
        per_q.append(evaluate_run(ranked, qrels[q], k=K))
    per_q = np.array(per_q)  # shape (num_queries, 3)
    rand_rows.append((seed, per_q[:,0].mean(), per_q[:,1].mean(), per_q[:,2].mean()))

rand_df = pd.DataFrame(rand_rows, columns=["seed", f"nDCG@{K}", f"MAP@{K}", f"Recall@{K}"])
print(rand_df.to_string(index=False))
print("\nRandom baseline mean across seeds:")
print(rand_df[[f"nDCG@{K}", f"MAP@{K}", f"Recall@{K}"]].mean())


In [None]:
def macro_avg_metrics(df, ndcg_col, map_col, rec_col):
    return np.array([df[ndcg_col].mean(), df[map_col].mean(), df[rec_col].mean()])

def bootstrap_ci(per_query_values, n_boot=10000, alpha=0.05, seed=0):
    """
    per_query_values: array shape (num_queries,) for one metric.
    """
    rng = np.random.default_rng(seed)
    Q = len(per_query_values)
    boots = []
    for _ in range(n_boot):
        samp = rng.choice(per_query_values, size=Q, replace=True)
        boots.append(np.mean(samp))
    boots = np.array(boots)
    lo = np.quantile(boots, alpha/2)
    hi = np.quantile(boots, 1 - alpha/2)
    return lo, hi

# Example: bootstrap CI for CLAP baseline nDCG@K
ndcg_col = f"nDCG@{K}"
lo, hi = bootstrap_ci(res[ndcg_col].values, n_boot=10000, seed=42)
print(f"CLAP macro {ndcg_col}: {res[ndcg_col].mean():.4f}  CI95 [{lo:.4f}, {hi:.4f}]")

# Paired randomization-style test (paired over queries) between CLAP and Random(mean over seeds)
# We'll compare CLAP vs one random seed here; you can extend to avg of seeds.
seed = 0
rand_perq = []
for q in labels:
    ranked = random_run(all_docids, seed=seed)
    ndcg, ap, rec = evaluate_run(ranked, qrels[q], k=K)
    rand_perq.append((q, ndcg, ap, rec))
rand_perq = pd.DataFrame(rand_perq, columns=["query_label", f"nDCG@{K}", f"MAP@{K}", f"Recall@{K}"])

diff = res.sort_values("query_label")[ndcg_col].values - rand_perq.sort_values("query_label")[ndcg_col].values
print("Per-query nDCG diffs (CLAP - Random):", diff.round(4))

# Simple paired t-test over queries (ok as a quick check; with 8 queries, interpret cautiously)
from math import sqrt
mean_diff = diff.mean()
std_diff = diff.std(ddof=1)
t = mean_diff / (std_diff / sqrt(len(diff))) if std_diff > 0 else float("inf")
print(f"Paired t statistic over queries (nDCG@{K}): t={t:.3f} (df={len(diff)-1})")
