In [None]:
import pandas as pd
import numpy as np
import re, math
from collections import Counter
import itertools

# ----------------------------
# 0) Paths
# ----------------------------
GOLD_PATH = "final_gold_combined.csv"
DOCS_PATH = "clap_combined.csv"   # contains file_name + gpt_description

# ----------------------------
# 1) Load data
# ----------------------------
gold = pd.read_csv(GOLD_PATH)
docs = pd.read_csv(DOCS_PATH)

# Keep only the two needed columns
docs = docs[["file_name", "gpt_description"]].copy()

# Normalize column naming across files
# gold uses `filename`; docs uses `file_name`
gold_docids = set(gold["filename"].astype(str))
docs["file_name"] = docs["file_name"].astype(str)

# Sanity: ensure all docs exist
missing_in_docs = gold_docids - set(docs["file_name"])
missing_in_gold = set(docs["file_name"]) - gold_docids
if missing_in_docs:
    raise ValueError(f"{len(missing_in_docs)} gold docids missing from docs file. Example: {list(missing_in_docs)[:3]}")
if missing_in_gold:
    # not necessarily fatal, but should usually be zero
    print(f"Warning: {len(missing_in_gold)} docids in docs file not present in gold. Example: {list(missing_in_gold)[:3]}")

# Align docs to gold order (optional but nice for deterministic behavior)
docs = docs[docs["file_name"].isin(gold_docids)].drop_duplicates("file_name").copy()
docs = docs.sort_values("file_name").reset_index(drop=True)

assert docs.shape[0] == 432, f"Expected 432 docs aligned to gold, got {docs.shape[0]}"
assert gold.shape[0] == 432, f"Expected 432 rows in gold, got {gold.shape[0]}"



In [None]:
# ----------------------------
# 2) Build qrels (graded relevance from final_top1/2/3)
# ----------------------------
LABEL_COLS = ["final_top1", "final_top2", "final_top3"]
GRADE = {"final_top1": 3, "final_top2": 2, "final_top3": 1}

labels = sorted(pd.unique(gold[LABEL_COLS].values.ravel()))
qrels = {lab: {} for lab in labels}

for _, r in gold.iterrows():
    docid = str(r["filename"])
    for col, rel in GRADE.items():
        lab = r[col]
        qrels[lab][docid] = max(qrels[lab].get(docid, 0), rel)

per_query_rels = {lab: len(qrels[lab]) for lab in labels}
print("Num labels (queries):", len(labels))
print("Labels:", labels)
print("Relevant docs per query (min/max):", min(per_query_rels.values()), max(per_query_rels.values()))



In [None]:
# ----------------------------
# 3) Metrics (nDCG graded; MAP/Recall binarized rel>0)
# ----------------------------
def dcg_at_k(rels, k):
    rels = rels[:k]
    s = 0.0
    for i, r in enumerate(rels, start=1):
        s += (2**r - 1) / math.log2(i + 1)
    return s

def ndcg_at_k(rels, ideal_rels, k):
    denom = dcg_at_k(ideal_rels, k)
    return 0.0 if denom == 0 else dcg_at_k(rels, k) / denom

def ap_at_k(binary_rels, k):
    binary_rels = binary_rels[:k]
    hits = 0
    s = 0.0
    for i, r in enumerate(binary_rels, start=1):
        if r:
            hits += 1
            s += hits / i
    return 0.0 if hits == 0 else s / hits

def recall_at_k(binary_rels, total_relevant, k):
    if total_relevant == 0:
        return 0.0
    return sum(binary_rels[:k]) / total_relevant

def evaluate_run(ranked_docids, qrel_for_query, k=50):
    rels = [qrel_for_query.get(d, 0) for d in ranked_docids]
    ideal = sorted(qrel_for_query.values(), reverse=True)

    ndcg = ndcg_at_k(rels, ideal, k)
    binary = [r > 0 for r in rels]
    ap = ap_at_k(binary, k)
    rec = recall_at_k(binary, total_relevant=len(qrel_for_query), k=k)
    return ndcg, ap, rec

# ----------------------------
# 4) BM25 implementation
# ----------------------------
def tokenize(text):
    # Simple, robust tokenizer for descriptions
    return re.findall(r"[a-z0-9]+", str(text).lower())

class SimpleBM25:
    def __init__(self, corpus_tokens, k1=1.2, b=0.75):
        self.k1, self.b = k1, b
        self.corpus = corpus_tokens
        self.N = len(corpus_tokens)
        self.doc_len = np.array([len(d) for d in corpus_tokens], dtype=float)
        self.avgdl = float(np.mean(self.doc_len)) if self.N > 0 else 0.0

        # document frequency
        df = Counter()
        for doc in corpus_tokens:
            for w in set(doc):
                df[w] += 1
        self.df = df
        # idf (Okapi BM25)
        self.idf = {w: math.log(1 + (self.N - df[w] + 0.5) / (df[w] + 0.5)) for w in df}

        # term frequencies per doc
        self.tf = [Counter(doc) for doc in corpus_tokens]

    def score(self, query_tokens):
        scores = np.zeros(self.N, dtype=float)
        for i in range(self.N):
            dl = self.doc_len[i]
            denom_const = self.k1 * (1 - self.b + self.b * (dl / self.avgdl if self.avgdl > 0 else 0.0))
            tf_i = self.tf[i]
            s = 0.0
            for w in query_tokens:
                if w not in self.idf:
                    continue
                f = tf_i.get(w, 0)
                if f == 0:
                    continue
                s += self.idf[w] * (f * (self.k1 + 1)) / (f + denom_const)
            scores[i] = s
        return scores

# ----------------------------
# 5) Run BM25 benchmark: label-as-query over GPT descriptions
# ----------------------------
docids = docs["file_name"].tolist()
corpus_tokens = [tokenize(t) for t in docs["gpt_description"].fillna("")]

# // Create BM25 index
bm25 = SimpleBM25(corpus_tokens, k1=1.2, b=0.75)

# Default query text for each label:
# For BM25, label strings like "Sensory-Calming" tokenize into ["sensory","calming"] which works well.
def label_to_query_text(label):
    return label.replace("-", " ")

K = 50
rows = []
for q in labels:
    q_tokens = tokenize(label_to_query_text(q))
    scores = bm25.score(q_tokens)

    # Rank docs by score; tie-break by docid for determinism
    ranked_idx = np.lexsort((np.array(docids), -scores))
    ranked_docids = [docids[i] for i in ranked_idx]

    ndcg, ap, rec = evaluate_run(ranked_docids, qrels[q], k=K)
    rows.append((q, ndcg, ap, rec, len(qrels[q])))

bm25_res = pd.DataFrame(rows, columns=["query_label", f"nDCG@{K}", f"MAP@{K}", f"Recall@{K}", "num_rel_docs"])
print("\nBM25 over GPT descriptions (label-as-query):")
print(bm25_res.sort_values(f"nDCG@{K}", ascending=False).to_string(index=False))
print("\nMacro-avg (over queries):")
print(bm25_res[[f"nDCG@{K}", f"MAP@{K}", f"Recall@{K}"]].mean())



In [None]:
# ----------------------------
# 6) Bootstrap CI over queries for BM25 nDCG@K (optional but recommended)
# ----------------------------
def bootstrap_ci(per_query_values, n_boot=10000, alpha=0.05, seed=42):
    rng = np.random.default_rng(seed)
    Q = len(per_query_values)
    boots = np.empty(n_boot, dtype=float)
    for i in range(n_boot):
        samp = rng.choice(per_query_values, size=Q, replace=True)
        boots[i] = np.mean(samp)
    lo = np.quantile(boots, alpha/2)
    hi = np.quantile(boots, 1 - alpha/2)
    return lo, hi

lo, hi = bootstrap_ci(bm25_res[f"nDCG@{K}"].values, n_boot=10000, seed=42)
print(f"\nBM25 macro nDCG@{K}: {bm25_res[f'nDCG@{K}'].mean():.4f}  CI95 [{lo:.4f}, {hi:.4f}]")

# ----------------------------
# 7) (Optional) Compare BM25 vs CLAP baseline if you want both in same run
# ----------------------------
# If you still want to compare against CLAP label-based baseline using human_aggregated.csv:
# - Load /mnt/data/human_aggregated.csv
# - Build CLAP scores like before and evaluate with evaluate_run()
