In [None]:
import pandas as pd
import itertools
from collections import Counter

# Paths (as mounted in this environment)
gold_path = "final_gold_combined.csv"
hum_path  = "human_aggregated.csv"  # not required for these stats, but loaded if needed

gold = pd.read_csv(gold_path)
hum  = pd.read_csv(hum_path)

# --- Basic sanity checks ---
assert gold.shape[0] == 432, f"Expected 432 tracks, got {gold.shape[0]}"
assert set(["final_top1","final_top2","final_top3"]).issubset(gold.columns)

# --- Label set ---
labels = sorted(pd.unique(gold[["final_top1","final_top2","final_top3"]].values.ravel()))
print("Labels:", labels)



In [None]:
# --- Rank-wise distributions ---
top1_counts = Counter(gold["final_top1"])
top2_counts = Counter(gold["final_top2"])
top3_counts = Counter(gold["final_top3"])

# Overall prevalence across all ranks (top1+top2+top3 = 1296 assignments)
overall_counts = Counter(gold[["final_top1","final_top2","final_top3"]].values.ravel())
print("\nTotal graded assignments:", sum(overall_counts.values()))  # should be 432*3 = 1296

print("\nOverall label counts (any rank):")
for l in labels:
    print(f"{l:18s} {overall_counts[l]}")

print("\nTop1 label counts:")
for l in labels:
    print(f"{l:18s} {top1_counts[l]}")

print("\nTop2 label counts:")
for l in labels:
    print(f"{l:18s} {top2_counts[l]}")

print("\nTop3 label counts:")
for l in labels:
    print(f"{l:18s} {top3_counts[l]}")

# --- SIGIR-style qrels statistics: rel=3/2/1 for top1/top2/top3 ---
# Each label is treated as a query; this reports how many docs are relevant at each grade.
qrel_stats = []
for l in labels:
    qrel_stats.append({
        "label_query": l,
        "rel=3 (top1)": top1_counts[l],
        "rel=2 (top2)": top2_counts[l],
        "rel=1 (top3)": top3_counts[l],
        "total relevant docs": overall_counts[l],
    })

qrel_df = pd.DataFrame(qrel_stats).sort_values("total relevant docs", ascending=False)
print("\nPer-label relevant documents by grade (qrels stats):")
print(qrel_df.to_string(index=False))



In [None]:
# --- Co-occurrence: unordered pairs/triplets within each track's top-3 set ---
pair_counts = Counter()
triplet_counts = Counter()

for _, row in gold.iterrows():
    labs = [row["final_top1"], row["final_top2"], row["final_top3"]]

    # Unordered pairs
    for a, b in itertools.combinations(sorted(labs), 2):
        pair_counts[(a, b)] += 1

    # Unordered triplet
    triplet_counts[tuple(sorted(labs))] += 1

print("\nTop 10 co-occurring label pairs (within top-3):")
for (a, b), c in pair_counts.most_common(10):
    print(f"{a} â€“ {b}: {c}")

print("\nTop 10 most common unordered triplets (within top-3):")
for trip, c in triplet_counts.most_common(10):
    print(f"{trip}: {c}")