In [2]:
# plots_dataset_characterization.py
# Requires: pandas, matplotlib
# Usage: python plots_dataset_characterization.py
# Input CSV columns expected: filename, final_top1, final_top2, final_top3, source

import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from itertools import combinations

CSV_PATH = "final_gold_combined.csv"   # adjust if needed

def save_tight(fig, fname):
    fig.tight_layout()
    fig.savefig(fname, dpi=300, bbox_inches="tight")
    plt.close(fig)

df = pd.read_csv(CSV_PATH)

# ---- Overall label frequencies (all ranks) ----
all_labels = pd.concat([df["final_top1"], df["final_top2"], df["final_top3"]], ignore_index=True)
label_counts_all = all_labels.value_counts()

fig = plt.figure(figsize=(10,8))
x = np.arange(len(label_counts_all))
plt.bar(x, label_counts_all.values)
plt.xticks(x, label_counts_all.index.tolist(), rotation=45, ha="right")
plt.ylabel("Count")
plt.title("Overall Frequency of Labels (All Ranks)")
save_tight(fig, "fig_label_frequency_all.png")
plt.show()

# ---- Rank-specific distributions ----
top1_counts = df["final_top1"].value_counts()
top2_counts = df["final_top2"].value_counts()
top3_counts = df["final_top3"].value_counts()
all_union = sorted(set(top1_counts.index) | set(top2_counts.index) | set(top3_counts.index))

idx = np.arange(len(all_union))
w = 0.25
fig = plt.figure(figsize=(10,8))
plt.bar(idx - w, [top1_counts.get(lbl,0) for lbl in all_union], width=w, label="Top-1")
plt.bar(idx,      [top2_counts.get(lbl,0) for lbl in all_union], width=w, label="Top-2")
plt.bar(idx + w,  [top3_counts.get(lbl,0) for lbl in all_union], width=w, label="Top-3")
plt.xticks(idx, all_union, rotation=45, ha="right")
plt.ylabel("Count")
plt.title("Label Distribution by Rank (Top-1 vs Top-2 vs Top-3)")
plt.legend()
save_tight(fig, "fig_label_distribution_by_rank.png")
plt.show()

# ---- Co-occurrence heatmap ----
labels_sorted = sorted(all_labels.dropna().unique())
L = len(labels_sorted)
co_mat = np.zeros((L, L), dtype=int)

for _, row in df.iterrows():
    triplet = [v for v in [row["final_top1"], row["final_top2"], row["final_top3"]] if isinstance(v, str)]
    # use a set to avoid self-pairs, then count symmetric co-occurrence
    for a, b in combinations(sorted(set(triplet)), 2):
        ia, ib = labels_sorted.index(a), labels_sorted.index(b)
        co_mat[ia, ib] += 1
        co_mat[ib, ia] += 1

fig = plt.figure(figsize=(10,8))
plt.imshow(co_mat, aspect="auto")
plt.colorbar(label="Co-occurrence Count")
plt.xticks(np.arange(L), labels_sorted, rotation=45, ha="right")
plt.yticks(np.arange(L), labels_sorted)
plt.title("Label Co-occurrence Heatmap")
save_tight(fig, "fig_label_cooccurrence_heatmap.png")
plt.show()

# ---- Most frequent label combinations (sorted triplets) ----
def sorted_triplet(row):
    trip = [v for v in [row["final_top1"], row["final_top2"], row["final_top3"]] if isinstance(v, str)]
    return tuple(sorted(trip))

combos = df.apply(sorted_triplet, axis=1)
combo_counts = combos.value_counts().head(15)

fig = plt.figure(figsize=(10,8))
y = np.arange(len(combo_counts))
labels_combo = [", ".join(t) for t in combo_counts.index]
plt.barh(y, combo_counts.values)
plt.yticks(y, labels_combo)
plt.xlabel("Count")
plt.title("Most Frequent Label Combinations (Top 15)")
save_tight(fig, "fig_label_combo_top15.png")
plt.show()
