# Lab 5 · Final Evaluation

Evaluate the final CSE472 Blanket submissions: download predictions, validate them against the hidden ground truth, compute RMSE/Jaccard/combined score, and export figures to `fig/`.

- pulls submission links from `team.jsonl`
- caches CSVs in `data/`
- validates schema and array lengths
- reports a leaderboard plus diagnostics plots


## Team roster & submission links

The links below are read from `team.jsonl` and used for downloading each submission.

| Group | Members | Repo | Submission File |
|-------|---------|------|-----------------|
| 1 | Sameera, Tanmayi | [sameerashahh/CSE472-blanket-challenge](https://github.com/sameerashahh/CSE472-blanket-challenge/tree/project2_implementation) | [`final_project_implementation/submission.csv`](https://raw.githubusercontent.com/sameerashahh/CSE472-blanket-challenge/486287884d7099c273a570a16aec4d387f60320e/final_project_implementation/submission.csv) |
| 7 | Dhruv, Sahajpreet | [dhruvb26/CSE472-blanket-challenge](https://github.com/dhruvb26/CSE472-blanket-challenge/tree/main) | [`solution/runs/20251124_194116/submission.csv`](https://raw.githubusercontent.com/dhruvb26/CSE472-blanket-challenge/b8c8ca86d3f6d0aa97ba79cd1f6d73ca57018c50/solution/runs/20251124_194116/submission.csv) |
| 8 | Fredo, Anton | [saan-volta/CSE472-blanket-challenge-submission](https://github.com/saan-volta/CSE472-blanket-challenge-submission/tree/main) | [`submission/submission.csv`](https://raw.githubusercontent.com/saan-volta/CSE472-blanket-challenge-submission/942d6b74e42ec51330fdfc8505c992db356819cf/submission/submission.csv) |
| 10 | Ang, Muhammed | [muhammedhunaid/CSE472-blanket-challenge](https://github.com/muhammedhunaid/CSE472-blanket-challenge/tree/muhammed/final-submission) | [`submission.csv`](https://raw.githubusercontent.com/muhammedhunaid/CSE472-blanket-challenge/refs/heads/muhammed/final-submission/submission.csv) |


## Setup

Figures are written to `fig/`, cached submissions live in `data/`, and the hidden ground truth is loaded from the sibling `blanket` repo. Run the notebook top-to-bottom after ensuring the `blanket` environment is active.


In [None]:
%load_ext watermark
%load_ext autoreload
%autoreload 2


In [None]:
from __future__ import annotations

import sys
from pathlib import Path

import httpx
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from datasets import load_dataset, load_from_disk
from dotenv import load_dotenv

# ensure local blanket package is importable
LAB_ROOT = Path().resolve()
PROJECT_SRC = LAB_ROOT.parents[2] / "blanket" / "src"
sys.path.append(str(PROJECT_SRC))
from blanket.metrics import jaccard_score, rmse

load_dotenv()

sns.set_theme(style="whitegrid", context="talk")
pd.set_option("display.max_rows", 20)
pd.set_option("display.max_columns", 8)

DATA_DIR = LAB_ROOT / "data"
FIG_DIR = LAB_ROOT / "fig"
GROUND_TRUTH_DIR = (
    LAB_ROOT.parents[2] / "blanket" / "data" / "datasets" / "final-ground-truth"
)

for path in (DATA_DIR, FIG_DIR):
    path.mkdir(parents=True, exist_ok=True)

print(f"Lab root: {LAB_ROOT}")
print(f"Ground truth dir: {GROUND_TRUTH_DIR}")
print(f"Figures will be saved to: {FIG_DIR}")


## Load ground-truth dataset

We keep the provided HF splits to double-check IDs, then convert the held-out test split to a Pandas frame with `y_test` and the Markov blanket mask.


In [None]:
develop = load_dataset(
    "CSE472-blanket-challenge/final-dataset", "develop", split="train"
)
submit = load_dataset("CSE472-blanket-challenge/final-dataset", "submit", split="train")
ground_truth_data = load_from_disk(GROUND_TRUTH_DIR)

assert set(develop["data_id"]) == set(ground_truth_data["train"]["data_id"])
assert set(submit["data_id"]) == set(ground_truth_data["test"]["data_id"])

ground_truth = (
    ground_truth_data["test"]
    .select_columns(["data_id", "y_test", "feature_mask"])
    .rename_column("feature_mask", "markov_blanket")
    .to_pandas()
)

ground_truth.head()


## Download & cache submissions

URLs come from `team.jsonl`. We skip downloads when the file already exists to keep runs fast and reproducible.


In [None]:
def download_submission(url: str, save_path: Path, overwrite: bool = False) -> Path:
    if save_path.exists() and not overwrite:
        return save_path

    response = httpx.get(url, timeout=30.0)
    response.raise_for_status()
    save_path.write_bytes(response.content)
    return save_path


teams = load_dataset("json", data_files=str(LAB_ROOT / "team.jsonl"), split="train")

for team in teams:
    dest = DATA_DIR / f"team_{team['group']}_submission.csv"
    download_submission(team["submission_url"], dest)

sorted(DATA_DIR.iterdir())


## Load submissions into data frames

Parse the list-like columns into NumPy arrays and drop stray index columns if present.


In [None]:
def to_array(value, dtype=float) -> np.ndarray:
    if isinstance(value, np.ndarray):
        return value.astype(dtype)
    if isinstance(value, list):
        return np.asarray(value, dtype=dtype)

    text = str(value).strip()
    if text.startswith("[") and text.endswith("]"):
        text = text[1:-1]
    return np.fromstring(text, sep=",", dtype=dtype)


submissions: dict[int, pd.DataFrame] = {}

for team in teams:
    group = team["group"]
    df = pd.read_csv(DATA_DIR / f"team_{group}_submission.csv")

    unnamed_cols = [c for c in df.columns if c.startswith("Unnamed:")]
    if unnamed_cols:
        df = df.drop(columns=unnamed_cols)

    df["y_pred"] = df["y_pred"].apply(lambda x: to_array(x, dtype=float))
    df["markov_blanket_pred"] = df["markov_blanket_pred"].apply(
        lambda x: to_array(x, dtype=int)
    )

    submissions[group] = df

submissions[next(iter(submissions))].head()


## Validate submission format

Quick schema and shape checks to surface any mismatches before scoring.


In [None]:
def validate_dataset(
    submission: pd.DataFrame, ground_truth: pd.DataFrame, *, name: str
) -> None:
    expected_columns = {"data_id", "y_pred", "markov_blanket_pred"}
    assert set(submission.columns) == expected_columns, (
        f"{name}: columns {submission.columns} do not match {expected_columns}"
    )

    assert set(submission["data_id"]) == set(ground_truth["data_id"]), (
        f"{name}: data_id values differ from ground truth"
    )

    assert submission.shape[0] == ground_truth.shape[0], (
        f"{name}: row count {submission.shape[0]} != ground truth {ground_truth.shape[0]}"
    )

    merged = submission.merge(ground_truth, on="data_id", how="inner")
    for _, row in merged.iterrows():
        assert len(row["y_pred"]) == len(row["y_test"]), (
            f"{name}: y_pred length {len(row['y_pred'])} does not match y_test length {len(row['y_test'])} for data_id {row['data_id']}"
        )
        assert len(row["markov_blanket_pred"]) == len(row["markov_blanket"]), (
            f"{name}: markov_blanket_pred length {len(row['markov_blanket_pred'])} does not match markov_blanket length {len(row['markov_blanket'])} for data_id {row['data_id']}"
        )

    print(f"✅ {name}: schema and lengths look good")


for group, df in submissions.items():
    validate_dataset(df, ground_truth, name=f"Team {group}")


## Evaluate metrics

Score each task with RMSE and Jaccard, then combine them via `score = rmse × (1 - jaccard)`.


In [None]:
def evaluate_single_task(
    y_query_true: np.ndarray,
    y_query_pred: np.ndarray,
    mb_true: np.ndarray,
    mb_pred: np.ndarray,
) -> dict:
    rmse_val = rmse(y_query_true, y_query_pred)
    jaccard_val = jaccard_score(mb_true, mb_pred)
    score_val = rmse_val * (1.0 - jaccard_val)
    return {"rmse": rmse_val, "jaccard": jaccard_val, "score": score_val}


def evaluate_submission(
    submission: pd.DataFrame, ground_truth: pd.DataFrame
) -> pd.DataFrame:
    merged = submission.merge(ground_truth, on="data_id", how="inner")
    results = []

    for _, row in merged.iterrows():
        eval_result = evaluate_single_task(
            y_query_true=row["y_test"],
            y_query_pred=row["y_pred"],
            mb_true=row["markov_blanket"],
            mb_pred=row["markov_blanket_pred"],
        )
        eval_result["data_id"] = row["data_id"]
        results.append(eval_result)

    return pd.DataFrame(results)


eval_results = {
    group: evaluate_submission(df, ground_truth) for group, df in submissions.items()
}
eval_results[next(iter(eval_results))].head()


## Aggregate leaderboard

Compute per-team means and standard deviations for each metric, then sort by the final score.


In [None]:
team_info = teams.to_pandas()
team_info["members"] = team_info["members"].apply(lambda x: ", ".join(x))

agg_rows = []
for group, df in eval_results.items():
    agg_rows.append(
        {
            "group": group,
            "rmse_mean": df["rmse"].mean(),
            "rmse_std": df["rmse"].std(),
            "jaccard_mean": df["jaccard"].mean(),
            "jaccard_std": df["jaccard"].std(),
            "score_mean": df["score"].mean(),
            "score_std": df["score"].std(),
        }
    )

agg_results_df = (
    pd.DataFrame(agg_rows)
    .merge(team_info[["group", "members"]], on="group", how="left")
    .sort_values(by="score_mean")
    .reset_index(drop=True)
)
agg_results_df.insert(0, "rank", range(1, len(agg_results_df) + 1))
agg_results_df


## Additional diagnostics

Build a long-form frame for task-level analysis and capture quick stats.


In [None]:
score_df = pd.concat(
    [df.assign(group=group) for group, df in eval_results.items()],
    ignore_index=True,
)

corr_rmse_jaccard = score_df["rmse"].corr(score_df["jaccard"])
print(f"RMSE vs Jaccard correlation across tasks: {corr_rmse_jaccard:.3f}")
score_df.head()


## Metric summary plot

Bar plots of team-level means with standard deviation error bars. Figures are saved under `fig/`.


In [None]:
plot_df = agg_results_df.copy()
plot_df["team_label"] = plot_df["group"].apply(lambda x: f"Team {x}")

fig, axes = plt.subplots(1, 3, figsize=(16, 5))
metrics = [
    ("rmse_mean", "rmse_std", "RMSE", "Blues"),
    ("jaccard_mean", "jaccard_std", "Jaccard", "Greens"),
    ("score_mean", "score_std", "Score = RMSE × (1 - Jaccard)", "Oranges"),
]

for ax, (mean_col, std_col, title, palette) in zip(axes, metrics):
    sns.barplot(
        data=plot_df,
        x="team_label",
        y=mean_col,
        yerr=plot_df[std_col],
        palette=palette,
        ax=ax,
        edgecolor="black",
        alpha=0.9,
    )
    for container in ax.containers:
        ax.bar_label(container, fmt="%.3f", padding=2)
    ax.set_title(title)
    ax.set_xlabel("")
    ax.set_ylabel(title)
    ax.tick_params(axis="x", rotation=30)

plt.tight_layout()
fig.suptitle("Team-level metrics", y=1.04, fontsize=16, fontweight="bold")
fig.savefig(FIG_DIR / "team_metric_summary.png", dpi=300, bbox_inches="tight")
plt.show()


## Trade-off & score distributions

- RMSE vs. Jaccard scatter shows the regression/feature-selection trade-off.
- Box plot shows score spread per team across tasks.


In [None]:
scatter_order = agg_results_df["group"].tolist()

# RMSE vs Jaccard scatter
fig, ax = plt.subplots(figsize=(7, 6))
sns.scatterplot(
    data=agg_results_df,
    x="rmse_mean",
    y="jaccard_mean",
    hue="group",
    palette="crest",
    s=180,
    ax=ax,
)
for _, row in agg_results_df.iterrows():
    ax.text(
        row["rmse_mean"],
        row["jaccard_mean"] + 0.01,
        f"Team {row['group']}",
        ha="center",
    )
ax.set_title("RMSE vs Jaccard (team means)")
ax.grid(True, alpha=0.3)
fig.savefig(FIG_DIR / "rmse_vs_jaccard.png", dpi=300, bbox_inches="tight")
plt.show()

# Score distribution per team
fig, ax = plt.subplots(figsize=(8, 5))
sns.boxplot(
    data=score_df,
    x="group",
    y="score",
    order=scatter_order,
    palette="pastel",
    ax=ax,
)
ax.set_title("Score distribution across tasks")
ax.set_xlabel("Team")
ax.set_ylabel("Score")
fig.savefig(FIG_DIR / "score_distribution_by_team.png", dpi=300, bbox_inches="tight")
plt.show()


## Task-wise wins

Count how many tasks each team wins (lowest score per task).


In [None]:
task_winners = score_df.sort_values("score").groupby("data_id").first().reset_index()
win_counts = (
    task_winners["group"]
    .value_counts()
    .sort_index()
    .reset_index()
    .rename(columns={"index": "group", "group": "wins"})
)

fig, ax = plt.subplots(figsize=(7, 4))
sns.barplot(data=win_counts, x="group", y="wins", palette="coolwarm", ax=ax)
ax.set_title("Task wins per team")
ax.set_xlabel("Team")
ax.set_ylabel("# Wins (lowest score)")
fig.savefig(FIG_DIR / "task_wins_by_team.png", dpi=300, bbox_inches="tight")
plt.show()

win_counts


## Quick takeaway

Highlight the current leader and a headline stat.


In [None]:
leader = agg_results_df.iloc[0]
print(
    f"Leader: Team {leader.group} ({leader.members}) — score {leader.score_mean:.4f}, "
    f"RMSE {leader.rmse_mean:.3f}, Jaccard {leader.jaccard_mean:.3f}."
)
print(f"RMSE/Jaccard correlation across tasks: {corr_rmse_jaccard:.3f}")
