# Simple Dashboard

In [13]:
from utils import plotting
from pathlib import Path

# Path to folder with metrics
metrics_dir = Path("metrics")

# Usage
metrics = plotting.load_all_metrics(metrics_dir, pattern="*_last.json")
plotting.plot_metrics_grid(metrics)


Quick observations:
- Run 1 – overfit, train and validation performance diverge
- Run 2-4 – stable training, both train and validation improve 

# Best Model Comparison

In [19]:
import polars as pl

# Load metrics
best_metrics = plotting.load_all_metrics(metrics_dir, pattern="*_best.json")

# Flatten metrics using list comprehension
rows = [
    {
        "run_name": m["run_name"],
        "train_loss": m["train_loss"][0] if m.get("train_loss") else None,
        "val_loss": m["val_loss"][0] if m.get("val_loss") else None,
        "train_accuracy": m["train_accuracy"][0] if m.get("train_accuracy") else None,
        "val_accuracy": m["val_accuracy"][0] if m.get("val_accuracy") else None,
        "train_time": m.get("train_time"),
        "epochs": m.get("epoch", m.get("epochs")),
    }
    for m in best_metrics
]

# Create Polars DataFrame
df = pl.DataFrame(rows)
df_rounded = df.with_columns([pl.col(pl.Float64).exclude("epochs").round(2)])
# Rank by validation accuracy (descending)
df_ranked = df_rounded.sort("val_accuracy", descending=True)
print(df_ranked)

# print the best run name
print(f"Best run: {df_ranked[0, 'run_name']}")

# Optionally, save
# df_ranked.write_csv("ranked_best_metrics.csv")


shape: (4, 7)
┌────────────────────┬────────────┬──────────┬────────────────┬──────────────┬────────────┬────────┐
│ run_name           ┆ train_loss ┆ val_loss ┆ train_accuracy ┆ val_accuracy ┆ train_time ┆ epochs │
│ ---                ┆ ---        ┆ ---      ┆ ---            ┆ ---          ┆ ---        ┆ ---    │
│ str                ┆ f64        ┆ f64      ┆ f64            ┆ f64          ┆ f64        ┆ i64    │
╞════════════════════╪════════════╪══════════╪════════════════╪══════════════╪════════════╪════════╡
│ TinyVGG_run_3_0929 ┆ 0.45       ┆ 0.57     ┆ 0.84           ┆ 0.81         ┆ 671.11     ┆ 20     │
│ 0127_best          ┆            ┆          ┆                ┆              ┆            ┆        │
│ TinyVGG_run_4_0929 ┆ 0.71       ┆ 0.6      ┆ 0.75           ┆ 0.79         ┆ 670.73     ┆ 19     │
│ 0140_best          ┆            ┆          ┆                ┆              ┆            ┆        │
│ TinyVGG_run_2_0929 ┆ 0.6        ┆ 0.66     ┆ 0.79           ┆ 0.77         

Runs 3 and 4 show the most promise:
- Run 3: Best performance, stable training, early stopping not triggered. Likely to improve further with continued training.
- Run 4: Second-best performance, stable training, early stopping triggered (just). Has the most regularization, so relaxing early stopping and training longer might yield good results. Small models can struggle with heavy regularization due to limited capacity, so might not beat Run 3 configuration.

We will choose Run 3 best model to evaluate on the testing set in `final_results.ipynb`.