# nr-learn Dashboard

このNotebookは `train -> predict -> backtest` の成果物をまとめて確認するためのダッシュボードです。

## 前提ファイル
- `artifacts/reports/train_metrics.json`
- `artifacts/predictions/predictions_*.csv`
- `artifacts/reports/backtest_*.json`

最初に学習・予測・バックテストを実行してから開いてください。

In [None]:
from pathlib import Path
import json
import pandas as pd
import matplotlib.pyplot as plt

ROOT = Path.cwd().resolve().parents[0] if Path.cwd().name == "notebooks" else Path.cwd()
ARTIFACTS = ROOT / "artifacts"
PRED_DIR = ARTIFACTS / "predictions"
REPORT_DIR = ARTIFACTS / "reports"

def latest_file(path: Path, pattern: str) -> Path:
    files = sorted(path.glob(pattern))
    if not files:
        raise FileNotFoundError(f"No files matched: {path}/{pattern}")
    return files[-1]

print("ROOT:", ROOT)
print("PRED_DIR:", PRED_DIR)
print("REPORT_DIR:", REPORT_DIR)

In [None]:
train_metrics_path = REPORT_DIR / "train_metrics.json"
if not train_metrics_path.exists():
    raise FileNotFoundError(f"Missing: {train_metrics_path}")

with train_metrics_path.open("r", encoding="utf-8") as f:
    train_metrics = json.load(f)

pd.DataFrame([train_metrics]).T.rename(columns={0: "value"})

In [None]:
pred_path = latest_file(PRED_DIR, "predictions_*.csv")
backtest_path = latest_file(REPORT_DIR, "backtest_*.json")

pred_df = pd.read_csv(pred_path)
with backtest_path.open("r", encoding="utf-8") as f:
    backtest = json.load(f)

summary = pd.DataFrame(
    [
        {
            "prediction_file": pred_path.name,
            "rows": len(pred_df),
            "races": pred_df["race_id"].nunique() if "race_id" in pred_df.columns else None,
            "top1_hit_rate": backtest.get("top1_hit_rate"),
            "top3_hit_rate": backtest.get("top3_hit_rate"),
            "top5_hit_rate": backtest.get("top5_hit_rate"),
            "simple_top1_win_roi": backtest.get("simple_top1_win_roi"),
        }
    ]
)

summary

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(13, 4.5))

axes[0].hist(pred_df["score"], bins=25, color="#3b82f6", alpha=0.85)
axes[0].set_title("Prediction Score Distribution")
axes[0].set_xlabel("score")
axes[0].set_ylabel("count")

if "race_id" in pred_df.columns:
    race_top = pred_df.groupby("race_id", as_index=False)["score"].max()
    axes[1].boxplot(race_top["score"], vert=True)
    axes[1].set_title("Per-race Top Score Boxplot")
    axes[1].set_ylabel("top score")
else:
    axes[1].text(0.5, 0.5, "race_id not found", ha="center", va="center")

plt.tight_layout()
plt.show()

In [None]:
if "rank" in pred_df.columns:
    calib = pred_df.copy()
    calib["is_win"] = (calib["rank"] == 1).astype(int)
    calib["score_bin"] = pd.cut(calib["score"], bins=10, include_lowest=True)
    calib_summary = calib.groupby("score_bin", observed=False).agg(
        mean_score=("score", "mean"),
        win_rate=("is_win", "mean"),
        count=("is_win", "size"),
    ).reset_index()

    plt.figure(figsize=(6, 5))
    plt.plot(calib_summary["mean_score"], calib_summary["win_rate"], marker="o")
    plt.plot([0, 1], [0, 1], linestyle="--")
    plt.title("Calibration-style Plot")
    plt.xlabel("mean predicted score")
    plt.ylabel("actual win rate")
    plt.grid(alpha=0.25)
    plt.show()

    calib_summary
else:
    print("rank 列がないため、Calibrationプロットはスキップ")

In [None]:
columns = [c for c in ["race_id", "horse_id", "horse_name", "score", "pred_rank", "rank"] if c in pred_df.columns]
(
    pred_df[columns]
    .sort_values("score", ascending=False)
    .head(20)
    .reset_index(drop=True)
)

## Next Steps

- `run_predict` を複数日ループで回して、`predictions_*.csv` を日次で蓄積する
- `run_backtest` で日次JSONを作り、時系列のTop1/Top3推移を比較する
- 特徴量を追加（騎手・調教師の期間別成績、コース×距離相性）して再学習する
- モデル比較（LightGBM vs RandomForest）を同じ分割条件で行う