In [None]:
from pathlib import Path
from typing import Dict, List, Optional, Tuple
import re, glob
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# --- CONFIG ---
DIR_ADP   = Path("results_adp")
DIR_SDDP  = Path("results_sddp")
DIR_EEVWS = Path("results_eev_ws")
DIR_ADPUC  = Path("results_adp_unit")
DEBUG_SCAN = True

COL_TIME   = ["stage", "Stage", "t", "time", "hour", "k", "step"]
COL_SAMPLE = ["path", "Path", "path_id","sample", "id", "sample_id", "scenario", "seed", "traj", "trajectory", "run"]
COL_VALUE_INCREMENTAL = ["obj_stage", "rev_stage", "reward_stage", "value_stage", "stage_value", "stage_reward", "stage_obj"]
COL_VALUE_CUMULATIVE  = ["obj_total", "total_obj", "cum_value","cum_abs_delta", "cum_profit", "cum_reward", "objective", "obj", "profit", "value"]

def _pick(df, cands): 
    for c in cands:
        if c in df.columns: return c
    return None

def _coerce_time_inplace(df, col="t"):
    df[col] = pd.to_numeric(df[col], errors="coerce")
    df = df.dropna(subset=[col]).copy()
    df[col] = df[col].astype(int)
    return df

# -------- discovery ----------
_pat = "*train_perstage*N*.csv"
_rxN = re.compile(r"_N(\d+)\.csv$", re.IGNORECASE)

def discover_perstage_files(root: Path) -> Dict[int, Path]:
    """
    Return {N: filepath} for any '*train_perstage*N*.csv' found in root.
    If multiple files match same N, keep the first (or choose the longest name deterministically).
    """
    mapping: Dict[int, Path] = {}
    hits = sorted(Path(root).glob(_pat))
    if DEBUG_SCAN:
        print(f"[SCAN] {root}/'{_pat}' -> {len(hits)} files")
    for f in hits:
        m = _rxN.search(f.name)
        if not m: 
            continue
        N = int(m.group(1))
        if N not in mapping:
            mapping[N] = f
        if DEBUG_SCAN:
            try:
                cols = list(pd.read_csv(f, nrows=1).columns)
            except Exception as e:
                cols = [f"ERROR: {e}"]
            print(f"       N={N:>4} -> {f.name}  cols[:10]={cols[:10]}")
    return dict(sorted(mapping.items()))

# -------- readers ----------
def read_perstage_file(f: Path) -> Optional[pd.DataFrame]:
    df = pd.read_csv(f)
    t = _pick(df, COL_TIME)
    s = _pick(df, COL_SAMPLE)
    v_inc = _pick(df, COL_VALUE_INCREMENTAL)
    v_cum = _pick(df, COL_VALUE_CUMULATIVE)
    if DEBUG_SCAN:
        print(f"[PICK] {f.name}: time={t}, sample={s}, inc={v_inc}, cum={v_cum}")
    if not (t and s and (v_inc or v_cum)): 
        return None
    out = df[[t, s, (v_inc or v_cum)]].rename(columns={t:"t", s:"sample", (v_inc or v_cum):"value"}).copy()
    out = _coerce_time_inplace(out, "t")
    out["is_incremental"] = v_inc is not None
    return out

def cum_from_tidy(df: pd.DataFrame) -> pd.DataFrame:
    df = df.sort_values(["sample","t"]).copy()
    if "is_incremental" in df and df["is_incremental"].iloc[0]:
        df["cum"] = df.groupby("sample")["value"].cumsum()
    else:
        df["cum"] = df["value"]
    return df

# -------- figures ----------
def std_last5_over_time(df: pd.DataFrame) -> pd.DataFrame:
    df = df.sort_values(["sample","t"])
    last5 = sorted(df["sample"].unique())[-5:]
    df5 = cum_from_tidy(df[df["sample"].isin(last5)].copy())
    wide = df5.pivot_table(index="t", columns="sample", values="cum", aggfunc="last").sort_index()
    return pd.DataFrame({"t": wide.index, "std_last5": wide.std(axis=1, ddof=1)})

def plot_std_time(series_by_N: Dict[int, pd.DataFrame], title: str, out_png: Path, out_pdf: Path):
    if not series_by_N: 
        print(f"[WARN] Nothing to plot for {title}."); return
    plt.figure(figsize=(8,5.2))
    for N, d in sorted(series_by_N.items()):
        plt.plot(d["t"], d["std_last5"], label=f"{N} samples")
    plt.xlabel("Time (h)"); plt.ylabel("Standard deviation of last five samples ($)")
    plt.legend(); plt.tight_layout()
    out_png.parent.mkdir(parents=True, exist_ok=True)
    plt.savefig(out_png, dpi=300); plt.savefig(out_pdf); plt.close()
    print(f"[OK] wrote {out_png} (+pdf)")

def build_std_figures_for(root: Path, label: str):
    found = discover_perstage_files(root)                 # {N: path}
    if not found: 
        print(f"[WARN] No per-stage files found in {root}"); 
        return
    series = {}
    for N, f in found.items():
        df = read_perstage_file(f)
        if df is None or df.empty:
            print(f"[WARN] {label}: cannot read time-series from {f.name}")
            continue
        series[N] = std_last5_over_time(df)
    plot_std_time(series, f"{label}: Std of last five samples",
                  Path(f"figures/{label.lower()}_std_last5.png"),
                  Path(f"figures/{label.lower()}_std_last5.pdf"))
    return found  # so we can reuse for the two-panel

def build_two_panel_for(root: Path, label: str):
    found = discover_perstage_files(root)
    if len(found) < 2:
        print(f"[WARN] Need at least two Ns for {label} panel; found: {list(found.keys())}")
        return
    N_small, N_large = min(found), max(found)
    df_small = read_perstage_file(found[N_small])
    df_large = read_perstage_file(found[N_large])
    if df_small is None or df_large is None:
        print(f"[WARN] Could not read panel data for {label}")
        return
    df_small = cum_from_tidy(df_small); df_large = cum_from_tidy(df_large)
    fig, axes = plt.subplots(1,2, figsize=(10,4.2), sharey=True)
    for ax, (df, N) in zip(axes, [(df_small, N_small), (df_large, N_large)]):
        last5 = sorted(df["sample"].unique())[-5:]
        for sid in last5:
            sub = df[df["sample"]==sid]
            ax.plot(sub["t"], sub["cum"])
        ax.set_xlabel("Time (h)"); ax.set_title(f"Last five samples\nout of N={N}")
        ax.grid(True, linestyle=":", alpha=0.6)
    axes[0].set_ylabel("Estimate of post-decision value ($)")
    fig.suptitle(f"{label}: Value estimates for last five samples", y=1.02)
    fig.tight_layout()
    out_png = Path(f"figures/{label.lower()}_last5_two_panel.png")
    out_pdf = Path(f"figures/{label.lower()}_last5_two_panel.pdf")
    out_png.parent.mkdir(parents=True, exist_ok=True)
    fig.savefig(out_png, dpi=300, bbox_inches="tight"); fig.savefig(out_pdf, bbox_inches="tight"); plt.close(fig)
    print(f"[OK] wrote {out_png} (+pdf)")
    
# ===================== RUNTIME vs N (ADP & SDDP) =====================
from glob import glob

RUNTIME_COLS = ["runtime_sec", "elapsed", "wall_time", "time_sec", "seconds", "duration"]
N_COLS       = ["N", "n", "samples", "num_samples"]

def _read_summary_allN(root: Path) -> Optional[pd.DataFrame]:
    """Find *summary_allN*.csv in root, return tidy df with columns N, runtime."""
    files = sorted(glob(str(root / "*summary_allN*.csv")))
    if not files:
        return None
    df = pd.read_csv(files[0])
    cN = _pick(df, N_COLS)
    ct = _pick(df, RUNTIME_COLS)
    if not (cN and ct):
        return None
    out = df[[cN, ct]].rename(columns={cN: "N", ct: "runtime"}).copy()
    out["N"] = pd.to_numeric(out["N"], errors="coerce")
    out["runtime"] = pd.to_numeric(out["runtime"], errors="coerce")
    out = out.dropna().sort_values("N").reset_index(drop=True)
    return out

def build_runtime_vs_samples():
    adp   = _read_summary_allN(DIR_ADP)
    sddp  = _read_summary_allN(DIR_SDDP)
    adpuc = _read_summary_allN(DIR_ADPUC)   # NEW

    if (adp is None or adp.empty) and (sddp is None or sddp.empty) and (adpuc is None or adpuc.empty):
        print("[WARN] No *summary_allN* files found; skipping runtime plot.")
        return

    plt.figure(figsize=(7.2, 4.6))
    if adp is not None and not adp.empty:
        plt.plot(adp["runtime"], adp["N"], marker="o", label="ADP")
    if sddp is not None and not sddp.empty:
        plt.plot(sddp["runtime"], sddp["N"], marker="s", label="SDDP")
    if adpuc is not None and not adpuc.empty:
        plt.plot(adpuc["runtime"], adpuc["N"], marker="^", label="ADP-UC")  # NEW

    plt.xlabel("Runtime")
    plt.ylabel("Number of samples (N)")
    plt.grid(True, linestyle=":", alpha=0.6)
    plt.legend()
    plt.tight_layout()
    out_png = Path("figures/runtime_vs_samples.png")
    out_pdf = Path("figures/runtime_vs_samples.pdf")
    out_png.parent.mkdir(parents=True, exist_ok=True)
    plt.savefig(out_png, dpi=300); plt.savefig(out_pdf); plt.close()
    print(f"[OK] Wrote {out_png} (+pdf)")



# ============ ACCUMULATED DIFF IN PROFIT BY ITER (N=1000) ====================
ITER_COLS  = ["iter", "iteration", "k"]
PATH_COLS  = ["path_id", "path", "sample", "id", "sample_id"]
VAL_COLS   = ["obj", "objective", "profit", "value", "obj_total"]

def _per_iter_profit_from_perpath(root: Path, N: int) -> Optional[pd.DataFrame]:
    """Prefer per-path eval files; columns expected to include iter + a profit/obj value."""
    for pat in [f"*train_eval_perpath*N{N}.csv", f"*test_perpath*N{N}.csv"]:
        for f in sorted(root.glob(pat)):
            df = pd.read_csv(f)
            cit  = _pick(df, ITER_COLS)
            cval = _pick(df, VAL_COLS)
            if not (cit and cval):
                continue
            out = df[[cit, cval]].rename(columns={cit: "iter", cval: "profit"}).copy()
            out["iter"] = pd.to_numeric(out["iter"], errors="coerce")
            out["profit"] = pd.to_numeric(out["profit"], errors="coerce")
            out = out.dropna().groupby("iter", as_index=False)["profit"].mean()
            return out.sort_values("iter").reset_index(drop=True)
    return None

def _per_iter_profit_from_perstage(root: Path, N: int) -> Optional[pd.DataFrame]:
    """
    Fallback: from per-stage file, take the cumulative objective ('obj' / similar)
    at the LAST stage of each (iter, path), then average across paths per iteration.
    """
    hits = sorted(root.glob(f"*train_perstage*N{N}.csv"))
    if not hits:
        return None
    df = pd.read_csv(hits[0])
    cit  = _pick(df, ITER_COLS)
    cpa  = _pick(df, PATH_COLS)
    cti  = _pick(df, ["t", "stage", "Stage", "time", "hour", "k"])
    cval = _pick(df, VAL_COLS + ["obj_stage"])
    if not (cit and cpa and cti and cval):
        return None

    df[cti] = pd.to_numeric(df[cti], errors="coerce")
    df = df.dropna(subset=[cti]).copy()
    last_t = df.groupby([cit, cpa], as_index=False)[cti].max()
    dfm = pd.merge(last_t, df[[cit, cpa, cti, cval]], on=[cit, cpa, cti], how="left")
    out = dfm.groupby(cit, as_index=False)[cval].mean().rename(columns={cit: "iter", cval: "profit"})
    out["iter"] = pd.to_numeric(out["iter"], errors="coerce")
    out = out.dropna().sort_values("iter").reset_index(drop=True)
    return out

def _per_iter_profit(root: Path, N: int) -> Optional[pd.DataFrame]:
    return _per_iter_profit_from_perpath(root, N) or _per_iter_profit_from_perstage(root, N)

def _load_cum_abs_delta_series(root: Path, N: int) -> Optional[pd.Series]:
    hits = sorted(root.glob(f"*history*N{N}.csv"))  # matches adp_uc_history_N1000.csv etc.
    if not hits:
        return None
    df = pd.read_csv(hits[0])
    if "cum_abs_delta" not in df.columns:
        return None
    s = pd.to_numeric(df["cum_abs_delta"], errors="coerce")
    s = s.reset_index(drop=True)  # iteration index = 1..len
    s.index = np.arange(1, len(s) + 1)
    return s.dropna()

def plot_cum_abs_delta_from_history(N: int = 1000):
    A  = _load_cum_abs_delta_series(DIR_ADP,   N)
    S  = _load_cum_abs_delta_series(DIR_SDDP,  N)
    U  = _load_cum_abs_delta_series(DIR_ADPUC, N)   # NEW

    if (A is None) and (S is None) and (U is None):
        print(f"[WARN] No history cum_abs_delta for N={N}; skipping.")
        return

    plt.figure(figsize=(8, 4.8))
    if A is not None: plt.plot(A.index, A.values, label="ADP: cum_abs_delta")
    if S is not None: plt.plot(S.index, S.values, label="SDDP: cum_abs_delta")
    if U is not None: plt.plot(U.index, U.values, label="ADP-UC: cum_abs_delta")  # NEW
    plt.xlabel("Iteration"); plt.ylabel("Cumulative |Δ profit| ($)")
    plt.title(f"Cumulative absolute profit change (N={N})")
    plt.grid(True, linestyle=":", alpha=0.6)
    plt.legend(); plt.tight_layout()
    out_png = Path(f"figures/cum_abs_delta_hist_N{N}.png")
    out_pdf = Path(f"figures/cum_abs_delta_hist_N{N}.pdf")
    out_png.parent.mkdir(parents=True, exist_ok=True)
    plt.savefig(out_png, dpi=300); plt.savefig(out_pdf); plt.close()
    print(f"[OK] Wrote {out_png} (+pdf)")


# ---------- Read EEV/WS by N from a single summary.csv and plot Δ to methods -----

EEV_COL_CANDS_OUT = ["EEV_out", "EEV_out_of_sample", "eev_out", "eev_oos"]
EEV_COL_CANDS_IN  = ["EEV_in", "EEV_in_sample", "eev_in", "eev_is", "EEV"]
WS_COL_CANDS_OUT  = ["WS_out", "WS_out_of_sample", "ws_out", "ws_oos"]
WS_COL_CANDS_IN   = ["WS_in", "WS_in_sample", "ws_in", "ws_is", "WS"]
N_COL_CANDS       = ["N", "n", "samples", "num_samples"]

def _load_summary_eev_ws() -> Optional[pd.DataFrame]:
    """Load summary.csv (EEV/WS by N) from results_eev_ws/ or project root."""
    for base in [DIR_EEVWS, Path(".")]:
        f = base / "summary.csv"
        if f.exists():
            df = pd.read_csv(f)
            # pick N
            cN = _pick(df, N_COL_CANDS)
            if not cN:
                print(f"[WARN] {f} has no N column. Found: {list(df.columns)}"); 
                continue
            # choose out-of-sample if available, else in-sample
            ceev = (_pick(df, EEV_COL_CANDS_OUT) or _pick(df, EEV_COL_CANDS_IN))
            cws  = (_pick(df, WS_COL_CANDS_OUT)  or _pick(df, WS_COL_CANDS_IN))
            if not ceev and not cws:
                print(f"[WARN] {f} has neither EEV nor WS columns I recognize.")
                continue
            out = df[[cN] + [c for c in [ceev, cws] if c]].rename(
                columns={cN:"N", (ceev or "EEV"):"EEV", (cws or "WS"):"WS"}
            ).copy()
            out["N"]  = pd.to_numeric(out["N"], errors="coerce")
            if "EEV" in out: out["EEV"] = pd.to_numeric(out["EEV"], errors="coerce")
            if "WS"  in out: out["WS"]  = pd.to_numeric(out["WS"],  errors="coerce")
            out = out.dropna(subset=["N"]).sort_values("N").reset_index(drop=True)
            print(f"[OK] Loaded EEV/WS summary from {f}")
            return out
    print("[WARN] Could not find summary.csv for EEV/WS.")
    return None

def _profit_at_last_iteration(root: Path, N: int) -> Optional[float]:
    """(Reuse from earlier) – returns scalar mean profit at last iter for a method."""
    # preferred: per-path eval; fallback: per-stage last stage
    for pat in [f"*train_eval_perpath*N{N}.csv", f"*test_perpath*N{N}.csv"]:
        hits = sorted(root.glob(pat))
        for f in hits:
            df = pd.read_csv(f)
            cit = _pick(df, ["iter","iteration","k"])
            cval = _pick(df, ["obj","objective","profit","value","obj_total"])
            if not (cit and cval): 
                continue
            s = (df[[cit,cval]].rename(columns={cit:"iter", cval:"profit"})
                   .assign(iter=lambda d: pd.to_numeric(d["iter"], errors="coerce"),
                           profit=lambda d: pd.to_numeric(d["profit"], errors="coerce"))
                   .dropna().groupby("iter")["profit"].mean().sort_index())
            if not s.empty:
                return float(s.iloc[-1])
    # fallback per-stage
    hits = sorted(root.glob(f"*train_perstage*N{N}.csv"))
    if not hits: return None
    df = pd.read_csv(hits[0])
    cit  = _pick(df, ["iter","iteration","k"])
    cpa  = _pick(df, ["path_id","path","sample","id","sample_id"])
    cti  = _pick(df, ["t","stage","Stage","time","hour","k"])
    cval = _pick(df, ["obj","objective","profit","value","obj_total","obj_stage"])
    if not (cit and cpa and cti and cval):
        return None
    df[cti] = pd.to_numeric(df[cti], errors="coerce")
    df = df.dropna(subset=[cti])
    last_t = df.groupby([cit,cpa], as_index=False)[cti].max()
    dfm = pd.merge(last_t, df[[cit,cpa,cti,cval]], on=[cit,cpa,cti], how="left")
    s = (dfm.rename(columns={cit:"iter", cval:"profit"})
            .assign(iter=lambda d: pd.to_numeric(d["iter"], errors="coerce"),
                    profit=lambda d: pd.to_numeric(d["profit"], errors="coerce"))
            .dropna().groupby("iter")["profit"].mean().sort_index())
    return None if s.empty else float(s.iloc[-1])

def build_delta_vs_N_from_summary(plot_ws: bool=False):
    summ = _load_summary_eev_ws()
    if summ is None or summ.empty or "EEV" not in summ:
        print("[WARN] No usable EEV data in summary.csv; skipping Δ vs N plot.")
        return

    Ns = list(map(int, sorted(summ["N"].dropna().unique())))
    rows = []
    for N in Ns:
        eev = float(summ.loc[summ["N"]==N, "EEV"].dropna().iloc[0]) if (summ["N"]==N).any() else None
        ws  = float(summ.loc[summ["N"]==N, "WS"].dropna().iloc[0])  if ("WS" in summ and (summ["N"]==N).any()) else None
        adp   = _profit_at_last_iteration(DIR_ADP,   N)
        sddp  = _profit_at_last_iteration(DIR_SDDP,  N)
        adpuc = _profit_at_last_iteration(DIR_ADPUC, N)   # NEW
        rows.append({
            "N": N,
            "Δ(EEV−ADP)":    None if (eev is None or adp   is None) else eev - adp,
            "Δ(EEV−SDDP)":   None if (eev is None or sddp  is None) else eev - sddp,
            "Δ(EEV−ADP-UC)": None if (eev is None or adpuc is None) else eev - adpuc,  # NEW
            "Δ(WS−ADP)":     None if (ws  is None or adp   is None) else ws  - adp,
            "Δ(WS−SDDP)":    None if (ws  is None or sddp  is None) else ws  - sddp,
            "Δ(WS−ADP-UC)":  None if (ws  is None or adpuc is None) else ws  - adpuc,
        })
    df = pd.DataFrame(rows).dropna(subset=["N"]).sort_values("N")
    if df.empty:
        print("[WARN] No overlapping N between summary.csv and method profits.")
        return

    plt.figure(figsize=(8, 4.8))
    if df["Δ(EEV−ADP)"].notna().any():
        plt.plot(df["N"], df["Δ(EEV−ADP)"], marker="o", label="EEV − ADP")
    if df["Δ(EEV−SDDP)"].notna().any():
        plt.plot(df["N"], df["Δ(EEV−SDDP)"], marker="s", label="EEV − SDDP")
    if df["Δ(EEV−ADP-UC)"].notna().any():
        plt.plot(df["N"], df["Δ(EEV−ADP-UC)"], marker="^", label="EEV − ADP-UC")  # NEW

    if plot_ws:
        if "Δ(WS−ADP)" in df and df["Δ(WS−ADP)"].notna().any():
            plt.plot(df["N"], df["Δ(WS−ADP)"], linestyle="--", marker="o", label="WS − ADP")
        if "Δ(WS−SDDP)" in df and df["Δ(WS−SDDP)"].notna().any():
            plt.plot(df["N"], df["Δ(WS−SDDP)"], linestyle="--", marker="s", label="WS − SDDP")
        if "Δ(WS−ADP-UC)" in df and df["Δ(WS−ADP-UC)"].notna().any():
            plt.plot(df["N"], df["Δ(WS−ADP-UC)"], linestyle="--", marker="^", label="WS − ADP-UC")

    plt.axhline(0.0, color="k", linewidth=0.8)
    plt.xlabel("Number of samples (N)")
    plt.ylabel("Profit difference ($)")
    plt.title("Difference to EEV by N" + (" (WS included)" if plot_ws else ""))
    plt.grid(True, linestyle=":", alpha=0.6)
    plt.legend()
    plt.tight_layout()
    out_png = Path("figures/delta_to_eev_vs_N.png" if not plot_ws else "figures/delta_to_eev_ws_vs_N.png")
    out_pdf = out_png.with_suffix(".pdf")
    out_png.parent.mkdir(parents=True, exist_ok=True)
    plt.savefig(out_png, dpi=300); plt.savefig(out_pdf); plt.close()
    print(f"[OK] Wrote {out_png} (+pdf)")


# -------- drivers ----------
def build_all():
    # last-five panels
    build_two_panel_for(DIR_ADP,   "ADP")
    build_two_panel_for(DIR_SDDP,  "SDDP")
    build_two_panel_for(DIR_ADPUC, "ADP-UC")

    # std of last five over time
    build_std_figures_for(DIR_ADP,   "ADP")
    build_std_figures_for(DIR_SDDP,  "SDDP")
    build_std_figures_for(DIR_ADPUC, "ADP-UC")

    # runtime & convergence-style plots
    build_runtime_vs_samples()
    plot_cum_abs_delta_from_history(1000)     # ADP, SDDP, ADP-UC
    build_delta_vs_N_from_summary(plot_ws=False)

if __name__ == "__main__":
    build_all()
    print("Done. See the 'figures/' folder.")

