# Systematic Retraining Notebook (PeMS D3 2024)

This notebook is a **clean, repeatable pipeline** to:

1. Load the already-preprocessed graph dataset (`.npz`)
2. Train multiple deep models with **identical splits + metrics**
3. Run controlled hyperparameter sweeps
4. Export a **single summary table + plots**

**Goal:** re-train and *improve* deep learning models (especially **GraphWaveNet–GRU–LSTM**) in a fair, leakage-free way.

---

## What you must set
- `DATASET_NPZ` path (e.g., `artifacts/pems_graph_dataset_strict.npz`)
- `OUT_DIR` for checkpoints + logs

> Tip: Start by running the *Repro* configs to match your previous results. Once reproducible, expand to the *Improve* configs.


In [1]:
# ============================================================
# 0) Imports + device
# ============================================================
import os, json, math, random, time
from pathlib import Path

import numpy as np
import pandas as pd

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader

print("PyTorch:", torch.__version__)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Device:", device)


PyTorch: 2.1.1+cu121
Device: cuda


In [2]:
# ============================================================
# 1) Reproducibility utilities
# ============================================================
def seed_everything(seed: int = 42):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)

    # Deterministic flags (can slow down; set False if needed for speed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

seed_everything(42)


In [3]:
# ============================================================
# 2) Paths + global settings (EDIT THIS)
# ============================================================
DATASET_NPZ = Path("artifacts/pems_graph_dataset_strict.npz")

OUT_DIR = Path("retrain_runs_v1")
OUT_DIR.mkdir(parents=True, exist_ok=True)

EVAL_HORIZONS = [12, 24, 48, 72]  # hours

DEFAULTS = dict(
    seed=42,
    batch_size=16,
    num_workers=0,
    lr=1e-3,
    weight_decay=1e-4,
    max_epochs=50,
    patience=8,
    grad_clip=1.0,
    loss="huber",       # "huber" or "mse"
    huber_delta=1.0,
    use_amp=True,
)

print("DATASET_NPZ:", DATASET_NPZ)
print("OUT_DIR:", OUT_DIR.resolve())


DATASET_NPZ: artifacts/pems_graph_dataset_strict.npz
OUT_DIR: /notebooks/Spatio-Temporal-Prediction-and-Coordination-of-EV-Charging-Demand-for-Power-System-Resilience/retrain_runs_v1


In [4]:
# ============================================================
# 3) Load dataset (.npz) and build scaled arrays (matches your prior notebooks)
# ============================================================
assert DATASET_NPZ.exists(), f"Dataset file not found: {DATASET_NPZ}"

data = np.load(DATASET_NPZ, allow_pickle=True)

X = data["X"].astype(np.float32)          # (T, N, F)
Y = data["Y"].astype(np.float32)          # (T, N) or (T, N, 1)
A = data["A"].astype(np.float32)          # (N, N)
stations = data["stations"]
timestamps = pd.to_datetime(data["timestamps"])

train_starts = data["train_starts"].astype(np.int64)
val_starts   = data["val_starts"].astype(np.int64)
test_starts  = data["test_starts"].astype(np.int64)

IN_LEN = int(data["in_len"][0])
OUT_LEN = int(data["out_len"][0])

flow_mean = data["flow_mean"].astype(np.float32)    # (N,)
flow_std  = data["flow_std"].astype(np.float32)     # (N,)
speed_mean = data["speed_mean"].astype(np.float32)  # (N,)
speed_std  = data["speed_std"].astype(np.float32)   # (N,)

print("X:", X.shape, "Y:", Y.shape)
print("A:", A.shape, "stations:", stations.shape)
print("IN_LEN:", IN_LEN, "OUT_LEN:", OUT_LEN)
print("starts:", len(train_starts), len(val_starts), len(test_starts))

# Convert Y to (T,N)
if Y.ndim == 3 and Y.shape[-1] == 1:
    Y = Y[..., 0]
assert Y.ndim == 2, f"Unexpected Y shape: {Y.shape}"

# Scale targets (this is what your earlier notebook assumes when it 'unscales' for metrics)
Y_scaled = (Y - flow_mean[None, :]) / (flow_std[None, :] + 1e-6)

# Scale input features per-node: channel 0 = flow, channel 1 = speed
X_scaled = X.copy()
X_scaled[..., 0] = (X_scaled[..., 0] - flow_mean[None, :]) / (flow_std[None, :] + 1e-6)
X_scaled[..., 1] = (X_scaled[..., 1] - speed_mean[None, :]) / (speed_std[None, :] + 1e-6)

# Transpose for fast slicing: (T,N,F) -> (F,N,T)
X_fnt = np.transpose(X_scaled, (2, 1, 0)).copy()  # (F, N, T)
Y_nt  = np.transpose(Y_scaled, (1, 0)).copy()     # (N, T)

T_total = X.shape[0]
print("X_fnt:", X_fnt.shape, "Y_nt:", Y_nt.shape, "T_total:", T_total)

# Precompute time features for every timestamp
hour = timestamps.hour.values
dow  = timestamps.dayofweek.values
tf_all = np.stack([
    np.sin(2*np.pi*hour/24.0),
    np.cos(2*np.pi*hour/24.0),
    np.sin(2*np.pi*dow/7.0),
    np.cos(2*np.pi*dow/7.0),
], axis=1).astype(np.float32)  # (T,4)

# Horizon indices (0-based inside the OUT_LEN window)
HORIZON_IDXS = [h - 1 for h in EVAL_HORIZONS]
assert max(HORIZON_IDXS) < OUT_LEN, f"OUT_LEN={OUT_LEN} is too small for horizons {EVAL_HORIZONS}"

class PemsWindowDataset(Dataset):
    def __init__(self, starts: np.ndarray):
        self.starts = starts

    def __len__(self):
        return len(self.starts)

    def __getitem__(self, idx):
        t0 = int(self.starts[idx])

        # x: (F,N,IN)
        x = X_fnt[:, :, t0 : t0 + IN_LEN]

        # y: (OUT, N) from Y_nt (N,T)
        y = Y_nt[:, t0 + IN_LEN : t0 + IN_LEN + OUT_LEN].T

        # tf: (OUT, 4)
        tf = tf_all[t0 + IN_LEN : t0 + IN_LEN + OUT_LEN]

        return (
            torch.from_numpy(x),             # (F,N,IN)
            torch.from_numpy(tf),            # (OUT,4)
            torch.from_numpy(y),             # (OUT,N)
        )

def make_loader(starts, batch_size, shuffle):
    ds = PemsWindowDataset(starts)
    return DataLoader(ds, batch_size=batch_size, shuffle=shuffle,
                      num_workers=DEFAULTS["num_workers"], pin_memory=True)

train_loader = make_loader(train_starts, DEFAULTS["batch_size"], shuffle=True)
val_loader   = make_loader(val_starts,   DEFAULTS["batch_size"], shuffle=False)
test_loader  = make_loader(test_starts,  DEFAULTS["batch_size"], shuffle=False)

print("Ready: loaders built.")


X: (2208, 1821, 6) Y: (2208, 1821)
A: (1821, 1821) stations: (1821,)
IN_LEN: 24 OUT_LEN: 72
starts: 1009 289 673
X_fnt: (6, 1821, 2208) Y_nt: (1821, 2208) T_total: 2208
Ready: loaders built.


In [5]:
# ============================================================
# 4) Metrics (reported in ORIGINAL units: vehicles/hour)
# ============================================================
FLOW_MEAN_T = torch.from_numpy(flow_mean).float().view(1, 1, -1).to(device)  # (1,1,N)
FLOW_STD_T  = torch.from_numpy(flow_std).float().view(1, 1, -1).to(device)   # (1,1,N)

def unscale_flow(y_scaled: torch.Tensor) -> torch.Tensor:
    # y_scaled: (B, OUT, N)
    return y_scaled * (FLOW_STD_T + 1e-6) + FLOW_MEAN_T

@torch.no_grad()
def compute_metrics(y_true_scaled: torch.Tensor, y_pred_scaled: torch.Tensor, horizon_idxs):
    '''
    Inputs are SCALED (normalized) flow.
    Output metrics are in ORIGINAL units (vehicles/hour).
    '''
    y_true = unscale_flow(y_true_scaled)
    y_pred = unscale_flow(y_pred_scaled)

    out = {}
    for h, hi in zip(EVAL_HORIZONS, horizon_idxs):
        err = y_pred[:, hi, :] - y_true[:, hi, :]
        mae = err.abs().mean().item()
        rmse = (err.pow(2).mean().sqrt()).item()
        out[f"MAE@{h}"] = mae
        out[f"RMSE@{h}"] = rmse
    out["MAE_avg"] = float(np.mean([out[f"MAE@{h}"] for h in EVAL_HORIZONS]))
    out["RMSE_avg"] = float(np.mean([out[f"RMSE@{h}"] for h in EVAL_HORIZONS]))
    return out


## 5) Model zoo

All models follow:

- `forward(x, tf) -> y_hat_scaled`  
  where `x` is `(B, F, N, IN_LEN)`, `tf` is `(B, OUT_LEN, 4)`, and `y_hat_scaled` is `(B, OUT_LEN, N)`.

Targets are **scaled** in training; we unscale only for reporting MAE/RMSE.


In [6]:
# ============================================================
# 5A) LSTM baseline (time-aware head)
# ============================================================
class LSTM_Baseline(nn.Module):
    def __init__(self, num_nodes: int, in_dim: int, in_len: int, out_len: int,
                 hidden_size: int = 128, num_layers: int = 1, dropout: float = 0.2, time_dim: int = 32):
        super().__init__()
        self.num_nodes = num_nodes
        self.in_dim = in_dim
        self.in_len = in_len
        self.out_len = out_len

        self.lstm = nn.LSTM(
            input_size=in_dim,
            hidden_size=hidden_size,
            num_layers=num_layers,
            batch_first=True,
            dropout=dropout if num_layers > 1 else 0.0,
        )
        self.time_mlp = nn.Sequential(
            nn.Linear(4, time_dim), nn.ReLU(), nn.Linear(time_dim, hidden_size)
        )
        self.out = nn.Linear(hidden_size, 1)

    def forward(self, x, tf):
        # x: (B,F,N,IN)
        B, F, N, T = x.shape
        x_seq = x.permute(0, 2, 3, 1).reshape(B * N, T, F)  # (B*N, IN, F)

        h_seq, _ = self.lstm(x_seq)         # (B*N, IN, H)
        h_last = h_seq[:, -1, :]            # (B*N, H)

        tf_flat = tf.reshape(B * self.out_len, 4)                # (B*OUT, 4)
        te = self.time_mlp(tf_flat).view(B, self.out_len, -1)    # (B, OUT, H)

        h_last = h_last.view(B, N, -1)                           # (B, N, H)
        h_rep = h_last.unsqueeze(1).expand(-1, self.out_len, -1, -1)  # (B, OUT, N, H)

        z = h_rep + te.unsqueeze(2)                              # (B, OUT, N, H)
        y_hat = self.out(z).squeeze(-1)                          # (B, OUT, N)  (scaled)
        return y_hat


In [7]:
# ============================================================
# 5B) GraphWaveNet encoder + optional RNN refinement
#     (supports sparse + optional adaptive dense adjacency)
# ============================================================
class NConvMixed(nn.Module):
    # Graph multiplication supporting sparse OR dense adjacency.
    def forward(self, x, A):
        # x: (B,C,N,T), A: sparse (N,N) or dense (N,N)
        if A.is_sparse:
            B, C, N, T = x.shape
            x_r = x.permute(2, 0, 1, 3).reshape(N, -1)      # (N, B*C*T)
            x_r = torch.sparse.mm(A, x_r)                   # (N, B*C*T)
            return x_r.reshape(N, B, C, T).permute(1, 2, 0, 3)
        else:
            return torch.einsum("bcjt,ij->bcit", x, A)

class DiffusionGraphConvMixed(nn.Module):
    def __init__(self, c_in, c_out, supports, order=1, dropout=0.0):
        super().__init__()
        self.nconv = NConvMixed()
        self.supports = supports
        self.order = order
        self.dropout = dropout
        c_total = c_in * (1 + len(supports) * order)
        self.mlp = nn.Conv2d(c_total, c_out, kernel_size=(1, 1))

    def forward(self, x):
        out = [x]
        for A in self.supports:
            x1 = self.nconv(x, A)
            out.append(x1)
            for _ in range(2, self.order + 1):
                x1 = self.nconv(x1, A)
                out.append(x1)
        h = torch.cat(out, dim=1)
        h = self.mlp(h)
        return F.dropout(h, p=self.dropout, training=self.training)

class GWNLayer(nn.Module):
    def __init__(self, residual_channels, dilation_channels, skip_channels,
                 kernel_size, dilation, supports, dropout=0.0, order=1):
        super().__init__()
        self.filter_conv = nn.Conv2d(residual_channels, dilation_channels, (1, kernel_size),
                                     dilation=(1, dilation))
        self.gate_conv   = nn.Conv2d(residual_channels, dilation_channels, (1, kernel_size),
                                     dilation=(1, dilation))
        self.dropout = dropout
        self.gconv = DiffusionGraphConvMixed(dilation_channels, residual_channels, supports, order=order, dropout=dropout)
        self.res_conv  = nn.Conv2d(residual_channels, residual_channels, kernel_size=(1,1))
        self.skip_conv = nn.Conv2d(residual_channels, skip_channels,   kernel_size=(1,1))
        self.bn = nn.BatchNorm2d(residual_channels)

    def forward(self, x):
        filt = torch.tanh(self.filter_conv(x))
        gate = torch.sigmoid(self.gate_conv(x))
        x_t  = filt * gate

        skip = self.skip_conv(x_t)
        x_g  = self.gconv(x_t)
        x_r  = self.res_conv(x_g)

        x = x_r + x[..., -x_r.shape[-1]:]
        x = self.bn(x)
        return x, skip

class GraphWaveNetEncoder(nn.Module):
    def __init__(self, num_nodes, in_dim, residual_channels=32, dilation_channels=32,
                 skip_channels=128, end_channels=256, kernel_size=2,
                 blocks=2, layers=4, dropout=0.1, order=1,
                 supports=None, adaptive_adj=False, adp_dim=10):
        super().__init__()
        self.num_nodes = num_nodes
        self.in_dim = in_dim
        self.residual_channels = residual_channels
        self.kernel_size = kernel_size
        self.blocks = blocks
        self.layers = layers
        self.order = order

        self.supports_static = supports or []
        self.adaptive_adj = adaptive_adj
        if adaptive_adj:
            self.nodevec1 = nn.Parameter(torch.randn(num_nodes, adp_dim))
            self.nodevec2 = nn.Parameter(torch.randn(adp_dim, num_nodes))

        self.start_conv = nn.Conv2d(in_dim, residual_channels, kernel_size=(1,1))
        self.gwn_layers = nn.ModuleList()

        receptive_field = 1
        for _ in range(blocks):
            dilation = 1
            for _ in range(layers):
                self.gwn_layers.append(
                    GWNLayer(residual_channels, dilation_channels, skip_channels,
                             kernel_size, dilation, supports=[], dropout=dropout, order=order)
                )
                receptive_field += (kernel_size - 1) * dilation
                dilation *= 2
        self.receptive_field = receptive_field

        self.end_conv_1 = nn.Conv2d(skip_channels, end_channels, kernel_size=(1,1))
        self.end_conv_2 = nn.Conv2d(end_channels, residual_channels, kernel_size=(1,1))

    def _build_supports(self):
        sups = list(self.supports_static)
        if self.adaptive_adj:
            adp = F.softmax(F.relu(self.nodevec1 @ self.nodevec2), dim=1)  # (N,N)
            sups.append(adp)
        return sups

    def forward(self, x):
        B, F, N, T = x.shape
        if T < self.receptive_field:
            x = F.pad(x, (self.receptive_field - T, 0, 0, 0))

        x = self.start_conv(x)

        supports = self._build_supports()
        skip_sum = 0.0

        layer_idx = 0
        for _ in range(self.blocks):
            dilation = 1
            for _ in range(self.layers):
                self.gwn_layers[layer_idx].gconv.supports = supports
                pad = (self.kernel_size - 1) * dilation
                x_in = F.pad(x, (pad, 0, 0, 0))
                x, skip = self.gwn_layers[layer_idx](x_in)
                skip_sum = skip_sum + skip
                layer_idx += 1
                dilation *= 2

        x = F.relu(skip_sum)
        x = F.relu(self.end_conv_1(x))
        x = self.end_conv_2(x)
        return x

class GraphWaveNetRNN(nn.Module):
    # GraphWaveNet encoder -> optional GRU -> optional LSTM -> time-aware horizon head.
    def __init__(self, num_nodes, in_dim, in_len, out_len, supports,
                 residual_channels=32, dilation_channels=32, skip_channels=128, end_channels=256,
                 kernel_size=2, blocks=2, layers=4, dropout=0.1, order=1,
                 adaptive_adj=False, adp_dim=10,
                 rnn_mode="gru_lstm", rnn_hidden=128, time_dim=32):
        super().__init__()
        self.num_nodes = num_nodes
        self.in_len = in_len
        self.out_len = out_len
        self.rnn_mode = rnn_mode

        self.encoder = GraphWaveNetEncoder(
            num_nodes=num_nodes, in_dim=in_dim,
            residual_channels=residual_channels, dilation_channels=dilation_channels,
            skip_channels=skip_channels, end_channels=end_channels,
            kernel_size=kernel_size, blocks=blocks, layers=layers,
            dropout=dropout, order=order, supports=supports,
            adaptive_adj=adaptive_adj, adp_dim=adp_dim
        )

        enc_dim = residual_channels
        if rnn_mode in ("gru", "gru_lstm"):
            self.gru = nn.GRU(enc_dim, rnn_hidden, batch_first=True)
            enc_dim = rnn_hidden
        else:
            self.gru = None

        if rnn_mode in ("lstm", "gru_lstm"):
            self.lstm = nn.LSTM(enc_dim, rnn_hidden, batch_first=True)
            enc_dim = rnn_hidden
        else:
            self.lstm = None

        self.time_mlp = nn.Sequential(
            nn.Linear(4, time_dim), nn.ReLU(), nn.Linear(time_dim, enc_dim)
        )
        self.out = nn.Linear(enc_dim, 1)

    def forward(self, x, tf):
        E = self.encoder(x)                              # (B, C, N, Tenc)
        B, C, N, T = E.shape
        E_seq = E.permute(0, 2, 3, 1).contiguous()        # (B, N, T, C)
        if T > self.in_len:
            E_seq = E_seq[:, :, -self.in_len:, :]

        seq = E_seq.reshape(B * N, E_seq.shape[2], E_seq.shape[3])

        if self.gru is not None:
            seq, _ = self.gru(seq)
        if self.lstm is not None:
            seq, _ = self.lstm(seq)

        h_last = seq[:, -1, :].view(B, N, -1)

        tf_flat = tf.reshape(B * self.out_len, 4)
        te = self.time_mlp(tf_flat).view(B, self.out_len, -1)

        h_rep = h_last.unsqueeze(1).expand(-1, self.out_len, -1, -1)
        z = h_rep + te.unsqueeze(2)
        y_hat = self.out(z).squeeze(-1)                  # scaled
        return y_hat


In [8]:
# ============================================================
# 5C) STGCN baseline (Chebyshev graph conv + temporal GLU)
# ============================================================
def nconv_sparse(x: torch.Tensor, A_sp: torch.Tensor) -> torch.Tensor:
    B, C, N, T = x.shape
    x_r = x.permute(2, 0, 1, 3).reshape(N, -1)
    x_r = torch.sparse.mm(A_sp, x_r)
    return x_r.reshape(N, B, C, T).permute(1, 2, 0, 3)

class TemporalConvGLU(nn.Module):
    def __init__(self, c_in: int, c_out: int, kt: int, dropout: float):
        super().__init__()
        self.kt = kt
        self.conv = nn.Conv2d(c_in, 2*c_out, kernel_size=(1, kt))
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        x = F.pad(x, (self.kt - 1, 0, 0, 0))
        z = self.conv(x)
        a, b = z.chunk(2, dim=1)
        return self.dropout(a * torch.sigmoid(b))

class ChebGraphConv(nn.Module):
    def __init__(self, c_in: int, c_out: int, Ks: int, L_sp: torch.Tensor):
        super().__init__()
        assert Ks >= 1
        self.Ks = Ks
        self.L_sp = L_sp
        self.theta = nn.Conv2d(Ks * c_in, c_out, kernel_size=(1, 1))

    def forward(self, x):
        out = [x]
        if self.Ks > 1:
            x1 = nconv_sparse(x, self.L_sp)
            out.append(x1)
            for _ in range(2, self.Ks):
                x2 = 2 * nconv_sparse(out[-1], self.L_sp) - out[-2]
                out.append(x2)
        return self.theta(torch.cat(out, dim=1))

class STConvBlock(nn.Module):
    def __init__(self, c_in, c_t, c_s, c_out, kt, Ks, L_sp, dropout):
        super().__init__()
        self.temp1 = TemporalConvGLU(c_in,  c_t,  kt=kt, dropout=dropout)
        self.gconv = ChebGraphConv(c_t, c_s, Ks=Ks, L_sp=L_sp)
        self.temp2 = TemporalConvGLU(c_s,  c_out, kt=kt, dropout=dropout)
        self.res = nn.Conv2d(c_in, c_out, kernel_size=(1,1)) if c_in != c_out else None
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        x_in = x
        x = self.temp1(x)
        x = F.relu(self.gconv(x))
        x = self.temp2(x)
        if self.res is not None:
            x_in = self.res(x_in)
        return self.dropout(F.relu(x + x_in))

def scaled_laplacian_sparse(A: np.ndarray) -> torch.Tensor:
    N = A.shape[0]
    A = A.astype(np.float32)
    D = np.diag(A.sum(axis=1) + 1e-6)
    L = D - A
    v = np.random.randn(N).astype(np.float32)
    for _ in range(50):
        v = L @ v
        v = v / (np.linalg.norm(v) + 1e-6)
    lam = float(v @ (L @ v))
    L_tilde = (2.0 / (lam + 1e-6)) * L - np.eye(N, dtype=np.float32)
    return torch.from_numpy(L_tilde).to_sparse()

class STGCN_MultiHorizon(nn.Module):
    def __init__(self, num_nodes, in_dim, out_len, L_sp,
                 kt=3, Ks=3, dropout=0.1, c_t=64, c_s=16, c_out=64, blocks=2):
        super().__init__()
        layers = []
        c_in = in_dim
        for _ in range(blocks):
            layers.append(STConvBlock(c_in=c_in, c_t=c_t, c_s=c_s, c_out=c_out,
                                      kt=kt, Ks=Ks, L_sp=L_sp, dropout=dropout))
            c_in = c_out
        self.blocks = nn.ModuleList(layers)
        self.head = nn.Conv1d(c_out, out_len, kernel_size=1)

    def forward(self, x, tf=None):
        h = x
        for blk in self.blocks:
            h = blk(h)
        h_last = h[:, :, :, -1]
        return self.head(h_last)  # (B, OUT, N) scaled


## 6) Training / validation / test runner (early stopping + checkpointing)

- Loss is computed in **scaled space** (stable training).
- Metrics are reported in **vehicles/hour** (unscaled).


In [9]:
# ============================================================
# 6) Training utilities
# ============================================================
def make_loss(loss_name: str, huber_delta: float = 1.0):
    if loss_name.lower() == "mse":
        return nn.MSELoss()
    if loss_name.lower() == "huber":
        return nn.HuberLoss(delta=huber_delta)
    raise ValueError(f"Unknown loss: {loss_name}")

def save_json(obj, path: Path):
    path.parent.mkdir(parents=True, exist_ok=True)
    with open(path, "w") as f:
        json.dump(obj, f, indent=2)

@torch.no_grad()
def eval_epoch(model, loader, loss_fn, amp=False):
    model.eval()
    losses = []
    all_metrics = []
    for x, tf, y in loader:
        x = x.to(device, non_blocking=True).float()
        tf = tf.to(device, non_blocking=True).float()
        y = y.to(device, non_blocking=True).float()

        with torch.cuda.amp.autocast(enabled=(amp and device.type == "cuda")):
            y_hat = model(x, tf)              # (B, OUT, N) scaled
            loss = loss_fn(y_hat, y)
        losses.append(loss.item())
        all_metrics.append(compute_metrics(y, y_hat, HORIZON_IDXS))

    out = {k: float(np.mean([m[k] for m in all_metrics])) for k in all_metrics[0].keys()}
    out["loss"] = float(np.mean(losses))
    return out

def train_one_experiment(name: str, model: nn.Module, cfg: dict):
    run_dir = OUT_DIR / name
    run_dir.mkdir(parents=True, exist_ok=True)

    seed_everything(cfg["seed"])
    model = model.to(device)

    loss_fn = make_loss(cfg["loss"], cfg.get("huber_delta", 1.0))
    optim = torch.optim.AdamW(model.parameters(), lr=cfg["lr"], weight_decay=cfg["weight_decay"])
    scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optim, mode="min", factor=0.5, patience=3)

    scaler = torch.cuda.amp.GradScaler(enabled=(cfg["use_amp"] and device.type == "cuda"))

    best_val = float("inf")
    best_path = run_dir / "best.pt"
    history = []
    patience_left = cfg["patience"]

    for epoch in range(1, cfg["max_epochs"] + 1):
        t0 = time.time()
        model.train()
        train_losses = []

        for x, tf, y in train_loader:
            x = x.to(device, non_blocking=True).float()
            tf = tf.to(device, non_blocking=True).float()
            y = y.to(device, non_blocking=True).float()

            optim.zero_grad(set_to_none=True)

            with torch.cuda.amp.autocast(enabled=(cfg["use_amp"] and device.type == "cuda")):
                y_hat = model(x, tf)
                loss = loss_fn(y_hat, y)

            scaler.scale(loss).backward()
            if cfg["grad_clip"] is not None:
                scaler.unscale_(optim)
                torch.nn.utils.clip_grad_norm_(model.parameters(), cfg["grad_clip"])
            scaler.step(optim)
            scaler.update()

            train_losses.append(loss.item())

        val_out = eval_epoch(model, val_loader, loss_fn, amp=cfg["use_amp"])
        val_mae_avg = val_out["MAE_avg"]
        scheduler.step(val_mae_avg)

        epoch_out = dict(
            epoch=epoch,
            train_loss=float(np.mean(train_losses)),
            lr=float(optim.param_groups[0]["lr"]),
            **{f"val_{k}": v for k, v in val_out.items()}
        )
        history.append(epoch_out)

        dt = time.time() - t0
        print(f"[{name}] epoch {epoch:03d} | train_loss={epoch_out['train_loss']:.4f} | "
              f"val_MAE_avg={val_mae_avg:.2f} veh/h | lr={epoch_out['lr']:.2e} | {dt:.1f}s")

        if val_mae_avg < best_val - 1e-6:
            best_val = val_mae_avg
            patience_left = cfg["patience"]
            torch.save({"model": model.state_dict(), "cfg": cfg}, best_path)
        else:
            patience_left -= 1
            if patience_left <= 0:
                print(f"[{name}] Early stopping at epoch {epoch}. Best val MAE_avg={best_val:.2f}")
                break

    ckpt = torch.load(best_path, map_location=device)
    model.load_state_dict(ckpt["model"])
    test_out = eval_epoch(model, test_loader, loss_fn, amp=cfg["use_amp"])

    save_json({"cfg": cfg, "history": history, "best_val_MAE_avg": best_val, "test": test_out},
              run_dir / "results.json")

    return {"name": name, "best_val_MAE_avg": best_val, **{f"test_{k}": v for k, v in test_out.items()}}


## 7) Experiment definitions

This is the **one place** you edit to control what gets trained.

To strengthen the proposed model:
- `adaptive_adj=True` (learned adjacency like GraphWaveNet)
- bigger channels (residual 64, skip 256)
- lower LR (e.g., `5e-4`) + longer training
- if you change `IN_LEN`, rebuild the `.npz` dataset first


In [10]:
# ============================================================
# 7) Define experiments
# ============================================================
N = X.shape[1]
F_in = X.shape[2]

# Static supports (forward/backward random-walk)
D = np.diag(A.sum(axis=1) + 1e-6)
A_rw = np.linalg.solve(D, A)
A_rw_T = np.linalg.solve(D, A.T)

A_rw_sp   = torch.from_numpy(A_rw).to_sparse().to(device)
A_rw_T_sp = torch.from_numpy(A_rw_T).to_sparse().to(device)
supports_static = [A_rw_sp, A_rw_T_sp]

L_sp = scaled_laplacian_sparse(A).to(device)

EXPERIMENTS = [
    # -------------------------
    # Reproduce (sanity checks)
    # -------------------------
    dict(
        name="LSTM_repro",
        model=lambda: LSTM_Baseline(num_nodes=N, in_dim=2, in_len=IN_LEN, out_len=OUT_LEN,
                                    hidden_size=128, num_layers=1, dropout=0.2, time_dim=32),
        cfg=dict(DEFAULTS, lr=1e-3, max_epochs=30, patience=6),
    ),
    dict(
        name="STGCN_repro",
        model=lambda: STGCN_MultiHorizon(num_nodes=N, in_dim=2, out_len=OUT_LEN, L_sp=L_sp,
                                         kt=3, Ks=3, dropout=0.1, c_t=64, c_s=16, c_out=64, blocks=2),
        cfg=dict(DEFAULTS, lr=1e-3, max_epochs=30, patience=6),
    ),
    dict(
        name="GWN_repro_noRNN",
        model=lambda: GraphWaveNetRNN(num_nodes=N, in_dim=2, in_len=IN_LEN, out_len=OUT_LEN,
                                      supports=supports_static,
                                      residual_channels=32, dilation_channels=32, skip_channels=128, end_channels=256,
                                      blocks=2, layers=4, dropout=0.1, order=1,
                                      adaptive_adj=False, rnn_mode="none", rnn_hidden=128),
        cfg=dict(DEFAULTS, lr=1e-3, max_epochs=30, patience=6),
    ),

    # -------------------------
    # Improve (stronger settings)
    # -------------------------
    dict(
        name="GWN_GRU_LSTM_adaptive",
        model=lambda: GraphWaveNetRNN(num_nodes=N, in_dim=2, in_len=IN_LEN, out_len=OUT_LEN,
                                      supports=supports_static,
                                      residual_channels=64, dilation_channels=64, skip_channels=256, end_channels=512,
                                      blocks=3, layers=4, dropout=0.2, order=1,
                                      adaptive_adj=True, adp_dim=20,
                                      rnn_mode="gru_lstm", rnn_hidden=256),
        cfg=dict(DEFAULTS, lr=5e-4, max_epochs=60, patience=10, grad_clip=1.0, weight_decay=1e-4),
    ),
    dict(
        name="GWN_LSTM_adaptive",
        model=lambda: GraphWaveNetRNN(num_nodes=N, in_dim=2, in_len=IN_LEN, out_len=OUT_LEN,
                                      supports=supports_static,
                                      residual_channels=64, dilation_channels=64, skip_channels=256, end_channels=512,
                                      blocks=3, layers=4, dropout=0.2, order=1,
                                      adaptive_adj=True, adp_dim=20,
                                      rnn_mode="lstm", rnn_hidden=256),
        cfg=dict(DEFAULTS, lr=5e-4, max_epochs=60, patience=10),
    ),
]

print("Experiments:", [e["name"] for e in EXPERIMENTS])


Experiments: ['LSTM_repro', 'STGCN_repro', 'GWN_repro_noRNN', 'GWN_GRU_LSTM_adaptive', 'GWN_LSTM_adaptive']


In [11]:
# ============================================================
# 8) Run all experiments and create a summary table
# ============================================================
results = []
for exp in EXPERIMENTS:
    name = exp["name"]
    cfg = exp["cfg"]
    model = exp["model"]()
    print("\n" + "="*80)
    print("Running:", name)
    out = train_one_experiment(name, model, cfg)
    results.append(out)

df_results = pd.DataFrame(results).sort_values("test_MAE_avg")
display(df_results)

df_results.to_csv(OUT_DIR / "summary.csv", index=False)
print("Saved summary:", (OUT_DIR / "summary.csv").resolve())



Running: LSTM_repro


RuntimeError: shape '[3072, 1]' is invalid for input of size 1024

In [None]:
# ============================================================
# 9) Plot MAE/RMSE vs horizon for the best runs
# ============================================================
import matplotlib.pyplot as plt

topk = min(5, len(df_results))
df_top = df_results.head(topk).copy()

x = np.array(EVAL_HORIZONS)
for metric in ["MAE", "RMSE"]:
    plt.figure()
    for _, row in df_top.iterrows():
        y = [row[f"test_{metric}@{h}"] for h in EVAL_HORIZONS]
        plt.plot(x, y, marker="o", label=row["name"])
    plt.xlabel("Horizon (hours)")
    plt.ylabel(metric + " (vehicles/hour)")
    plt.title(f"Top-{topk} Test {metric} vs Horizon")
    plt.legend()
    plt.show()


## 10) To beat Random Forest (and keep the comparison fair)

If your Random Forest uses engineered covariates (lags/rolling stats/neighbors) but deep models only see (flow,speed),
RF can win because it has more information.

To give deep models a fair shot, rebuild the `.npz` so `X[..., :]` includes the engineered features too, then rerun.
