In [1]:
import os
# IMPORTANT: limit CPU thread explosions (helps stop Paperspace kernels from dying)
os.environ["OMP_NUM_THREADS"] = "1"
os.environ["MKL_NUM_THREADS"] = "1"
os.environ["OPENBLAS_NUM_THREADS"] = "1"
os.environ["NUMEXPR_NUM_THREADS"] = "1"

import json
import gc
from pathlib import Path
import numpy as np
import pandas as pd
from tqdm import tqdm

import torch
import torch.nn as nn


In [2]:
!pip -q install scikit-learn joblib

[0m

In [3]:
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import MultiTaskElasticNet
from sklearn.ensemble import RandomForestRegressor
from joblib import Parallel, delayed


In [4]:
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
print("Device:", DEVICE)
if DEVICE == "cuda":
    print("GPU:", torch.cuda.get_device_name(0))

DATA_PATH = Path("artifacts/pems_graph_dataset_strict.npz")
assert DATA_PATH.exists(), f"Missing: {DATA_PATH}"

data = np.load(DATA_PATH, allow_pickle=True)

X = data["X"].astype(np.float32)         # (T, N, F)
Y = data["Y"].astype(np.float32)         # (T, N)   raw flow
A = data["A"].astype(np.float32)         # (N, N)
stations = data["stations"]
timestamps = data["timestamps"]

train_starts = data["train_starts"].astype(np.int64)
val_starts   = data["val_starts"].astype(np.int64)
test_starts  = data["test_starts"].astype(np.int64)

IN_LEN  = int(np.array(data["in_len"]).item())
OUT_LEN = int(np.array(data["out_len"]).item())

flow_mean  = data["flow_mean"].astype(np.float32)   # (N,)
flow_std   = data["flow_std"].astype(np.float32)    # (N,)
speed_mean = data["speed_mean"].astype(np.float32)  # (N,)
speed_std  = data["speed_std"].astype(np.float32)   # (N,)

T, N, Fdim = X.shape
print("X:", X.shape, "(T,N,F)")
print("Y:", Y.shape, "(T,N)")
print("IN_LEN/OUT_LEN:", IN_LEN, OUT_LEN)
print("N stations:", N)
print("train/val/test starts:", len(train_starts), len(val_starts), len(test_starts))


Device: cuda
GPU: Quadro P5000
X: (2208, 1821, 6) (T,N,F)
Y: (2208, 1821) (T,N)
IN_LEN/OUT_LEN: 24 72
N stations: 1821
train/val/test starts: 1009 289 673


In [5]:
def time_encoding(dt_index: pd.DatetimeIndex) -> np.ndarray:
    hours = dt_index.hour.values
    dow   = dt_index.dayofweek.values
    hour_sin = np.sin(2*np.pi*hours/24.0)
    hour_cos = np.cos(2*np.pi*hours/24.0)
    dow_sin  = np.sin(2*np.pi*dow/7.0)
    dow_cos  = np.cos(2*np.pi*dow/7.0)
    return np.stack([hour_sin, hour_cos, dow_sin, dow_cos], axis=1).astype(np.float32)

TF_all = time_encoding(pd.to_datetime(timestamps))  # (T,4)
print("TF_all:", TF_all.shape)


TF_all: (2208, 4)


In [6]:
# Avoid divide-by-zero
flow_std  = np.maximum(flow_std,  1e-6).astype(np.float32)
speed_std = np.maximum(speed_std, 1e-6).astype(np.float32)

X_scaled = X.copy()
# assume channel0=flow, channel1=speed (your pipeline)
X_scaled[:, :, 0] = (X_scaled[:, :, 0] - flow_mean[None, :])  / flow_std[None, :]
X_scaled[:, :, 1] = (X_scaled[:, :, 1] - speed_mean[None, :]) / speed_std[None, :]

Y_scaled = (Y - flow_mean[None, :]) / flow_std[None, :]

print("Sanity (scaled) flow mean/std ~ 0/1 on TRAIN-ish slice:")
print("Y_scaled mean/std:", float(Y_scaled.mean()), float(Y_scaled.std()))


Sanity (scaled) flow mean/std ~ 0/1 on TRAIN-ish slice:
Y_scaled mean/std: -781.403564453125 30704.76953125


In [7]:
X_fnt = np.transpose(X_scaled, (2, 1, 0)).copy()  # (F, N, T)

class FastPemsWindowDataset(torch.utils.data.Dataset):
    def __init__(self, X_fnt, Y_scaled, TF_all, starts, in_len, out_len):
        self.X_fnt = X_fnt
        self.Y = Y_scaled
        self.TF = TF_all
        self.starts = np.asarray(starts, dtype=np.int64)
        self.in_len = int(in_len)
        self.out_len = int(out_len)

    def __len__(self):
        return len(self.starts)

    def __getitem__(self, i):
        s = int(self.starts[i])
        x = self.X_fnt[:, :, s:s+self.in_len]                    # (F,N,IN)
        y = self.Y[s+self.in_len:s+self.in_len+self.out_len, :]  # (OUT,N)
        tf = self.TF[s+self.in_len:s+self.in_len+self.out_len]   # (OUT,4)
        return torch.from_numpy(x), torch.from_numpy(y), torch.from_numpy(tf)

train_ds = FastPemsWindowDataset(X_fnt, Y_scaled, TF_all, train_starts, IN_LEN, OUT_LEN)
val_ds   = FastPemsWindowDataset(X_fnt, Y_scaled, TF_all, val_starts,   IN_LEN, OUT_LEN)
test_ds  = FastPemsWindowDataset(X_fnt, Y_scaled, TF_all, test_starts,  IN_LEN, OUT_LEN)

BATCH_SIZE = 8
train_loader = torch.utils.data.DataLoader(train_ds, batch_size=BATCH_SIZE, shuffle=True,  num_workers=0, pin_memory=False)
val_loader   = torch.utils.data.DataLoader(val_ds,   batch_size=BATCH_SIZE, shuffle=False, num_workers=0, pin_memory=False)
test_loader  = torch.utils.data.DataLoader(test_ds,  batch_size=BATCH_SIZE, shuffle=False, num_workers=0, pin_memory=False)

xb, yb, tfb = next(iter(train_loader))
print("Batch x:", xb.shape, "Batch y:", yb.shape, "Batch tf:", tfb.shape)


Batch x: torch.Size([8, 6, 1821, 24]) Batch y: torch.Size([8, 72, 1821]) Batch tf: torch.Size([8, 72, 4])


In [8]:
EVAL_HORIZONS = [12, 24, 48, 72]
H = len(EVAL_HORIZONS)

flow_mean_t = torch.tensor(flow_mean, dtype=torch.float32, device=DEVICE).view(1, 1, -1)
flow_std_t  = torch.tensor(flow_std,  dtype=torch.float32, device=DEVICE).view(1, 1, -1)

def print_metrics(title, metrics):
    print("\n" + title)
    for h in sorted(metrics.keys()):
        print(f"  {h:>3}h  MAE={metrics[h]['MAE']:.3f}  RMSE={metrics[h]['RMSE']:.3f}")

def avg_mae(metrics):
    return float(np.mean([metrics[h]["MAE"] for h in metrics]))

@torch.inference_mode()
def eval_horizons_fast(model, loader):
    model.eval()
    acc = {h: {"abs": 0.0, "sq": 0.0, "count": 0} for h in EVAL_HORIZONS}

    for xb, yb, tfb in tqdm(loader, desc="Eval", leave=False):
        xb  = xb.to(DEVICE, non_blocking=True)
        yb  = yb.to(DEVICE, non_blocking=True)
        tfb = tfb.to(DEVICE, non_blocking=True)

        pred = model(xb, tfb)  # scaled (B,OUT,N)

        pred_u = pred * flow_std_t + flow_mean_t
        true_u = yb   * flow_std_t + flow_mean_t

        for h in EVAL_HORIZONS:
            idx = h - 1
            err = pred_u[:, idx, :] - true_u[:, idx, :]
            acc[h]["abs"]   += float(err.abs().sum())
            acc[h]["sq"]    += float((err**2).sum())
            acc[h]["count"] += err.numel()

    metrics = {}
    for h in EVAL_HORIZONS:
        mae = acc[h]["abs"] / acc[h]["count"]
        rmse = (acc[h]["sq"] / acc[h]["count"]) ** 0.5
        metrics[h] = {"MAE": float(mae), "RMSE": float(rmse)}
    return metrics

def make_run_dir(model_name: str) -> Path:
    ts = pd.Timestamp.now().strftime("%Y%m%d_%H%M%S")
    run_dir = Path("artifacts/runs") / f"{ts}_{model_name}"
    run_dir.mkdir(parents=True, exist_ok=False)
    return run_dir

def save_metrics_files(run_dir: Path, split_name: str, metrics: dict):
    (run_dir / f"{split_name}_metrics.json").write_text(json.dumps(metrics, indent=2))
    rows = []
    for h in sorted(metrics.keys()):
        rows.append({"horizon": h, "MAE": metrics[h]["MAE"], "RMSE": metrics[h]["RMSE"]})
    pd.DataFrame(rows).to_csv(run_dir / f"{split_name}_metrics.csv", index=False)

def append_results_summary(model_name: str, run_dir: Path, test_metrics: dict):
    summary_path = Path("artifacts/results_summary.csv")
    row = {
        "timestamp": pd.Timestamp.now().strftime("%Y-%m-%d %H:%M:%S"),
        "model_name": model_name,
        "run_dir": str(run_dir),
    }
    for h in EVAL_HORIZONS:
        row[f"test_MAE_{h}h"] = test_metrics[h]["MAE"]
        row[f"test_RMSE_{h}h"] = test_metrics[h]["RMSE"]

    df_new = pd.DataFrame([row])
    if summary_path.exists():
        df_old = pd.read_csv(summary_path)
        df = pd.concat([df_old, df_new], ignore_index=True)
    else:
        df = df_new
    df.to_csv(summary_path, index=False)
    return summary_path

def save_preds_npz_and_csv_subset(
    run_dir: Path,
    pred_u: np.ndarray,   # (S,N,H)
    true_u: np.ndarray,   # (S,N,H)
    starts: np.ndarray,
    max_stations_csv: int = 300,
):
    # NPZ (full)
    np.savez_compressed(
        run_dir / "test_pred_true_selected_horizons.npz",
        pred=pred_u.astype(np.float32),
        true=true_u.astype(np.float32),
        horizons=np.array(EVAL_HORIZONS, dtype=np.int64),
        starts=starts.astype(np.int64),
        stations=stations,
        timestamps=timestamps
    )

    # CSV subset (manageable)
    K = min(max_stations_csv, N)
    keep = np.arange(K)

    frames = []
    for j, h in enumerate(EVAL_HORIZONS):
        idx = starts + IN_LEN + (h - 1)
        ts_h = pd.to_datetime(timestamps[idx])

        df_h = pd.DataFrame({
            "start_idx": np.repeat(starts, K),
            "timestamp": np.repeat(ts_h, K),
            "station": np.tile(np.array(stations)[keep], len(starts)),
            "horizon_h": h,
            "y_true": true_u[:, keep, j].reshape(-1),
            "y_pred": pred_u[:, keep, j].reshape(-1),
        })
        frames.append(df_h)

    df_out = pd.concat(frames, ignore_index=True)
    df_out.to_csv(run_dir / "test_pred_true_selected_horizons.csv", index=False)
    return run_dir / "test_pred_true_selected_horizons.csv"


## LSTM MODEL 

In [9]:
class LSTM_Baseline(nn.Module):
    def __init__(self, in_dim: int, out_len: int, hidden: int = 64, layers: int = 1, dropout: float = 0.1, tf_dim: int = 4):
        super().__init__()
        self.out_len = out_len
        self.hidden = hidden
        self.lstm = nn.LSTM(input_size=in_dim, hidden_size=hidden, num_layers=layers, dropout=(dropout if layers > 1 else 0.0), batch_first=True)
        self.head = nn.Linear(hidden + tf_dim, 1)

    def forward(self, x, tf):
        # x: (B,F,N,IN) -> (B,N,IN,F) -> (B*N, IN, F)
        B, F, Nn, INL = x.shape
        x_seq = x.permute(0, 2, 3, 1).contiguous().view(B * Nn, INL, F)

        out, (h, c) = self.lstm(x_seq)
        h_last = h[-1]  # (B*N, hidden)

        # tf: (B, OUT, 4) -> repeat per node -> (B*N, OUT, 4)
        tf_rep = tf.unsqueeze(1).expand(B, Nn, self.out_len, tf.shape[-1]).contiguous().view(B * Nn, self.out_len, tf.shape[-1])

        h_rep = h_last.unsqueeze(1).expand(B * Nn, self.out_len, self.hidden)
        z = torch.cat([h_rep, tf_rep], dim=-1)           # (B*N, OUT, hidden+4)
        y = self.head(z).squeeze(-1)                    # (B*N, OUT)
        y = y.view(B, Nn, self.out_len).permute(0, 2, 1) # (B, OUT, N)
        return y

def train_torch_and_save(model_name: str, model: nn.Module, epochs=40, lr=1e-3, weight_decay=1e-4, clip=5.0, patience=6, eval_every=2):
    run_dir = make_run_dir(model_name)
    print("Run dir:", run_dir)

    model = model.to(DEVICE)
    opt = torch.optim.Adam(model.parameters(), lr=lr, weight_decay=weight_decay)
    loss_fn = nn.SmoothL1Loss(beta=1.0)

    best = float("inf")
    bad = 0
    best_state = None
    history = []

    for epoch in range(1, epochs + 1):
        model.train()
        running = 0.0

        for xb, yb, tfb in tqdm(train_loader, desc=f"Train {epoch}/{epochs}", leave=False):
            xb = xb.to(DEVICE, non_blocking=True)
            yb = yb.to(DEVICE, non_blocking=True)
            tfb = tfb.to(DEVICE, non_blocking=True)

            opt.zero_grad(set_to_none=True)
            pred = model(xb, tfb)
            loss = loss_fn(pred, yb)
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
            opt.step()
            running += float(loss.item())

        if epoch % eval_every == 0:
            val_m = eval_horizons_fast(model, val_loader)
            score = avg_mae(val_m)
            print(f"\nEpoch {epoch}: train_loss={running/len(train_loader):.6f}  val_avg_MAE={score:.3f}")
            print_metrics("VAL", val_m)

            history.append({"epoch": epoch, "train_loss": running/len(train_loader), "val_avg_MAE": score, **{f"val_MAE_{h}h": val_m[h]["MAE"] for h in EVAL_HORIZONS}})

            if score < best:
                best = score
                bad = 0
                best_state = {k: v.detach().cpu().clone() for k, v in model.state_dict().items()}
            else:
                bad += 1
                if bad >= patience:
                    print(f"\nEarly stopping. Best val_avg_MAE={best:.3f}")
                    break

    # save history
    if len(history) > 0:
        pd.DataFrame(history).to_csv(run_dir / "history.csv", index=False)

    # load best + save checkpoint
    if best_state is not None:
        model.load_state_dict(best_state)
    torch.save(model.state_dict(), run_dir / "best.pt")

    # TEST
    print("\nEvaluating on TEST set...")
    test_m = eval_horizons_fast(model, test_loader)
    print_metrics(f"{model_name} â€” TEST", test_m)

    save_metrics_files(run_dir, "test", test_m)

    # Collect & save preds (selected horizons only)
    S = len(test_starts)
    pred_u = np.zeros((S, N, len(EVAL_HORIZONS)), dtype=np.float32)
    true_u = np.zeros((S, N, len(EVAL_HORIZONS)), dtype=np.float32)

    model.eval()
    pos = 0
    with torch.inference_mode():
        for xb, yb, tfb in tqdm(test_loader, desc="Collect preds", leave=False):
            bsz = xb.shape[0]
            xb  = xb.to(DEVICE)
            yb  = yb.to(DEVICE)
            tfb = tfb.to(DEVICE)

            pred = model(xb, tfb)  # scaled (B,OUT,N)
            pred_u_b = (pred * flow_std_t + flow_mean_t)  # (B,OUT,N)
            true_u_b = (yb   * flow_std_t + flow_mean_t)

            for j, h in enumerate(EVAL_HORIZONS):
                idx = h - 1
                pred_u[pos:pos+bsz, :, j] = pred_u_b[:, idx, :].detach().cpu().numpy()
                true_u[pos:pos+bsz, :, j] = true_u_b[:, idx, :].detach().cpu().numpy()

            pos += bsz

    csv_path = save_preds_npz_and_csv_subset(run_dir, pred_u, true_u, test_starts, max_stations_csv=300)
    summary_path = append_results_summary(model_name, run_dir, test_m)

    print("\nSaved run outputs to:", run_dir)
    print(" - best checkpoint:", run_dir / "best.pt")
    print(" - history:", run_dir / "history.csv")
    print(" - test metrics:", run_dir / "test_metrics.json")
    print(" - preds npz:", run_dir / "test_pred_true_selected_horizons.npz")
    print(" - preds csv:", csv_path)
    print(" - master summary:", summary_path)

    return run_dir


In [11]:
lstm_base = LSTM_Baseline(in_dim=Fdim, out_len=OUT_LEN, hidden=64, layers=1, dropout=0.1).to(DEVICE)
run_dir_lstm = train_torch_and_save("LSTM", lstm_base, epochs=40, patience=6, eval_every=2)
run_dir_lstm

Run dir: artifacts/runs/20260210_191832_LSTM


                                                   

OutOfMemoryError: CUDA out of memory. Tried to allocate 1.69 GiB. GPU 0 has a total capacty of 15.88 GiB of which 341.69 MiB is free. Process 2300183 has 13.59 GiB memory in use. Process 2696991 has 1.96 GiB memory in use. Of the allocated memory 157.09 MiB is allocated by PyTorch, and 1.68 GiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF

In [12]:
!nvidia-smi


Tue Feb 10 19:34:08 2026       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.144.03             Driver Version: 550.144.03     CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  Quadro P5000                   On  |   00000000:00:05.0 Off |                  Off |
| 26%   33C    P8              6W /  180W |   15925MiB /  16384MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                