# Simulation Study: Controlled Spatio-Temporal Data Generation (PeMS-like)

## Goal
We generate synthetic spatio-temporal data with **known ground-truth dynamics** so we can evaluate
whether different forecasting models exploit:
- **Temporal dependence** (autoregressive structure + seasonality)
- **Spatial dependence** (graph diffusion / neighbor influence)

We keep the *graph topology* and *timestamps* from the PeMS-derived dataset to preserve realism
(number of stations, time indexing, and sampling frequency), but we replace the observed flow/speed
with simulated signals.

## Key idea
We introduce a spatial coupling coefficient **α ∈ [0, 1]**:
- α = 0: stations evolve independently (no spatial structure)
- α > 0: stations interact via the road-network adjacency matrix (graph diffusion)

A good graph-based model should improve as α increases.

## Outputs
This notebook produces `.npz` files compatible with our training pipeline:
- X: (T, N, 6) with [flow, speed, hour_sin, hour_cos, dow_sin, dow_cos]
- Y: (T, N) flow target
- A, stations, timestamps, train/val/test starts, in_len, out_len
- flow_mean/std and speed_mean/std computed **using training indices only** (no leakage)


In [1]:
from pathlib import Path
import json
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# ----------------------------
# Paths (match your repo layout)
# ----------------------------
BASE = Path(".").resolve()
ART = BASE / "artifacts"
DATA_PATH_REAL = ART / "pems_graph_dataset_strict.npz"   # your existing dataset file
SIM_DIR = ART / "sim_datasets"
SIM_DIR.mkdir(parents=True, exist_ok=True)

print("BASE:", BASE)
print("DATA_PATH_REAL exists:", DATA_PATH_REAL.exists())
print("SIM_DIR:", SIM_DIR)

# ----------------------------
# Reproducibility
# ----------------------------
GLOBAL_SEED = 42
rng_global = np.random.default_rng(GLOBAL_SEED)


BASE: /notebooks/Spatio-Temporal-Prediction-and-Coordination-of-EV-Charging-Demand-for-Power-System-Resilience
DATA_PATH_REAL exists: True
SIM_DIR: /notebooks/Spatio-Temporal-Prediction-and-Coordination-of-EV-Charging-Demand-for-Power-System-Resilience/artifacts/sim_datasets


In [2]:
assert DATA_PATH_REAL.exists(), f"Missing file: {DATA_PATH_REAL}"

ds = np.load(DATA_PATH_REAL, allow_pickle=True)

A = ds["A"].astype(np.float32)              # (N,N)
stations = ds["stations"]                   # (N,)
timestamps = ds["timestamps"].astype("datetime64[ns]")  # (T,)

train_starts = ds["train_starts"].astype(np.int64)
val_starts   = ds["val_starts"].astype(np.int64)
test_starts  = ds["test_starts"].astype(np.int64)

IN_LEN  = int(np.array(ds["in_len"]).item())
OUT_LEN = int(np.array(ds["out_len"]).item())

T = len(timestamps)
N = A.shape[0]

print("A:", A.shape)
print("Timestamps:", T, "| range:", timestamps.min(), "->", timestamps.max())
print("Stations:", N)
print("IN_LEN:", IN_LEN, "OUT_LEN:", OUT_LEN)
print("train/val/test starts:", len(train_starts), len(val_starts), len(test_starts))


A: (1821, 1821)
Timestamps: 2208 | range: 2024-10-01T00:00:00.000000000 -> 2024-12-31T23:00:00.000000000
Stations: 1821
IN_LEN: 24 OUT_LEN: 72
train/val/test starts: 1009 289 673


In [3]:
def time_encoding(dt_index: pd.DatetimeIndex) -> np.ndarray:
    """Return (T,4): [hour_sin, hour_cos, dow_sin, dow_cos]."""
    hours = dt_index.hour.values
    dow   = dt_index.dayofweek.values
    hour_sin = np.sin(2*np.pi*hours/24.0)
    hour_cos = np.cos(2*np.pi*hours/24.0)
    dow_sin  = np.sin(2*np.pi*dow/7.0)
    dow_cos  = np.cos(2*np.pi*dow/7.0)
    return np.stack([hour_sin, hour_cos, dow_sin, dow_cos], axis=1).astype(np.float32)

dt_idx = pd.to_datetime(timestamps)
TF = time_encoding(dt_idx)  # (T,4)

print("TF:", TF.shape, "min/max:", float(TF.min()), float(TF.max()))


TF: (2208, 4) min/max: -1.0 1.0


In [4]:
def time_encoding(dt_index: pd.DatetimeIndex) -> np.ndarray:
    """Return (T,4): [hour_sin, hour_cos, dow_sin, dow_cos]."""
    hours = dt_index.hour.values
    dow   = dt_index.dayofweek.values
    hour_sin = np.sin(2*np.pi*hours/24.0)
    hour_cos = np.cos(2*np.pi*hours/24.0)
    dow_sin  = np.sin(2*np.pi*dow/7.0)
    dow_cos  = np.cos(2*np.pi*dow/7.0)
    return np.stack([hour_sin, hour_cos, dow_sin, dow_cos], axis=1).astype(np.float32)

dt_idx = pd.to_datetime(timestamps)
TF = time_encoding(dt_idx)  # (T,4)

print("TF:", TF.shape, "min/max:", float(TF.min()), float(TF.max()))


TF: (2208, 4) min/max: -1.0 1.0


In [5]:
def make_random_walk_matrix(A: np.ndarray, add_self_loops: bool = True) -> np.ndarray:
    """
    Build a row-normalized adjacency (random-walk matrix) for diffusion.
    Returns P where each row sums to 1 (approximately).
    """
    A = A.astype(np.float32)
    # symmetrize (undirected)
    A = np.maximum(A, A.T)

    if add_self_loops:
        A = A + np.eye(A.shape[0], dtype=np.float32)

    row_sum = A.sum(axis=1)
    row_sum = np.where(row_sum == 0, 1.0, row_sum)
    P = (A.T / row_sum).T
    return P.astype(np.float32)

P = make_random_walk_matrix(A, add_self_loops=True)

print("P:", P.shape)
print("Row sum check (first 5):", P.sum(axis=1)[:5])


P: (1821, 1821)
Row sum check (first 5): [1. 1. 1. 1. 1.]


In [7]:
def simulate_latent_process(
    P: np.ndarray,
    TF: np.ndarray,
    rho: float,
    alpha: float,
    noise_sigma: float,
    seed: int
) -> np.ndarray:
    """
    Simulate latent state z_t ∈ R^N with graph diffusion + seasonality:

        z_{t+1} = rho * z_t + alpha * (P z_t) + s(t) + eps_t
        z_{t+1} = tanh(z_{t+1})   # stabilizes dynamics

    where s(t) is node-specific daily/weekly seasonality built from TF.

    Returns
    -------
    z : (T,N) float32
    """
    rng = np.random.default_rng(seed)
    T = TF.shape[0]
    N = P.shape[0]

    # node-specific amplitudes (keeps stations heterogeneous)
    amp_daily  = rng.uniform(0.6, 1.6, size=N).astype(np.float32)
    amp_weekly = rng.uniform(0.2, 0.9, size=N).astype(np.float32)

    # Build seasonality from TF: hour_sin/cos + dow_sin/cos
    hour_sin, hour_cos, dow_sin, dow_cos = TF.T  # each is (T,)
    daily_signal  = (hour_sin + 0.5*hour_cos).astype(np.float32)
    weekly_signal = (dow_sin  + 0.5*dow_cos ).astype(np.float32)

    z = np.zeros((T, N), dtype=np.float32)
    z[0] = rng.normal(0, 1.0, size=N).astype(np.float32)

    for t in range(T - 1):
        spatial = P @ z[t]  # (N,)
        seasonal = 0.6 * daily_signal[t] * amp_daily + 0.4 * weekly_signal[t] * amp_weekly
        eps = rng.normal(0, noise_sigma, size=N).astype(np.float32)

        z_next = rho * z[t] + alpha * spatial + seasonal + eps
        z[t+1] = np.tanh(z_next)  # stabilizes + adds mild nonlinearity

    return z

def latent_to_flow_speed(z: np.ndarray, seed: int):
    """
    Map latent z_t to observable flow and speed.
    Flow is positive; speed decreases with flow (simple traffic-like relation).
    """
    rng = np.random.default_rng(seed + 10_000)
    T, N = z.shape

    # node-specific base demand + sensitivity
    base_flow  = rng.uniform(120, 420, size=N).astype(np.float32)
    scale_flow = rng.uniform(60, 160, size=N).astype(np.float32)

    flow = base_flow[None, :] + scale_flow[None, :] * z
    flow = np.clip(flow, 0.0, None).astype(np.float32)

    # speed roughly inversely related to flow
    vmax = 70.0
    k = rng.uniform(0.03, 0.08, size=N).astype(np.float32)  # how strongly flow reduces speed
    speed_noise = rng.normal(0, 2.0, size=(T, N)).astype(np.float32)

    speed = vmax - flow * k[None, :] + speed_noise
    speed = np.clip(speed, 0.0, vmax).astype(np.float32)

    return flow, speed


In [8]:
# ----------------------------
# Choose one scenario first
# ----------------------------
SIM_CONFIG = {
    "name": "sim_pems_like",
    "seed": 123,
    "rho": 0.70,          # temporal persistence
    "alpha": 0.30,        # spatial coupling strength (key experimental knob)
    "noise_sigma": 0.05,  # latent noise
}

z = simulate_latent_process(
    P=P,
    TF=TF,
    rho=SIM_CONFIG["rho"],
    alpha=SIM_CONFIG["alpha"],
    noise_sigma=SIM_CONFIG["noise_sigma"],
    seed=SIM_CONFIG["seed"],
)

flow, speed = latent_to_flow_speed(z, seed=SIM_CONFIG["seed"])

print("z:", z.shape, "flow:", flow.shape, "speed:", speed.shape)
print("flow stats:", float(flow.mean()), float(flow.std()), "min/max:", float(flow.min()), float(flow.max()))
print("speed stats:", float(speed.mean()), float(speed.std()), "min/max:", float(speed.min()), float(speed.max()))


z: (2208, 1821) flow: (2208, 1821) speed: (2208, 1821)
flow stats: 268.8896179199219 121.799072265625 min/max: 0.0 759.8511352539062
speed stats: 55.09693908691406 8.136998176574707 min/max: 16.539337158203125 70.0


In [9]:
# X: (T,N,6) with [flow, speed, hour_sin, hour_cos, dow_sin, dow_cos]
X = np.zeros((T, N, 6), dtype=np.float32)
X[:, :, 0] = flow
X[:, :, 1] = speed

# time features repeated across stations
X[:, :, 2] = TF[:, 0:1]  # hour_sin
X[:, :, 3] = TF[:, 1:1+1]  # hour_cos
X[:, :, 4] = TF[:, 2:2+1]  # dow_sin
X[:, :, 5] = TF[:, 3:3+1]  # dow_cos

# Y target: flow (T,N)
Y = flow.astype(np.float32)

print("X:", X.shape, "Y:", Y.shape)


X: (2208, 1821, 6) Y: (2208, 1821)


In [10]:
def compute_train_indices(train_starts, in_len, out_len):
    """
    Build the set of time indices touched by training samples (inputs + outputs),
    so scaling uses only training data.
    """
    train_starts = np.asarray(train_starts, dtype=np.int64)

    idx_in = train_starts[:, None] + np.arange(in_len)[None, :]
    idx_out = train_starts[:, None] + in_len + np.arange(out_len)[None, :]

    idx = np.unique(np.concatenate([idx_in.ravel(), idx_out.ravel()]))
    idx = idx[(idx >= 0) & (idx < T)]
    return idx

train_idx = compute_train_indices(train_starts, IN_LEN, OUT_LEN)
print("Unique training time indices:", len(train_idx), "of T=", T)

flow_train = flow[train_idx]    # (T_train,N)
speed_train = speed[train_idx]

flow_mean = flow_train.mean(axis=0).astype(np.float32)
flow_std  = flow_train.std(axis=0).astype(np.float32)
speed_mean = speed_train.mean(axis=0).astype(np.float32)
speed_std  = speed_train.std(axis=0).astype(np.float32)

# avoid tiny std
flow_std = np.maximum(flow_std, 1e-3)
speed_std = np.maximum(speed_std, 1e-3)

print("flow_mean/std:", flow_mean.shape, flow_std.shape)
print("speed_mean/std:", speed_mean.shape, speed_std.shape)


Unique training time indices: 1104 of T= 2208
flow_mean/std: (1821,) (1821,)
speed_mean/std: (1821,) (1821,)


In [12]:
sim_name = f"{SIM_CONFIG['name']}_alpha{SIM_CONFIG['alpha']:.2f}_seed{SIM_CONFIG['seed']}"
out_npz = SIM_DIR / f"{sim_name}.npz"
out_json = SIM_DIR / f"{sim_name}.json"

np.savez_compressed(
    out_npz,
    X=X,
    Y=Y,
    A=A,
    stations=stations,
    timestamps=timestamps,
    train_starts=train_starts,
    val_starts=val_starts,
    test_starts=test_starts,
    in_len=np.array(IN_LEN),
    out_len=np.array(OUT_LEN),
    flow_mean=flow_mean,
    flow_std=flow_std,
    speed_mean=speed_mean,
    speed_std=speed_std,
)

with open(out_json, "w", encoding="utf-8") as f:
    json.dump(SIM_CONFIG, f, indent=2)

print("Saved:", out_npz)
print("Saved config:", out_json)


Saved: /notebooks/Spatio-Temporal-Prediction-and-Coordination-of-EV-Charging-Demand-for-Power-System-Resilience/artifacts/sim_datasets/sim_pems_like_alpha0.30_seed123.npz
Saved config: /notebooks/Spatio-Temporal-Prediction-and-Coordination-of-EV-Charging-Demand-for-Power-System-Resilience/artifacts/sim_datasets/sim_pems_like_alpha0.30_seed123.json


In [13]:
sim_name = f"{SIM_CONFIG['name']}_alpha{SIM_CONFIG['alpha']:.2f}_seed{SIM_CONFIG['seed']}"
out_npz = SIM_DIR / f"{sim_name}.npz"
out_json = SIM_DIR / f"{sim_name}.json"

np.savez_compressed(
    out_npz,
    X=X,
    Y=Y,
    A=A,
    stations=stations,
    timestamps=timestamps,
    train_starts=train_starts,
    val_starts=val_starts,
    test_starts=test_starts,
    in_len=np.array(IN_LEN),
    out_len=np.array(OUT_LEN),
    flow_mean=flow_mean,
    flow_std=flow_std,
    speed_mean=speed_mean,
    speed_std=speed_std,
)

with open(out_json, "w", encoding="utf-8") as f:
    json.dump(SIM_CONFIG, f, indent=2)

print("Saved:", out_npz)
print("Saved config:", out_json)


Saved: /notebooks/Spatio-Temporal-Prediction-and-Coordination-of-EV-Charging-Demand-for-Power-System-Resilience/artifacts/sim_datasets/sim_pems_like_alpha0.30_seed123.npz
Saved config: /notebooks/Spatio-Temporal-Prediction-and-Coordination-of-EV-Charging-Demand-for-Power-System-Resilience/artifacts/sim_datasets/sim_pems_like_alpha0.30_seed123.json
