#  Spatio-Temporal Prediction and Coordination of EV Charging Demand for Power System Resilience

## Research Objectives

Recent studies have explored electric vehicles (EVs) from different perspectives, ranging from estimating vehicle range based on battery capacity, model specifications, and internal components (Ahmed et al., 2022) to forecasting charging behavior using machine learning methods such as Random Forest and SVM with factors like previous payment data, weather, and traffic (Shahriar et al., 2020). In parallel, research on smart cities has focused on managing traffic flow efficiently to reduce congestion and energy consumption (Dymora, Mazurek, & Jucha, 2024).

Building on these insights, this study links traffic dynamics with EV energy consumption to better predict when and where charging demand will arise. By integrating spatio-temporal traffic features with deep learning models, the goal is to anticipate EV charging needs in real time and enable coordinated charging strategies that support overall power system resilience.


## Load Required Libraries 

In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score, mean_absolute_percentage_error
from sklearn.linear_model import LinearRegression, Ridge, ElasticNet
from scipy import stats
from keras.models import Sequential
from keras.layers import LSTM, Dense, Dropout
from keras.callbacks import EarlyStopping
from keras.optimizers import Adam
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import random
from statsmodels.tsa.arima.model import ARIMA
from statsmodels.tsa.stattools import adfuller

## Load and Clean the Data 

In [2]:
df1 = pd.read_csv("cleaned_traffic_data.csv")

## How the data looks directly from PEMS

In [3]:
df1.head()

Unnamed: 0,Timestamp,Station,District,Route,Direction of Travel,Lane Type,Station Length,Samples,% Observed,Total Flow,...,Lane 5 Avg Speed,Lane 6 Flow,Lane 6 Avg Occ,Lane 6 Avg Speed,Lane 7 Flow,Lane 7 Avg Occ,Lane 7 Avg Speed,Lane 8 Flow,Lane 8 Avg Occ,Lane 8 Avg Speed
0,10/01/2024 00:00:00,308512,3,50,W,ML,3.995,197,0,497.0,...,,,,,,,,,,
1,10/01/2024 00:00:00,311831,3,5,S,OR,,101,92,27.0,...,,,,,,,,,,
2,10/01/2024 00:00:00,311832,3,5,S,FR,,101,92,78.0,...,,,,,,,,,,
3,10/01/2024 00:00:00,311844,3,5,N,OR,,202,92,43.0,...,,,,,,,,,,
4,10/01/2024 00:00:00,311847,3,5,N,OR,,303,92,73.0,...,,,,,,,,,,


### We ignore and remove features that contain only NAN values, and maintain the other features.

In [4]:
# Define the final selected columns
selected_columns = [
    "Timestamp", "Station", "Route", "Direction of Travel",
    "Total Flow", "Avg Speed", "% Observed","Samples","Lane Type"
]

# Keep only the selected columns
df1 = df1[selected_columns]

In [5]:
df1

Unnamed: 0,Timestamp,Station,Route,Direction of Travel,Total Flow,Avg Speed,% Observed,Samples,Lane Type
0,10/01/2024 00:00:00,308512,50,W,497.0,64.1,0,197,ML
1,10/01/2024 00:00:00,311831,5,S,27.0,,92,101,OR
2,10/01/2024 00:00:00,311832,5,S,78.0,,92,101,FR
3,10/01/2024 00:00:00,311844,5,N,43.0,,92,202,OR
4,10/01/2024 00:00:00,311847,5,N,73.0,,92,303,OR
...,...,...,...,...,...,...,...,...,...
4114675,12/31/2024 23:00:00,3423094,99,S,68.0,64.8,96,118,ML
4114676,12/31/2024 23:00:00,3900021,50,E,803.0,66.5,67,292,ML
4114677,12/31/2024 23:00:00,3900022,50,E,509.0,68.0,0,0,HV
4114678,12/31/2024 23:00:00,3900023,50,W,881.0,67.4,67,289,ML


In [6]:
meta_data = pd.read_excel("pems_output.xlsx")

In [7]:
meta_data.head()

Unnamed: 0,Fwy,District,County,City,CA PM,Abs PM,Length,ID,Name,Lanes,Type,Sensor Type,HOV,MS ID,IRM
0,I5-N,3,Sacramento,,1.919,497.212,4.312,3413014,5NB at Twin Cities Rd,2,Mainline,,No,1,
1,I5-N,3,Sacramento,,2.026,497.319,,3413016,5NB to Twin Cities Rd,4,Off Ramp,,No,1,
2,I5-N,3,Sacramento,,9.498,504.791,3.291,317802,Hood Franklin Rd,2,Mainline,radars,No,1,
3,I5-N,3,Sacramento,,10.942,506.235,3.115,3013091,5NB at Elk Grove HOV,1,HOV,,24H,1,
4,I5-N,3,Sacramento,,11.08,506.373,,311844,Elk Grove Blvd 5NB Slip,2,On Ramp,others,No,1,


In [8]:
# nicer display for debugging
pd.set_option("display.max_columns", 100)
pd.set_option("display.width", 200)


In [9]:
def clean_columns(df):
    df = df.copy()
    df.columns = (
        df.columns
        .str.strip()
        .str.replace(" ", "_")
        .str.replace("-", "_")
        .str.replace("/", "_")
        .str.replace("%", "pct")
        .str.lower()
    )
    return df

df1 = clean_columns(df1)
df1.head()


Unnamed: 0,timestamp,station,route,direction_of_travel,total_flow,avg_speed,pct_observed,samples,lane_type
0,10/01/2024 00:00:00,308512,50,W,497.0,64.1,0,197,ML
1,10/01/2024 00:00:00,311831,5,S,27.0,,92,101,OR
2,10/01/2024 00:00:00,311832,5,S,78.0,,92,101,FR
3,10/01/2024 00:00:00,311844,5,N,43.0,,92,202,OR
4,10/01/2024 00:00:00,311847,5,N,73.0,,92,303,OR


In [10]:
df1["timestamp"] = pd.to_datetime(df1["timestamp"], errors="coerce")
df1 = df1.sort_values(by=["station", "timestamp"]).reset_index(drop=True)


In [11]:
cols_to_drop = ["id", "name", "sensor_type", "irm", "ms_id"]

df1 = df1.drop(columns=[c for c in cols_to_drop if c in df1.columns])


In [12]:
useful_metadata = [
    "station", "fwy", "lane_type", "type", "lanes", 
    "hov", "abs_pm", "direction_of_travel", "route"
]

df1 = df1[[col for col in df1.columns if col not in ["id", "name", "irm", "sensor_type", "ms_id"]]]


In [13]:
# numeric
for col in ["fwy", "lanes", "abs_pm", "route"]:
    if col in df1.columns:
        df1[col] = pd.to_numeric(df1[col], errors="coerce")

# categorical
for col in ["lane_type", "type", "hov", "direction_of_travel"]:
    if col in df1.columns:
        df1[col] = df1[col].astype("category")


In [14]:
print(meta_data.columns)
meta_data.head()


Index(['Fwy', 'District', 'County', 'City', 'CA PM', 'Abs PM', 'Length', 'ID', 'Name', 'Lanes', 'Type', 'Sensor Type', 'HOV', 'MS ID', 'IRM'], dtype='object')


Unnamed: 0,Fwy,District,County,City,CA PM,Abs PM,Length,ID,Name,Lanes,Type,Sensor Type,HOV,MS ID,IRM
0,I5-N,3,Sacramento,,1.919,497.212,4.312,3413014,5NB at Twin Cities Rd,2,Mainline,,No,1,
1,I5-N,3,Sacramento,,2.026,497.319,,3413016,5NB to Twin Cities Rd,4,Off Ramp,,No,1,
2,I5-N,3,Sacramento,,9.498,504.791,3.291,317802,Hood Franklin Rd,2,Mainline,radars,No,1,
3,I5-N,3,Sacramento,,10.942,506.235,3.115,3013091,5NB at Elk Grove HOV,1,HOV,,24H,1,
4,I5-N,3,Sacramento,,11.08,506.373,,311844,Elk Grove Blvd 5NB Slip,2,On Ramp,others,No,1,


In [15]:
df1.head()

Unnamed: 0,timestamp,station,route,direction_of_travel,total_flow,avg_speed,pct_observed,samples,lane_type
0,2024-10-01 01:00:00,308511,50,E,12.0,67.5,100,202,ML
1,2024-10-01 02:00:00,308511,50,E,12.0,67.0,100,197,ML
2,2024-10-01 03:00:00,308511,50,E,20.0,66.3,92,197,ML
3,2024-10-01 04:00:00,308511,50,E,55.0,67.4,100,197,ML
4,2024-10-01 05:00:00,308511,50,E,228.0,66.1,83,168,ML


In [16]:
print(df1['station'].nunique(), "unique station IDs in traffic")
print(meta_data['ID'].nunique(), "unique IDs in metadata")

shared = set(df1['station']).intersection(set(meta_data['ID']))
print("Shared IDs:", len(shared))


1896 unique station IDs in traffic
1861 unique IDs in metadata
Shared IDs: 1861


In [17]:
drop_cols = ['name', 'sensor_type', 'irm', 'ms_id', 'city', 'ca_pm', 'length']
meta_data = meta_data.drop(columns=drop_cols, errors="ignore")


In [18]:
df = df1.merge(
    meta_data,
    left_on='station',
    right_on='ID',
    how='left',
    validate='m:1'
)


In [19]:
def clean_columns(df):
    df = df.copy()
    df.columns = (
        df.columns
        .str.strip()
        .str.replace(" ", "_")
        .str.replace("-", "_")
        .str.replace("/", "_")
        .str.replace("%", "pct")
        .str.lower()
    )
    return df

df = clean_columns(df)
df.head()


Unnamed: 0,timestamp,station,route,direction_of_travel,total_flow,avg_speed,pct_observed,samples,lane_type,fwy,district,county,city,ca_pm,abs_pm,length,id,name,lanes,type,sensor_type,hov,ms_id,irm
0,2024-10-01 01:00:00,308511,50,E,12.0,67.5,100,202,ML,US50-E,3.0,El Dorado,,31.627,60.162,3.134,308511.0,Sly Park Rd,2.0,Mainline,,No,1.0,
1,2024-10-01 02:00:00,308511,50,E,12.0,67.0,100,197,ML,US50-E,3.0,El Dorado,,31.627,60.162,3.134,308511.0,Sly Park Rd,2.0,Mainline,,No,1.0,
2,2024-10-01 03:00:00,308511,50,E,20.0,66.3,92,197,ML,US50-E,3.0,El Dorado,,31.627,60.162,3.134,308511.0,Sly Park Rd,2.0,Mainline,,No,1.0,
3,2024-10-01 04:00:00,308511,50,E,55.0,67.4,100,197,ML,US50-E,3.0,El Dorado,,31.627,60.162,3.134,308511.0,Sly Park Rd,2.0,Mainline,,No,1.0,
4,2024-10-01 05:00:00,308511,50,E,228.0,66.1,83,168,ML,US50-E,3.0,El Dorado,,31.627,60.162,3.134,308511.0,Sly Park Rd,2.0,Mainline,,No,1.0,


In [20]:
df.head()

Unnamed: 0,timestamp,station,route,direction_of_travel,total_flow,avg_speed,pct_observed,samples,lane_type,fwy,district,county,city,ca_pm,abs_pm,length,id,name,lanes,type,sensor_type,hov,ms_id,irm
0,2024-10-01 01:00:00,308511,50,E,12.0,67.5,100,202,ML,US50-E,3.0,El Dorado,,31.627,60.162,3.134,308511.0,Sly Park Rd,2.0,Mainline,,No,1.0,
1,2024-10-01 02:00:00,308511,50,E,12.0,67.0,100,197,ML,US50-E,3.0,El Dorado,,31.627,60.162,3.134,308511.0,Sly Park Rd,2.0,Mainline,,No,1.0,
2,2024-10-01 03:00:00,308511,50,E,20.0,66.3,92,197,ML,US50-E,3.0,El Dorado,,31.627,60.162,3.134,308511.0,Sly Park Rd,2.0,Mainline,,No,1.0,
3,2024-10-01 04:00:00,308511,50,E,55.0,67.4,100,197,ML,US50-E,3.0,El Dorado,,31.627,60.162,3.134,308511.0,Sly Park Rd,2.0,Mainline,,No,1.0,
4,2024-10-01 05:00:00,308511,50,E,228.0,66.1,83,168,ML,US50-E,3.0,El Dorado,,31.627,60.162,3.134,308511.0,Sly Park Rd,2.0,Mainline,,No,1.0,


In [21]:
metadata_cols = ['fwy', 'district', 'county', 'abs_pm', 'lanes', 'type', 'hov']
df[metadata_cols].isna().mean() * 100


fwy         1.532537
district    1.532537
county      1.532537
abs_pm      1.532537
lanes       1.532537
type        1.532537
hov         1.532537
dtype: float64

In [22]:
required = ['fwy', 'abs_pm', 'lanes', 'type']
df = df.dropna(subset=required)

In [23]:
df.head()

Unnamed: 0,timestamp,station,route,direction_of_travel,total_flow,avg_speed,pct_observed,samples,lane_type,fwy,district,county,city,ca_pm,abs_pm,length,id,name,lanes,type,sensor_type,hov,ms_id,irm
0,2024-10-01 01:00:00,308511,50,E,12.0,67.5,100,202,ML,US50-E,3.0,El Dorado,,31.627,60.162,3.134,308511.0,Sly Park Rd,2.0,Mainline,,No,1.0,
1,2024-10-01 02:00:00,308511,50,E,12.0,67.0,100,197,ML,US50-E,3.0,El Dorado,,31.627,60.162,3.134,308511.0,Sly Park Rd,2.0,Mainline,,No,1.0,
2,2024-10-01 03:00:00,308511,50,E,20.0,66.3,92,197,ML,US50-E,3.0,El Dorado,,31.627,60.162,3.134,308511.0,Sly Park Rd,2.0,Mainline,,No,1.0,
3,2024-10-01 04:00:00,308511,50,E,55.0,67.4,100,197,ML,US50-E,3.0,El Dorado,,31.627,60.162,3.134,308511.0,Sly Park Rd,2.0,Mainline,,No,1.0,
4,2024-10-01 05:00:00,308511,50,E,228.0,66.1,83,168,ML,US50-E,3.0,El Dorado,,31.627,60.162,3.134,308511.0,Sly Park Rd,2.0,Mainline,,No,1.0,


In [24]:
df = df.dropna(subset=['total_flow']).copy()

In [25]:
df['timestamp'] = pd.to_datetime(df['timestamp'])
df['hour'] = df['timestamp'].dt.hour


In [26]:
train_end = pd.Timestamp("2024-11-15 23:59:59")
val_end = pd.Timestamp("2024-11-30 23:59:59")

df_train = df[df['timestamp'] <= train_end].copy()
df_val   = df[(df['timestamp'] > train_end) & (df['timestamp'] <= val_end)].copy()
df_test  = df[df['timestamp'] > val_end].copy()


In [27]:
print(df_train.shape, df_val.shape, df_test.shape)


(1876911, 25) (615305, 25) (1265940, 25)


In [28]:
df['station'].nunique()

1772

In [29]:
speed_lookup = (
    df_train
    .dropna(subset=['avg_speed'])
    .groupby(['lane_type', 'type', 'hour', 'fwy', 'district'])['avg_speed']
    .mean()
)


  .groupby(['lane_type', 'type', 'hour', 'fwy', 'district'])['avg_speed']


In [30]:
global_speed_mean = df_train['avg_speed'].mean()


In [31]:
def impute_speed_no_leakage(df, lookup, global_mean):
    df = df.sort_values(['station', 'timestamp']).copy()
    
    # 1. forward fill within each station
    df['avg_speed'] = df.groupby('station')['avg_speed'].ffill()
    
    # 2. group-average fill
    missing_mask = df['avg_speed'].isna()
    if missing_mask.any():
        df.loc[missing_mask, 'avg_speed'] = df[missing_mask].apply(
            lambda row: lookup.get(
                (row['lane_type'], row['type'], row['hour'], row['fwy'], row['district']),
                np.nan
            ),
            axis=1
        )
    
    # final fallback: global mean
    df['avg_speed'] = df['avg_speed'].fillna(global_mean)
    
    return df


In [32]:
df_train = impute_speed_no_leakage(df_train, speed_lookup, global_speed_mean)
df_val   = impute_speed_no_leakage(df_val, speed_lookup, global_speed_mean)
df_test  = impute_speed_no_leakage(df_test, speed_lookup, global_speed_mean)


In [33]:
df_train.isna().sum()

timestamp                    0
station                      0
route                        0
direction_of_travel          0
total_flow                   0
avg_speed                    0
pct_observed                 0
samples                      0
lane_type                    0
fwy                          0
district                     0
county                       0
city                   1087940
ca_pm                        0
abs_pm                       0
length                  613284
id                           0
name                         0
lanes                        0
type                         0
sensor_type            1060791
hov                          0
ms_id                        0
irm                    1876911
hour                         0
dtype: int64

In [34]:
df_test.isna().sum()

timestamp                    0
station                      0
route                        0
direction_of_travel          0
total_flow                   0
avg_speed                    0
pct_observed                 0
samples                      0
lane_type                    0
fwy                          0
district                     0
county                       0
city                    732485
ca_pm                        0
abs_pm                       0
length                  421501
id                           0
name                         0
lanes                        0
type                         0
sensor_type             709952
hov                          0
ms_id                        0
irm                    1265940
hour                         0
dtype: int64

In [35]:
df_val.isna().sum()

timestamp                   0
station                     0
route                       0
direction_of_travel         0
total_flow                  0
avg_speed                   0
pct_observed                0
samples                     0
lane_type                   0
fwy                         0
district                    0
county                      0
city                   356413
ca_pm                       0
abs_pm                      0
length                 206705
id                          0
name                        0
lanes                       0
type                        0
sensor_type            344101
hov                         0
ms_id                       0
irm                    615305
hour                        0
dtype: int64

In [36]:
def clean_columns(df):
    df = df.copy()
    df.columns = (
        df.columns
        .str.strip()
        .str.replace(" ", "_")
        .str.replace("-", "_")
        .str.replace("/", "_")
        .str.replace("%", "pct")
        .str.lower()
    )
    return df

df = clean_columns(df)
df.head()


Unnamed: 0,timestamp,station,route,direction_of_travel,total_flow,avg_speed,pct_observed,samples,lane_type,fwy,district,county,city,ca_pm,abs_pm,length,id,name,lanes,type,sensor_type,hov,ms_id,irm,hour
0,2024-10-01 01:00:00,308511,50,E,12.0,67.5,100,202,ML,US50-E,3.0,El Dorado,,31.627,60.162,3.134,308511.0,Sly Park Rd,2.0,Mainline,,No,1.0,,1
1,2024-10-01 02:00:00,308511,50,E,12.0,67.0,100,197,ML,US50-E,3.0,El Dorado,,31.627,60.162,3.134,308511.0,Sly Park Rd,2.0,Mainline,,No,1.0,,2
2,2024-10-01 03:00:00,308511,50,E,20.0,66.3,92,197,ML,US50-E,3.0,El Dorado,,31.627,60.162,3.134,308511.0,Sly Park Rd,2.0,Mainline,,No,1.0,,3
3,2024-10-01 04:00:00,308511,50,E,55.0,67.4,100,197,ML,US50-E,3.0,El Dorado,,31.627,60.162,3.134,308511.0,Sly Park Rd,2.0,Mainline,,No,1.0,,4
4,2024-10-01 05:00:00,308511,50,E,228.0,66.1,83,168,ML,US50-E,3.0,El Dorado,,31.627,60.162,3.134,308511.0,Sly Park Rd,2.0,Mainline,,No,1.0,,5


In [37]:
meta_cols = ["fwy", "abs_pm", "lanes", "type", "lane_type"]
df[meta_cols].isna().mean() * 100


fwy          0.0
abs_pm       0.0
lanes        0.0
type         0.0
lane_type    0.0
dtype: float64

## Feature Engineering 

## Temporal Features 

In [38]:
def create_lags(df, target_col, lags):
    df = df.sort_values(['station', 'timestamp']).copy()
    
    for lag in lags:
        df[f'{target_col}_lag_{lag}'] = (
            df.groupby('station')[target_col].shift(lag)
        )
        
    return df


In [39]:
lags = [1, 2, 3, 6, 12, 24]

df_train = create_lags(df_train, 'total_flow', lags)
df_val   = create_lags(df_val,   'total_flow', lags)
df_test  = create_lags(df_test,  'total_flow', lags)


In [40]:
df_train = df_train.dropna(subset=[f'total_flow_lag_{l}' for l in lags])
df_val   = df_val.dropna(subset=[f'total_flow_lag_{l}' for l in lags])
df_test  = df_test.dropna(subset=[f'total_flow_lag_{l}' for l in lags])


In [41]:
def create_rolling_features(df):
    df = df.sort_values(['station', 'timestamp']).copy()

    # rolling over 24 hours = 24 time steps (hourly data)
    window = 24
    
    df['rolling_mean_24h'] = (
        df.groupby('station')['total_flow']
          .transform(lambda x: x.rolling(window, min_periods=12).mean())
    )
    
    df['rolling_std_24h'] = (
        df.groupby('station')['total_flow']
          .transform(lambda x: x.rolling(window, min_periods=12).std())
    )

    df['rolling_min_24h'] = (
        df.groupby('station')['total_flow']
          .transform(lambda x: x.rolling(window, min_periods=12).min())
    )
    
    df['rolling_max_24h'] = (
        df.groupby('station')['total_flow']
          .transform(lambda x: x.rolling(window, min_periods=12).max())
    )

    # coefficient of variation
    df['rolling_cv_24h'] = df['rolling_std_24h'] / (df['rolling_mean_24h'] + 1e-4)

    return df


In [42]:
df_train = create_rolling_features(df_train)
df_val   = create_rolling_features(df_val)
df_test  = create_rolling_features(df_test)


In [43]:
rolling_cols = [
    'rolling_mean_24h', 'rolling_std_24h', 'rolling_min_24h',
    'rolling_max_24h', 'rolling_cv_24h'
]

df_train = df_train.dropna(subset=rolling_cols)
df_val   = df_val.dropna(subset=rolling_cols)
df_test  = df_test.dropna(subset=rolling_cols)


In [44]:
def add_time_features(df):
    df = df.copy()
    
    df['hour'] = df['timestamp'].dt.hour
    df['dayofweek'] = df['timestamp'].dt.dayofweek  # Monday=0, Sunday=6
    
    return df


In [45]:
df_train = add_time_features(df_train)
df_val   = add_time_features(df_val)
df_test  = add_time_features(df_test)


In [46]:
def add_cyclical_time(df):
    df = df.copy()
    
    df['hour_sin'] = np.sin(2 * np.pi * df['hour'] / 24)
    df['hour_cos'] = np.cos(2 * np.pi * df['hour'] / 24)
    
    df['dow_sin'] = np.sin(2 * np.pi * df['dayofweek'] / 7)
    df['dow_cos'] = np.cos(2 * np.pi * df['dayofweek'] / 7)
    
    return df


In [47]:
df_train = add_cyclical_time(df_train)
df_val   = add_cyclical_time(df_val)
df_test  = add_cyclical_time(df_test)


In [48]:
def add_peak_flags(df):
    df = df.copy()
    
    df['is_peak_hour'] = df['hour'].isin([7, 8, 9, 16, 17, 18]).astype(int)
    df['is_weekend'] = df['dayofweek'].isin([5, 6]).astype(int)
    
    return df


In [49]:
df_train = add_peak_flags(df_train)
df_val   = add_peak_flags(df_val)
df_test  = add_peak_flags(df_test)


In [50]:
df_train.columns

Index(['timestamp', 'station', 'route', 'direction_of_travel', 'total_flow', 'avg_speed', 'pct_observed', 'samples', 'lane_type', 'fwy', 'district', 'county', 'city', 'ca_pm', 'abs_pm', 'length',
       'id', 'name', 'lanes', 'type', 'sensor_type', 'hov', 'ms_id', 'irm', 'hour', 'total_flow_lag_1', 'total_flow_lag_2', 'total_flow_lag_3', 'total_flow_lag_6', 'total_flow_lag_12',
       'total_flow_lag_24', 'rolling_mean_24h', 'rolling_std_24h', 'rolling_min_24h', 'rolling_max_24h', 'rolling_cv_24h', 'dayofweek', 'hour_sin', 'hour_cos', 'dow_sin', 'dow_cos', 'is_peak_hour',
       'is_weekend'],
      dtype='object')

## Spatial Features 

In [51]:
def add_station_order(df):
    df = df.copy()
    df['station_order'] = (
        df.sort_values(['fwy', 'abs_pm'])
          .groupby('fwy')
          .cumcount()
    )
    return df


In [52]:
df_train = add_station_order(df_train)
df_val   = add_station_order(df_val)
df_test  = add_station_order(df_test)


In [53]:
stations_sorted = (
    df_train[['station', 'fwy', 'abs_pm']]
    .drop_duplicates()
    .sort_values(['fwy', 'abs_pm'])
)

stations_sorted['upstream_station'] = stations_sorted.groupby('fwy')['station'].shift(1)
stations_sorted['downstream_station'] = stations_sorted.groupby('fwy')['station'].shift(-1)


In [54]:
df_train = df_train.merge(stations_sorted[['station','upstream_station','downstream_station']],
                          on='station', how='left')

df_val   = df_val.merge(stations_sorted[['station','upstream_station','downstream_station']],
                        on='station', how='left')

df_test  = df_test.merge(stations_sorted[['station','upstream_station','downstream_station']],
                         on='station', how='left')


In [55]:
def add_neighbor_lags(df, neighbor_col, lag):
    df = df.sort_values(['station', 'timestamp']).copy()
    
    # Map from station→lagged flow
    mapping = (
        df[['station', 'timestamp', f'total_flow_lag_{lag}']]
        .rename(columns={'station': 'neighbor_station',
                         f'total_flow_lag_{lag}': f'{neighbor_col}_flow_lag_{lag}'})
    )
    
    df = df.merge(mapping, 
                  left_on=['timestamp', neighbor_col], 
                  right_on=['timestamp', 'neighbor_station'], 
                  how='left')
    
    df = df.drop(columns=['neighbor_station'])
    
    return df


In [56]:
df_train = add_neighbor_lags(df_train, 'upstream_station', 1)
df_train = add_neighbor_lags(df_train, 'downstream_station', 1)

df_val   = add_neighbor_lags(df_val, 'upstream_station', 1)
df_val   = add_neighbor_lags(df_val, 'downstream_station', 1)

df_test  = add_neighbor_lags(df_test, 'upstream_station', 1)
df_test  = add_neighbor_lags(df_test, 'downstream_station', 1)


In [57]:
def normalize_pm(df):
    df = df.copy()
    df['abs_pm_norm'] = (
        df.groupby('fwy')['abs_pm']
          .transform(lambda x: (x - x.min()) / (x.max() - x.min() + 1e-6))
    )
    return df


In [58]:
df_train = normalize_pm(df_train)
df_val   = normalize_pm(df_val)
df_test  = normalize_pm(df_test)


In [59]:
df_train.head()

Unnamed: 0,timestamp,station,route,direction_of_travel,total_flow,avg_speed,pct_observed,samples,lane_type,fwy,district,county,city,ca_pm,abs_pm,length,id,name,lanes,type,sensor_type,hov,ms_id,irm,hour,total_flow_lag_1,total_flow_lag_2,total_flow_lag_3,total_flow_lag_6,total_flow_lag_12,total_flow_lag_24,rolling_mean_24h,rolling_std_24h,rolling_min_24h,rolling_max_24h,rolling_cv_24h,dayofweek,hour_sin,hour_cos,dow_sin,dow_cos,is_peak_hour,is_weekend,station_order,upstream_station,downstream_station,upstream_station_flow_lag_1,downstream_station_flow_lag_1,abs_pm_norm
0,2024-10-02 12:00:00,308511,50,E,572.0,65.5,46,204,ML,US50-E,3.0,El Dorado,,31.627,60.162,3.134,308511.0,Sly Park Rd,2.0,Mainline,,No,1.0,,12,628.0,596.0,531.0,395.0,183.0,308.0,399.0,173.193008,168.0,628.0,0.434068,2,1.224647e-16,-1.0,0.974928,-0.222521,0,0,217131,3086071.0,3086081.0,504.0,334.0,0.554877
1,2024-10-02 13:00:00,308511,50,E,580.0,65.5,46,191,ML,US50-E,3.0,El Dorado,,31.627,60.162,3.134,308511.0,Sly Park Rd,2.0,Mainline,,No,1.0,,13,572.0,628.0,596.0,486.0,181.0,320.0,412.923077,173.251966,168.0,628.0,0.419574,2,-0.258819,-0.965926,0.974928,-0.222521,0,0,217132,3086071.0,3086081.0,470.0,267.0,0.554877
2,2024-10-02 14:00:00,308511,50,E,572.0,65.4,50,209,ML,US50-E,3.0,El Dorado,,31.627,60.162,3.134,308511.0,Sly Park Rd,2.0,Mainline,,No,1.0,,14,580.0,572.0,628.0,461.0,168.0,357.0,424.285714,171.798822,168.0,628.0,0.404913,2,-0.5,-0.866025,0.974928,-0.222521,0,0,217133,3086071.0,3086081.0,454.0,332.0,0.554877
3,2024-10-02 15:00:00,308511,50,E,605.0,65.7,46,192,ML,US50-E,3.0,El Dorado,,31.627,60.162,3.134,308511.0,Sly Park Rd,2.0,Mainline,,No,1.0,,15,572.0,580.0,572.0,531.0,178.0,244.0,436.333333,171.999446,168.0,628.0,0.394193,2,-0.7071068,-0.707107,0.974928,-0.222521,0,0,217134,3086071.0,3086081.0,497.0,248.0,0.554877
4,2024-10-02 16:00:00,308511,50,E,539.0,65.8,50,194,ML,US50-E,3.0,El Dorado,,31.627,60.162,3.134,308511.0,Sly Park Rd,2.0,Mainline,,No,1.0,,16,605.0,572.0,580.0,596.0,229.0,196.0,442.75,168.13784,168.0,628.0,0.379758,2,-0.8660254,-0.5,0.974928,-0.222521,1,0,217135,3086071.0,3086081.0,657.0,251.0,0.554877


In [60]:
target_col = "total_flow"


In [61]:
feature_cols = [
    # Temporal numeric
    'avg_speed', 'hour', 'dayofweek',
    'hour_sin', 'hour_cos', 'dow_sin', 'dow_cos',
    'is_peak_hour', 'is_weekend',

    # Lags
    'total_flow_lag_1', 'total_flow_lag_2', 'total_flow_lag_3',
    'total_flow_lag_6', 'total_flow_lag_12', 'total_flow_lag_24',

    # Rolling stats
    'rolling_mean_24h', 'rolling_std_24h',
    'rolling_min_24h', 'rolling_max_24h', 'rolling_cv_24h',

    # Spatial features
    'station_order', 'abs_pm_norm',

    # Spatial neighbor lag features
    'upstream_station_flow_lag_1',
    'downstream_station_flow_lag_1',

    # Metadata
    'route', 'lanes',
]


In [62]:
cat_cols = [
    'lane_type', 'type', 'direction_of_travel', 
    'hov', 'county', 'fwy', 'district'
]


In [63]:
df_train_encoded = pd.get_dummies(df_train, columns=cat_cols, drop_first=False)
df_val_encoded   = pd.get_dummies(df_val,   columns=cat_cols, drop_first=False)
df_test_encoded  = pd.get_dummies(df_test,  columns=cat_cols, drop_first=False)


In [64]:
df_val_encoded = df_val_encoded.reindex(columns=df_train_encoded.columns, fill_value=0)
df_test_encoded = df_test_encoded.reindex(columns=df_train_encoded.columns, fill_value=0)


In [65]:
drop_cols = [
    'timestamp', 'station', 'id',
    'upstream_station', 'downstream_station'
]

df_train_encoded = df_train_encoded.drop(columns=drop_cols, errors='ignore')
df_val_encoded   = df_val_encoded.drop(columns=drop_cols, errors='ignore')
df_test_encoded  = df_test_encoded.drop(columns=drop_cols, errors='ignore')


In [66]:
X_train = df_train_encoded.drop(columns=[target_col])
y_train = df_train_encoded[target_col]


In [67]:
X_val = df_val_encoded.drop(columns=[target_col])
y_val = df_val_encoded[target_col]


In [68]:
X_test = df_test_encoded.drop(columns=[target_col])
y_test = df_test_encoded[target_col]


In [69]:
print("X_train:", X_train.shape)
print("X_val:", X_val.shape)
print("X_test:", X_test.shape)

print("y_train:", y_train.shape)
print("y_val:", y_val.shape)
print("y_test:", y_test.shape)


X_train: (1815062, 106)
X_val: (555000, 106)
X_test: (1205541, 106)
y_train: (1815062,)
y_val: (555000,)
y_test: (1205541,)


In [70]:
# Drop the columns you don't want
cols_to_drop = ["ca_pm", "city", "name"]
X_train = X_train.drop(columns=cols_to_drop)
X_val   = X_val.drop(columns=cols_to_drop)
X_test  = X_test.drop(columns=cols_to_drop)

# Keep only numeric columns for scaling
numeric_cols = X_train.select_dtypes(include=["number"]).columns

X_train_num = X_train[numeric_cols]
X_val_num   = X_val[numeric_cols]
X_test_num  = X_test[numeric_cols]

print("Numeric columns used for scaling:")
print(numeric_cols)

#Scale only numeric features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_num)
X_val_scaled   = scaler.transform(X_val_num)
X_test_scaled  = scaler.transform(X_test_num)


Numeric columns used for scaling:
Index(['route', 'avg_speed', 'pct_observed', 'samples', 'abs_pm', 'length', 'lanes', 'ms_id', 'irm', 'hour', 'total_flow_lag_1', 'total_flow_lag_2', 'total_flow_lag_3', 'total_flow_lag_6',
       'total_flow_lag_12', 'total_flow_lag_24', 'rolling_mean_24h', 'rolling_std_24h', 'rolling_min_24h', 'rolling_max_24h', 'rolling_cv_24h', 'dayofweek', 'hour_sin', 'hour_cos', 'dow_sin',
       'dow_cos', 'is_peak_hour', 'is_weekend', 'station_order', 'upstream_station_flow_lag_1', 'downstream_station_flow_lag_1', 'abs_pm_norm'],
      dtype='object')


  updated_mean = (last_sum + new_sum) / updated_sample_count
  T = new_sum / new_sample_count
  new_unnormalized_variance -= correction**2 / new_sample_count


## Linear Regression Model (Elastic Net MIMO baseline)

In [104]:
import numpy as np
import pandas as pd

HORIZONS = [12, 24, 48, 72]
TARGET_COLS = [f"y_{h}" for h in HORIZONS]

def add_mimo_targets(df, target="total_flow"):
    df = df.sort_values(["station", "timestamp"]).copy()
    for h in HORIZONS:
        df[f"y_{h}"] = df.groupby("station")[target].shift(-h)
    return df

df_train_m = add_mimo_targets(df_train)
df_val_m   = add_mimo_targets(df_val)
df_test_m  = add_mimo_targets(df_test)

# Drop rows where any horizon is missing
df_train_m = df_train_m.dropna(subset=TARGET_COLS)
df_val_m   = df_val_m.dropna(subset=TARGET_COLS)
df_test_m  = df_test_m.dropna(subset=TARGET_COLS)


In [96]:
cat_cols = [
    "lane_type", "type", "direction_of_travel",
    "hov", "county", "fwy", "district"
]

df_train_enc = pd.get_dummies(df_train_m, columns=cat_cols, drop_first=False)
df_val_enc   = pd.get_dummies(df_val_m,   columns=cat_cols, drop_first=False)
df_test_enc  = pd.get_dummies(df_test_m,  columns=cat_cols, drop_first=False)

# Align columns: val/test → same columns as train
df_val_enc  = df_val_enc.reindex(columns=df_train_enc.columns, fill_value=0)
df_test_enc = df_test_enc.reindex(columns=df_train_enc.columns, fill_value=0)


In [97]:
# Columns we do NOT want as features
NON_FEATURE_COLS = [
    "timestamp", "station", "id",
    "upstream_station", "downstream_station",
    "total_flow",        # current target, not a predictor
] + TARGET_COLS         # these are our y's, not X

NON_FEATURE_COLS = [c for c in NON_FEATURE_COLS if c in df_train_enc.columns]

X_train = df_train_enc.drop(columns=NON_FEATURE_COLS)
X_val   = df_val_enc.drop(columns=NON_FEATURE_COLS)
X_test  = df_test_enc.drop(columns=NON_FEATURE_COLS)

y_train = df_train_enc[TARGET_COLS].values   # shape (n_samples, 4)
y_val   = df_val_enc[TARGET_COLS].values
y_test  = df_test_enc[TARGET_COLS].values

print("X_train:", X_train.shape)
print("X_val:  ", X_val.shape)
print("X_test: ", X_test.shape)
print("y_train:", y_train.shape)
print("y_val:  ", y_val.shape)
print("y_test: ", y_test.shape)


X_train: (1688253, 99)
X_val:   (430974, 99)
X_test:  (1081400, 99)
y_train: (1688253, 4)
y_val:   (430974, 4)
y_test:  (1081400, 4)


In [100]:

nan_cols = [c for c in X_train.columns if X_train[c].isna().any()]
print("Columns with NaNs in X_train:", nan_cols)

# Build masks for rows that are clean (no NaNs in those columns)
train_mask = ~X_train[nan_cols].isna().any(axis=1)
val_mask   = ~X_val[nan_cols].isna().any(axis=1)
test_mask  = ~X_test[nan_cols].isna().any(axis=1)

print("Dropping from train:", (~train_mask).sum())
print("Dropping from val:  ", (~val_mask).sum())
print("Dropping from test: ", (~test_mask).sum())

# Keep only rows with complete features
X_train = X_train[train_mask]
y_train = y_train[train_mask]

X_val   = X_val[val_mask]
y_val   = y_val[val_mask]

X_test  = X_test[test_mask]
y_test  = y_test[test_mask]


Columns with NaNs in X_train: ['upstream_station_flow_lag_1', 'downstream_station_flow_lag_1']
Dropping from train: 151190
Dropping from val:   31190
Dropping from test:  87582


In [101]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_train_s = scaler.fit_transform(X_train)
X_val_s   = scaler.transform(X_val)
X_test_s  = scaler.transform(X_test)


In [102]:
from sklearn.linear_model import MultiTaskElasticNet
from sklearn.metrics import mean_absolute_error, mean_squared_error

alphas = [0.01, 0.1, 1.0]       # small but reasonable grid
l1_ratios = [0.1, 0.5, 0.9]

best_cfg = None
best_val_rmse_mean = np.inf
results = []

for alpha in alphas:
    for l1 in l1_ratios:
        model = MultiTaskElasticNet(
            alpha=alpha,
            l1_ratio=l1,
            max_iter=3000,   # keep reasonable to avoid days of training
            random_state=42
        )
        model.fit(X_train_s, y_train)

        y_val_pred = model.predict(X_val_s)

        mae_list = []
        rmse_list = []
        for i, h in enumerate(HORIZONS):
            mae = mean_absolute_error(y_val[:, i], y_val_pred[:, i])
            rmse = mean_squared_error(y_val[:, i], y_val_pred[:, i], squared=False)
            mae_list.append(mae)
            rmse_list.append(rmse)

        rmse_mean = np.mean(rmse_list)

        results.append({
            "alpha": alpha,
            "l1_ratio": l1,
            "val_rmse_mean": rmse_mean,
            "val_mae_12": mae_list[0],
            "val_mae_24": mae_list[1],
            "val_mae_48": mae_list[2],
            "val_mae_72": mae_list[3],
            "val_rmse_12": rmse_list[0],
            "val_rmse_24": rmse_list[1],
            "val_rmse_48": rmse_list[2],
            "val_rmse_72": rmse_list[3],
        })

        if rmse_mean < best_val_rmse_mean:
            best_val_rmse_mean = rmse_mean
            best_cfg = (alpha, l1)

print("Best config (alpha, l1_ratio):", best_cfg)

results_df = pd.DataFrame(results)
display(results_df.sort_values("val_rmse_mean"))


Best config (alpha, l1_ratio): (0.01, 0.9)


Unnamed: 0,alpha,l1_ratio,val_rmse_mean,val_mae_12,val_mae_24,val_mae_48,val_mae_72,val_rmse_12,val_rmse_24,val_rmse_48,val_rmse_72
2,0.01,0.9,375.962396,167.283193,196.127142,237.520304,241.767428,344.265177,353.783271,406.179786,399.62135
1,0.01,0.5,376.413373,169.091868,197.100274,238.219815,242.0274,343.874304,355.537014,406.745124,399.497051
0,0.01,0.1,376.938354,170.999468,198.249672,238.853579,242.390376,343.840061,357.252835,407.148724,399.511797
5,0.1,0.9,377.087988,171.423809,198.586852,238.990496,242.45082,343.864822,357.731468,407.237263,399.518396
4,0.1,0.5,381.772471,190.210511,208.482754,244.177641,246.459388,350.665178,367.482027,408.831227,400.111451
3,0.1,0.1,387.078708,205.757763,216.929586,249.170637,250.857471,361.155021,373.984071,411.049947,402.125792
8,1.0,0.9,388.42251,208.985002,218.812639,250.008252,251.593134,364.004852,375.538217,411.563087,402.583886
7,1.0,0.5,439.791949,294.045399,265.142417,283.645773,282.382751,460.442616,421.31266,443.956291,433.456228
6,1.0,0.1,479.377674,340.575086,291.373391,305.599596,302.784001,526.132005,455.949125,473.524885,461.904682


In [103]:
best_alpha, best_l1 = best_cfg

best_model = MultiTaskElasticNet(
    alpha=best_alpha,
    l1_ratio=best_l1,
    max_iter=3000,
    random_state=42
)
best_model.fit(X_train_s, y_train)

y_val_pred  = best_model.predict(X_val_s)
y_test_pred = best_model.predict(X_test_s)

from sklearn.metrics import mean_absolute_error, mean_squared_error

def evaluate_multioutput(y_true, y_pred, split_name):
    print(f"\n=== {split_name} performance ===")
    for i, h in enumerate(HORIZONS):
        mae = mean_absolute_error(y_true[:, i], y_pred[:, i])
        rmse = mean_squared_error(y_true[:, i], y_pred[:, i], squared=False)
        print(f"Horizon {h:>2}h:  MAE={mae:.3f},  RMSE={rmse:.3f}")

evaluate_multioutput(y_val,  y_val_pred,  "Validation")
evaluate_multioutput(y_test, y_test_pred, "Test")



=== Validation performance ===
Horizon 12h:  MAE=167.283,  RMSE=344.265
Horizon 24h:  MAE=196.127,  RMSE=353.783
Horizon 48h:  MAE=237.520,  RMSE=406.180
Horizon 72h:  MAE=241.767,  RMSE=399.621

=== Test performance ===
Horizon 12h:  MAE=167.259,  RMSE=323.810
Horizon 24h:  MAE=194.969,  RMSE=345.779
Horizon 48h:  MAE=226.868,  RMSE=386.246
Horizon 72h:  MAE=226.044,  RMSE=380.819


## Random Forest (Hyperparamter Tuning,MIMO)

In [105]:
import numpy as np

print("Any NaNs in X_train?", X_train.isna().any().any())
print("Any NaNs in X_val?  ", X_val.isna().any().any())
print("Any NaNs in X_test? ", X_test.isna().any().any())

print("Any NaNs in y_train?", np.isnan(y_train).any())
print("Any NaNs in y_val?  ", np.isnan(y_val).any())
print("Any NaNs in y_test? ", np.isnan(y_test).any())


Any NaNs in X_train? False
Any NaNs in X_val?   False
Any NaNs in X_test?  False
Any NaNs in y_train? False
Any NaNs in y_val?   False
Any NaNs in y_test?  False


In [106]:
import numpy as np

max_tune_samples = 300_000  # adjust if memory/time is tight

if len(X_train) > max_tune_samples:
    rng = np.random.default_rng(42)
    tune_idx = rng.choice(len(X_train), size=max_tune_samples, replace=False)
    
    X_train_tune = X_train.iloc[tune_idx]
    y_train_tune = y_train[tune_idx]
else:
    X_train_tune = X_train
    y_train_tune = y_train

print("Tuning on", X_train_tune.shape[0], "samples.")


Tuning on 300000 samples.


In [107]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error
import pandas as pd

HORIZONS = [12, 24, 48, 72]

param_grid = {
    "n_estimators": [50, 100],
    "max_depth": [10, 20, None],
    "min_samples_split": [2, 5],
    "min_samples_leaf": [1, 3],
}

best_cfg = None
best_val_rmse_mean = np.inf
results = []

for n_est in param_grid["n_estimators"]:
    for depth in param_grid["max_depth"]:
        for min_split in param_grid["min_samples_split"]:
            for min_leaf in param_grid["min_samples_leaf"]:
                
                rf = RandomForestRegressor(
                    n_estimators=n_est,
                    max_depth=depth,
                    min_samples_split=min_split,
                    min_samples_leaf=min_leaf,
                    n_jobs=-1,
                    random_state=42,
                )
                
                rf.fit(X_train_tune, y_train_tune)   # MIMO training
                
                # Predict on validation (full val, not subsample)
                y_val_pred = rf.predict(X_val)
                
                mae_list = []
                rmse_list = []
                for i, h in enumerate(HORIZONS):
                    mae = mean_absolute_error(y_val[:, i], y_val_pred[:, i])
                    rmse = mean_squared_error(y_val[:, i], y_val_pred[:, i], squared=False)
                    mae_list.append(mae)
                    rmse_list.append(rmse)
                
                rmse_mean = np.mean(rmse_list)
                
                results.append({
                    "n_estimators": n_est,
                    "max_depth": depth,
                    "min_samples_split": min_split,
                    "min_samples_leaf": min_leaf,
                    "val_rmse_mean": rmse_mean,
                    "val_mae_12": mae_list[0],
                    "val_mae_24": mae_list[1],
                    "val_mae_48": mae_list[2],
                    "val_mae_72": mae_list[3],
                    "val_rmse_12": rmse_list[0],
                    "val_rmse_24": rmse_list[1],
                    "val_rmse_48": rmse_list[2],
                    "val_rmse_72": rmse_list[3],
                })
                
                if rmse_mean < best_val_rmse_mean:
                    best_val_rmse_mean = rmse_mean
                    best_cfg = (n_est, depth, min_split, min_leaf)

print("Best RF config:", best_cfg)

results_df = pd.DataFrame(results)
display(results_df.sort_values("val_rmse_mean").head(10))


Best RF config: (100, None, 2, 3)


Unnamed: 0,n_estimators,max_depth,min_samples_split,min_samples_leaf,val_rmse_mean,val_mae_12,val_mae_24,val_mae_48,val_mae_72,val_rmse_12,val_rmse_24,val_rmse_48,val_rmse_72
23,100,,5,3,282.983614,117.338824,127.960574,153.048205,160.171105,246.066365,266.758007,306.538847,312.571236
21,100,,2,3,282.983614,117.338824,127.960574,153.048205,160.171105,246.066365,266.758007,306.538847,312.571236
20,100,,2,1,283.039988,116.047731,127.669979,153.037767,160.507952,244.737546,266.92525,306.962873,313.534282
19,100,20.0,5,3,283.157123,118.211423,128.482174,153.455056,160.531777,246.378616,266.868051,306.685382,312.696442
17,100,20.0,2,3,283.157123,118.211423,128.482174,153.455056,160.531777,246.378616,266.868051,306.685382,312.696442
22,100,,5,1,283.251061,116.95012,127.989081,153.357308,160.837637,245.660602,266.900261,306.945856,313.497525
16,100,20.0,2,1,283.270967,117.621892,128.444136,153.642853,160.91451,245.31132,267.039325,307.125569,313.607655
9,50,,2,3,283.335284,117.813566,128.448546,153.332248,160.63484,246.42941,267.266171,306.727751,312.917805
11,50,,5,3,283.335284,117.813566,128.448546,153.332248,160.63484,246.42941,267.266171,306.727751,312.917805
18,100,20.0,5,1,283.474345,118.288078,128.720355,153.866312,161.162918,246.147202,267.113721,307.06605,313.570408


In [108]:
best_n_est, best_depth, best_min_split, best_min_leaf = best_cfg

rf_best = RandomForestRegressor(
    n_estimators=best_n_est,
    max_depth=best_depth,
    min_samples_split=best_min_split,
    min_samples_leaf=best_min_leaf,
    n_jobs=-1,
    random_state=42,
)

rf_best.fit(X_train, y_train)   # full train, multi-output


In [109]:
from sklearn.metrics import mean_absolute_error, mean_squared_error

y_val_pred  = rf_best.predict(X_val)
y_test_pred = rf_best.predict(X_test)

def evaluate_multioutput(y_true, y_pred, split_name):
    print(f"\n=== {split_name} performance (Random Forest MIMO) ===")
    for i, h in enumerate(HORIZONS):
        mae  = mean_absolute_error(y_true[:, i], y_pred[:, i])
        rmse = mean_squared_error(y_true[:, i], y_pred[:, i], squared=False)
        print(f"Horizon {h:>2}h:  MAE={mae:.3f},  RMSE={rmse:.3f}")

evaluate_multioutput(y_val,  y_val_pred,  "Validation")
evaluate_multioutput(y_test, y_test_pred, "Test")



=== Validation performance (Random Forest MIMO) ===
Horizon 12h:  MAE=110.796,  RMSE=238.475
Horizon 24h:  MAE=122.136,  RMSE=261.175
Horizon 48h:  MAE=148.032,  RMSE=303.520
Horizon 72h:  MAE=156.794,  RMSE=312.984

=== Test performance (Random Forest MIMO) ===
Horizon 12h:  MAE=108.037,  RMSE=230.633
Horizon 24h:  MAE=109.674,  RMSE=238.836
Horizon 48h:  MAE=123.461,  RMSE=267.246
Horizon 72h:  MAE=131.515,  RMSE=279.490


## LSTM MODEL 

In [142]:
# ============================================
# 1. Imports and basic setup
# ============================================
import numpy as np
import pandas as pd

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import TensorDataset, DataLoader

from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_absolute_error, mean_squared_error
import math

import optuna

# Reproducibility
SEED = 42
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed_all(SEED)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)


# ============================================
# 2. Multi-horizon targets (MIMO) per station
# ============================================
HORIZONS = [12, 24, 48, 72]  # hours ahead
TARGET_COLS = [f"y_{h}" for h in HORIZONS]

def add_mimo_targets(df, target="total_flow"):
    """
    For each station, create future total_flow targets:
    y_12, y_24, y_48, y_72 hours ahead.
    """
    df = df.sort_values(["station", "timestamp"]).copy()
    for h in HORIZONS:
        df[f"y_{h}"] = df.groupby("station")[target].shift(-h)
    return df

df_train_m = add_mimo_targets(df_train)
df_val_m   = add_mimo_targets(df_val)
df_test_m  = add_mimo_targets(df_test)

# Drop rows that do not have all horizons available
df_train_m = df_train_m.dropna(subset=TARGET_COLS).copy()
df_val_m   = df_val_m.dropna(subset=TARGET_COLS).copy()
df_test_m  = df_test_m.dropna(subset=TARGET_COLS).copy()

print("Train rows (LSTM):", len(df_train_m))
print("Val rows   (LSTM):", len(df_val_m))
print("Test rows  (LSTM):", len(df_test_m))


# ============================================
# 3. Define numeric feature set for LSTM
# ============================================
numeric_cols = df_train_m.select_dtypes(include=[np.number]).columns.tolist()

exclude_cols = set(TARGET_COLS)
for col in ["station", "id", "ID", "MS ID", "MS_ID"]:
    if col in numeric_cols:
        exclude_cols.add(col)

SEQ_FEATURES = [c for c in numeric_cols if c not in exclude_cols]

print("Number of LSTM numeric features:", len(SEQ_FEATURES))
print("Example features:", SEQ_FEATURES[:20])


# ============================================
# 4. Handle NaNs/inf in features + Standardize (FIXED)
# ============================================
# 4.1 Replace inf with NaN (critical)
for d in [df_train_m, df_val_m, df_test_m]:
    d[SEQ_FEATURES] = d[SEQ_FEATURES].replace([np.inf, -np.inf], np.nan)

# 4.2 Compute train means (may contain NaN if a feature is all-NaN in train)
train_means = df_train_m[SEQ_FEATURES].mean()

# 4.3 Drop features that are completely NaN in train (cannot be imputed)
bad_all_nan = train_means.index[train_means.isna()].tolist()
if bad_all_nan:
    print("Dropping all-NaN features (train):", bad_all_nan[:20], "..." if len(bad_all_nan) > 20 else "")
    SEQ_FEATURES = [c for c in SEQ_FEATURES if c not in bad_all_nan]
    train_means = df_train_m[SEQ_FEATURES].mean()

# 4.4 Impute using train means
for d in [df_train_m, df_val_m, df_test_m]:
    d[SEQ_FEATURES] = d[SEQ_FEATURES].fillna(train_means)

# 4.5 Hard safety check before scaling
def assert_finite_df(name, d, cols):
    x = d[cols].to_numpy()
    if not np.isfinite(x).all():
        bad = np.argwhere(~np.isfinite(x))
        r, c = bad[0]
        raise ValueError(f"{name}: non-finite value in column '{cols[c]}' at row index {r}")

assert_finite_df("train features", df_train_m, SEQ_FEATURES)
assert_finite_df("val features",   df_val_m,   SEQ_FEATURES)
assert_finite_df("test features",  df_test_m,  SEQ_FEATURES)

# 4.6 Scale (fit on train only)
scaler_seq = StandardScaler()
scaler_seq.fit(df_train_m[SEQ_FEATURES])

for d in [df_train_m, df_val_m, df_test_m]:
    d[SEQ_FEATURES] = scaler_seq.transform(d[SEQ_FEATURES])


# ============================================
# 5. Sequence builder (per station, variable seq_len)
# ============================================
def build_sequences(df, seq_features, target_cols, seq_len=24):
    """
    df must contain: 'station', 'timestamp', seq_features, target_cols
    Returns:
        X: (n_samples, seq_len, n_features)
        y: (n_samples, n_targets)
    """
    df = df.sort_values(["station", "timestamp"]).copy()

    X_list = []
    Y_list = []

    for station_id, g in df.groupby("station"):
        g = g.reset_index(drop=True)

        feat_mat = g[seq_features].values   # (T, F)
        targ_mat = g[target_cols].values    # (T, H)

        T = len(g)
        if T < seq_len:
            continue

        for end_idx in range(seq_len - 1, T):
            # targets should be clean, but keep guard
            if np.isnan(targ_mat[end_idx]).any():
                continue

            start_idx = end_idx - seq_len + 1
            X_seq = feat_mat[start_idx:end_idx + 1]   # (seq_len, F)
            y_vec = targ_mat[end_idx]                 # (H,)

            X_list.append(X_seq)
            Y_list.append(y_vec)

    if not X_list:
        raise ValueError("No sequences created – adjust seq_len or check data.")

    X = np.stack(X_list, axis=0)
    y = np.stack(Y_list, axis=0)
    return X, y


# ============================================
# 6. Metrics helpers
# ============================================
def eval_loader(model, loader, device):
    model.eval()
    all_true = []
    all_pred = []
    with torch.no_grad():
        for xb, yb in loader:
            xb = xb.to(device)
            yb = yb.to(device)
            y_hat = model(xb)
            all_true.append(yb.detach().cpu().numpy())
            all_pred.append(y_hat.detach().cpu().numpy())
    y_true = np.concatenate(all_true, axis=0)
    y_pred = np.concatenate(all_pred, axis=0)
    return y_true, y_pred

def compute_mae_rmse_by_horizon(y_true, y_pred, horizons):
    mae_list = []
    rmse_list = []
    for i, h in enumerate(horizons):
        mae = mean_absolute_error(y_true[:, i], y_pred[:, i])
        rmse = math.sqrt(mean_squared_error(y_true[:, i], y_pred[:, i]))
        mae_list.append(mae)
        rmse_list.append(rmse)
    return mae_list, rmse_list


# ============================================
# 7. LSTM MIMO model (with optional FC head)
# ============================================
class LSTMMIMO(nn.Module):
    def __init__(
        self,
        input_size,
        hidden_size=128,
        num_layers=2,
        dropout=0.2,
        n_outputs=4,
        fc_hidden=None,
    ):
        super().__init__()
        self.lstm = nn.LSTM(
            input_size=input_size,
            hidden_size=hidden_size,
            num_layers=num_layers,
            batch_first=True,
            dropout=dropout if num_layers > 1 else 0.0,
        )

        self.use_fc_hidden = fc_hidden is not None
        if not self.use_fc_hidden:
            self.fc = nn.Linear(hidden_size, n_outputs)
        else:
            self.fc1 = nn.Linear(hidden_size, fc_hidden)
            self.act = nn.ReLU()
            self.fc2 = nn.Linear(fc_hidden, n_outputs)

    def forward(self, x):
        _, (h_n, _) = self.lstm(x)   # h_n: (num_layers, batch, hidden)
        last_hidden = h_n[-1]        # (batch, hidden)

        if not self.use_fc_hidden:
            return self.fc(last_hidden)
        else:
            z = self.fc1(last_hidden)
            z = self.act(z)
            return self.fc2(z)


# ============================================
# 8. Training one config (for Optuna) (FIXED: grad clip + NaN guard)
# ============================================
def train_lstm_one_config(
    X_train, y_train,
    X_val, y_val,
    input_size,
    n_outputs,
    hidden_size,
    num_layers,
    dropout,
    lr,
    weight_decay,
    batch_size,
    max_epochs=30,
    patience=5,
    fc_hidden=None,
):
    X_train_t = torch.from_numpy(X_train).float()
    y_train_t = torch.from_numpy(y_train).float()
    X_val_t   = torch.from_numpy(X_val).float()
    y_val_t   = torch.from_numpy(y_val).float()

    train_ds = TensorDataset(X_train_t, y_train_t)
    val_ds   = TensorDataset(X_val_t,   y_val_t)

    train_loader = DataLoader(train_ds, batch_size=batch_size, shuffle=True,  drop_last=False)
    val_loader   = DataLoader(val_ds,   batch_size=batch_size, shuffle=False, drop_last=False)

    model = LSTMMIMO(
        input_size=input_size,
        hidden_size=hidden_size,
        num_layers=num_layers,
        dropout=dropout,
        n_outputs=n_outputs,
        fc_hidden=fc_hidden,
    ).to(device)

    criterion = nn.MSELoss()
    optimizer = optim.AdamW(model.parameters(), lr=lr, weight_decay=weight_decay)

    best_val_metric = float("inf")
    best_state = None
    epochs_no_improve = 0

    for epoch in range(1, max_epochs + 1):
        model.train()
        running_loss = 0.0
        n_batches = 0

        for xb, yb in train_loader:
            xb = xb.to(device)
            yb = yb.to(device)

            optimizer.zero_grad(set_to_none=True)
            y_hat = model(xb)

            # NaN/inf guard (fail fast)
            if torch.isnan(y_hat).any() or torch.isinf(y_hat).any():
                raise ValueError("Model output became NaN/inf during training.")

            loss = criterion(y_hat, yb)
            loss.backward()

            # Gradient clipping (stabilizes LSTM training)
            torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)

            optimizer.step()

            running_loss += loss.item()
            n_batches += 1

        train_loss = running_loss / max(n_batches, 1)

        # validation
        y_val_true, y_val_pred = eval_loader(model, val_loader, device)

        # Guard before sklearn metrics
        if (not np.isfinite(y_val_true).all()) or (not np.isfinite(y_val_pred).all()):
            raise ValueError("Non-finite values in y_val_true/y_val_pred before MAE/RMSE.")

        mae_list, _ = compute_mae_rmse_by_horizon(y_val_true, y_val_pred, HORIZONS)
        avg_val_mae = float(np.mean(mae_list))

        print(f"Epoch {epoch:02d} | Train MSE={train_loss:.4f} | Val avg MAE={avg_val_mae:.4f}")

        if avg_val_mae < best_val_metric - 1e-3:
            best_val_metric = avg_val_mae
            best_state = {k: v.detach().cpu().clone() for k, v in model.state_dict().items()}
            epochs_no_improve = 0
        else:
            epochs_no_improve += 1

        if epochs_no_improve >= patience:
            print(f"Early stopping after {epoch} epochs.")
            break

    if best_state is not None:
        model.load_state_dict(best_state)

    return model, best_val_metric


# ============================================
# 9. Training final model (no early stopping) (FIXED: grad clip + NaN guard)
# ============================================
def train_lstm_final(
    X_train, y_train,
    input_size,
    n_outputs,
    hidden_size,
    num_layers,
    dropout,
    lr,
    weight_decay,
    batch_size,
    max_epochs=30,
    fc_hidden=None,
):
    X_train_t = torch.from_numpy(X_train).float()
    y_train_t = torch.from_numpy(y_train).float()
    train_ds = TensorDataset(X_train_t, y_train_t)
    train_loader = DataLoader(train_ds, batch_size=batch_size, shuffle=True, drop_last=False)

    model = LSTMMIMO(
        input_size=input_size,
        hidden_size=hidden_size,
        num_layers=num_layers,
        dropout=dropout,
        n_outputs=n_outputs,
        fc_hidden=fc_hidden,
    ).to(device)

    criterion = nn.MSELoss()
    optimizer = optim.AdamW(model.parameters(), lr=lr, weight_decay=weight_decay)

    for epoch in range(1, max_epochs + 1):
        model.train()
        running_loss = 0.0
        n_batches = 0

        for xb, yb in train_loader:
            xb = xb.to(device)
            yb = yb.to(device)

            optimizer.zero_grad(set_to_none=True)
            y_hat = model(xb)

            if torch.isnan(y_hat).any() or torch.isinf(y_hat).any():
                raise ValueError("Model output became NaN/inf during final training.")

            loss = criterion(y_hat, yb)
            loss.backward()

            torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)

            optimizer.step()

            running_loss += loss.item()
            n_batches += 1

        train_loss = running_loss / max(n_batches, 1)
        print(f"[FINAL TRAIN] Epoch {epoch:02d} | Train MSE={train_loss:.4f}")

    return model


# ============================================
# 10. Optuna objective (hyperparameter search) (FIXED: safer LR + deprecations)
# ============================================
def assert_finite_array(name, a):
    a = np.asarray(a)
    if not np.isfinite(a).all():
        idx = np.argwhere(~np.isfinite(a))[0]
        raise ValueError(f"{name} has NaN/inf at index {tuple(idx)}")

def objective(trial):
    seq_len      = trial.suggest_categorical("seq_len", [24, 48])
    hidden_size  = trial.suggest_categorical("hidden_size", [64, 96, 128, 192])
    num_layers   = trial.suggest_int("num_layers", 1, 3)
    dropout      = trial.suggest_float("dropout", 0.0, 0.3)

    # safer ranges + no deprecation warnings
    lr           = trial.suggest_float("lr", 1e-4, 8e-4, log=True)
    weight_decay = trial.suggest_float("weight_decay", 1e-6, 5e-4, log=True)

    batch_size   = trial.suggest_categorical("batch_size", [128, 256])

    use_fc_head  = trial.suggest_categorical("use_fc_head", [False, True])
    fc_hidden    = None
    if use_fc_head:
        fc_hidden = trial.suggest_categorical("fc_hidden", [64, 128, 256])

    # build sequences
    X_train_seq, y_train_seq = build_sequences(df_train_m, SEQ_FEATURES, TARGET_COLS, seq_len=seq_len)
    X_val_seq,   y_val_seq   = build_sequences(df_val_m,   SEQ_FEATURES, TARGET_COLS, seq_len=seq_len)

    # finiteness checks
    assert_finite_array("X_train_seq", X_train_seq)
    assert_finite_array("y_train_seq", y_train_seq)
    assert_finite_array("X_val_seq",   X_val_seq)
    assert_finite_array("y_val_seq",   y_val_seq)

    input_size = X_train_seq.shape[2]
    n_outputs  = y_train_seq.shape[1]

    try:
        _, best_val_mae = train_lstm_one_config(
            X_train_seq, y_train_seq,
            X_val_seq,   y_val_seq,
            input_size=input_size,
            n_outputs=n_outputs,
            hidden_size=hidden_size,
            num_layers=num_layers,
            dropout=dropout,
            lr=lr,
            weight_decay=weight_decay,
            batch_size=batch_size,
            max_epochs=30,
            patience=5,
            fc_hidden=fc_hidden,
        )
        return best_val_mae
    except ValueError as e:
        # prune NaN/inf configs instead of killing the whole study
        msg = str(e).lower()
        if ("nan" in msg) or ("inf" in msg) or ("non-finite" in msg):
            raise optuna.TrialPruned(str(e))
        raise


# ============================================
# 11. Run Optuna study
# ============================================
study = optuna.create_study(direction="minimize")
study.optimize(objective, n_trials=30)

print("Number of finished trials:", len(study.trials))
print("Best validation avg MAE:", study.best_value)
print("Best params:")
for k, v in study.best_trial.params.items():
    print(f"  {k}: {v}")


# ============================================
# 12. Retrain best LSTM on train+val and evaluate on test
# ============================================
best_params = study.best_trial.params
print("\nBest params (for final LSTM):", best_params)

best_seq_len   = best_params["seq_len"]
best_hidden    = best_params["hidden_size"]
best_layers    = best_params["num_layers"]
best_dropout   = best_params["dropout"]
best_lr        = best_params["lr"]
best_wd        = best_params["weight_decay"]
best_batch     = best_params["batch_size"]
best_use_fc    = best_params["use_fc_head"]
best_fc_hidden = best_params.get("fc_hidden", None) if best_use_fc else None

# Combine train+val
df_trainval_m = pd.concat([df_train_m, df_val_m], axis=0)

X_trainval_seq, y_trainval_seq = build_sequences(df_trainval_m, SEQ_FEATURES, TARGET_COLS, seq_len=best_seq_len)
X_test_seq,     y_test_seq     = build_sequences(df_test_m,     SEQ_FEATURES, TARGET_COLS, seq_len=best_seq_len)

# Final sanity checks
assert_finite_array("X_trainval_seq", X_trainval_seq)
assert_finite_array("y_trainval_seq", y_trainval_seq)
assert_finite_array("X_test_seq",     X_test_seq)
assert_finite_array("y_test_seq",     y_test_seq)

print("Train+Val seq shape:", X_trainval_seq.shape)
print("Test seq shape:    ", X_test_seq.shape)

input_size = X_trainval_seq.shape[2]
n_outputs  = y_trainval_seq.shape[1]

best_lstm_model = train_lstm_final(
    X_trainval_seq, y_trainval_seq,
    input_size=input_size,
    n_outputs=n_outputs,
    hidden_size=best_hidden,
    num_layers=best_layers,
    dropout=best_dropout,
    lr=best_lr,
    weight_decay=best_wd,
    batch_size=best_batch,
    max_epochs=30,
    fc_hidden=best_fc_hidden,
)

# Evaluate on test
X_test_t = torch.from_numpy(X_test_seq).float()
y_test_t = torch.from_numpy(y_test_seq).float()
test_ds  = TensorDataset(X_test_t, y_test_t)
test_loader = DataLoader(test_ds, batch_size=best_batch, shuffle=False, drop_last=False)

y_test_true, y_test_pred = eval_loader(best_lstm_model, test_loader, device)

if (not np.isfinite(y_test_true).all()) or (not np.isfinite(y_test_pred).all()):
    raise ValueError("Non-finite values in y_test_true/y_test_pred before MAE/RMSE.")

mae_list, rmse_list = compute_mae_rmse_by_horizon(y_test_true, y_test_pred, HORIZONS)

print("\n=== Final Test performance (LSTM MIMO, Optuna-tuned, numeric features) ===")
for i, h in enumerate(HORIZONS):
    print(f"Horizon {h:>2}h: MAE={mae_list[i]:.3f}, RMSE={rmse_list[i]:.3f}")


Using device: cpu
Train rows (LSTM): 1688253
Val rows   (LSTM): 430974
Test rows  (LSTM): 1081400
Number of LSTM numeric features: 36
Example features: ['route', 'total_flow', 'avg_speed', 'pct_observed', 'samples', 'district', 'abs_pm', 'length', 'lanes', 'ms_id', 'irm', 'hour', 'total_flow_lag_1', 'total_flow_lag_2', 'total_flow_lag_3', 'total_flow_lag_6', 'total_flow_lag_12', 'total_flow_lag_24', 'rolling_mean_24h', 'rolling_std_24h']
Dropping all-NaN features (train): ['irm'] 


[I 2025-12-21 23:01:16,772] A new study created in memory with name: no-name-57a985a3-25d9-4a32-8cde-e15780b21769


Epoch 01 | Train MSE=888286.5131 | Val avg MAE=204.9874
Epoch 02 | Train MSE=81608.0009 | Val avg MAE=165.1434
Epoch 03 | Train MSE=61305.1449 | Val avg MAE=156.8264
Epoch 04 | Train MSE=47235.2947 | Val avg MAE=148.6420
Epoch 05 | Train MSE=41653.5490 | Val avg MAE=147.5267
Epoch 06 | Train MSE=39218.9999 | Val avg MAE=148.6104
Epoch 07 | Train MSE=36999.8900 | Val avg MAE=150.5392
Epoch 08 | Train MSE=34598.4405 | Val avg MAE=143.5492
Epoch 09 | Train MSE=32771.6470 | Val avg MAE=139.6030
Epoch 10 | Train MSE=31579.9796 | Val avg MAE=140.5489
Epoch 11 | Train MSE=30753.5867 | Val avg MAE=146.5336
Epoch 12 | Train MSE=29984.3764 | Val avg MAE=141.3441
Epoch 13 | Train MSE=29366.8188 | Val avg MAE=141.5092
Epoch 14 | Train MSE=28852.4461 | Val avg MAE=143.9686
Early stopping after 14 epochs.


[I 2025-12-22 01:19:04,001] Trial 0 finished with value: 139.60297012329102 and parameters: {'seq_len': 48, 'hidden_size': 128, 'num_layers': 1, 'dropout': 0.18279657183045964, 'lr': 0.00017243169962096453, 'weight_decay': 0.00034854923494569655, 'batch_size': 256, 'use_fc_head': True, 'fc_hidden': 256}. Best is trial 0 with value: 139.60297012329102.


Epoch 01 | Train MSE=2621705.7906 | Val avg MAE=789.4817
Epoch 02 | Train MSE=2151011.8758 | Val avg MAE=690.6048
Epoch 03 | Train MSE=1796077.1652 | Val avg MAE=622.9527
Epoch 04 | Train MSE=1510598.8991 | Val avg MAE=573.8016
Epoch 05 | Train MSE=1279173.4918 | Val avg MAE=532.5050
Epoch 06 | Train MSE=1092885.3186 | Val avg MAE=493.6100
Epoch 07 | Train MSE=937913.8016 | Val avg MAE=454.2796
Epoch 08 | Train MSE=805808.4096 | Val avg MAE=416.3296
Epoch 09 | Train MSE=690995.0705 | Val avg MAE=378.0546
Epoch 10 | Train MSE=590861.3039 | Val avg MAE=344.8239
Epoch 11 | Train MSE=504595.3047 | Val avg MAE=314.2296
Epoch 12 | Train MSE=431245.8452 | Val avg MAE=292.6281
Epoch 13 | Train MSE=369205.5969 | Val avg MAE=268.7428
Epoch 14 | Train MSE=317047.7433 | Val avg MAE=252.8503
Epoch 15 | Train MSE=273347.0962 | Val avg MAE=239.3214
Epoch 16 | Train MSE=236618.8761 | Val avg MAE=225.5005
Epoch 17 | Train MSE=205944.4810 | Val avg MAE=211.6714
Epoch 18 | Train MSE=180184.7148 | Val avg

[I 2025-12-22 02:55:49,144] Trial 1 finished with value: 160.37986755371094 and parameters: {'seq_len': 24, 'hidden_size': 64, 'num_layers': 1, 'dropout': 0.11877656731480904, 'lr': 0.0003776461966532008, 'weight_decay': 7.863453923578373e-06, 'batch_size': 128, 'use_fc_head': False}. Best is trial 0 with value: 139.60297012329102.


Epoch 01 | Train MSE=284033.4902 | Val avg MAE=157.1277
Epoch 02 | Train MSE=46351.6253 | Val avg MAE=145.4572
Epoch 03 | Train MSE=33912.8262 | Val avg MAE=141.9714
Epoch 04 | Train MSE=30040.7267 | Val avg MAE=135.2995
Epoch 05 | Train MSE=27664.3493 | Val avg MAE=137.1155
Epoch 06 | Train MSE=25949.5624 | Val avg MAE=138.3983
Epoch 07 | Train MSE=24658.5912 | Val avg MAE=137.2494
Epoch 08 | Train MSE=23609.4686 | Val avg MAE=138.7376
Epoch 09 | Train MSE=22743.8419 | Val avg MAE=136.8513
Early stopping after 9 epochs.


[I 2025-12-22 05:43:41,105] Trial 2 finished with value: 135.29945182800293 and parameters: {'seq_len': 24, 'hidden_size': 128, 'num_layers': 3, 'dropout': 0.1398739486757031, 'lr': 0.00047412555827233585, 'weight_decay': 3.008165097293078e-06, 'batch_size': 128, 'use_fc_head': True, 'fc_hidden': 128}. Best is trial 2 with value: 135.29945182800293.


Epoch 01 | Train MSE=2294259.2660 | Val avg MAE=651.5102
Epoch 02 | Train MSE=1482749.0246 | Val avg MAE=530.2490
Epoch 03 | Train MSE=1012463.3921 | Val avg MAE=433.4208
Epoch 04 | Train MSE=703793.5889 | Val avg MAE=354.5431
Epoch 05 | Train MSE=493297.5230 | Val avg MAE=297.7001
Epoch 06 | Train MSE=350895.0014 | Val avg MAE=254.7436
Epoch 07 | Train MSE=252490.8693 | Val avg MAE=221.1709
Epoch 08 | Train MSE=184623.7303 | Val avg MAE=199.9077
Epoch 09 | Train MSE=138032.2261 | Val avg MAE=184.2266
Epoch 10 | Train MSE=106814.7962 | Val avg MAE=176.2647
Epoch 11 | Train MSE=86542.7910 | Val avg MAE=168.1982
Epoch 12 | Train MSE=72576.1783 | Val avg MAE=163.6987
Epoch 13 | Train MSE=61934.2868 | Val avg MAE=157.8394
Epoch 14 | Train MSE=53730.3491 | Val avg MAE=156.1455
Epoch 15 | Train MSE=47711.7626 | Val avg MAE=153.4488
Epoch 16 | Train MSE=43304.0829 | Val avg MAE=148.5932
Epoch 17 | Train MSE=39939.0743 | Val avg MAE=151.4136
Epoch 18 | Train MSE=37324.0479 | Val avg MAE=148.48

[I 2025-12-22 13:26:11,537] Trial 3 finished with value: 141.0974521636963 and parameters: {'seq_len': 48, 'hidden_size': 96, 'num_layers': 2, 'dropout': 0.2835048252827458, 'lr': 0.0006307009205788019, 'weight_decay': 1.2062676008709296e-06, 'batch_size': 128, 'use_fc_head': False}. Best is trial 2 with value: 135.29945182800293.


Epoch 01 | Train MSE=260988.3778 | Val avg MAE=146.9226
Epoch 02 | Train MSE=39085.6565 | Val avg MAE=141.6802
Epoch 03 | Train MSE=31941.2732 | Val avg MAE=141.9841
Epoch 04 | Train MSE=29010.5577 | Val avg MAE=137.9284
Epoch 05 | Train MSE=27166.7309 | Val avg MAE=139.7583
Epoch 06 | Train MSE=25804.5396 | Val avg MAE=140.0269
Epoch 07 | Train MSE=24747.7265 | Val avg MAE=139.8873
Epoch 08 | Train MSE=23823.6942 | Val avg MAE=141.8532
Epoch 09 | Train MSE=23084.4415 | Val avg MAE=140.1224
Early stopping after 9 epochs.


[I 2025-12-22 16:06:52,993] Trial 4 finished with value: 137.92837142944336 and parameters: {'seq_len': 48, 'hidden_size': 192, 'num_layers': 1, 'dropout': 0.04667802755387429, 'lr': 0.0005998145323909479, 'weight_decay': 0.00010353206835601686, 'batch_size': 128, 'use_fc_head': True, 'fc_hidden': 64}. Best is trial 2 with value: 135.29945182800293.


Epoch 01 | Train MSE=2684883.3318 | Val avg MAE=820.3597
Epoch 02 | Train MSE=2297647.8178 | Val avg MAE=730.7983
Epoch 03 | Train MSE=1989562.4904 | Val avg MAE=666.6228
Epoch 04 | Train MSE=1734265.0883 | Val avg MAE=617.4141
Epoch 05 | Train MSE=1517627.4097 | Val avg MAE=577.8007
Epoch 06 | Train MSE=1333585.8886 | Val avg MAE=543.6396
Epoch 07 | Train MSE=1176849.7188 | Val avg MAE=507.5430
Epoch 08 | Train MSE=1041609.0367 | Val avg MAE=469.8899
Epoch 09 | Train MSE=921016.3394 | Val avg MAE=433.5579
Epoch 10 | Train MSE=813651.9175 | Val avg MAE=401.2734
Epoch 11 | Train MSE=718357.5126 | Val avg MAE=374.8584
Epoch 12 | Train MSE=633577.5967 | Val avg MAE=347.9872
Epoch 13 | Train MSE=558116.8781 | Val avg MAE=324.4159
Epoch 14 | Train MSE=491738.7573 | Val avg MAE=305.0246
Epoch 15 | Train MSE=433454.3251 | Val avg MAE=286.3428
Epoch 16 | Train MSE=382561.2933 | Val avg MAE=269.3656
Epoch 17 | Train MSE=338141.8355 | Val avg MAE=253.6048
Epoch 18 | Train MSE=299469.5738 | Val a

[I 2025-12-22 19:02:10,569] Trial 5 finished with value: 174.01996231079102 and parameters: {'seq_len': 24, 'hidden_size': 64, 'num_layers': 2, 'dropout': 0.058076191292492446, 'lr': 0.00028913167587516473, 'weight_decay': 0.00010666718837173085, 'batch_size': 128, 'use_fc_head': False}. Best is trial 2 with value: 135.29945182800293.


Epoch 01 | Train MSE=2803203.4691 | Val avg MAE=885.6512
Epoch 02 | Train MSE=2597472.8882 | Val avg MAE=827.5401
Epoch 03 | Train MSE=2415081.3623 | Val avg MAE=780.4769
Epoch 04 | Train MSE=2252531.4674 | Val avg MAE=741.2516
Epoch 05 | Train MSE=2106742.6438 | Val avg MAE=708.3741
Epoch 06 | Train MSE=1973935.8167 | Val avg MAE=679.7475
Epoch 07 | Train MSE=1851217.5591 | Val avg MAE=654.9623
Epoch 08 | Train MSE=1737670.7555 | Val avg MAE=631.0606
Epoch 09 | Train MSE=1632197.1376 | Val avg MAE=611.5437
Epoch 10 | Train MSE=1534143.9806 | Val avg MAE=593.4034
Epoch 11 | Train MSE=1443355.0732 | Val avg MAE=575.1620
Epoch 12 | Train MSE=1359080.5994 | Val avg MAE=560.7695
Epoch 13 | Train MSE=1280730.9828 | Val avg MAE=546.3141
Epoch 14 | Train MSE=1208282.6694 | Val avg MAE=530.8681
Epoch 15 | Train MSE=1140660.3700 | Val avg MAE=514.5329
Epoch 16 | Train MSE=1077289.4858 | Val avg MAE=498.8444
Epoch 17 | Train MSE=1017515.0709 | Val avg MAE=482.4942
Epoch 18 | Train MSE=960690.181

[I 2025-12-22 22:08:42,113] Trial 6 finished with value: 313.86082458496094 and parameters: {'seq_len': 24, 'hidden_size': 128, 'num_layers': 1, 'dropout': 0.011094582668396136, 'lr': 0.00013331706318015402, 'weight_decay': 0.00028683881838577097, 'batch_size': 256, 'use_fc_head': False}. Best is trial 2 with value: 135.29945182800293.


Epoch 01 | Train MSE=464606.9194 | Val avg MAE=161.4172
Epoch 02 | Train MSE=52458.0181 | Val avg MAE=148.2775
Epoch 03 | Train MSE=38908.0276 | Val avg MAE=141.7835
Epoch 04 | Train MSE=33739.9144 | Val avg MAE=140.8395
Epoch 05 | Train MSE=31561.8034 | Val avg MAE=140.1644
Epoch 06 | Train MSE=30080.7688 | Val avg MAE=139.9337
Epoch 07 | Train MSE=28933.3383 | Val avg MAE=139.4897
Epoch 08 | Train MSE=28011.5741 | Val avg MAE=141.6566
Epoch 09 | Train MSE=27241.5938 | Val avg MAE=140.7020
Epoch 10 | Train MSE=26602.1427 | Val avg MAE=141.4180
Epoch 11 | Train MSE=26053.1234 | Val avg MAE=142.7401
Epoch 12 | Train MSE=25518.8418 | Val avg MAE=140.0545
Early stopping after 12 epochs.


[I 2025-12-23 00:27:40,224] Trial 7 finished with value: 139.48967170715332 and parameters: {'seq_len': 48, 'hidden_size': 128, 'num_layers': 1, 'dropout': 0.13973381907150748, 'lr': 0.0005124113628062034, 'weight_decay': 8.556461033525865e-05, 'batch_size': 256, 'use_fc_head': True, 'fc_hidden': 128}. Best is trial 2 with value: 135.29945182800293.


Epoch 01 | Train MSE=2739757.4359 | Val avg MAE=842.4169
Epoch 02 | Train MSE=2430941.3489 | Val avg MAE=766.3767
Epoch 03 | Train MSE=2178379.2796 | Val avg MAE=709.2198
Epoch 04 | Train MSE=1962990.7260 | Val avg MAE=664.0283
Epoch 05 | Train MSE=1772116.7047 | Val avg MAE=625.9785
Epoch 06 | Train MSE=1602332.4344 | Val avg MAE=595.4917
Epoch 07 | Train MSE=1451851.6273 | Val avg MAE=566.2780
Epoch 08 | Train MSE=1318711.5664 | Val avg MAE=543.5745
Epoch 09 | Train MSE=1200897.7543 | Val avg MAE=520.8731
Epoch 10 | Train MSE=1095901.5154 | Val avg MAE=494.5957
Epoch 11 | Train MSE=1001241.7233 | Val avg MAE=472.1403
Epoch 12 | Train MSE=914976.8914 | Val avg MAE=449.4621
Epoch 13 | Train MSE=835879.4852 | Val avg MAE=424.9232
Epoch 14 | Train MSE=763744.4531 | Val avg MAE=406.2322
Epoch 15 | Train MSE=697896.0798 | Val avg MAE=385.3488
Epoch 16 | Train MSE=637302.3783 | Val avg MAE=365.5727
Epoch 17 | Train MSE=582057.7048 | Val avg MAE=347.9329
Epoch 18 | Train MSE=531820.7073 | Va

[I 2025-12-23 05:33:05,293] Trial 8 finished with value: 214.0431022644043 and parameters: {'seq_len': 48, 'hidden_size': 128, 'num_layers': 1, 'dropout': 0.251561045511314, 'lr': 0.00011225019871639282, 'weight_decay': 1.002654251729281e-06, 'batch_size': 128, 'use_fc_head': False}. Best is trial 2 with value: 135.29945182800293.


Epoch 01 | Train MSE=758409.2972 | Val avg MAE=187.0823
Epoch 02 | Train MSE=73882.6501 | Val avg MAE=166.8026
Epoch 03 | Train MSE=64293.4651 | Val avg MAE=161.7550
Epoch 04 | Train MSE=50628.8792 | Val avg MAE=155.2564
Epoch 05 | Train MSE=41021.0099 | Val avg MAE=149.9497
Epoch 06 | Train MSE=38002.9858 | Val avg MAE=149.3010
Epoch 07 | Train MSE=34827.1180 | Val avg MAE=146.0250
Epoch 08 | Train MSE=31161.5938 | Val avg MAE=145.7017
Epoch 09 | Train MSE=29095.9076 | Val avg MAE=143.9759
Epoch 10 | Train MSE=27754.4067 | Val avg MAE=141.9228
Epoch 11 | Train MSE=26750.7786 | Val avg MAE=140.9322
Epoch 12 | Train MSE=25827.0713 | Val avg MAE=141.7650
Epoch 13 | Train MSE=25069.2377 | Val avg MAE=140.3488
Epoch 14 | Train MSE=24373.7806 | Val avg MAE=143.4617
Epoch 15 | Train MSE=23795.6431 | Val avg MAE=141.8383
Epoch 16 | Train MSE=23260.0325 | Val avg MAE=141.6295
Epoch 17 | Train MSE=22701.6178 | Val avg MAE=139.3617
Epoch 18 | Train MSE=22274.4746 | Val avg MAE=140.5797
Epoch 19 

[I 2025-12-23 18:19:37,689] Trial 9 finished with value: 139.36168479919434 and parameters: {'seq_len': 48, 'hidden_size': 128, 'num_layers': 3, 'dropout': 0.22776324392350739, 'lr': 0.00028133278163109516, 'weight_decay': 0.0004397701596305855, 'batch_size': 256, 'use_fc_head': True, 'fc_hidden': 128}. Best is trial 2 with value: 135.29945182800293.


Epoch 01 | Train MSE=304693.6676 | Val avg MAE=156.4496
Epoch 02 | Train MSE=50421.2956 | Val avg MAE=143.4345
Epoch 03 | Train MSE=34770.9535 | Val avg MAE=138.8524
Epoch 04 | Train MSE=29497.9097 | Val avg MAE=140.2761
Epoch 05 | Train MSE=26964.8626 | Val avg MAE=136.9650
Epoch 06 | Train MSE=25075.2465 | Val avg MAE=135.8765
Epoch 07 | Train MSE=23512.3296 | Val avg MAE=139.8687
Epoch 08 | Train MSE=22287.8167 | Val avg MAE=138.0817
Epoch 09 | Train MSE=21257.7699 | Val avg MAE=138.4098
Epoch 10 | Train MSE=20392.3774 | Val avg MAE=138.2908
Epoch 11 | Train MSE=19608.9211 | Val avg MAE=139.0440
Early stopping after 11 epochs.


[I 2025-12-24 00:53:56,764] Trial 10 finished with value: 135.87645149230957 and parameters: {'seq_len': 24, 'hidden_size': 192, 'num_layers': 3, 'dropout': 0.101225242494378, 'lr': 0.00038378821501749413, 'weight_decay': 6.077813966908005e-06, 'batch_size': 128, 'use_fc_head': True, 'fc_hidden': 128}. Best is trial 2 with value: 135.29945182800293.


Epoch 01 | Train MSE=279284.6351 | Val avg MAE=148.8024
Epoch 02 | Train MSE=39305.7303 | Val avg MAE=139.0697
Epoch 03 | Train MSE=29949.2799 | Val avg MAE=137.3662
Epoch 04 | Train MSE=26363.5798 | Val avg MAE=137.1802
Epoch 05 | Train MSE=24059.5869 | Val avg MAE=136.3881
Epoch 06 | Train MSE=22358.3973 | Val avg MAE=133.9233
Epoch 07 | Train MSE=21034.0140 | Val avg MAE=137.5202
Epoch 08 | Train MSE=19943.1887 | Val avg MAE=137.0705
Epoch 09 | Train MSE=19026.7000 | Val avg MAE=135.3519
Epoch 10 | Train MSE=18190.5133 | Val avg MAE=136.2853
Epoch 11 | Train MSE=17502.0238 | Val avg MAE=136.8981
Early stopping after 11 epochs.


[I 2025-12-24 07:14:03,337] Trial 11 finished with value: 133.9233112335205 and parameters: {'seq_len': 24, 'hidden_size': 192, 'num_layers': 3, 'dropout': 0.08964163752606763, 'lr': 0.0004045030802118628, 'weight_decay': 5.901669321983368e-06, 'batch_size': 128, 'use_fc_head': True, 'fc_hidden': 128}. Best is trial 11 with value: 133.9233112335205.


Epoch 01 | Train MSE=186442.6692 | Val avg MAE=144.2151
Epoch 02 | Train MSE=35226.5526 | Val avg MAE=140.0301
Epoch 03 | Train MSE=28850.5453 | Val avg MAE=139.8354
Epoch 04 | Train MSE=25861.3349 | Val avg MAE=139.2626
Epoch 05 | Train MSE=23940.6118 | Val avg MAE=139.0280
Epoch 06 | Train MSE=22458.3859 | Val avg MAE=138.1199
Epoch 07 | Train MSE=21372.4467 | Val avg MAE=136.0789
Epoch 08 | Train MSE=20372.7731 | Val avg MAE=136.7900
Epoch 09 | Train MSE=19586.8926 | Val avg MAE=135.9178
Epoch 10 | Train MSE=18894.0419 | Val avg MAE=137.3139
Epoch 11 | Train MSE=18282.1834 | Val avg MAE=136.8845
Epoch 12 | Train MSE=17786.9971 | Val avg MAE=136.7448
Epoch 13 | Train MSE=17364.2711 | Val avg MAE=137.2697
Epoch 14 | Train MSE=17015.1243 | Val avg MAE=136.7741
Early stopping after 14 epochs.


[I 2025-12-24 15:27:42,262] Trial 12 finished with value: 135.9177646636963 and parameters: {'seq_len': 24, 'hidden_size': 192, 'num_layers': 3, 'dropout': 0.17000210469070173, 'lr': 0.0007949108157047812, 'weight_decay': 4.8654761136106204e-06, 'batch_size': 128, 'use_fc_head': True, 'fc_hidden': 128}. Best is trial 11 with value: 133.9233112335205.


Epoch 01 | Train MSE=348580.6089 | Val avg MAE=161.3604
Epoch 02 | Train MSE=62921.8425 | Val avg MAE=151.6389
Epoch 03 | Train MSE=42839.8754 | Val avg MAE=143.6594
Epoch 04 | Train MSE=36142.1205 | Val avg MAE=142.2110
Epoch 05 | Train MSE=31330.6735 | Val avg MAE=139.4617
Epoch 06 | Train MSE=29414.4711 | Val avg MAE=141.2255
Epoch 07 | Train MSE=27999.7005 | Val avg MAE=139.2467
Epoch 08 | Train MSE=26850.4758 | Val avg MAE=142.2234
Epoch 09 | Train MSE=25915.7651 | Val avg MAE=137.8974
Epoch 10 | Train MSE=25149.9979 | Val avg MAE=135.3493
Epoch 11 | Train MSE=24426.9738 | Val avg MAE=138.8775
Epoch 12 | Train MSE=23839.6450 | Val avg MAE=137.6000
Epoch 13 | Train MSE=23366.2449 | Val avg MAE=137.7680
Epoch 14 | Train MSE=22867.5905 | Val avg MAE=135.6014
Epoch 15 | Train MSE=22490.5167 | Val avg MAE=138.9943
Early stopping after 15 epochs.


[I 2025-12-24 18:46:38,947] Trial 13 finished with value: 135.34930229187012 and parameters: {'seq_len': 24, 'hidden_size': 96, 'num_layers': 3, 'dropout': 0.07117707811692936, 'lr': 0.0004077622308173462, 'weight_decay': 1.682006029819981e-05, 'batch_size': 128, 'use_fc_head': True, 'fc_hidden': 128}. Best is trial 11 with value: 133.9233112335205.


Epoch 01 | Train MSE=516855.7652 | Val avg MAE=157.1177
Epoch 02 | Train MSE=61125.7373 | Val avg MAE=148.3840
Epoch 03 | Train MSE=41558.7641 | Val avg MAE=146.4435
Epoch 04 | Train MSE=36031.8955 | Val avg MAE=138.0881
Epoch 05 | Train MSE=30696.7399 | Val avg MAE=138.0244
Epoch 06 | Train MSE=28023.6283 | Val avg MAE=139.5586
Epoch 07 | Train MSE=26336.7248 | Val avg MAE=139.1256
Epoch 08 | Train MSE=25017.1608 | Val avg MAE=139.4393
Epoch 09 | Train MSE=23997.8156 | Val avg MAE=139.4142
Epoch 10 | Train MSE=23054.9509 | Val avg MAE=139.0334
Early stopping after 10 epochs.


[I 2025-12-24 22:30:07,389] Trial 14 finished with value: 138.02438163757324 and parameters: {'seq_len': 24, 'hidden_size': 192, 'num_layers': 2, 'dropout': 0.19673780715529038, 'lr': 0.0002585468986854149, 'weight_decay': 2.4972119495076417e-06, 'batch_size': 128, 'use_fc_head': True, 'fc_hidden': 64}. Best is trial 11 with value: 133.9233112335205.


Epoch 01 | Train MSE=336089.8868 | Val avg MAE=161.4560
Epoch 02 | Train MSE=57717.9074 | Val avg MAE=148.8668
Epoch 03 | Train MSE=39382.8713 | Val avg MAE=143.1752
Epoch 04 | Train MSE=31968.2203 | Val avg MAE=140.5323
Epoch 05 | Train MSE=28922.5236 | Val avg MAE=139.0856
Epoch 06 | Train MSE=26898.1297 | Val avg MAE=139.7355
Epoch 07 | Train MSE=25307.5352 | Val avg MAE=139.2145
Epoch 08 | Train MSE=24059.5566 | Val avg MAE=137.2347
Epoch 09 | Train MSE=22980.1167 | Val avg MAE=139.2110
Epoch 10 | Train MSE=22076.6139 | Val avg MAE=138.1128
Epoch 11 | Train MSE=21298.4518 | Val avg MAE=137.7496
Epoch 12 | Train MSE=20586.6966 | Val avg MAE=136.9756
Epoch 13 | Train MSE=19969.2824 | Val avg MAE=139.1564
Epoch 14 | Train MSE=19396.8252 | Val avg MAE=137.8435
Epoch 15 | Train MSE=18874.0063 | Val avg MAE=137.9800
Epoch 16 | Train MSE=18393.2891 | Val avg MAE=138.7812
Epoch 17 | Train MSE=17934.1955 | Val avg MAE=138.3741
Early stopping after 17 epochs.


[I 2025-12-25 08:05:44,895] Trial 15 finished with value: 136.97562789916992 and parameters: {'seq_len': 24, 'hidden_size': 192, 'num_layers': 3, 'dropout': 0.09729382516478127, 'lr': 0.0002179641614253948, 'weight_decay': 1.5509688548975008e-05, 'batch_size': 128, 'use_fc_head': True, 'fc_hidden': 256}. Best is trial 11 with value: 133.9233112335205.


Epoch 01 | Train MSE=389296.3747 | Val avg MAE=164.8741
Epoch 02 | Train MSE=65039.1047 | Val avg MAE=156.8596
Epoch 03 | Train MSE=46461.1763 | Val avg MAE=149.2650
Epoch 04 | Train MSE=41781.6123 | Val avg MAE=146.3369
Epoch 05 | Train MSE=36582.8605 | Val avg MAE=141.0688
Epoch 06 | Train MSE=34372.4619 | Val avg MAE=140.2212
Epoch 07 | Train MSE=33015.5121 | Val avg MAE=141.8196
Epoch 08 | Train MSE=31958.0506 | Val avg MAE=141.8921
Epoch 09 | Train MSE=31121.2252 | Val avg MAE=141.6164
Epoch 10 | Train MSE=30404.2920 | Val avg MAE=139.7038
Epoch 11 | Train MSE=29802.3856 | Val avg MAE=139.9544
Epoch 12 | Train MSE=29252.8847 | Val avg MAE=139.9010
Epoch 13 | Train MSE=28792.2317 | Val avg MAE=140.5877
Epoch 14 | Train MSE=28355.5815 | Val avg MAE=139.1037
Epoch 15 | Train MSE=27981.0564 | Val avg MAE=140.3638
Epoch 16 | Train MSE=27616.4142 | Val avg MAE=138.3142
Epoch 17 | Train MSE=27320.7520 | Val avg MAE=140.0763
Epoch 18 | Train MSE=27034.3575 | Val avg MAE=138.0690
Epoch 19 

[I 2025-12-25 11:07:58,437] Trial 16 finished with value: 138.06896018981934 and parameters: {'seq_len': 24, 'hidden_size': 64, 'num_layers': 3, 'dropout': 0.1422982715750114, 'lr': 0.0004726276037662199, 'weight_decay': 2.4508033260334865e-06, 'batch_size': 128, 'use_fc_head': True, 'fc_hidden': 128}. Best is trial 11 with value: 133.9233112335205.


Epoch 01 | Train MSE=201415.0059 | Val avg MAE=145.4687
Epoch 02 | Train MSE=37359.7056 | Val avg MAE=138.8119
Epoch 03 | Train MSE=31420.8225 | Val avg MAE=135.3271
Epoch 04 | Train MSE=28898.4725 | Val avg MAE=140.2875
Epoch 05 | Train MSE=27153.0307 | Val avg MAE=134.0841
Epoch 06 | Train MSE=25911.9836 | Val avg MAE=136.0683
Epoch 07 | Train MSE=24890.4163 | Val avg MAE=139.0135
Epoch 08 | Train MSE=24083.2980 | Val avg MAE=137.8493
Epoch 09 | Train MSE=23361.0051 | Val avg MAE=137.4012
Epoch 10 | Train MSE=22768.8035 | Val avg MAE=136.2285
Early stopping after 10 epochs.


[I 2025-12-25 12:32:20,555] Trial 17 finished with value: 134.0840892791748 and parameters: {'seq_len': 24, 'hidden_size': 96, 'num_layers': 2, 'dropout': 0.019991406980689852, 'lr': 0.0007928137555241749, 'weight_decay': 3.3245558192360414e-05, 'batch_size': 128, 'use_fc_head': True, 'fc_hidden': 128}. Best is trial 11 with value: 133.9233112335205.


Epoch 01 | Train MSE=269588.5983 | Val avg MAE=152.2807
Epoch 02 | Train MSE=41498.2649 | Val avg MAE=140.0074
Epoch 03 | Train MSE=33254.9508 | Val avg MAE=140.0364
Epoch 04 | Train MSE=30310.2022 | Val avg MAE=139.7786
Epoch 05 | Train MSE=28381.0174 | Val avg MAE=138.2783
Epoch 06 | Train MSE=26891.2090 | Val avg MAE=137.7808
Epoch 07 | Train MSE=25760.9913 | Val avg MAE=137.1574
Epoch 08 | Train MSE=24875.4676 | Val avg MAE=139.3918
Epoch 09 | Train MSE=24089.7699 | Val avg MAE=141.2078
Epoch 10 | Train MSE=23428.1948 | Val avg MAE=137.1876
Epoch 11 | Train MSE=22888.2156 | Val avg MAE=138.1601
Epoch 12 | Train MSE=22385.0849 | Val avg MAE=135.8554
Epoch 13 | Train MSE=21934.3143 | Val avg MAE=136.2459
Epoch 14 | Train MSE=21535.7674 | Val avg MAE=139.0741
Epoch 15 | Train MSE=21165.1241 | Val avg MAE=139.7768
Epoch 16 | Train MSE=20832.8565 | Val avg MAE=139.6542
Epoch 17 | Train MSE=20553.9377 | Val avg MAE=137.2509
Early stopping after 17 epochs.


[I 2025-12-25 14:45:39,997] Trial 18 finished with value: 135.85537147521973 and parameters: {'seq_len': 24, 'hidden_size': 96, 'num_layers': 2, 'dropout': 0.001137797412376719, 'lr': 0.0007487024700864935, 'weight_decay': 4.0922455834938574e-05, 'batch_size': 256, 'use_fc_head': True, 'fc_hidden': 256}. Best is trial 11 with value: 133.9233112335205.


Epoch 01 | Train MSE=249533.8382 | Val avg MAE=151.4566
Epoch 02 | Train MSE=42042.5544 | Val avg MAE=140.6876
Epoch 03 | Train MSE=33383.5741 | Val avg MAE=138.1679
Epoch 04 | Train MSE=30442.9111 | Val avg MAE=139.0662
Epoch 05 | Train MSE=28588.7579 | Val avg MAE=136.8446
Epoch 06 | Train MSE=27214.5432 | Val avg MAE=137.1700
Epoch 07 | Train MSE=26143.2931 | Val avg MAE=140.4338
Epoch 08 | Train MSE=25266.6812 | Val avg MAE=137.2312
Epoch 09 | Train MSE=24531.9731 | Val avg MAE=137.8746
Epoch 10 | Train MSE=23888.2900 | Val avg MAE=138.1539
Early stopping after 10 epochs.


[I 2025-12-25 16:10:17,017] Trial 19 finished with value: 136.8446216583252 and parameters: {'seq_len': 24, 'hidden_size': 96, 'num_layers': 2, 'dropout': 0.029247727208351795, 'lr': 0.0006240517598688299, 'weight_decay': 3.4872387954872066e-05, 'batch_size': 128, 'use_fc_head': True, 'fc_hidden': 128}. Best is trial 11 with value: 133.9233112335205.


Epoch 01 | Train MSE=554683.0875 | Val avg MAE=160.7711
Epoch 02 | Train MSE=64326.0953 | Val avg MAE=154.1983
Epoch 03 | Train MSE=45511.7942 | Val avg MAE=145.8251
Epoch 04 | Train MSE=39735.4932 | Val avg MAE=139.6499
Epoch 05 | Train MSE=34901.3491 | Val avg MAE=140.1053
Epoch 06 | Train MSE=31977.9353 | Val avg MAE=139.1503
Epoch 07 | Train MSE=30406.6757 | Val avg MAE=136.2493
Epoch 08 | Train MSE=29271.8935 | Val avg MAE=137.0428
Epoch 09 | Train MSE=28326.5109 | Val avg MAE=137.7484
Epoch 10 | Train MSE=27476.6941 | Val avg MAE=138.8310
Epoch 11 | Train MSE=26803.2081 | Val avg MAE=138.8510
Epoch 12 | Train MSE=26119.5104 | Val avg MAE=136.1509
Epoch 13 | Train MSE=25562.8532 | Val avg MAE=133.7383
Epoch 14 | Train MSE=25108.9810 | Val avg MAE=137.7418
Epoch 15 | Train MSE=24632.9182 | Val avg MAE=137.9981
Epoch 16 | Train MSE=24252.1654 | Val avg MAE=139.5212
Epoch 17 | Train MSE=23850.0891 | Val avg MAE=138.8706
Epoch 18 | Train MSE=23517.8079 | Val avg MAE=137.2863
Early sto

[I 2025-12-25 18:41:04,499] Trial 20 finished with value: 133.73832321166992 and parameters: {'seq_len': 24, 'hidden_size': 96, 'num_layers': 2, 'dropout': 0.08480346545321482, 'lr': 0.000338292769713216, 'weight_decay': 1.2011095009674564e-05, 'batch_size': 128, 'use_fc_head': True, 'fc_hidden': 64}. Best is trial 20 with value: 133.73832321166992.


Epoch 01 | Train MSE=585535.0752 | Val avg MAE=166.8651
Epoch 02 | Train MSE=65895.4729 | Val avg MAE=157.2872
Epoch 03 | Train MSE=46761.0052 | Val avg MAE=146.9327
Epoch 04 | Train MSE=39942.3379 | Val avg MAE=141.2341
Epoch 05 | Train MSE=35508.7107 | Val avg MAE=140.5251
Epoch 06 | Train MSE=32247.4221 | Val avg MAE=142.0000
Epoch 07 | Train MSE=30543.4851 | Val avg MAE=138.2015
Epoch 08 | Train MSE=29319.3976 | Val avg MAE=139.6683
Epoch 09 | Train MSE=28354.8918 | Val avg MAE=138.6163
Epoch 10 | Train MSE=27543.4809 | Val avg MAE=138.6303
Epoch 11 | Train MSE=26798.6479 | Val avg MAE=139.5450
Epoch 12 | Train MSE=26182.1884 | Val avg MAE=141.1644
Early stopping after 12 epochs.


[I 2025-12-25 20:21:04,515] Trial 21 finished with value: 138.2014617919922 and parameters: {'seq_len': 24, 'hidden_size': 96, 'num_layers': 2, 'dropout': 0.07707647921838799, 'lr': 0.00031349596209201923, 'weight_decay': 1.0666337097056402e-05, 'batch_size': 128, 'use_fc_head': True, 'fc_hidden': 64}. Best is trial 20 with value: 133.73832321166992.


Epoch 01 | Train MSE=877593.6039 | Val avg MAE=181.1512
Epoch 02 | Train MSE=72007.5903 | Val avg MAE=159.7414
Epoch 03 | Train MSE=58477.7291 | Val avg MAE=151.7993
Epoch 04 | Train MSE=44392.1048 | Val avg MAE=144.7759
Epoch 05 | Train MSE=39718.0342 | Val avg MAE=145.5488
Epoch 06 | Train MSE=36305.6999 | Val avg MAE=141.3870
Epoch 07 | Train MSE=33418.1315 | Val avg MAE=140.9053
Epoch 08 | Train MSE=31482.3934 | Val avg MAE=140.0561
Epoch 09 | Train MSE=30217.2041 | Val avg MAE=139.2312
Epoch 10 | Train MSE=29206.4650 | Val avg MAE=140.3888
Epoch 11 | Train MSE=28349.3728 | Val avg MAE=139.9558
Epoch 12 | Train MSE=27613.6639 | Val avg MAE=138.5795
Epoch 13 | Train MSE=26968.1051 | Val avg MAE=141.0058
Epoch 14 | Train MSE=26390.7292 | Val avg MAE=138.4772
Epoch 15 | Train MSE=25876.3125 | Val avg MAE=139.4117
Epoch 16 | Train MSE=25399.6693 | Val avg MAE=137.7891
Epoch 17 | Train MSE=24981.4605 | Val avg MAE=139.0874
Epoch 18 | Train MSE=24565.8403 | Val avg MAE=138.9034
Epoch 19 

[I 2025-12-25 23:14:56,188] Trial 22 finished with value: 137.78911018371582 and parameters: {'seq_len': 24, 'hidden_size': 96, 'num_layers': 2, 'dropout': 0.03161031920215632, 'lr': 0.00020566058345911938, 'weight_decay': 3.075360598420957e-05, 'batch_size': 128, 'use_fc_head': True, 'fc_hidden': 64}. Best is trial 20 with value: 133.73832321166992.


Epoch 01 | Train MSE=480533.7542 | Val avg MAE=159.5552
Epoch 02 | Train MSE=63085.2855 | Val avg MAE=151.6738
Epoch 03 | Train MSE=43859.6236 | Val avg MAE=148.2107
Epoch 04 | Train MSE=38431.7994 | Val avg MAE=139.9894
Epoch 05 | Train MSE=33561.1548 | Val avg MAE=139.5879
Epoch 06 | Train MSE=30972.9872 | Val avg MAE=137.0133
Epoch 07 | Train MSE=29493.9253 | Val avg MAE=139.8050
Epoch 08 | Train MSE=28395.8554 | Val avg MAE=138.1059
Epoch 09 | Train MSE=27440.9224 | Val avg MAE=137.6333
Epoch 10 | Train MSE=26673.6503 | Val avg MAE=137.3563
Epoch 11 | Train MSE=26003.4750 | Val avg MAE=139.2148
Early stopping after 11 epochs.


[I 2025-12-26 00:46:51,383] Trial 23 finished with value: 137.01329231262207 and parameters: {'seq_len': 24, 'hidden_size': 96, 'num_layers': 2, 'dropout': 0.09143701048769956, 'lr': 0.0003771556682252897, 'weight_decay': 5.380047922465392e-05, 'batch_size': 128, 'use_fc_head': True, 'fc_hidden': 64}. Best is trial 20 with value: 133.73832321166992.


Epoch 01 | Train MSE=530149.1455 | Val avg MAE=165.6089
Epoch 02 | Train MSE=66544.2266 | Val avg MAE=156.8574
Epoch 03 | Train MSE=48563.7879 | Val avg MAE=146.0517
Epoch 04 | Train MSE=40273.6336 | Val avg MAE=147.2529
Epoch 05 | Train MSE=35917.4167 | Val avg MAE=142.9268
Epoch 06 | Train MSE=32442.4993 | Val avg MAE=140.7843
Epoch 07 | Train MSE=30701.1622 | Val avg MAE=140.8018
Epoch 08 | Train MSE=29449.4534 | Val avg MAE=137.6203
Epoch 09 | Train MSE=28492.0067 | Val avg MAE=140.5315
Epoch 10 | Train MSE=27689.7940 | Val avg MAE=139.2095
Epoch 11 | Train MSE=26961.4682 | Val avg MAE=137.8811
Epoch 12 | Train MSE=26394.8429 | Val avg MAE=139.6529
Epoch 13 | Train MSE=25891.1892 | Val avg MAE=139.0613
Early stopping after 13 epochs.


[I 2025-12-26 02:36:05,465] Trial 24 finished with value: 137.62031173706055 and parameters: {'seq_len': 24, 'hidden_size': 96, 'num_layers': 2, 'dropout': 0.11904364849762557, 'lr': 0.0003372805336706817, 'weight_decay': 1.1922593272521357e-05, 'batch_size': 128, 'use_fc_head': True, 'fc_hidden': 64}. Best is trial 20 with value: 133.73832321166992.


Epoch 01 | Train MSE=410829.5809 | Val avg MAE=160.3398
Epoch 02 | Train MSE=54141.0181 | Val avg MAE=144.6755
Epoch 03 | Train MSE=38263.7346 | Val avg MAE=141.4144
Epoch 04 | Train MSE=31548.3921 | Val avg MAE=137.0187
Epoch 05 | Train MSE=28498.0454 | Val avg MAE=140.7984
Epoch 06 | Train MSE=26517.2340 | Val avg MAE=138.0628
Epoch 07 | Train MSE=24951.4485 | Val avg MAE=137.1782
Epoch 08 | Train MSE=23746.8050 | Val avg MAE=136.7450
Epoch 09 | Train MSE=22708.1675 | Val avg MAE=136.1671
Epoch 10 | Train MSE=21865.8651 | Val avg MAE=139.1691
Epoch 11 | Train MSE=21144.4620 | Val avg MAE=137.5025
Epoch 12 | Train MSE=20463.2155 | Val avg MAE=136.8236
Epoch 13 | Train MSE=19860.6665 | Val avg MAE=138.2087
Epoch 14 | Train MSE=19324.4626 | Val avg MAE=138.5233
Early stopping after 14 epochs.


[I 2025-12-26 07:30:42,165] Trial 25 finished with value: 136.16705131530762 and parameters: {'seq_len': 24, 'hidden_size': 192, 'num_layers': 2, 'dropout': 0.04681714065798248, 'lr': 0.00023396558657020624, 'weight_decay': 2.1270764380991602e-05, 'batch_size': 128, 'use_fc_head': True, 'fc_hidden': 128}. Best is trial 20 with value: 133.73832321166992.


Epoch 01 | Train MSE=2594287.6610 | Val avg MAE=774.2896
Epoch 02 | Train MSE=2079851.5699 | Val avg MAE=670.2895
Epoch 03 | Train MSE=1704799.7411 | Val avg MAE=603.5558
Epoch 04 | Train MSE=1413996.6631 | Val avg MAE=551.0280
Epoch 05 | Train MSE=1181365.9777 | Val avg MAE=501.7033
Epoch 06 | Train MSE=991980.3836 | Val avg MAE=447.9650
Epoch 07 | Train MSE=829857.9109 | Val avg MAE=400.1158
Epoch 08 | Train MSE=691710.0024 | Val avg MAE=359.4363
Epoch 09 | Train MSE=574815.1883 | Val avg MAE=322.4950
Epoch 10 | Train MSE=477116.8730 | Val avg MAE=293.8452
Epoch 11 | Train MSE=396168.2216 | Val avg MAE=269.3187
Epoch 12 | Train MSE=329430.1058 | Val avg MAE=248.3990
Epoch 13 | Train MSE=274505.6660 | Val avg MAE=231.6960
Epoch 14 | Train MSE=229744.6464 | Val avg MAE=219.8270
Epoch 15 | Train MSE=193271.3296 | Val avg MAE=204.8210
Epoch 16 | Train MSE=163829.5451 | Val avg MAE=194.3529
Epoch 17 | Train MSE=140165.8668 | Val avg MAE=187.0908
Epoch 18 | Train MSE=121137.1418 | Val avg 

[I 2025-12-26 11:32:01,015] Trial 26 finished with value: 149.04664611816406 and parameters: {'seq_len': 24, 'hidden_size': 96, 'num_layers': 2, 'dropout': 0.02728113964811686, 'lr': 0.0005532141303222043, 'weight_decay': 4.398509361025365e-06, 'batch_size': 256, 'use_fc_head': False}. Best is trial 20 with value: 133.73832321166992.


Epoch 01 | Train MSE=656999.0207 | Val avg MAE=169.2577
Epoch 02 | Train MSE=70752.3051 | Val avg MAE=158.9999
Epoch 03 | Train MSE=53323.3128 | Val avg MAE=150.8716
Epoch 04 | Train MSE=43824.1419 | Val avg MAE=146.6006
Epoch 05 | Train MSE=40322.8206 | Val avg MAE=145.2796
Epoch 06 | Train MSE=36666.6281 | Val avg MAE=142.0001
Epoch 07 | Train MSE=33883.3068 | Val avg MAE=139.8238
Epoch 08 | Train MSE=32251.3112 | Val avg MAE=140.3006
Epoch 09 | Train MSE=31075.0233 | Val avg MAE=141.1307
Epoch 10 | Train MSE=30117.2084 | Val avg MAE=137.9465
Epoch 11 | Train MSE=29271.9109 | Val avg MAE=138.4800
Epoch 12 | Train MSE=28615.4438 | Val avg MAE=139.1407
Epoch 13 | Train MSE=27957.5859 | Val avg MAE=135.9024
Epoch 14 | Train MSE=27411.0437 | Val avg MAE=138.4729
Epoch 15 | Train MSE=26895.0710 | Val avg MAE=136.8523
Epoch 16 | Train MSE=26443.9173 | Val avg MAE=138.3530
Epoch 17 | Train MSE=25983.7561 | Val avg MAE=141.9597
Epoch 18 | Train MSE=25625.2739 | Val avg MAE=139.5980
Early sto

[I 2025-12-26 14:01:52,079] Trial 27 finished with value: 135.90236854553223 and parameters: {'seq_len': 24, 'hidden_size': 96, 'num_layers': 2, 'dropout': 0.1128549347970683, 'lr': 0.00018854976376279906, 'weight_decay': 7.689824682478167e-06, 'batch_size': 128, 'use_fc_head': True, 'fc_hidden': 128}. Best is trial 20 with value: 133.73832321166992.


Epoch 01 | Train MSE=334100.6909 | Val avg MAE=157.3085
Epoch 02 | Train MSE=57346.2587 | Val avg MAE=145.6731
Epoch 03 | Train MSE=35817.6100 | Val avg MAE=136.0894
Epoch 04 | Train MSE=28246.4396 | Val avg MAE=137.6235
Epoch 05 | Train MSE=25230.5820 | Val avg MAE=136.0355
Epoch 06 | Train MSE=23394.0941 | Val avg MAE=136.9545
Epoch 07 | Train MSE=21961.6955 | Val avg MAE=136.5899
Epoch 08 | Train MSE=20831.7578 | Val avg MAE=135.5241
Epoch 09 | Train MSE=19906.2713 | Val avg MAE=137.6516
Epoch 10 | Train MSE=19083.7127 | Val avg MAE=134.9472
Epoch 11 | Train MSE=18320.5443 | Val avg MAE=138.4048
Epoch 12 | Train MSE=17727.0660 | Val avg MAE=135.7591
Epoch 13 | Train MSE=17167.9684 | Val avg MAE=136.4588
Epoch 14 | Train MSE=16655.6322 | Val avg MAE=136.6309
Epoch 15 | Train MSE=16145.2935 | Val avg MAE=136.3602
Early stopping after 15 epochs.


[I 2025-12-26 22:20:10,886] Trial 28 finished with value: 134.94724082946777 and parameters: {'seq_len': 24, 'hidden_size': 192, 'num_layers': 3, 'dropout': 0.0708301469353607, 'lr': 0.0004458716164604057, 'weight_decay': 0.00017863974102696879, 'batch_size': 128, 'use_fc_head': True, 'fc_hidden': 64}. Best is trial 20 with value: 133.73832321166992.


Epoch 01 | Train MSE=1135255.8818 | Val avg MAE=256.4803
Epoch 02 | Train MSE=121816.9749 | Val avg MAE=182.6373
Epoch 03 | Train MSE=75831.8101 | Val avg MAE=171.6249
Epoch 04 | Train MSE=69872.0220 | Val avg MAE=169.4399
Epoch 05 | Train MSE=66570.7077 | Val avg MAE=169.6648
Epoch 06 | Train MSE=63870.9313 | Val avg MAE=164.9262
Epoch 07 | Train MSE=57273.2796 | Val avg MAE=159.7262
Epoch 08 | Train MSE=48040.1861 | Val avg MAE=157.0184
Epoch 09 | Train MSE=43911.2673 | Val avg MAE=153.4394
Epoch 10 | Train MSE=41806.8226 | Val avg MAE=153.6846
Epoch 11 | Train MSE=40409.7152 | Val avg MAE=153.9473
Epoch 12 | Train MSE=39288.1602 | Val avg MAE=152.4635
Epoch 13 | Train MSE=38256.3853 | Val avg MAE=152.0113
Epoch 14 | Train MSE=37314.9183 | Val avg MAE=151.4871
Epoch 15 | Train MSE=36217.6673 | Val avg MAE=150.6400
Epoch 16 | Train MSE=34706.0330 | Val avg MAE=146.3842
Epoch 17 | Train MSE=33192.8685 | Val avg MAE=149.1845
Epoch 18 | Train MSE=32110.3212 | Val avg MAE=145.1918
Epoch 1

[I 2025-12-27 04:43:12,984] Trial 29 finished with value: 142.2108211517334 and parameters: {'seq_len': 48, 'hidden_size': 64, 'num_layers': 3, 'dropout': 0.1706751269281414, 'lr': 0.0001651476330840796, 'weight_decay': 6.0711908845500406e-05, 'batch_size': 256, 'use_fc_head': True, 'fc_hidden': 256}. Best is trial 20 with value: 133.73832321166992.


Number of finished trials: 30
Best validation avg MAE: 133.73832321166992
Best params:
  seq_len: 24
  hidden_size: 96
  num_layers: 2
  dropout: 0.08480346545321482
  lr: 0.000338292769713216
  weight_decay: 1.2011095009674564e-05
  batch_size: 128
  use_fc_head: True
  fc_hidden: 64

Best params (for final LSTM): {'seq_len': 24, 'hidden_size': 96, 'num_layers': 2, 'dropout': 0.08480346545321482, 'lr': 0.000338292769713216, 'weight_decay': 1.2011095009674564e-05, 'batch_size': 128, 'use_fc_head': True, 'fc_hidden': 64}
Train+Val seq shape: (2078702, 24, 35)
Test seq shape:     (1041748, 24, 35)
[FINAL TRAIN] Epoch 01 | Train MSE=414814.1156
[FINAL TRAIN] Epoch 02 | Train MSE=63688.5856
[FINAL TRAIN] Epoch 03 | Train MSE=45559.1439
[FINAL TRAIN] Epoch 04 | Train MSE=39566.3001
[FINAL TRAIN] Epoch 05 | Train MSE=34572.0758
[FINAL TRAIN] Epoch 06 | Train MSE=31822.1935
[FINAL TRAIN] Epoch 07 | Train MSE=30127.4876
[FINAL TRAIN] Epoch 08 | Train MSE=28936.3562
[FINAL TRAIN] Epoch 09 | Tra

# STGCN 

In [74]:
# =========================
# Build graph objects for STGCN
# =========================

# Decide where to build the graph from (use TRAIN only)
df_graph = df_train.copy()

# Sanity check: required columns
required_cols = ["station", "upstream_station", "downstream_station"]
missing = [c for c in required_cols if c not in df_graph.columns]
if missing:
    raise ValueError(
        f"Missing columns in df_train needed for adjacency: {missing}\n"
        "Make sure you already created upstream_station/downstream_station."
    )

# Fix dtype + drop bad station ids
df_graph["station"] = pd.to_numeric(df_graph["station"], errors="coerce")
df_graph["upstream_station"] = pd.to_numeric(df_graph["upstream_station"], errors="coerce")
df_graph["downstream_station"] = pd.to_numeric(df_graph["downstream_station"], errors="coerce")

df_graph = df_graph.dropna(subset=["station"]).copy()
df_graph["station"] = df_graph["station"].astype(int)

# Create a GLOBAL station list and mapping (node ordering)
stations = np.sort(df_graph["station"].unique())
N = len(stations)
station2idx = {s: i for i, s in enumerate(stations)}
idx2station = {i: s for s, i in station2idx.items()}

print("Number of nodes (stations):", N)

# Build adjacency matrix A (N x N)
#    We'll add edges station <-> upstream and station <-> downstream (bidirectional)
A = np.zeros((N, N), dtype=np.float32)

# self-loops (optional here; STGCN often uses normalized adjacency with self-loop anyway)
np.fill_diagonal(A, 1.0)

# Get unique neighbor relationships per station to avoid repeated work
edges_df = (
    df_graph[["station", "upstream_station", "downstream_station"]]
    .drop_duplicates()
)

for _, row in edges_df.iterrows():
    s = int(row["station"])
    i = station2idx.get(s, None)
    if i is None:
        continue

    # upstream edge
    u = row["upstream_station"]
    if not pd.isna(u):
        u = int(u)
        j = station2idx.get(u, None)
        if j is not None:
            A[i, j] = 1.0
            A[j, i] = 1.0  # bidirectional

    # downstream edge
    d = row["downstream_station"]
    if not pd.isna(d):
        d = int(d)
        j = station2idx.get(d, None)
        if j is not None:
            A[i, j] = 1.0
            A[j, i] = 1.0  # bidirectional

# GCN-normalized adjacency (common for STGCN)
#    A_hat = D^{-1/2} (A) D^{-1/2}
deg = A.sum(axis=1)
deg_inv_sqrt = np.power(deg, -0.5, where=deg>0)
deg_inv_sqrt[~np.isfinite(deg_inv_sqrt)] = 0.0

D_inv_sqrt = np.diag(deg_inv_sqrt.astype(np.float32))
A_hat = D_inv_sqrt @ A @ D_inv_sqrt

print("Adjacency built.")
print("A shape:", A.shape, "A_hat shape:", A_hat.shape)
print("Edges (excluding self-loops):", int((A.sum() - np.trace(A)) / 2))


Number of nodes (stations): 1766
Adjacency built.
A shape: (1766, 1766) A_hat shape: (1766, 1766)
Edges (excluding self-loops): 1723


In [84]:
# =========================
#  Build STGCN tensors (X, y) from df_train/val/test
# =========================

import numpy as np
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader

# -------------------------
# Settings you control
# -------------------------
INPUT_LEN = 24          # past 24 hours
HORIZON = 12            # predict t + 12 hours (your Option B base model)
BATCH_SIZE = 32

TARGET_COL = "total_flow"

# Choose node-level features available in df_train/val/test.
# Start simple with total_flow only (strong baseline). You can add more later.
NODE_FEATURE_COLS = [
    "total_flow",        # required
    # "avg_speed",        # optional (if you want)
    # "pct_observed",     # optional
    # "samples",          # optional
]

# If these exist, we can also add time features (same across nodes at each timestamp)
TIME_FEATURE_COLS = [
    "hour_sin", "hour_cos", "dow_sin", "dow_cos", "is_peak_hour", "is_weekend"
]

# -------------------------
# Helpers
# -------------------------
def _ensure_datetime(df):
    df = df.copy()
    df["timestamp"] = pd.to_datetime(df["timestamp"])
    # If your data is exactly hourly already, this is fine:
    df["timestamp"] = df["timestamp"].dt.floor("H")
    return df

def build_time_node_tensor(df, station2idx, feature_cols, timestamp_col="timestamp", station_col="station"):
    """
    Builds a dense tensor: X_time_node_feat with shape (T, N, F)
    Also returns timestamps used (sorted).
    Uses ONLY stations that exist in station2idx (matches adjacency).
    """
    df = _ensure_datetime(df)

    # Keep only stations that are in the graph (important!)
    df = df[df[station_col].isin(station2idx.keys())].copy()

    # Sort
    df = df.sort_values([timestamp_col, station_col])

    # Build sorted time index
    times = np.array(sorted(df[timestamp_col].unique()))
    N = len(station2idx)
    F = len(feature_cols)

    # Initialize (T, N, F) with NaNs (we'll fill)
    X = np.full((len(times), N, F), np.nan, dtype=np.float32)

    # Fast fill by grouping per timestamp
    time_to_row = {t: i for i, t in enumerate(times)}
    for _, row in df.iterrows():
        t = row[timestamp_col]
        s = row[station_col]
        i = time_to_row[t]
        j = station2idx[s]
        for k, col in enumerate(feature_cols):
            X[i, j, k] = row[col]

    return X, times

def fill_missing(X, method="ffill", fill_value=0.0):
    """
    Fill missing values in (T, N, F).
    method: 'ffill' (forward fill over time) then fill remaining with fill_value
    """
    X_filled = X.copy()
    T, N, F = X_filled.shape

    if method == "ffill":
        # Forward fill over time for each node-feature
        for j in range(N):
            for k in range(F):
                col = X_filled[:, j, k]
                # forward fill NaNs
                mask = np.isnan(col)
                if mask.all():
                    continue
                idx = np.where(~mask, np.arange(T), 0)
                np.maximum.accumulate(idx, out=idx)
                col_f = col[idx]
                X_filled[:, j, k] = col_f

    # Fill any remaining NaNs
    X_filled[np.isnan(X_filled)] = fill_value
    return X_filled

def fit_standardizer(X_train):
    """
    X_train: (T, N, F) float32
    Returns mean, std per feature over ALL time+nodes.
    """
    # reshape to (T*N, F)
    flat = X_train.reshape(-1, X_train.shape[-1])
    mean = flat.mean(axis=0)
    std = flat.std(axis=0) + 1e-6
    return mean.astype(np.float32), std.astype(np.float32)

def apply_standardizer(X, mean, std):
    return ((X - mean[None, None, :]) / std[None, None, :]).astype(np.float32)

def make_windows(X, y_target, input_len, horizon):
    """
    X: (T, N, F)
    y_target: (T, N)  (future target comes from total_flow column)
    Returns:
      Xw: (num_samples, F, input_len, N)
      yw: (num_samples, N)  predicting one horizon step ahead (t + horizon)
    """
    T = X.shape[0]
    N = X.shape[1]
    F = X.shape[2]

    X_list, y_list = [], []
    last_start = T - input_len - horizon
    for t0 in range(last_start):
        x_block = X[t0 : t0 + input_len]              # (Tin, N, F)
        y_t = y_target[t0 + input_len + horizon - 1]  # (N,)
        # transpose to (F, Tin, N) for STGCN-friendly shape
        x_block = np.transpose(x_block, (2, 0, 1))    # (F, Tin, N)
        X_list.append(x_block)
        y_list.append(y_t)

    Xw = np.stack(X_list, axis=0).astype(np.float32)
    yw = np.stack(y_list, axis=0).astype(np.float32)
    return Xw, yw

class STGCNDataset(Dataset):
    def __init__(self, Xw, yw):
        self.X = torch.from_numpy(Xw)  # (B, F, Tin, N)
        self.y = torch.from_numpy(yw)  # (B, N)
    def __len__(self):
        return self.X.shape[0]
    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]

# -------------------------
#  Build (T, N, F) tensors from each split
# -------------------------
# IMPORTANT: station2idx MUST come from the adjacency build step (N=1766)
# Example check:
print("Graph nodes (station2idx):", len(station2idx))

X_train_TNF, train_times = build_time_node_tensor(df_train, station2idx, NODE_FEATURE_COLS)
X_val_TNF,   val_times   = build_time_node_tensor(df_val,   station2idx, NODE_FEATURE_COLS)
X_test_TNF,  test_times  = build_time_node_tensor(df_test,  station2idx, NODE_FEATURE_COLS)

print("Raw shapes (T, N, F):")
print("Train:", X_train_TNF.shape, "Val:", X_val_TNF.shape, "Test:", X_test_TNF.shape)

# -------------------------
# Fill missing values (because some station-timestamps may be absent)
# -------------------------
X_train_TNF = fill_missing(X_train_TNF, method="ffill", fill_value=0.0)
X_val_TNF   = fill_missing(X_val_TNF,   method="ffill", fill_value=0.0)
X_test_TNF  = fill_missing(X_test_TNF,  method="ffill", fill_value=0.0)

# -------------------------
# Standardize features using TRAIN stats only (no leakage!)
# -------------------------
mean_f, std_f = fit_standardizer(X_train_TNF)
X_train_TNF = apply_standardizer(X_train_TNF, mean_f, std_f)
X_val_TNF   = apply_standardizer(X_val_TNF,   mean_f, std_f)
X_test_TNF  = apply_standardizer(X_test_TNF,  mean_f, std_f)

# -------------------------
# Build targets y = future total_flow (still in original scale OR standardized)
# We will train to predict standardized total_flow if total_flow is in NODE_FEATURE_COLS.
# -------------------------
# total_flow is feature index 0 if NODE_FEATURE_COLS = ["total_flow", ...]
flow_idx = NODE_FEATURE_COLS.index("total_flow")

y_train_TN = X_train_TNF[:, :, flow_idx]  # (T, N)
y_val_TN   = X_val_TNF[:, :, flow_idx]
y_test_TN  = X_test_TNF[:, :, flow_idx]

# -------------------------
# Create sliding windows for 12-step horizon training
# -------------------------
Xw_train, yw_train = make_windows(X_train_TNF, y_train_TN, INPUT_LEN, HORIZON)
Xw_val,   yw_val   = make_windows(X_val_TNF,   y_val_TN,   INPUT_LEN, HORIZON)
Xw_test,  yw_test  = make_windows(X_test_TNF,  y_test_TN,  INPUT_LEN, HORIZON)

print("Windowed shapes:")
print("Xw_train:", Xw_train.shape, "yw_train:", yw_train.shape)
print("Xw_val  :", Xw_val.shape,   "yw_val  :", yw_val.shape)
print("Xw_test :", Xw_test.shape,  "yw_test :", yw_test.shape)

# -------------------------
# DataLoaders 
# -------------------------
train_loader = DataLoader(STGCNDataset(Xw_train, yw_train), batch_size=BATCH_SIZE, shuffle=True, drop_last=True)
val_loader   = DataLoader(STGCNDataset(Xw_val,   yw_val),   batch_size=BATCH_SIZE, shuffle=False)
test_loader  = DataLoader(STGCNDataset(Xw_test,  yw_test),  batch_size=BATCH_SIZE, shuffle=False)



Graph nodes (station2idx): 1766


  df["timestamp"] = df["timestamp"].dt.floor("H")
  df["timestamp"] = df["timestamp"].dt.floor("H")
  df["timestamp"] = df["timestamp"].dt.floor("H")


Raw shapes (T, N, F):
Train: (1069, 1766, 1) Val: (325, 1766, 1) Test: (709, 1766, 1)
Windowed shapes:
Xw_train: (1033, 1, 24, 1766) yw_train: (1033, 1766)
Xw_val  : (289, 1, 24, 1766) yw_val  : (289, 1766)
Xw_test : (673, 1, 24, 1766) yw_test : (673, 1766)


In [85]:
# =========================
# Full STGCN Model (PyTorch) + shape checks
# Input:  (B, F, T, N)
# Output: (B, N)  (predict total_flow at t + HORIZON for all N stations)
# =========================

import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Device:", device)

# -------------------------
# Convert adjacency to torch (A_hat should be numpy array)
# -------------------------
A_hat_torch = torch.tensor(A_hat, dtype=torch.float32, device=device)  # (N, N)
N_graph = A_hat_torch.shape[0]
print("A_hat_torch shape:", tuple(A_hat_torch.shape))

# -------------------------
# Graph Convolution (GCN) on each time step
# X: (B, C, T, N)
# GCN: X' = A_hat @ X (over N) then a linear projection C->C'
# -------------------------
class GCNLayer(nn.Module):
    def __init__(self, c_in, c_out, bias=True):
        super().__init__()
        self.theta = nn.Linear(c_in, c_out, bias=bias)

    def forward(self, x, A_hat):
        # x: (B, C, T, N)
        B, C, T, N = x.shape

        # (B, T, N, C)
        x_perm = x.permute(0, 2, 3, 1)

        # Graph propagation: (N,N) @ (B,T,N,C) -> (B,T,N,C)
        # We'll do einsum for clarity and speed.
        x_g = torch.einsum("nm,btnc->btmc", A_hat, x_perm)

        # Linear projection over channels: (B,T,N,C_out)
        x_out = self.theta(x_g)

        # Back to (B, C_out, T, N)
        return x_out.permute(0, 3, 1, 2)

# -------------------------
# Temporal Convolution Block
# conv over time dimension T (kernel along T), keeping N as "width"
# We'll use Conv2d with kernel (Kt, 1)
# -------------------------
class TemporalConv(nn.Module):
    def __init__(self, c_in, c_out, kernel_size=3, dropout=0.0):
        super().__init__()
        padding = (kernel_size - 1) // 2
        self.conv = nn.Conv2d(c_in, c_out, kernel_size=(kernel_size, 1), padding=(padding, 0))
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        # x: (B, C, T, N)
        x = self.conv(x)
        x = F.relu(x)
        x = self.dropout(x)
        return x

# -------------------------
# STGCN Block = TemporalConv -> GCN -> TemporalConv
# -------------------------
class STGCNBlock(nn.Module):
    def __init__(self, c_in, c_hidden, c_out, Kt=3, dropout=0.0):
        super().__init__()
        self.temp1 = TemporalConv(c_in, c_hidden, kernel_size=Kt, dropout=dropout)
        self.gcn   = GCNLayer(c_hidden, c_hidden)
        self.temp2 = TemporalConv(c_hidden, c_out, kernel_size=Kt, dropout=dropout)

        # residual (if shapes match)
        self.residual = None
        if c_in != c_out:
            self.residual = nn.Conv2d(c_in, c_out, kernel_size=(1, 1))

    def forward(self, x, A_hat):
        # x: (B, C, T, N)
        res = x
        x = self.temp1(x)
        x = self.gcn(x, A_hat)
        x = F.relu(x)
        x = self.temp2(x)

        if self.residual is not None:
            res = self.residual(res)
        x = x + res
        x = F.relu(x)
        return x

# -------------------------
# Full STGCN Model
# - stack 2 STGCN blocks (common)
# - final temporal pooling + linear head to (B, N)
# -------------------------
class STGCN(nn.Module):
    def __init__(self, num_nodes, in_channels, hidden_channels=64, Kt=3, dropout=0.1):
        super().__init__()
        self.block1 = STGCNBlock(in_channels, hidden_channels, hidden_channels, Kt=Kt, dropout=dropout)
        self.block2 = STGCNBlock(hidden_channels, hidden_channels, hidden_channels, Kt=Kt, dropout=dropout)

        # readout: reduce time dimension then predict per node
        # After blocks: (B, hidden, T, N)
        self.time_pool = nn.AdaptiveAvgPool2d((1, num_nodes))  # -> (B, hidden, 1, N)

        # final mapping hidden->1 per node
        self.head = nn.Conv2d(hidden_channels, 1, kernel_size=(1, 1))

    def forward(self, x, A_hat):
        # x: (B, F, T, N)
        x = self.block1(x, A_hat)
        x = self.block2(x, A_hat)

        x = self.time_pool(x)         # (B, hidden, 1, N)
        x = self.head(x)              # (B, 1, 1, N)
        x = x.squeeze(2).squeeze(1)   # (B, N)
        return x

# -------------------------
# Instantiate model (auto-infer in_channels from one batch)
# -------------------------
sample_X, sample_y = next(iter(train_loader))
B0, F0, T0, N0 = sample_X.shape

assert N0 == N_graph, f"Mismatch: batch has N={N0} but A_hat has N={N_graph}"
print("Sample batch shape:", tuple(sample_X.shape), "Target shape:", tuple(sample_y.shape))

model = STGCN(num_nodes=N0, in_channels=F0, hidden_channels=64, Kt=3, dropout=0.1).to(device)
print(model)

# -------------------------
# Quick forward pass sanity check
# -------------------------
with torch.no_grad():
    out = model(sample_X.to(device), A_hat_torch)
print("Model output shape:", tuple(out.shape))  # should be (B, N)


Device: cpu
A_hat_torch shape: (1766, 1766)
Sample batch shape: (32, 1, 24, 1766) Target shape: (32, 1766)
STGCN(
  (block1): STGCNBlock(
    (temp1): TemporalConv(
      (conv): Conv2d(1, 64, kernel_size=(3, 1), stride=(1, 1), padding=(1, 0))
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (gcn): GCNLayer(
      (theta): Linear(in_features=64, out_features=64, bias=True)
    )
    (temp2): TemporalConv(
      (conv): Conv2d(64, 64, kernel_size=(3, 1), stride=(1, 1), padding=(1, 0))
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (residual): Conv2d(1, 64, kernel_size=(1, 1), stride=(1, 1))
  )
  (block2): STGCNBlock(
    (temp1): TemporalConv(
      (conv): Conv2d(64, 64, kernel_size=(3, 1), stride=(1, 1), padding=(1, 0))
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (gcn): GCNLayer(
      (theta): Linear(in_features=64, out_features=64, bias=True)
    )
    (temp2): TemporalConv(
      (conv): Conv2d(64, 64, kernel_size=(3, 1), stride=(1, 1), padding=(1, 0

In [87]:
###### =========================
#Train STGCN (12h horizon) + Validation + Save Best
# Metrics: MAE + RMSE
# =========================

import torch
import numpy as np
from math import sqrt

# ---- helpers ----
def mae_torch(y_pred, y_true):
    return torch.mean(torch.abs(y_pred - y_true))

def rmse_torch(y_pred, y_true):
    return torch.sqrt(torch.mean((y_pred - y_true) ** 2))

@torch.no_grad()
def evaluate(model, loader, A_hat_torch, device):
    model.eval()
    total_mae = 0.0
    total_mse = 0.0
    total_count = 0

    for X, y in loader:
        X = X.to(device)                  # (B, F, T, N)
        y = y.to(device)                  # (B, N)

        pred = model(X, A_hat_torch)      # (B, N)

        # accumulate
        batch_size = y.shape[0]
        total_mae += torch.sum(torch.abs(pred - y)).item()
        total_mse += torch.sum((pred - y) ** 2).item()
        total_count += batch_size * y.shape[1]  # B*N

    mean_mae = total_mae / total_count
    mean_rmse = sqrt(total_mse / total_count)
    return mean_mae, mean_rmse

# -------------------------
# 0) Training config
# -------------------------
lr = 1e-3
weight_decay = 1e-5
epochs = 30
patience = 7  # early stop if val doesn't improve

criterion = torch.nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=lr, weight_decay=weight_decay)

# Optional: stabilize training
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode="min", factor=0.5, patience=3)

best_val_mae = float("inf")
best_state = None
no_improve = 0

train_history = []
val_history = []

# -------------------------
# 1) Train loop
# -------------------------
for epoch in range(1, epochs + 1):
    model.train()
    running_loss = 0.0
    running_count = 0

    for X, y in train_loader:
        X = X.to(device)    # (B, F, T, N)
        y = y.to(device)    # (B, N)

        optimizer.zero_grad()
        pred = model(X, A_hat_torch)      # (B, N)
        loss = criterion(pred, y)
        loss.backward()

        # optional gradient clipping (helps if things explode)
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=3.0)

        optimizer.step()

        running_loss += loss.item() * X.shape[0]
        running_count += X.shape[0]

    train_loss = running_loss / running_count

    # -------------------------
    # 2) Validation metrics
    # -------------------------
    val_mae, val_rmse = evaluate(model, val_loader, A_hat_torch, device)

    scheduler.step(val_mae)

    train_history.append(train_loss)
    val_history.append((val_mae, val_rmse))

    print(
        f"Epoch {epoch:02d}/{epochs} | "
        f"Train MSE: {train_loss:.6f} | "
        f"Val MAE: {val_mae:.4f} | Val RMSE: {val_rmse:.4f} | "
        f"LR: {optimizer.param_groups[0]['lr']:.2e}"
    )

    # -------------------------
    # 3) Save best by Val MAE
    # -------------------------
    if val_mae < best_val_mae - 1e-6:
        best_val_mae = val_mae
        best_state = {k: v.detach().cpu().clone() for k, v in model.state_dict().items()}
        no_improve = 0
        print("  New best model saved (by Val MAE).")
    else:
        no_improve += 1
        if no_improve >= patience:
            print("  Early stopping (no improvement).")
            break

# -------------------------
# 4) Load best model
# -------------------------
if best_state is not None:
    model.load_state_dict(best_state)
    print(f"\nLoaded best model with Val MAE = {best_val_mae:.4f}")

# -------------------------
# 5) Final evaluation on TEST (12h horizon)
# -------------------------
test_mae, test_rmse = evaluate(model, test_loader, A_hat_torch, device)
print(f"\n TEST (12h horizon) | MAE: {test_mae:.4f} | RMSE: {test_rmse:.4f}")

# -------------------------
# 6) Save checkpoint
# -------------------------
save_path = "stgcn_best_12h.pth"
torch.save(model.state_dict(), save_path)
print(f"Saved best model to: {save_path}")


Epoch 01/30 | Train MSE: 0.161837 | Val MAE: 0.2392 | Val RMSE: 0.3649 | LR: 1.00e-03
  New best model saved (by Val MAE).
Epoch 02/30 | Train MSE: 0.144097 | Val MAE: 0.2386 | Val RMSE: 0.3554 | LR: 1.00e-03
  New best model saved (by Val MAE).
Epoch 03/30 | Train MSE: 0.143275 | Val MAE: 0.2348 | Val RMSE: 0.3530 | LR: 1.00e-03
  New best model saved (by Val MAE).
Epoch 04/30 | Train MSE: 0.144782 | Val MAE: 0.2460 | Val RMSE: 0.3605 | LR: 1.00e-03
Epoch 05/30 | Train MSE: 0.140495 | Val MAE: 0.2377 | Val RMSE: 0.3519 | LR: 1.00e-03
Epoch 06/30 | Train MSE: 0.138571 | Val MAE: 0.2325 | Val RMSE: 0.3497 | LR: 1.00e-03
  New best model saved (by Val MAE).
Epoch 07/30 | Train MSE: 0.139226 | Val MAE: 0.2373 | Val RMSE: 0.3532 | LR: 1.00e-03
Epoch 08/30 | Train MSE: 0.135905 | Val MAE: 0.2335 | Val RMSE: 0.3488 | LR: 1.00e-03
Epoch 09/30 | Train MSE: 0.134823 | Val MAE: 0.2303 | Val RMSE: 0.3483 | LR: 1.00e-03
  New best model saved (by Val MAE).
Epoch 10/30 | Train MSE: 0.138539 | Val M

In [88]:
# =========================
# Recursive Forecasting for 12/24/48/72 using 12h-trained STGCN
# =========================

import numpy as np
import torch
from math import sqrt

# We will forecast in multiples of 12 because your base model predicts +12
HORIZONS = [12, 24, 48, 72]
STEP = 12

# Confirm settings used earlier
INPUT_LEN = 24

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.eval()

# ---- utility: compute MAE/RMSE over all nodes and samples ----
def compute_mae_rmse(y_pred, y_true):
    # y_pred, y_true: (num_samples, N)
    err = y_pred - y_true
    mae = np.mean(np.abs(err))
    rmse = np.sqrt(np.mean(err**2))
    return float(mae), float(rmse)

@torch.no_grad()
def recursive_predict_batch(model, A_hat_torch, X0, steps):
    """
    Recursive multi-step prediction using a +12h model.
    X0: torch tensor (B, F, T, N) initial input window
    steps: how many +12 steps to roll forward (e.g., 2 means +24h)
    Returns: y_hat at final horizon: (B, N)
    """
    X = X0.clone()

    for _ in range(steps):
        y_hat = model(X, A_hat_torch)  # (B, N)

        # We assume "total_flow" is feature index 0 (from Step 2)
        # Update the last time step feature channel for total_flow with the prediction
        # Shift the window left by 1 time step and append a new step.
        # Because our base horizon is 12h, we shift by 12 time steps? NO:
        # The model was trained with sequences sampled hourly, but it predicts at +12.
        # For recursion we advance the "virtual time" by 12 hours, so we shift by 12 slots.
        shift = STEP

        # X: (B, F, T, N)
        B, Fch, T, N = X.shape
        if shift >= T:
            raise ValueError("STEP must be smaller than INPUT_LEN (window length).")

        # Shift window forward by 'shift' hours
        X[:, :, :-shift, :] = X[:, :, shift:, :]

        # For the newly opened last 'shift' slots, we need to fill them.
        # We only have predictions for flow at the end of the horizon, not each intermediate hour.
        # So we fill all new slots with the same predicted value for total_flow.
        # (This is a common pragmatic approximation for recursive rollouts when step > 1.)
        # Feature idx of total_flow:
        flow_fidx = 0

        # Fill new timesteps for the flow channel with y_hat
        for tfill in range(shift):
            X[:, flow_fidx, T - shift + tfill, :] = y_hat

        # NOTE: For other features (avg_speed, time encodings), we leave them as-is in the new slots.
        # If you included time encodings as node features, we will handle that better in Step 5.5.

    return y_hat

# -------------------------
# Build arrays of predictions for each horizon on the TEST loader
# -------------------------
results = {}

for horizon in HORIZONS:
    steps = horizon // STEP  # 12->1, 24->2, 48->4, 72->6

    all_pred = []
    all_true = []

    for X, y in test_loader:
        X = X.to(device)  # (B, F, T, N)
        y = y.to(device)  # (B, N) --> this y is the +12 target, not +24 etc.

        # For fair evaluation of longer horizons, we need the true target at that horizon.
        # We'll rebuild it from the underlying time-series tensor used in Step 2.
        # So here, we will only collect predictions, and compute true labels below using tensors.

        y_hat = recursive_predict_batch(model, A_hat_torch, X, steps)  # (B, N)
        all_pred.append(y_hat.detach().cpu().numpy())

    results[horizon] = np.concatenate(all_pred, axis=0)

print(" Predictions generated for horizons:", list(results.keys()))


 Predictions generated for horizons: [12, 24, 48, 72]


In [89]:
# =========================
#  Build TRUE labels for 12/24/48/72 from X_test_TNF
# =========================

# flow is feature index 0 if NODE_FEATURE_COLS started with total_flow
flow_idx = 0

# True flow series on test split: (T, N)
y_test_series = X_test_TNF[:, :, flow_idx]  # standardized flow (same scale as training)

def true_labels_for_horizon(y_series_TN, input_len, horizon):
    """
    y_series_TN: (T, N)
    return yw: (num_samples, N) aligned with the SAME t0 loop used in make_windows
    """
    T = y_series_TN.shape[0]
    ys = []
    last_start = T - input_len - horizon
    for t0 in range(last_start):
        ys.append(y_series_TN[t0 + input_len + horizon - 1])
    return np.stack(ys, axis=0).astype(np.float32)

true_y = {}
for h in HORIZONS:
    true_y[h] = true_labels_for_horizon(y_test_series, INPUT_LEN, h)
    print(h, true_y[h].shape)


12 (673, 1766)
24 (661, 1766)
48 (637, 1766)
72 (613, 1766)


In [90]:
# =========================
# Metrics table for 12/24/48/72
# =========================

for h in HORIZONS:
    y_pred = results[h]
    y_true = true_y[h]

    # ensure same length (they should match)
    m = min(len(y_pred), len(y_true))
    y_pred = y_pred[:m]
    y_true = y_true[:m]

    mae, rmse = compute_mae_rmse(y_pred, y_true)
    print(f"Horizon {h:>2}h  | MAE: {mae:.4f} | RMSE: {rmse:.4f}")


Horizon 12h  | MAE: 0.2069 | RMSE: 0.3210
Horizon 24h  | MAE: 0.4431 | RMSE: 0.6938
Horizon 48h  | MAE: 0.5682 | RMSE: 0.8391
Horizon 72h  | MAE: 0.6686 | RMSE: 0.9738


In [92]:
flow_idx = 0

flow_mean = scaler.mean_[flow_idx]
flow_std = np.sqrt(scaler.var_[flow_idx])

print("Flow mean:", flow_mean)
print("Flow std:", flow_std)


Flow mean: 63.93297253757723
Flow std: 38.66041689479637


In [93]:
for h in HORIZONS:
    y_pred = results[h]
    y_true = true_y[h]

    m = min(len(y_pred), len(y_true))
    y_pred = y_pred[:m]
    y_true = y_true[:m]

    mae_z, rmse_z = compute_mae_rmse(y_pred, y_true)

    mae_real = mae_z * flow_std
    rmse_real = rmse_z * flow_std

    print(
        f"Horizon {h:>2}h | "
        f"MAE: {mae_real:.2f} veh/hr | "
        f"RMSE: {rmse_real:.2f} veh/hr"
    )


Horizon 12h | MAE: 8.00 veh/hr | RMSE: 12.41 veh/hr
Horizon 24h | MAE: 17.13 veh/hr | RMSE: 26.82 veh/hr
Horizon 48h | MAE: 21.97 veh/hr | RMSE: 32.44 veh/hr
Horizon 72h | MAE: 25.85 veh/hr | RMSE: 37.65 veh/hr


# GRAPH WAVENET

In [95]:
import numpy as np
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader

# -------------------------
# CONFIG
# -------------------------
TIME_COL = "timestamp"
NODE_COL = "station"
TARGET_COL = "total_flow"
OPTIONAL_NODE_FEATURES = ["avg_speed"]  # will use only if present

SEQ_LEN = 24
HORIZON = 72
EVAL_HORIZONS = [12, 24, 48, 72]

BATCH_SIZE = 16
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
print("DEVICE:", DEVICE)

# -------------------------
# Basic cleaning (train only for graph build)
# -------------------------
df_graph = df_train.copy()
df_graph[TIME_COL] = pd.to_datetime(df_graph[TIME_COL]).dt.floor("H")

required_cols = [NODE_COL, "upstream_station", "downstream_station"]
missing = [c for c in required_cols if c not in df_graph.columns]
if missing:
    raise ValueError(f"df_train missing columns for adjacency: {missing}")

# Fix station types
for c in [NODE_COL, "upstream_station", "downstream_station"]:
    df_graph[c] = pd.to_numeric(df_graph[c], errors="coerce")

df_graph = df_graph.dropna(subset=[NODE_COL]).copy()
df_graph[NODE_COL] = df_graph[NODE_COL].astype(int)

stations = np.sort(df_graph[NODE_COL].unique())
N = len(stations)
station2idx = {s: i for i, s in enumerate(stations)}
idx2station = {i: s for s, i in station2idx.items()}

print(" Nodes (stations):", N)

# -------------------------
# Build adjacency A (N x N), symmetric, with self-loops
# -------------------------
A = np.zeros((N, N), dtype=np.float32)
np.fill_diagonal(A, 1.0)  # self-loops

edges = df_graph[[NODE_COL, "upstream_station", "downstream_station"]].drop_duplicates()

def add_edge(s, n):
    if pd.isna(n):
        return
    n = int(n)
    if n in station2idx:
        i = station2idx[int(s)]
        j = station2idx[n]
        A[i, j] = 1.0
        A[j, i] = 1.0  # undirected

for _, r in edges.iterrows():
    s = int(r[NODE_COL])
    add_edge(s, r["upstream_station"])
    add_edge(s, r["downstream_station"])

print(" A shape:", A.shape)
print(" Edges (excluding self-loops):", int((A.sum() - np.trace(A)) / 2))


DEVICE: cpu


  df_graph[TIME_COL] = pd.to_datetime(df_graph[TIME_COL]).dt.floor("H")


 Nodes (stations): 1766
 A shape: (1766, 1766)
 Edges (excluding self-loops): 1723


In [96]:
def row_normalize(A_np):
    A_np = A_np.astype(np.float32)
    rowsum = A_np.sum(axis=1, keepdims=True)
    rowsum[rowsum == 0] = 1.0
    return A_np / rowsum

A_rw  = row_normalize(A)
A_rwt = row_normalize(A.T)

supports = [
    torch.tensor(A_rw, dtype=torch.float32, device=DEVICE),
    torch.tensor(A_rwt, dtype=torch.float32, device=DEVICE)
]

print(" Supports:", len(supports), "each:", supports[0].shape)


 Supports: 2 each: torch.Size([1766, 1766])


In [97]:
def ensure_hourly(df):
    df = df.copy()
    df[TIME_COL] = pd.to_datetime(df[TIME_COL]).dt.floor("H")
    return df

# Features for the model: always include total_flow + optional features if present
feature_cols = [TARGET_COL] + [c for c in OPTIONAL_NODE_FEATURES if c in df_train.columns]
print(" Feature columns:", feature_cols)

def build_TNF(df_split, times, stations, feature_cols):
    """
    Build (T, N, F) for a split using a fixed time index (times) and station list (stations).
    Missing values are forward-filled per station then zero-filled.
    """
    df_split = ensure_hourly(df_split)
    df_split = df_split[df_split[NODE_COL].isin(stations)].copy()

    # Reindex full grid
    full_index = pd.MultiIndex.from_product([times, stations], names=[TIME_COL, NODE_COL])
    df_full = df_split.set_index([TIME_COL, NODE_COL]).reindex(full_index).reset_index()

    # fill missing values per station
    df_full[feature_cols] = (
        df_full.groupby(NODE_COL)[feature_cols]
               .apply(lambda g: g.ffill().bfill())
               .reset_index(level=0, drop=True)
    )
    df_full[feature_cols] = df_full[feature_cols].fillna(0.0)

    # stack
    T = len(times)
    N = len(stations)
    F_dim = len(feature_cols)
    X = np.zeros((T, N, F_dim), dtype=np.float32)

    for fi, col in enumerate(feature_cols):
        pivot = df_full.pivot(index=TIME_COL, columns=NODE_COL, values=col).reindex(index=times, columns=stations)
        X[:, :, fi] = pivot.values.astype(np.float32)

    return X

# Important: each split should have its own time index
train_times = pd.date_range(ensure_hourly(df_train)[TIME_COL].min(), ensure_hourly(df_train)[TIME_COL].max(), freq="H")
val_times   = pd.date_range(ensure_hourly(df_val)[TIME_COL].min(),   ensure_hourly(df_val)[TIME_COL].max(),   freq="H")
test_times  = pd.date_range(ensure_hourly(df_test)[TIME_COL].min(),  ensure_hourly(df_test)[TIME_COL].max(),  freq="H")

X_train_TNF = build_TNF(df_train, train_times, stations, feature_cols)
X_val_TNF   = build_TNF(df_val,   val_times,   stations, feature_cols)
X_test_TNF  = build_TNF(df_test,  test_times,  stations, feature_cols)

print("Train TNF:", X_train_TNF.shape, "Val TNF:", X_val_TNF.shape, "Test TNF:", X_test_TNF.shape)


 Feature columns: ['total_flow', 'avg_speed']


  df[TIME_COL] = pd.to_datetime(df[TIME_COL]).dt.floor("H")
  df[TIME_COL] = pd.to_datetime(df[TIME_COL]).dt.floor("H")
  train_times = pd.date_range(ensure_hourly(df_train)[TIME_COL].min(), ensure_hourly(df_train)[TIME_COL].max(), freq="H")
  df[TIME_COL] = pd.to_datetime(df[TIME_COL]).dt.floor("H")
  df[TIME_COL] = pd.to_datetime(df[TIME_COL]).dt.floor("H")
  val_times   = pd.date_range(ensure_hourly(df_val)[TIME_COL].min(),   ensure_hourly(df_val)[TIME_COL].max(),   freq="H")
  df[TIME_COL] = pd.to_datetime(df[TIME_COL]).dt.floor("H")
  df[TIME_COL] = pd.to_datetime(df[TIME_COL]).dt.floor("H")
  test_times  = pd.date_range(ensure_hourly(df_test)[TIME_COL].min(),  ensure_hourly(df_test)[TIME_COL].max(),  freq="H")
  df[TIME_COL] = pd.to_datetime(df[TIME_COL]).dt.floor("H")
  df[TIME_COL] = pd.to_datetime(df[TIME_COL]).dt.floor("H")
  df[TIME_COL] = pd.to_datetime(df[TIME_COL]).dt.floor("H")


Train TNF: (1069, 1766, 2) Val TNF: (325, 1766, 2) Test TNF: (709, 1766, 2)


In [98]:
class StandardScalerTNF:
    def __init__(self):
        self.mean = None
        self.std = None

    def fit(self, X):
        # X: (T,N,F)
        self.mean = X.mean(axis=(0,1), keepdims=True)
        self.std = X.std(axis=(0,1), keepdims=True)
        self.std[self.std == 0] = 1.0

    def transform(self, X):
        return (X - self.mean) / self.std

    def inverse_transform_feature(self, Z, fidx=0):
        # Z has feature space scaled; convert that feature back to original units
        return Z * float(self.std[..., fidx]) + float(self.mean[..., fidx])

scaler = StandardScalerTNF()
scaler.fit(X_train_TNF)

X_train = scaler.transform(X_train_TNF).astype(np.float32)
X_val   = scaler.transform(X_val_TNF).astype(np.float32)
X_test  = scaler.transform(X_test_TNF).astype(np.float32)

target_mean = float(scaler.mean[..., 0])
target_std  = float(scaler.std[..., 0])
print(" Target mean/std from TRAIN:", target_mean, target_std)


 Target mean/std from TRAIN: 1005.9266967773438 1343.13134765625


  target_mean = float(scaler.mean[..., 0])
  target_std  = float(scaler.std[..., 0])


In [99]:
class GWNWindowDataset(Dataset):
    def __init__(self, X_tnf, seq_len=24, horizon=72, target_feature_idx=0):
        self.X = X_tnf
        self.seq_len = seq_len
        self.horizon = horizon
        self.target_idx = target_feature_idx
        self.T = X_tnf.shape[0]

        self.valid_t = list(range(seq_len, self.T - horizon))

    def __len__(self):
        return len(self.valid_t)

    def __getitem__(self, idx):
        t = self.valid_t[idx]

        x_hist = self.X[t - self.seq_len:t]                 # (seq_len, N, F)
        y_fut  = self.X[t:t + self.horizon, :, self.target_idx]  # (horizon, N)

        # to (F, N, seq_len)
        x = torch.tensor(x_hist, dtype=torch.float32).permute(2, 1, 0).contiguous()
        # to (1, N, horizon)
        y = torch.tensor(y_fut, dtype=torch.float32).permute(1, 0).unsqueeze(0).contiguous()
        return x, y

train_ds = GWNWindowDataset(X_train, seq_len=SEQ_LEN, horizon=HORIZON)
val_ds   = GWNWindowDataset(X_val,   seq_len=SEQ_LEN, horizon=HORIZON)
test_ds  = GWNWindowDataset(X_test,  seq_len=SEQ_LEN, horizon=HORIZON)

train_loader = DataLoader(train_ds, batch_size=BATCH_SIZE, shuffle=True, drop_last=True)
val_loader   = DataLoader(val_ds, batch_size=BATCH_SIZE, shuffle=False)
test_loader  = DataLoader(test_ds, batch_size=BATCH_SIZE, shuffle=False)

print(" Windows:", len(train_ds), len(val_ds), len(test_ds))


 Windows: 973 229 613


In [100]:
import torch
import torch.nn as nn
import torch.nn.functional as F

# -------------------------
# Diffusion Graph Convolution (Graph WaveNet style)
# -------------------------
class DiffusionGraphConv(nn.Module):
    def __init__(self, in_channels, out_channels, supports_len, order=2):
        super().__init__()
        self.order = order
        self.supports_len = supports_len
        self.mlp = nn.Conv2d(in_channels * (supports_len * order + 1), out_channels, kernel_size=(1, 1))

    def forward(self, x, supports):
        # x: [B, C, N, T]
        out = [x]
        for a in supports:
            x1 = torch.einsum("nm,bcmt->bcnt", a, x)  # [B,C,N,T]
            out.append(x1)
            xk = x1
            for _ in range(2, self.order + 1):
                xk = torch.einsum("nm,bcmt->bcnt", a, xk)
                out.append(xk)
        h = torch.cat(out, dim=1)
        return self.mlp(h)

# -------------------------
# Graph WaveNet Model
# -------------------------
class GraphWaveNet(nn.Module):
    def __init__(
        self,
        num_nodes,
        in_dim,
        out_dim=1,
        horizon=72,
        blocks=4,
        layers=2,
        kernel_size=2,
        dropout=0.3,
        residual_channels=32,
        dilation_channels=32,
        skip_channels=256,
        end_channels=512,
        supports=None,
        adaptive_adj=True,
        apt_embed_dim=10,
        gcn_bool=True,
        gcn_order=2
    ):
        super().__init__()
        self.num_nodes = num_nodes
        self.in_dim = in_dim
        self.out_dim = out_dim
        self.horizon = horizon
        self.dropout = dropout
        self.blocks = blocks
        self.layers = layers
        self.gcn_bool = gcn_bool
        self.adaptive_adj = adaptive_adj

        self.start_conv = nn.Conv2d(in_dim, residual_channels, kernel_size=(1, 1))

        self.supports = supports if supports is not None else []
        self.supports_len = len(self.supports)

        # Adaptive adjacency
        if adaptive_adj:
            self.nodevec1 = nn.Parameter(torch.randn(num_nodes, apt_embed_dim), requires_grad=True)
            self.nodevec2 = nn.Parameter(torch.randn(apt_embed_dim, num_nodes), requires_grad=True)
            self.supports_len += 1

        self.filter_convs = nn.ModuleList()
        self.gate_convs = nn.ModuleList()
        self.residual_convs = nn.ModuleList()
        self.skip_convs = nn.ModuleList()
        self.bn = nn.ModuleList()
        self.gconv = nn.ModuleList()

        receptive_field = 1
        for _ in range(blocks):
            additional_scope = kernel_size - 1
            new_dilation = 1
            for _ in range(layers):
                # Dilated temporal convolutions (along time axis)
                self.filter_convs.append(
                    nn.Conv2d(residual_channels, dilation_channels,
                              kernel_size=(1, kernel_size), dilation=(1, new_dilation))
                )
                self.gate_convs.append(
                    nn.Conv2d(residual_channels, dilation_channels,
                              kernel_size=(1, kernel_size), dilation=(1, new_dilation))
                )

                self.residual_convs.append(nn.Conv2d(dilation_channels, residual_channels, kernel_size=(1, 1)))
                self.skip_convs.append(nn.Conv2d(dilation_channels, skip_channels, kernel_size=(1, 1)))
                self.bn.append(nn.BatchNorm2d(residual_channels))

                if gcn_bool:
                    self.gconv.append(
                        DiffusionGraphConv(dilation_channels, residual_channels,
                                           supports_len=self.supports_len, order=gcn_order)
                    )

                receptive_field += additional_scope * new_dilation
                new_dilation *= 2

        self.receptive_field = receptive_field
        self.end_conv_1 = nn.Conv2d(skip_channels, end_channels, kernel_size=(1, 1))
        self.end_conv_2 = nn.Conv2d(end_channels, out_dim * horizon, kernel_size=(1, 1))

    def forward(self, x, supports_override=None):
        """
        x: [B, F, N, T]
        returns: [B, out_dim, N, horizon]
        """
        if x.size(-1) < self.receptive_field:
            x = F.pad(x, (self.receptive_field - x.size(-1), 0, 0, 0))

        x = self.start_conv(x)
        skip = None

        # supports: physical + adaptive
        supports = list(self.supports) if supports_override is None else list(supports_override)
        if self.adaptive_adj:
            adp = F.softmax(F.relu(torch.matmul(self.nodevec1, self.nodevec2)), dim=1)  # [N,N]
            supports = supports + [adp]

        for i in range(self.blocks * self.layers):
            residual = x

            # gated temporal conv
            filter_out = torch.tanh(self.filter_convs[i](x))
            gate_out = torch.sigmoid(self.gate_convs[i](x))
            x = filter_out * gate_out
            x = F.dropout(x, self.dropout, training=self.training)

            # skip
            s = self.skip_convs[i](x)
            skip = s if skip is None else (skip[..., -s.size(-1):] + s)

            # graph conv
            if self.gcn_bool:
                x = self.gconv[i](x, supports)
            else:
                x = self.residual_convs[i](x)

            # residual + norm
            x = x + residual[..., -x.size(-1):]
            x = self.bn[i](x)

        x = F.relu(skip)
        x = F.relu(self.end_conv_1(x))
        x = self.end_conv_2(x)  # [B, out_dim*horizon, N, T']
        x = x[:, :, :, -1]      # [B, out_dim*horizon, N]
        x = x.view(x.size(0), self.out_dim, self.horizon, self.num_nodes)
        x = x.permute(0, 1, 3, 2).contiguous()  # [B, out_dim, N, horizon]
        return x

# Instantiate
in_dim = len(feature_cols)
model = GraphWaveNet(
    num_nodes=N,
    in_dim=in_dim,
    out_dim=1,
    horizon=HORIZON,
    supports=supports,
    adaptive_adj=True,
    gcn_bool=True,
    dropout=0.3,
    residual_channels=32,
    dilation_channels=32,
    skip_channels=256,
    end_channels=512
).to(DEVICE)

print(model.__class__.__name__, "initialized.")


GraphWaveNet initialized.


In [None]:
import numpy as np

def mae_loss(pred, true):
    # pred/true: [B,1,N,H]
    return torch.mean(torch.abs(pred - true))

@torch.no_grad()
def evaluate_val_loss(model, loader):
    model.eval()
    losses = []
    for x, y in loader:
        x = x.to(DEVICE)  # [B,F,N,T]
        y = y.to(DEVICE)  # [B,1,N,H]
        pred = model(x)
        losses.append(mae_loss(pred, y).item())
    return float(np.mean(losses)) if losses else np.nan

optimizer = torch.optim.Adam(model.parameters(), lr=1e-3, weight_decay=1e-5)

EPOCHS = 25
PATIENCE = 6

best_val = float("inf")
best_state = None
no_improve = 0

for epoch in range(1, EPOCHS + 1):
    model.train()
    train_losses = []

    for x, y in train_loader:
        x = x.to(DEVICE)
        y = y.to(DEVICE)

        optimizer.zero_grad()
        pred = model(x)
        loss = mae_loss(pred, y)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 5.0)
        optimizer.step()

        train_losses.append(loss.item())

    train_loss = float(np.mean(train_losses))
    val_loss = evaluate_val_loss(model, val_loader)

    print(f"Epoch {epoch:02d}/{EPOCHS} | Train MAE(z): {train_loss:.4f} | Val MAE(z): {val_loss:.4f}")

    if val_loss < best_val - 1e-6:
        best_val = val_loss
        best_state = {k: v.detach().cpu().clone() for k, v in model.state_dict().items()}
        no_improve = 0
        print("   Best model updated.")
    else:
        no_improve += 1
        if no_improve >= PATIENCE:
            print("   Early stopping.")
            break

# Load best
model.load_state_dict(best_state)
model.to(DEVICE)
print("Loaded best model with Val MAE(z):", best_val)

# Save checkpoint
torch.save(model.state_dict(), "gwn_best.pth")
print("Saved: gwn_best.pth")


In [None]:
from math import sqrt

@torch.no_grad()
def predict_full_horizon(model, loader):
    """
    Returns pred_z, true_z in scaled space:
      pred_z, true_z shape: [num_samples, 1, N, H]
    """
    model.eval()
    preds, trues = [], []
    for x, y in loader:
        x = x.to(DEVICE)
        y = y.to(DEVICE)
        pred = model(x)  # [B,1,N,H]
        preds.append(pred.detach().cpu().numpy())
        trues.append(y.detach().cpu().numpy())
    return np.concatenate(preds, axis=0), np.concatenate(trues, axis=0)

def to_real_units(z):
    # z is scaled flow; convert back to veh/hr using train mean/std for the target feature
    return z * target_std + target_mean

def mae_rmse(pred, true):
    err = pred - true
    mae = float(np.mean(np.abs(err)))
    rmse = float(np.sqrt(np.mean(err**2)))
    return mae, rmse

def report_metrics(pred_z, true_z, label):
    pred_real = to_real_units(pred_z)
    true_real = to_real_units(true_z)

    print(f"\n=== {label} metrics (veh/hr) ===")
    for h in EVAL_HORIZONS:
        k = h - 1  # 12h -> index 11
        p = pred_real[:, 0, :, k]  # [S,N]
        t = true_real[:, 0, :, k]
        m, r = mae_rmse(p, t)
        print(f"Horizon {h:>2}h | MAE: {m:.3f} | RMSE: {r:.3f}")

# VAL
val_pred_z, val_true_z = predict_full_horizon(model, val_loader)
report_metrics(val_pred_z, val_true_z, "VALIDATION")

# TEST
test_pred_z, test_true_z = predict_full_horizon(model, test_loader)
report_metrics(test_pred_z, test_true_z, "TEST")


In [None]:
import pandas as pd

# Save arrays for plots/thesis figures
np.save("gwn_val_pred_real.npy", to_real_units(val_pred_z))
np.save("gwn_val_true_real.npy", to_real_units(val_true_z))
np.save("gwn_test_pred_real.npy", to_real_units(test_pred_z))
np.save("gwn_test_true_real.npy", to_real_units(test_true_z))
print("Saved prediction arrays for VAL/TEST (real units).")

# Build a results table for TEST
rows = []
pred_real = to_real_units(test_pred_z)
true_real = to_real_units(test_true_z)

for h in EVAL_HORIZONS:
    k = h - 1
    p = pred_real[:, 0, :, k]
    t = true_real[:, 0, :, k]
    m, r = mae_rmse(p, t)
    rows.append({"Model": "GraphWaveNet", "Horizon_hours": h, "MAE": m, "RMSE": r})

gwn_results_df = pd.DataFrame(rows)
print(gwn_results_df)


###########################

In [None]:
BASE_HORIZON = 12   # train only for 12h
EVAL_HORIZONS = [12, 24, 48, 72]
SEQ_LEN = 24

In [None]:
class GWNWindowDataset_12(Dataset):
    def __init__(self, X_tnf, seq_len=24, horizon=12, target_feature_idx=0):
        self.X = X_tnf
        self.seq_len = seq_len
        self.horizon = horizon
        self.target_idx = target_feature_idx
        self.T = X_tnf.shape[0]
        self.valid_t = list(range(seq_len, self.T - horizon))

    def __len__(self):
        return len(self.valid_t)

    def __getitem__(self, idx):
        t = self.valid_t[idx]
        x_hist = self.X[t - self.seq_len:t]                        # (seq_len, N, F)
        y_fut  = self.X[t:t + self.horizon, :, self.target_idx]    # (horizon, N)

        x = torch.tensor(x_hist, dtype=torch.float32).permute(2, 1, 0).contiguous()  # (F,N,T)
        y = torch.tensor(y_fut, dtype=torch.float32).permute(1, 0).unsqueeze(0).contiguous()  # (1,N,H)

        return x, y

train_ds = GWNWindowDataset_12(X_train, seq_len=SEQ_LEN, horizon=BASE_HORIZON)
val_ds   = GWNWindowDataset_12(X_val,   seq_len=SEQ_LEN, horizon=BASE_HORIZON)
test_ds  = GWNWindowDataset_12(X_test,  seq_len=SEQ_LEN, horizon=BASE_HORIZON)

train_loader = DataLoader(train_ds, batch_size=BATCH_SIZE, shuffle=True, drop_last=True)
val_loader   = DataLoader(val_ds,   batch_size=BATCH_SIZE, shuffle=False)
test_loader  = DataLoader(test_ds,  batch_size=BATCH_SIZE, shuffle=False)

print("Windows:", len(train_ds), len(val_ds), len(test_ds))


In [None]:
in_dim = len(feature_cols)

model = GraphWaveNet(
    num_nodes=N,
    in_dim=in_dim,
    out_dim=1,
    horizon=BASE_HORIZON,  # <-- KEY CHANGE
    supports=supports,
    adaptive_adj=True,
    gcn_bool=True,
    dropout=0.3,
    residual_channels=32,
    dilation_channels=32,
    skip_channels=256,
    end_channels=512
).to(DEVICE)

optimizer = torch.optim.Adam(model.parameters(), lr=1e-3, weight_decay=1e-5)


In [None]:
# =========================
#  Train Graph WaveNet (+12h only) with Val monitoring + Best checkpoint
# Assumes you already have:
#   model, train_loader, val_loader, optimizer, DEVICE
# and BASE_HORIZON=12 was used in the dataset/model (so y shape is [B,1,N,12])
# =========================

import numpy as np
import torch

def mae_loss(pred, true):
    # pred/true: [B,1,N,12]
    return torch.mean(torch.abs(pred - true))

@torch.no_grad()
def evaluate_val_mae(model, loader):
    model.eval()
    losses = []
    for x, y in loader:
        x = x.to(DEVICE)  # [B,F,N,T]
        y = y.to(DEVICE)  # [B,1,N,12]
        pred = model(x)   # [B,1,N,12]
        losses.append(mae_loss(pred, y).item())
    return float(np.mean(losses)) if losses else np.nan

# ---- training config ----
EPOCHS = 25
PATIENCE = 6
CLIP_NORM = 5.0
SAVE_PATH = "gwn_best_12h.pth"

best_val = float("inf")
best_state = None
no_improve = 0

for epoch in range(1, EPOCHS + 1):
    model.train()
    train_losses = []

    for x, y in train_loader:
        x = x.to(DEVICE)  # [B,F,N,SEQ_LEN]
        y = y.to(DEVICE)  # [B,1,N,12]

        optimizer.zero_grad()
        pred = model(x)           # [B,1,N,12]
        loss = mae_loss(pred, y)  # scalar
        loss.backward()

        torch.nn.utils.clip_grad_norm_(model.parameters(), CLIP_NORM)
        optimizer.step()

        train_losses.append(loss.item())

    train_mae = float(np.mean(train_losses))
    val_mae = evaluate_val_mae(model, val_loader)

    print(f"Epoch {epoch:02d}/{EPOCHS} | Train MAE(z): {train_mae:.4f} | Val MAE(z): {val_mae:.4f}")

    # ---- keep best by Val MAE ----
    if val_mae < best_val - 1e-6:
        best_val = val_mae
        best_state = {k: v.detach().cpu().clone() for k, v in model.state_dict().items()}
        no_improve = 0
        print("   Best model updated.")
    else:
        no_improve += 1
        if no_improve >= PATIENCE:
            print("   Early stopping.")
            break

# ---- load best + save ----
if best_state is not None:
    model.load_state_dict(best_state)
    model.to(DEVICE)
    torch.save(model.state_dict(), SAVE_PATH)
    print(f" Loaded + saved best model: {SAVE_PATH} | Best Val MAE(z): {best_val:.4f}")
else:
    print(" No best_state saved (unexpected). Check training/validation loaders.")


In [None]:
@torch.no_grad()
def gwn_recursive_rollout(model, x0, steps, flow_feature_index=0):
    """
    model: trained for BASE_HORIZON=12
    x0: [B, F, N, SEQ_LEN] in scaled space
    steps: number of 12-hour blocks to roll (1=12h, 2=24h, 4=48h, 6=72h)
    Returns: final prediction for the last block at its LAST step: [B, N]
    """
    model.eval()
    x = x0.clone()  # [B,F,N,T]

    last_block_pred = None

    for _ in range(steps):
        y_block = model(x)             # [B,1,N,12]  (scaled)
        last_block_pred = y_block[:, 0, :, -1]  # take the last step of the 12h block -> [B,N]

        # Roll the input window forward by 12 hours:
        # shift left by 12 and append 12 new "future" steps.
        shift = BASE_HORIZON
        B, Fch, Nn, T = x.shape

        if shift >= T:
            raise ValueError("BASE_HORIZON must be smaller than SEQ_LEN (24).")

        # shift existing history left
        x[:, :, :, :-shift] = x[:, :, :, shift:]

        # fill the last 12 positions
        # We ONLY update total_flow channel. Other channels stay constant (approx).
        for tfill in range(shift):
            x[:, flow_feature_index, :, T - shift + tfill] = last_block_pred

    return last_block_pred  # [B,N] scaled


In [None]:
def true_labels_from_series(X_tnf_scaled, seq_len, horizon, flow_idx=0):
    """
    X_tnf_scaled: (T,N,F)
    return: (num_samples, N) for the horizon at t + horizon
    aligned with dataset indexing (t from seq_len ... T-horizon-1)
    """
    y_series = X_tnf_scaled[:, :, flow_idx]  # (T,N)
    ys = []
    for t in range(seq_len, X_tnf_scaled.shape[0] - horizon):
        ys.append(y_series[t + horizon - 1])
    return np.stack(ys, axis=0).astype(np.float32)

def inverse_units(z):
    return z * target_std + target_mean


In [None]:
from math import sqrt

def mae_rmse_np(p, t):
    err = p - t
    mae = float(np.mean(np.abs(err)))
    rmse = float(np.sqrt(np.mean(err**2)))
    return mae, rmse

@torch.no_grad()
def eval_recursive(model, loader, X_tnf_scaled, label="TEST", flow_feature_index=0):
    print(f"\n=== {label} Recursive Evaluation (veh/hr) ===")

    # true labels for each horizon
    true_dict = {h: true_labels_from_series(X_tnf_scaled, SEQ_LEN, h, flow_idx=0) for h in EVAL_HORIZONS}

    # collect predictions per horizon
    pred_dict = {h: [] for h in EVAL_HORIZONS}

    sample_cursor = 0  # aligns loader batches to true labels

    for x_batch, _ in loader:
        x_batch = x_batch.to(DEVICE)  # [B,F,N,T]
        B = x_batch.shape[0]

        for h in EVAL_HORIZONS:
            steps = h // BASE_HORIZON
            pred_scaled = gwn_recursive_rollout(model, x_batch, steps, flow_feature_index=flow_feature_index)  # [B,N]
            pred_dict[h].append(pred_scaled.detach().cpu().numpy())

        sample_cursor += B

    # compute metrics
    for h in EVAL_HORIZONS:
        pred_scaled_all = np.concatenate(pred_dict[h], axis=0)  # (S,N)
        true_scaled_all = true_dict[h]

        m = min(len(pred_scaled_all), len(true_scaled_all))
        pred_real = inverse_units(pred_scaled_all[:m])
        true_real = inverse_units(true_scaled_all[:m])

        mae, rmse = mae_rmse_np(pred_real, true_real)
        print(f"Horizon {h:>2}h | MAE: {mae:.3f} | RMSE: {rmse:.3f}")


In [None]:
# After training and loading best model:
eval_recursive(model, val_loader, X_val, label="VALIDATION", flow_feature_index=0)
eval_recursive(model, test_loader, X_test, label="TEST", flow_feature_index=0)
