In [2]:
# BASELINE SUITE
# Includes: Persistence, SeasonalNaive (FIXED for t+h), VAR, HistGBR, LSTM(Dropout+EarlyStop)

import numpy as np
import pandas as pd

from sklearn.neighbors import NearestNeighbors
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.ensemble import HistGradientBoostingRegressor

from statsmodels.tsa.api import VAR

# LSTM 
import tensorflow as tf
from tensorflow.keras import layers, Model

# Config
DATA_PATH = "dataset_2023_2025.csv"
HORIZONS = [1, 6, 24]
LAGS = [1, 2, 6, 24]
SEED = 42

TRAIN_END = "2024-12-31 23:00:00"
VAL_END   = "2025-06-30 23:00:00"
TEST_END  = "2025-11-23 23:00:00"

# LSTM sequence settings
SEQ_LEN = 24   # 24 hours history
BATCH = 256
EPOCHS = 30
DROPOUT = 0.20
LR = 1e-3

np.random.seed(SEED)
tf.random.set_seed(SEED)

# -----------------------------
# Utils
# -----------------------------
def rmse(y_true, y_pred):
    return np.sqrt(mean_squared_error(y_true, y_pred))

def make_lag_features(df, lags):
    df = df.sort_values(['city_id','datetime']).copy()
    for lag in lags:
        df[f'pm2_5_lag{lag}'] = df.groupby('city_id')['pm2_5'].shift(lag)
    return df

def make_multi_horizon_targets(df, horizons):
    df = df.sort_values(['city_id','datetime']).copy()
    for h in horizons:
        df[f'y_h{h}'] = df.groupby('city_id')['pm2_5'].shift(-h)
        df[f'pm10_h{h}'] = df.groupby('city_id')['pm10'].shift(-h)  # constraint check only
    return df

def time_split(df):
    train = df[df['datetime'] <= pd.Timestamp(TRAIN_END)].copy()
    val = df[(df['datetime'] > pd.Timestamp(TRAIN_END)) & (df['datetime'] <= pd.Timestamp(VAL_END))].copy()
    test = df[(df['datetime'] > pd.Timestamp(VAL_END)) & (df['datetime'] <= pd.Timestamp(TEST_END))].copy()
    return train, val, test

def constraint_violation_rate(pred_pm25, true_pm10):
    v = pred_pm25 - true_pm10
    viol = v > 0
    rate = float(np.mean(viol))
    mag = float(np.mean(v[viol])) if np.any(viol) else 0.0
    return rate, mag

# -----------------------------
# Load
# -----------------------------
df = pd.read_csv(DATA_PATH)
df['datetime'] = pd.to_datetime(df['datetime'])

keep_cols = [
    'city_id','city_name','lat','lon','datetime',
    'pm2_5','pm10',  # pm10 NOT used as input, only check
    'carbon_monoxide','nitrogen_dioxide','sulphur_dioxide','ozone',
    'doy_sin','doy_cos','hour_sin','hour_cos'
]
df = df[keep_cols].copy()

# build lags + targets and drop NA
df = make_lag_features(df, LAGS)
df = make_multi_horizon_targets(df, HORIZONS)

needed = [f'pm2_5_lag{l}' for l in LAGS] + [f'y_h{h}' for h in HORIZONS] + [f'pm10_h{h}' for h in HORIZONS]
df = df.dropna(subset=needed).copy()

train_df, val_df, test_df = time_split(df)

print("Train:", train_df['datetime'].min(), "->", train_df['datetime'].max(), "rows", len(train_df))
print("Val  :", val_df['datetime'].min(), "->", val_df['datetime'].max(), "rows", len(val_df))
print("Test :", test_df['datetime'].min(), "->", test_df['datetime'].max(), "rows", len(test_df))

# -----------------------------
# Feature definitions (PM10 excluded!)
# -----------------------------
num_features = [
    'carbon_monoxide','nitrogen_dioxide','sulphur_dioxide','ozone',
    'doy_sin','doy_cos','hour_sin','hour_cos',
] + [f'pm2_5_lag{l}' for l in LAGS] + ['lat','lon']
cat_features = ['city_id']

def get_xy(dfi, h):
    y = dfi[f'y_h{h}'].to_numpy(dtype=float)
    pm10_t = dfi[f'pm10_h{h}'].to_numpy(dtype=float)
    X = dfi[num_features + cat_features].copy()
    return X, y, pm10_t

# Baseline 0: Persistence (yhat = pm2_5 at time t)
def predict_persistence(dfi):
    return dfi['pm2_5'].to_numpy(dtype=float)

# Baseline 1: Seasonal Naive
def fit_seasonal_naive(train):
    tmp = train.copy()
    tmp['month'] = tmp['datetime'].dt.month
    tmp['hour'] = tmp['datetime'].dt.hour
    # city x month x hour climatology
    clim = tmp.groupby(['city_id','month','hour'])['pm2_5'].mean()
    # fallback city mean
    city_mean = tmp.groupby('city_id')['pm2_5'].mean()
    return clim, city_mean

def predict_seasonal_naive(dfi, clim, city_mean, h):
    tmp = dfi.copy()
    dt_tgt = tmp['datetime'] + pd.to_timedelta(h, unit='h')
    tmp['month_tgt'] = dt_tgt.dt.month
    tmp['hour_tgt'] = dt_tgt.dt.hour

    pred = np.empty(len(tmp), dtype=float)
    # loop is OK; if you want vectorization later we can optimize
    for i, r in enumerate(tmp.itertuples(index=False)):
        key = (r.city_id, r.month_tgt, r.hour_tgt)
        if key in clim.index:
            pred[i] = float(clim.loc[key])
        else:
            pred[i] = float(city_mean.loc[r.city_id])
    return pred

# Baseline 2: VAR per city on [PM2.5, CO, NO2, SO2, O3]
VAR_VARS = ['pm2_5','carbon_monoxide','nitrogen_dioxide','sulphur_dioxide','ozone']

def fit_var_models(train, maxlags=6):
    models = {}
    for cid, g in train.sort_values('datetime').groupby('city_id'):
        X = g[VAR_VARS].astype(float).to_numpy()
        if len(X) < 200:
            continue
        m = VAR(X)
        res = m.fit(maxlags=maxlags, ic=None, trend='c')
        models[cid] = res
    return models

def predict_var_for_split(df_split, df_full, var_models, h):
    # df_split: rows at time t (features); predict y(t+h)
    out = np.full(len(df_split), np.nan, dtype=float)

    for cid, g in df_split.sort_values(['city_id','datetime']).groupby('city_id'):
        if cid not in var_models:
            continue
        res = var_models[cid]
        k_ar = res.k_ar

        full = df_full[df_full['city_id']==cid].sort_values('datetime')[['datetime']+VAR_VARS].copy()
        full_dt = full['datetime'].to_numpy()
        full_X  = full[VAR_VARS].to_numpy(dtype=float)

        pos = pd.Series(np.arange(len(full_dt)), index=full_dt)

        for idx, row in g.iterrows():
            t = row['datetime']
            if t not in pos.index:
                continue
            p = int(pos.loc[t])
            if p - k_ar + 1 < 0:
                continue
            hist = full_X[p-k_ar+1:p+1]
            fc = res.forecast(hist, steps=h)
            out[df_split.index.get_loc(idx)] = fc[-1, 0]  # PM2.5
    return out

# Baseline 3: Fast tree (HistGBR)
def build_tree_pipeline():
    pre = ColumnTransformer(
        transformers=[
            ("num", StandardScaler(), num_features),
            ("cat", OneHotEncoder(handle_unknown="ignore"), cat_features)
        ]
    )
    model = HistGradientBoostingRegressor(
        max_depth=8,
        learning_rate=0.08,
        max_iter=400,
        random_state=SEED
    )
    return Pipeline([("pre", pre), ("model", model)])

# Baseline 4: LSTM (Dropout + EarlyStopping)
# Sequence per city: last SEQ_LEN hours of [pm2.5 + chems + time feats]
# Static: lat/lon + city_id one-hot
SEQ_FEATURES = [
    'pm2_5',  # NOTE: uses observed history only up to time t
    'carbon_monoxide','nitrogen_dioxide','sulphur_dioxide','ozone',
    'doy_sin','doy_cos','hour_sin','hour_cos'
]
STATIC_NUM = ['lat','lon']
STATIC_CAT = ['city_id']

def build_seq_index(dfi):
    # dfi must be sorted by city/time
    dfi = dfi.sort_values(['city_id','datetime']).copy()
    dfi['pos_in_city'] = dfi.groupby('city_id').cumcount()
    return dfi

def make_lstm_arrays(df_full, df_split, h, seq_len=24):
    """
    Build sequences for rows in df_split (at time t), using df_full history up to t.
    Returns: X_seq, X_static, y, meta (city_id, datetime, pm10_true)
    """
    df_full = build_seq_index(df_full)
    df_split = df_split.sort_values(['city_id','datetime']).copy()
    city_ohe = OneHotEncoder(handle_unknown="ignore", sparse_output=False)
    city_ohe.fit(train_df[['city_id']])

    X_seq_list, X_stat_list, y_list, pm10_list = [], [], [], []
    meta_city, meta_dt = [], []

    for cid, g in df_split.groupby('city_id'):
        full_c = df_full[df_full['city_id']==cid].sort_values('datetime')
        full_dt = full_c['datetime'].to_numpy()
        full_X  = full_c[SEQ_FEATURES].to_numpy(dtype=float)

        # map datetime->index
        pos = pd.Series(np.arange(len(full_dt)), index=full_dt)

        # static for this city (same for all)
        lat = float(g['lat'].iloc[0]); lon = float(g['lon'].iloc[0])
        cid_ohe = city_ohe.transform(pd.DataFrame({'city_id':[cid]}))[0]
        stat_vec = np.concatenate([[lat, lon], cid_ohe], axis=0)

        for r in g.itertuples(index=False):
            t = r.datetime
            if t not in pos.index:
                continue
            p = int(pos.loc[t])
            if p - seq_len + 1 < 0:
                continue

            # sequence ends at t (inclusive), length seq_len
            seq = full_X[p-seq_len+1:p+1]  # (seq_len, nfeat)

            # target is stored already in df_split row as y_h{h}
            y_t = getattr(r, f'y_h{h}')
            pm10_t = getattr(r, f'pm10_h{h}')

            X_seq_list.append(seq)
            X_stat_list.append(stat_vec)
            y_list.append(float(y_t))
            pm10_list.append(float(pm10_t))
            meta_city.append(cid)
            meta_dt.append(t)

    X_seq = np.stack(X_seq_list).astype(np.float32)
    X_stat = np.stack(X_stat_list).astype(np.float32)
    y = np.array(y_list, dtype=np.float32)
    pm10_true = np.array(pm10_list, dtype=np.float32)

    meta = pd.DataFrame({'city_id': meta_city, 'datetime': meta_dt})
    return X_seq, X_stat, y, pm10_true, meta

def build_lstm_model(seq_len, n_seq_feat, n_static_feat):
    inp_seq = layers.Input(shape=(seq_len, n_seq_feat), name="seq")
    x = layers.Masking()(inp_seq)
    x = layers.LSTM(64, return_sequences=False, dropout=DROPOUT, recurrent_dropout=0.0)(x)
    x = layers.Dense(64, activation="relu")(x)
    x = layers.Dropout(DROPOUT)(x)

    inp_stat = layers.Input(shape=(n_static_feat,), name="static")
    s = layers.Dense(32, activation="relu")(inp_stat)
    s = layers.Dropout(DROPOUT)(s)

    z = layers.Concatenate()([x, s])
    z = layers.Dense(64, activation="relu")(z)
    z = layers.Dropout(DROPOUT)(z)
    out = layers.Dense(1, name="y")(z)

    model = Model([inp_seq, inp_stat], out)
    model.compile(
        optimizer=tf.keras.optimizers.Adam(LR),
        loss="mae"
    )
    return model

# Run baselines and SAVE predictions
pred_rows = []

def add_preds(model_name, split_name, h, meta_df, y_true, y_pred, pm10_true):
    # long format rows for saving
    out = meta_df.copy()
    out['split'] = split_name
    out['model'] = model_name
    out['horizon_h'] = h
    out['y_true'] = y_true
    out['y_pred'] = y_pred
    out['pm10_true'] = pm10_true
    pred_rows.append(out)

# Persistence + SeasonalNaive
clim, city_mean = fit_seasonal_naive(train_df)

for h in HORIZONS:
    # VAL
    meta_val = val_df[['city_id','datetime']].copy()
    yv = val_df[f'y_h{h}'].to_numpy(float)
    pm10v = val_df[f'pm10_h{h}'].to_numpy(float)

    add_preds("Persistence", "val", h, meta_val, yv, predict_persistence(val_df), pm10v)
    add_preds("SeasonalNaive", "val", h, meta_val, yv, predict_seasonal_naive(val_df, clim, city_mean, h), pm10v)

    # TEST
    meta_te = test_df[['city_id','datetime']].copy()
    yt = test_df[f'y_h{h}'].to_numpy(float)
    pm10t = test_df[f'pm10_h{h}'].to_numpy(float)

    add_preds("Persistence", "test", h, meta_te, yt, predict_persistence(test_df), pm10t)
    add_preds("SeasonalNaive", "test", h, meta_te, yt, predict_seasonal_naive(test_df, clim, city_mean, h), pm10t)

# VAR 
var_models = fit_var_models(train_df, maxlags=6)
for h in HORIZONS:
    # VAL
    yhat_val = predict_var_for_split(val_df, df, var_models, h)
    miss = np.isnan(yhat_val)
    if miss.any():
        yhat_val[miss] = predict_persistence(val_df.iloc[np.where(miss)[0]])
    add_preds("VAR(6)", "val", h,
              val_df[['city_id','datetime']].copy(),
              val_df[f'y_h{h}'].to_numpy(float),
              yhat_val,
              val_df[f'pm10_h{h}'].to_numpy(float))

    # TEST
    yhat_test = predict_var_for_split(test_df, df, var_models, h)
    miss = np.isnan(yhat_test)
    if miss.any():
        yhat_test[miss] = predict_persistence(test_df.iloc[np.where(miss)[0]])
    add_preds("VAR(6)", "test", h,
              test_df[['city_id','datetime']].copy(),
              test_df[f'y_h{h}'].to_numpy(float),
              yhat_test,
              test_df[f'pm10_h{h}'].to_numpy(float))

# HistGBR 
for h in HORIZONS:
    X_tr, y_tr, _ = get_xy(train_df, h)
    X_va, y_va, pm10_va = get_xy(val_df, h)
    X_te, y_te, pm10_te = get_xy(test_df, h)

    pipe = build_tree_pipeline()
    pipe.fit(X_tr, y_tr)

    yhat_va = pipe.predict(X_va)
    yhat_te = pipe.predict(X_te)

    add_preds("HistGBR", "val", h, val_df[['city_id','datetime']].copy(), y_va, yhat_va, pm10_va)
    add_preds("HistGBR", "test", h, test_df[['city_id','datetime']].copy(), y_te, yhat_te, pm10_te)

# LSTM 
# Scale SEQ features using TRAIN ONLY (leakage-safe)
seq_scaler = StandardScaler()
seq_scaler.fit(train_df[SEQ_FEATURES].to_numpy(float))

def apply_seq_scaler(df_in):
    df2 = df_in.copy()
    df2[SEQ_FEATURES] = seq_scaler.transform(df2[SEQ_FEATURES].to_numpy(float))
    return df2

df_scaled = apply_seq_scaler(df)
train_scaled = apply_seq_scaler(train_df)
val_scaled = apply_seq_scaler(val_df)
test_scaled = apply_seq_scaler(test_df)

# Build static dimension size
tmp_ohe = OneHotEncoder(handle_unknown="ignore", sparse_output=False).fit(train_df[['city_id']])
n_static = len(STATIC_NUM) + tmp_ohe.transform(train_df[['city_id']].iloc[:1]).shape[1]

for h in HORIZONS:
    # Build arrays
    Xs_tr, Xst_tr, y_tr, pm10_tr, meta_tr = make_lstm_arrays(df_scaled, train_scaled, h, seq_len=SEQ_LEN)
    Xs_va, Xst_va, y_va, pm10_va, meta_va = make_lstm_arrays(df_scaled, val_scaled, h, seq_len=SEQ_LEN)
    Xs_te, Xst_te, y_te, pm10_te, meta_te = make_lstm_arrays(df_scaled, test_scaled, h, seq_len=SEQ_LEN)

    model = build_lstm_model(SEQ_LEN, Xs_tr.shape[-1], n_static)

    cb = [
        tf.keras.callbacks.EarlyStopping(monitor="val_loss", patience=3, restore_best_weights=True)
    ]

    model.fit(
        {"seq": Xs_tr, "static": Xst_tr}, y_tr,
        validation_data=({"seq": Xs_va, "static": Xst_va}, y_va),
        epochs=EPOCHS, batch_size=BATCH, callbacks=cb, verbose=1
    )

    yhat_va = model.predict({"seq": Xs_va, "static": Xst_va}, batch_size=BATCH).reshape(-1)
    yhat_te = model.predict({"seq": Xs_te, "static": Xst_te}, batch_size=BATCH).reshape(-1)

    add_preds("LSTM", "val", h, meta_va, y_va, yhat_va, pm10_va)
    add_preds("LSTM", "test", h, meta_te, y_te, yhat_te, pm10_te)

# Save predictions
pred_df = pd.concat(pred_rows, ignore_index=True)
pred_df = pred_df.sort_values(['split','horizon_h','model','city_id','datetime']).reset_index(drop=True)

OUT_PATH = "predictions_baselines.csv"
pred_df.to_csv(OUT_PATH, index=False)
print(f"\nSaved: {OUT_PATH}  rows={len(pred_df)}  models={pred_df['model'].nunique()}")

# Quick sanity summary on TEST 
def summarize(pred_df, split="test"):
    sub = pred_df[pred_df['split']==split]
    for h in sorted(sub['horizon_h'].unique()):
        print(f"\n=== {split.upper()} horizon h={h} ===")
        for m in sub['model'].unique():
            s = sub[(sub['horizon_h']==h) & (sub['model']==m)]
            y = s['y_true'].to_numpy(float)
            yhat = s['y_pred'].to_numpy(float)
            pm10 = s['pm10_true'].to_numpy(float)
            mae = mean_absolute_error(y, yhat)
            r = rmse(y, yhat)
            r2 = r2_score(y, yhat)
            vr, vm = constraint_violation_rate(yhat, pm10)
            print(f"{m:12s} | MAE {mae:7.3f} RMSE {r:7.3f} R2 {r2:6.3f} | viol {100*vr:5.2f}% (mean {vm:6.3f})")

summarize(pred_df, "test")

Train: 2023-01-02 00:00:00 -> 2024-12-31 23:00:00 rows 508080
Val  : 2025-01-01 00:00:00 -> 2025-06-30 23:00:00 rows 125976
Test : 2025-07-01 00:00:00 -> 2025-11-22 23:00:00 rows 100920
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30

Saved: predictions_baselines.csv  rows=3403440  models=5

=== TEST horizon h=1 ===
HistGBR      | MAE   3.454 RMSE   5.770 R2  0.943 | viol 33.49% (mean  3.460)
LSTM         | MAE   4.270 RMSE   6.064 R2  0.937 | viol 47.31% (mean  2.671)
Persistence  | MAE   2.757 RMSE   5.044 R2  0.956 | viol 28.20% (mean  3.299)
SeasonalNaive | MAE  12.214 RMSE  17.205 R2  0.493 | viol 51.50% (mean 10.441)
VAR(6)       | MAE   2.079 RMSE   3.791 R2  0.975 | viol 29.85% (mean  2.284)

=== TEST horizon h=6 ===
HistGBR      | MAE   8.188 RMSE  12.361 R2  0.741 | viol 45.34% (mean  7.112)
LSTM         | MAE   7.518 RMSE  11.298 R2  0.784 | viol 50.67% (mean 