In [25]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from pathlib import Path
from statsmodels.tsa.statespace.structural import UnobservedComponents as UCM
from sklearn.metrics import mean_absolute_error

plt.style.use("seaborn-v0_8")
DATA_PATH = Path("../data/ts2024.csv")

# --- Cell 1 (UPDATED) ---
df = (pd.read_csv(DATA_PATH)
        .sort_values("DateTime")
        .reset_index(drop=True))

df["DateTime"] = pd.to_datetime(df["DateTime"], format="mixed", errors="raise")


In [26]:
# --- Cell 2 (REPLACE with this version) ---
def make_exog(frame: pd.DataFrame, add_holidays=False) -> pd.DataFrame:
    """Return design matrix with:
       • 23 hour-of-day dummies (drop hour 0)
       • 6 weekday dummies       (drop Mon=0)
       • optional December-holiday dummies.
       Works even if DateTime column was not parsed properly.
    """
    # ensure DateTime is datetime64 dtype
    dt = pd.to_datetime(frame["DateTime"], errors="coerce")

    if dt.isnull().any():
        raise ValueError("Some 'DateTime' entries could not be parsed.")

    # basic seasonal dummies
    exog = (pd.get_dummies(dt.dt.hour,  prefix="hr",  drop_first=True)
              .join(pd.get_dummies(dt.dt.dayofweek, prefix="dow", drop_first=True)))

    if add_holidays:
        # common December holiday dummies
        specials = {
            "xmas_eve": (12, 24),
            "xmas_day": (12, 25),
            "boxing":   (12, 26),
            "nye":      (12, 31),
        }
        for name, (m, d) in specials.items():
            exog[name] = ((dt.dt.month == m) & (dt.dt.day == d)).astype(int)

    return exog.astype(int)


In [27]:
print(df["DateTime"].head())
print(df["DateTime"].dtype)


0   2015-01-01 00:00:00
1   2015-01-01 01:00:00
2   2015-01-01 02:00:00
3   2015-01-01 03:00:00
4   2015-01-01 04:00:00
Name: DateTime, dtype: datetime64[ns]
datetime64[ns]


In [28]:
# --- Cell 3 ---
H = 744
test  = df.iloc[-H:]
valid = df.iloc[-2*H:-H]
train = df.iloc[:-2*H]

X_train_A = make_exog(train, add_holidays=False)
X_valid_A = make_exog(valid, add_holidays=False)
X_test_A  = make_exog(test,  add_holidays=False)

X_train_B = make_exog(train, add_holidays=True)
X_valid_B = make_exog(valid, add_holidays=True)
X_test_B  = make_exog(test,  add_holidays=True)

print("Design-matrix shapes:",
      "\n A:", X_train_A.shape, X_valid_A.shape, X_test_A.shape,
      "\n B:", X_train_B.shape, X_valid_B.shape, X_test_B.shape)


Design-matrix shapes: 
 A: (16056, 29) (744, 29) (744, 29) 
 B: (16056, 33) (744, 33) (744, 33)


In [29]:
df

Unnamed: 0,DateTime,Date,Hour,X
0,2015-01-01 00:00:00,2015-01-01,0,0.0146
1,2015-01-01 01:00:00,2015-01-01,1,0.0148
2,2015-01-01 02:00:00,2015-01-01,2,0.0101
3,2015-01-01 03:00:00,2015-01-01,3,0.0060
4,2015-01-01 04:00:00,2015-01-01,4,0.0055
...,...,...,...,...
17539,2016-12-31 19:00:00,2016-12-31,19,
17540,2016-12-31 20:00:00,2016-12-31,20,
17541,2016-12-31 21:00:00,2016-12-31,21,
17542,2016-12-31 22:00:00,2016-12-31,22,


In [30]:
# --- Cell 4 ---
def fit_ucm(endog, model_kwargs, exog=None):
    """Return fitted UCM instance."""
    return UCM(endog, exog=exog, **model_kwargs).fit(disp=False)

def forecast_and_score(fit_res, n_periods, exog_future=None, truth=None):
    """Forecast ahead and return (forecast, MAE)."""
    pred = fit_res.forecast(steps=n_periods, exog=exog_future)
    mae  = mean_absolute_error(truth, pred) if truth is not None else None
    return pred, mae

In [None]:
# --- Cell 5 ---
SEASON_LIST = [
    {"period": 24,  "harmonics": 12}#,   # daily
    #{"period": 168, "harmonics": 10},   # weekly
]

MODEL_A = dict(level="local linear trend", freq_seasonal=SEASON_LIST)

In [33]:
# --- Cell 6 ---
# Model A: LLT + freq_seasonal
fit_A = fit_ucm(train["X"], model_kwargs=MODEL_A)
pred_A, mae_A = forecast_and_score(fit_A, n_periods=H, truth=valid["X"])

# Model B: same model + exog dummies (hour, dow, holidays)
fit_B = fit_ucm(train["X"], model_kwargs=MODEL_A, exog=X_train_B)
pred_B, mae_B = forecast_and_score(fit_B, n_periods=H, exog_future=X_valid_B, truth=valid["X"])

print(f"Validation MAE\n  Model A (no dummies): {mae_A:.4f}\n  Model B (with dummies): {mae_B:.4f}")



KeyboardInterrupt: 

In [None]:
# --- Cell 7 ---
trainval = pd.concat([train, valid])
model_kwargs = MODEL_A
use_exog = X_train_B.append(X_valid_B) if mae_B < mae_A else None
exog_test = X_test_B if mae_B < mae_A else None

winner = "Model B (with dummies)" if mae_B < mae_A else "Model A (no dummies)"

final_fit = fit_ucm(trainval["X"], model_kwargs=model_kwargs, exog=use_exog)
test_pred, test_mae = forecast_and_score(final_fit, n_periods=H, exog_future=exog_test, truth=test["X"])

print(f"Selected model: {winner}\nTest MAE: {test_mae:.4f}")

In [None]:
# --- Cell 8 ---
fig, ax = plt.subplots(figsize=(10, 3))
ax.plot(test["DateTime"], test["X"], label="Actual", color="black", lw=0.8)
ax.plot(test["DateTime"], test_pred, label="Forecast", color="tab:red")
ax.set(title=f"{winner} – Forecast on test set", ylabel="X")
ax.legend()
plt.tight_layout()

In [None]:
# --- Cell 9 ---
out = test[["DateTime"]].copy()
out["forecast_X"] = test_pred
out.to_csv("../data/ts2024_test_forecast_winner.csv", index=False)
print("Saved forecast → ../data/ts2024_test_forecast_winner.csv")