# 02 GARCH Baseline (GARCH(1,1)-t)

Build the benchmark volatility model using an expanding-window GARCH(1,1)-t process.

Key properties:
- One-step-ahead variance forecast
- Strict no-lookahead implementation
- Evaluation with both `MSE` and `QLIKE`
- Rolling out-of-sample metrics using split definitions from `01_data_pipeline.ipynb`


In [None]:
from __future__ import annotations

from pathlib import Path
import sys

import matplotlib.pyplot as plt
import pandas as pd

PROJECT_ROOT = Path.cwd().resolve()
if not (PROJECT_ROOT / "src").exists():
    PROJECT_ROOT = PROJECT_ROOT.parent
if str(PROJECT_ROOT) not in sys.path:
    sys.path.append(str(PROJECT_ROOT))

from src.evaluation import evaluate_forecasts
from src.models.garch import rolling_garch_forecast
from src.utils import set_seed

set_seed(42)
pd.set_option("display.max_columns", 100)
pd.set_option("display.width", 130)


In [None]:
data_path = PROJECT_ROOT / "data" / "processed" / "sp500_log_returns.csv"
splits_path = PROJECT_ROOT / "data" / "processed" / "rolling_splits.csv"

df = pd.read_csv(data_path, parse_dates=["date"]).sort_values("date").reset_index(drop=True)
splits_df = pd.read_csv(
    splits_path,
    parse_dates=[
        "train_start_date",
        "train_end_date",
        "val_start_date",
        "val_end_date",
        "test_start_date",
        "test_end_date",
    ],
)

print(f"Processed rows: {len(df):,}")
print(f"Rolling splits: {len(splits_df):,}")
df.head()


In [None]:
# Baseline configuration aligned with split construction.
RETURN_COL = "log_return"
TARGET_VAR_COL = "sq_return"
PRED_VAR_COL = "garch_pred_var"
MIN_TRAIN_SIZE = 756
REFIT_EVERY = 21

df[PRED_VAR_COL] = rolling_garch_forecast(
    returns=df[RETURN_COL],
    min_train_size=MIN_TRAIN_SIZE,
    refit_every=REFIT_EVERY,
)

df[["date", RETURN_COL, TARGET_VAR_COL, PRED_VAR_COL]].tail()


In [None]:
# Build prediction panel by test windows so this baseline uses the same split protocol as deep models.
prediction_frames = []
for split in splits_df.itertuples(index=False):
    mask = (df["date"] >= split.test_start_date) & (df["date"] <= split.test_end_date)
    panel = df.loc[mask, ["date", TARGET_VAR_COL, PRED_VAR_COL]].copy()
    panel = panel.dropna(subset=[TARGET_VAR_COL, PRED_VAR_COL])

    panel = panel.rename(
        columns={
            TARGET_VAR_COL: "y_true_var",
            PRED_VAR_COL: "y_pred_var",
        }
    )
    panel["split_id"] = int(split.split_id)
    panel["variant"] = "baseline"
    panel["architecture"] = "garch11_t"
    panel["train_loss"] = "mse"
    prediction_frames.append(panel)

if not prediction_frames:
    raise ValueError("No GARCH predictions were generated for test windows.")

predictions_df = pd.concat(prediction_frames, ignore_index=True)
predictions_df = predictions_df.sort_values(["split_id", "date"]).reset_index(drop=True)
print(f"Prediction rows (split-panel): {len(predictions_df):,}")
predictions_df.head()


In [None]:
metrics_by_split = evaluate_forecasts(
    predictions_df,
    group_cols=["split_id", "variant", "architecture", "train_loss"],
)
metrics_overall = evaluate_forecasts(
    predictions_df,
    group_cols=["variant", "architecture", "train_loss"],
)

print("Overall baseline metrics:")
display(metrics_overall)

print("First split-level rows:")
metrics_by_split.head()


In [None]:
plot_df = predictions_df.groupby("date", as_index=False)[["y_true_var", "y_pred_var"]].mean()

fig, ax = plt.subplots(figsize=(12, 4))
ax.plot(plot_df["date"], plot_df["y_true_var"], label="Realized Variance (r_t^2)", alpha=0.75)
ax.plot(plot_df["date"], plot_df["y_pred_var"], label="GARCH Predicted Variance", alpha=0.85)
ax.set_title("GARCH(1,1)-t: Realized vs Predicted Variance")
ax.set_xlabel("Date")
ax.set_ylabel("Variance")
ax.grid(alpha=0.25)
ax.legend()
plt.tight_layout()
plt.show()


In [None]:
pred_dir = PROJECT_ROOT / "reports" / "predictions"
pred_dir.mkdir(parents=True, exist_ok=True)

pred_path = pred_dir / "garch_baseline_predictions.csv"
split_metrics_path = pred_dir / "garch_baseline_metrics_by_split.csv"
overall_metrics_path = pred_dir / "garch_baseline_metrics_overall.csv"

predictions_df.to_csv(pred_path, index=False)
metrics_by_split.to_csv(split_metrics_path, index=False)
metrics_overall.to_csv(overall_metrics_path, index=False)

print(f"Saved predictions: {pred_path}")
print(f"Saved split metrics: {split_metrics_path}")
print(f"Saved overall metrics: {overall_metrics_path}")
