## 03 â€“ Point models

Train baselines and a tree model; compare MAE/RMSE on a holdout slice.


In [None]:
# Ensure project root is importable dynamically
import sys, os
from pathlib import Path

def find_project_root(start: Path | None = None) -> Path:
    if start is None:
        start = Path.cwd()
    for p in [start, *start.parents]:
        if (p / "src").is_dir() and (p / "requirements.txt").exists():
            return p
        if (p / ".git").exists():
            return p
    return start

PROJECT_ROOT = find_project_root()
if str(PROJECT_ROOT) not in sys.path:
    sys.path.insert(0, str(PROJECT_ROOT))


In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import TimeSeriesSplit

from pathlib import Path
from src.data import load_time_series, rename_region_columns_to_standard
from src.features import add_calendar_features, add_lagged_load_features, select_feature_columns
from src.models import (
    baseline_persistence,
    baseline_same_hour_last_week,
    train_point_models,
    evaluate_point_models,
)
from src.evaluation import summarize_metrics

RAW_PATH = (PROJECT_ROOT / "time_series_60min_singleindex.csv")
FEAT_PATH = (PROJECT_ROOT / "data" / "processed_features.parquet")
DATETIME_COL = "utc_timestamp"

# Load engineered features if available, else build quickly here
if FEAT_PATH.exists():
    eng = pd.read_parquet(FEAT_PATH)
else:
    df = load_time_series(str(RAW_PATH), datetime_col=DATETIME_COL)
    df = rename_region_columns_to_standard(df, region="DE")
    if "load_mw" not in df.columns:
        raise ValueError("Could not find Germany load column; verify source columns.")
    eng = add_calendar_features(df)
    eng = add_lagged_load_features(eng, target_col="load_mw", lag_hours=(1, 24, 168), rolling_windows=(24, 168))
    eng = eng.dropna()

X, y, feature_names = select_feature_columns(eng, target_col="load_mw")
print("Data ready:", X.shape, "features=", len(feature_names))


ValueError: Could not infer datetime column name in CSV.

In [None]:
# Time-based train/validation split (last 20% as validation)
split_idx = int(0.8 * len(X))
X_train, X_valid = X.iloc[:split_idx], X.iloc[split_idx:]
y_train, y_valid = y.iloc[:split_idx], y.iloc[split_idx:]

models = train_point_models(X_train, y_train)

# Baselines using only the target history (applied on validation index)
persist_pred = baseline_persistence(y, horizon_hours=1).iloc[split_idx:]
weekly_pred = baseline_same_hour_last_week(y).iloc[split_idx:]

# Drop any NA alignment
mask = (~persist_pred.isna()) & (~weekly_pred.isna())
persist_pred = persist_pred[mask]
weekly_pred = weekly_pred[mask]
y_valid_baselines = y_valid.loc[mask.index][mask]

# Evaluate ML models (already aligned)
results = evaluate_point_models(models, X_valid, y_valid)

# Evaluate baselines
from sklearn.metrics import mean_absolute_error, mean_squared_error
import numpy as np

results["baseline_persistence"] = {
    "MAE": float(mean_absolute_error(y_valid_baselines, persist_pred)),
    "RMSE": float(np.sqrt(mean_squared_error(y_valid_baselines, persist_pred)))
}
results["baseline_weekly"] = {
    "MAE": float(mean_absolute_error(y_valid_baselines, weekly_pred)),
    "RMSE": float(np.sqrt(mean_squared_error(y_valid_baselines, weekly_pred)))
}

summ = summarize_metrics(results)
summ


In [None]:
# Visualize a recent window of predictions vs truth
plot_slice = y_valid.index[-7*24:]  # last week of validation

fig, ax = plt.subplots(figsize=(14, 5))
ax.plot(y_valid.loc[plot_slice].index, y_valid.loc[plot_slice].values, label="truth", lw=1.5)
ax.plot(y_valid.loc[plot_slice].index, models.gradient_boosting.predict(X_valid.loc[plot_slice]), label="GBR", lw=1.2)
ax.plot(y_valid.loc[plot_slice].index, models.random_forest.predict(X_valid.loc[plot_slice]), label="RF", lw=1.0)
ax.legend()
ax.set_title("Validation window: predictions vs truth")
ax.set_xlabel("")
fig.tight_layout()
fig.savefig(str(PROJECT_ROOT / "figures" / "point_models_validation_window.png"), dpi=150)
plt.show()
