# 07 — Hyperparameter Optimisation (HPO)

This notebook loads **pre-computed** HPO results (`reports/hpo_results.json`) and
`models/best_params.json` generated by `python -m src.models.hpo`.

**Pipeline recap**
- Expanding-window **time-series CV** (4 folds, non-overlapping validation)
- **XGBoost**: 12 randomised configs × 4 folds, with early stopping
- **LSTM**: 6 randomised configs × 3 folds, 10 epochs max per trial

We visualise:
1. CV fold schematic
2. XGBoost HPO param importance
3. LSTM HPO learning-rate / hidden-dim landscape
4. Best params summary table

In [None]:
import json, pathlib
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
import warnings
warnings.filterwarnings('ignore')

REPORTS = pathlib.Path('../reports')
MODELS  = pathlib.Path('../models')
FIGS    = pathlib.Path('../figures')
FIGS.mkdir(exist_ok=True)

with open(REPORTS / 'hpo_results.json') as f:
    hpo = json.load(f)
with open(MODELS / 'best_params.json') as f:
    best = json.load(f)

print(f"HPO elapsed: {hpo['elapsed_seconds']:.0f}s")
print(f"Keys: {list(hpo.keys())}")

## 1 — Time-Series CV Fold Schematic

In [None]:
n_rows = 34000  # CV pool size
n_splits = 4
boundaries = [int(round(i * n_rows / (n_splits + 1))) for i in range(n_splits + 2)]

fig, ax = plt.subplots(figsize=(10, 3))
for fold_idx in range(n_splits):
    train_end = boundaries[fold_idx + 1]
    val_end   = boundaries[fold_idx + 2]
    # train bar
    ax.barh(fold_idx, train_end, left=0, height=0.6, color='#3b82f6', label='Train' if fold_idx == 0 else '')
    # val bar
    ax.barh(fold_idx, val_end - train_end, left=train_end, height=0.6, color='#f97316', label='Val' if fold_idx == 0 else '')

ax.set_yticks(range(n_splits))
ax.set_yticklabels([f'Fold {i+1}' for i in range(n_splits)])
ax.set_xlabel('Row index (sorted by timestamp_ns)')
ax.set_title('Expanding-Window Time-Series CV')
ax.legend(loc='lower right')
ax.invert_yaxis()
plt.tight_layout()
fig.savefig(FIGS / 'hpo_cv_folds.png', dpi=150)
plt.show()

## 2 — XGBoost Regression HPO Results

In [None]:
xgb_reg = hpo['xgb_regression']
df_xgb_reg = pd.DataFrame(xgb_reg['all_results'])

# Expand params dict into columns
params_df = pd.json_normalize(df_xgb_reg['params'])
df_xgb_reg = pd.concat([params_df, df_xgb_reg[['mean_score', 'std_score']]], axis=1)
df_xgb_reg = df_xgb_reg.sort_values('mean_score')

print(f"Metric: {xgb_reg['metric']}  |  Best: {xgb_reg['best_score']:.4f}")
print(f"Best params: {xgb_reg['best_params']}\n")
df_xgb_reg.head(12).style.background_gradient(subset=['mean_score'], cmap='RdYlGn_r')

In [None]:
fig, ax = plt.subplots(figsize=(8, 4))
labels = [str(i+1) for i in range(len(df_xgb_reg))]
ax.barh(labels, df_xgb_reg['mean_score'], xerr=df_xgb_reg['std_score'], color='#3b82f6', alpha=0.8)
ax.set_xlabel(f"CV Mean {xgb_reg['metric'].upper()}")
ax.set_ylabel('Config #')
ax.set_title('XGBoost Regression — HPO Configs (sorted)')
plt.tight_layout()
fig.savefig(FIGS / 'hpo_xgb_regression.png', dpi=150)
plt.show()

## 3 — XGBoost Classification HPO Results

In [None]:
xgb_clf = hpo['xgb_classification']
df_xgb_clf = pd.DataFrame(xgb_clf['all_results'])
params_df_clf = pd.json_normalize(df_xgb_clf['params'])
df_xgb_clf = pd.concat([params_df_clf, df_xgb_clf[['mean_score', 'std_score']]], axis=1)
df_xgb_clf = df_xgb_clf.sort_values('mean_score')

print(f"Metric: {xgb_clf['metric']}  |  Best: {xgb_clf['best_score']:.4f}")
print(f"Best params: {xgb_clf['best_params']}\n")

fig, ax = plt.subplots(figsize=(8, 4))
labels = [str(i+1) for i in range(len(df_xgb_clf))]
ax.barh(labels, df_xgb_clf['mean_score'], xerr=df_xgb_clf['std_score'], color='#f97316', alpha=0.8)
ax.set_xlabel(f"CV Mean {xgb_clf['metric'].upper()}")
ax.set_ylabel('Config #')
ax.set_title('XGBoost Classification — HPO Configs (sorted)')
plt.tight_layout()
fig.savefig(FIGS / 'hpo_xgb_classification.png', dpi=150)
plt.show()

## 4 — LSTM HPO Results

In [None]:
# ── LSTM regression ──
lstm_reg = hpo['lstm_regression']
df_lstm_reg = pd.DataFrame(lstm_reg['all_results'])
lp_reg = pd.json_normalize(df_lstm_reg['params'])
df_lstm_reg = pd.concat([lp_reg, df_lstm_reg[['mean_score', 'std_score']]], axis=1)
df_lstm_reg = df_lstm_reg.sort_values('mean_score')

print(f"LSTM Regression — Metric: {lstm_reg['metric']}  |  Best: {lstm_reg['best_score']:.4f}")
print(f"Best params: {lstm_reg['best_params']}\n")
df_lstm_reg

In [None]:
# ── LSTM classification ──
lstm_clf = hpo['lstm_classification']
df_lstm_clf = pd.DataFrame(lstm_clf['all_results'])
lp_clf = pd.json_normalize(df_lstm_clf['params'])
df_lstm_clf = pd.concat([lp_clf, df_lstm_clf[['mean_score', 'std_score']]], axis=1)
df_lstm_clf = df_lstm_clf.sort_values('mean_score')

print(f"LSTM Classification — Metric: {lstm_clf['metric']}  |  Best: {lstm_clf['best_score']:.4f}")
print(f"Best params: {lstm_clf['best_params']}\n")
df_lstm_clf

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(12, 4))

for ax, df_l, title, color in [
    (axes[0], df_lstm_reg, 'LSTM Regression (MSE Loss)', '#10b981'),
    (axes[1], df_lstm_clf, 'LSTM Classification (BCE Loss)', '#ef4444'),
]:
    scatter = ax.scatter(
        df_l['learning_rate'], df_l['hidden_dim'],
        s=200, c=df_l['mean_score'], cmap='viridis_r',
        edgecolors='black', linewidths=0.5,
    )
    plt.colorbar(scatter, ax=ax, label='Mean CV Loss')
    ax.set_xlabel('Learning Rate')
    ax.set_ylabel('Hidden Dim')
    ax.set_title(title)

plt.tight_layout()
fig.savefig(FIGS / 'hpo_lstm_landscape.png', dpi=150)
plt.show()

## 5 — Best Parameters Summary

In [None]:
summary_rows = []
for task_key in ['xgb_regression', 'xgb_classification', 'lstm_regression', 'lstm_classification']:
    r = hpo[task_key]
    model = 'XGBoost' if 'xgb' in task_key else 'LSTM'
    summary_rows.append({
        'Task': task_key,
        'Model': model,
        'Metric': r['metric'],
        'Best Score': round(r['best_score'], 4),
        'N Configs': r['n_configs'],
        'N Splits': r['n_splits'],
        'Best Params': str(r['best_params']),
    })

df_summary = pd.DataFrame(summary_rows)
print(f"Total HPO time: {hpo['elapsed_seconds']:.0f}s\n")
df_summary

In [None]:
print('best_params.json:')
print(json.dumps(best, indent=2))
print()
print('Saved artifacts:')
for p in [REPORTS / 'hpo_results.json', MODELS / 'best_params.json']:
    print(f'  ✓ {p}  ({p.stat().st_size:,} bytes)')