# Model Optimization: Bayesian Search with Optuna

**Objective:** Minimize MAE by mathematically searching the hyperparameter space.
**Method:** Bayesian Optimization (TPE - Tree-structured Parzen Estimator). Unlike Grid Search, this learns from previous iterations to find the global minimum faster.

**Target:** `target_points_next_3`

In [None]:
import pandas as pd
import numpy as np
import xgboost as xgb
import optuna
from sklearn.metrics import mean_absolute_error
from pathlib import Path

# Robust Path Handling
current_path = Path.cwd()
if 'notebooks' in str(current_path).lower():
    BASE_DIR = current_path.parent
else:
    BASE_DIR = current_path
PROCESSED_DIR = BASE_DIR / "data" / "processed"

optuna.logging.set_verbosity(optuna.logging.WARNING)

In [None]:
# 1. Load & Prep (Same as Model Lab)
df = pd.read_csv(PROCESSED_DIR / "fpl_features_production.csv")

# Whitelist Preprocessing
keys = ['season', 'GW', 'element', 'target_points_next_3']
numeric_df = df.select_dtypes(include=[np.number])
model_cols = numeric_df.columns.tolist()
for k in keys:
    if k not in model_cols and k in df.columns:
        model_cols.append(k)
model_df = df[model_cols].copy()

# We will optimize on the LAST COMPLETED SEASON (2024-25) to prevent overfitting to 2025-26
# and to ensure we have enough data.
TRAIN_SEASONS = ['2021-22', '2022-23', '2023-24']
VAL_SEASON = '2024-25'

X_train = model_df[model_df['season'].isin(TRAIN_SEASONS)].drop(columns=['target_points_next_3', 'season', 'GW', 'element'])
y_train = model_df[model_df['season'].isin(TRAIN_SEASONS)]['target_points_next_3']

X_val = model_df[model_df['season'] == VAL_SEASON].drop(columns=['target_points_next_3', 'season', 'GW', 'element'])
y_val = model_df[model_df['season'] == VAL_SEASON]['target_points_next_3']

print(f"Optimization Set: Train {len(X_train):,} | Val {len(X_val):,}")

In [None]:
def objective(trial):
    # 1. Define Search Space
    params = {
        'objective': 'reg:absoluteerror',
        'n_estimators': 1000, # Fixed high number, handled by early stopping
        'learning_rate': trial.suggest_float('learning_rate', 0.005, 0.1, log=True),
        'max_depth': trial.suggest_int('max_depth', 3, 10),
        'subsample': trial.suggest_float('subsample', 0.6, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.6, 1.0),
        'reg_alpha': trial.suggest_float('reg_alpha', 1e-8, 10.0, log=True),
        'reg_lambda': trial.suggest_float('reg_lambda', 1e-8, 10.0, log=True),
        'n_jobs': -1,
        'random_state': 42
    }
    
    # 2. Train with Pruning
    model = xgb.XGBRegressor(**params)
    
    # Pruning callback for efficiency
    pruning_callback = optuna.integration.XGBoostPruningCallback(trial, "validation_0-mae")
    
    model.fit(
        X_train, y_train,
        eval_set=[(X_val, y_val)],
        eval_metric="mae",
        early_stopping_rounds=50,
        callbacks=[pruning_callback],
        verbose=False
    )
    
    # 3. Evaluate
    preds = model.predict(X_val)
    mae = mean_absolute_error(y_val, preds)
    return mae

In [None]:
print("Starting Bayesian Optimization (50 Trials)...")
study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=50, show_progress_bar=True)

print("\n--- OPTIMIZATION COMPLETE ---")
print(f"Best MAE: {study.best_value:.4f}")
print("Best Params:")
print(study.best_params)

In [None]:
# Save best params to config for production use
import json

best_params = study.best_params
best_params['n_estimators'] = 1000 # Restore fixed param
best_params['objective'] = 'reg:absoluteerror'

config_path = BASE_DIR / "config" / "best_model_params.json"
config_path.parent.mkdir(parents=True, exist_ok=True)

with open(config_path, 'w') as f:
    json.dump(best_params, f, indent=4)

print(f"Best parameters saved to {config_path}")