# 03 – Hyperparameter Tuning with Bayesian Optimization
Use bayesian-optimization to tune a RandomForestClassifier.

In [None]:
import pandas as pd
from pathlib import Path
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import roc_auc_score
from sklearn.ensemble import RandomForestClassifier
from bayes_opt import BayesianOptimization

df = pd.read_csv(Path('../data/raw/creditcard.csv'))
X = df.drop(columns=['Class'])
y = df['Class']

X_train, X_valid, y_train, y_valid = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

cols_to_scale = ['Amount'] + (['Time'] if 'Time' in X.columns else [])
scaler = StandardScaler()
X_train[cols_to_scale] = scaler.fit_transform(X_train[cols_to_scale])
X_valid[cols_to_scale] = scaler.transform(X_valid[cols_to_scale])

def cv_auc(n_estimators, max_depth, max_features, min_samples_split, min_samples_leaf):
    from sklearn.ensemble import RandomForestClassifier
    n_estimators = int(n_estimators)
    max_depth = int(max_depth)
    min_samples_split = int(min_samples_split)
    min_samples_leaf = int(min_samples_leaf)
    max_features = max(min(max_features, 1.0), 0.1)
    clf = RandomForestClassifier(
        n_estimators=n_estimators,
        max_depth=max_depth if max_depth>0 else None,
        max_features=max_features,
        min_samples_split=min_samples_split,
        min_samples_leaf=min_samples_leaf,
        class_weight='balanced_subsample',
        random_state=42,
        n_jobs=-1
    )
    clf.fit(X_train, y_train)
    proba = clf.predict_proba(X_valid)[:,1]
    return roc_auc_score(y_valid, proba)

pbounds = {
    'n_estimators': (100, 600),
    'max_depth': (4, 20),
    'max_features': (0.2, 1.0),
    'min_samples_split': (2, 20),
    'min_samples_leaf': (1, 10),
}

optimizer = BayesianOptimization(
    f=cv_auc,
    pbounds=pbounds,
    random_state=42,
    verbose=2
)

optimizer.maximize(init_points=8, n_iter=20)
print('Best params:', optimizer.max)

|   iter    |  target   | n_esti... | max_depth | max_fe... | min_sa... | min_sa... |
-------------------------------------------------------------------------------------
| [35m2        [39m | [35m0.9737882[39m | [35m177.99726[39m | [35m4.9293377[39m | [35m0.8929409[39m | [35m12.820070[39m | [35m7.3726532[39m |
| [39m3        [39m | [39m0.9609985[39m | [39m110.29224[39m | [39m19.518557[39m | [39m0.8659541[39m | [39m5.8221039[39m | [39m2.6364247[39m |
| [35m4        [39m | [35m0.9750126[39m | [35m191.70225[39m | [35m8.8678758[39m | [35m0.6198051[39m | [35m9.7750103[39m | [35m3.6210622[39m |
| [35m5        [39m | [35m0.9788734[39m | [35m405.92644[39m | [35m6.2319017[39m | [35m0.4337157[39m | [35m8.5945131[39m | [35m5.1046298[39m |


In [None]:
# Show the single best result (score + params)
print("Best result:", optimizer.max)

# Extract only the best params
best_params = optimizer.max["params"]

# Cast to the correct types for sklearn
best_params = {
    "n_estimators": int(best_params["n_estimators"]),
    "max_depth": int(best_params["max_depth"]) if int(best_params["max_depth"]) > 0 else None,
    "max_features": float(best_params["max_features"]),     # stays float in (0,1]
    "min_samples_split": int(best_params["min_samples_split"]),
    "min_samples_leaf": int(best_params["min_samples_leaf"]),
}
best_params