# Hyperparameter Tuning

In [None]:
!pip install -r requirements.txt --quiet

In [None]:
import numpy as np
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import (train_test_split, GridSearchCV, 
                                     RandomizedSearchCV, cross_val_score)
from sklearn.metrics import f1_score

## Load and split data

In [None]:
data = load_breast_cancer()
X, y = data.data, data.target
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size=0.2, 
                                                    random_state=42)

## Model Initialization

In [None]:
from xgboost import XGBClassifier
xgb_clf = XGBClassifier(
    use_label_encoder=False, 
    eval_metric='logloss')

## Grid Search Hyperparameter Tuning using Sci-Kit Learn

In [None]:
grid_params = {
    'n_estimators': [50, 100, 150],
    'max_depth': [3, 5, 7],
    'learning_rate': [0.001, 0.01, 0.1]
}

In [None]:
len(grid_params['n_estimators']) * len(grid_params['max_depth']) * len(grid_params['learning_rate'])

In [None]:
from sklearn.model_selection import GridSearchCV
grid_search = GridSearchCV(
    estimator=xgb_clf, 
    param_grid=grid_params, 
    cv=5, scoring='f1', 
    verbose=1, n_jobs=-1)

In [None]:
grid_search.fit(X_train, y_train)

In [None]:
print("Best Grid Search Params:", grid_search.best_params_)
print("Grid Search F1 Score:", f1_score(y_test, grid_search.best_estimator_.predict(X_test)))

## Random Search Hyperparameter Tuning using Sci-Kit Learn

In [None]:
random_params = {
    'n_estimators': np.arange(50, 200, 10),
    'max_depth': np.arange(3, 10),
    'learning_rate': np.linspace(0.01, 0.3, 30),
    'subsample': [0.6, 0.8, 1.0]
}

In [None]:
random_search = RandomizedSearchCV(
    estimator=xgb_clf, 
    param_distributions=random_params, 
    n_iter=20, cv=5, scoring='f1', 
    verbose=1, random_state=42, 
    n_jobs=-1)

In [None]:
random_search.fit(X_train, y_train)

In [None]:
print("Best Random Search Params:", random_search.best_params_)
print("Random Search F1 score:", f1_score(y_test, 
                                          random_search.best_estimator_.predict(X_test)))


## Bayesian Optimization with Optuna

In [None]:
import optuna
from optuna.samplers import TPESampler
from xgboost import XGBClassifier
from sklearn.model_selection import cross_val_score
from sklearn.metrics import f1_score

### Define and Run Optuna Study

In [None]:
def objective(trial):
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 50, 200),
        'max_depth': trial.suggest_int('max_depth', 3, 10),
        'learning_rate': trial.suggest_float('learning_rate', 
                                             0.001, 0.3),
        'subsample': trial.suggest_float('subsample', 0.6, 1.0),
        'use_label_encoder': False,
        'eval_metric': 'logloss',
        'n_jobs': -1 
        }
    model = XGBClassifier(**params)
    f1 = cross_val_score(model, X_train, y_train, 
                         cv=5, scoring='f1').mean()
    return f1

In [None]:
tpe_sampler = TPESampler() #Bayesian optimization sampler
# Create a study object
study = optuna.create_study(direction='maximize', 
                            sampler=tpe_sampler)
study.optimize(objective, n_trials=30)

### Get Best Study and Retrain Model

In [None]:
best_params = study.best_params
best_params

In [None]:
best_model = XGBClassifier(**best_params, 
                           use_label_encoder=False, 
                           eval_metric='logloss', n_jobs=-1)
best_model.fit(X_train, y_train)

### Get best Optuna metrics

In [None]:
optuna_f1 = f1_score(y_test, best_model.predict(X_test))
print("Best Optuna (Bayesian) Params:", best_params)
print(f"Optuna Bayesian F1-Score on Test Set: {optuna_f1:.4f}")

### Visualize Optuna process

In [None]:
import optuna.visualization as vis
vis.plot_optimization_history(study).show()

In [None]:
vis.plot_param_importances(study).show()

## Evaluate methods

In [None]:
import pandas as pd

grid_f1 = float(f1_score(y_test, 
                         grid_search.best_estimator_.predict(X_test)))
random_f1 = float(f1_score(y_test, 
                           random_search.best_estimator_.predict(X_test)))
optuna_f1 = float(optuna_f1)

hp_results_df = pd.DataFrame([
    {'Method': 'Grid Search', 'F1_Score': grid_f1},
    {'Method': 'Random Search', 'F1_Score': random_f1},
    {'Method': 'Optuna Bayesian', 'F1_Score': optuna_f1}])

In [None]:
hp_results_df

### Visualize comparisons

In [158]:
%%writefile hyperparameter/visualize.py
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

def plot_hp_comparison(results_df, 
                       metric_cols, 
                       x_col_name='Method',
                       figsize=(12, 6),
                       x_col_fontsize=12,
                       title='Hyperparameter Tuning Comparison',
                       title_fontsize=14,
                       label_fontsize=10,
                       label_color='black',
                       label_position='above',   # Now functional again
                       x_label_rotation=30,
                       show_legend=True,
                       **barplot_kwargs):

    if isinstance(metric_cols, str):
        metric_cols = [metric_cols]

    plot_df = results_df.copy()
    if len(metric_cols) > 1:
        plot_df = pd.melt(plot_df, id_vars=[x_col_name], value_vars=metric_cols,
                          var_name='Metric', value_name='Value')
    else:
        plot_df['Metric'] = metric_cols[0]
        plot_df = plot_df.rename(columns={metric_cols[0]: 'Value'})

    plt.figure(figsize=figsize)

    ax = sns.barplot(data=plot_df, x=x_col_name, y='Value', hue='Metric', 
                     dodge=len(metric_cols) > 1, **barplot_kwargs)

    plt.title(title, fontsize=title_fontsize)
    plt.ylim(0, plot_df['Value'].max() * 1.15)
    plt.xlabel(x_col_name, fontsize=x_col_fontsize)
    plt.ylabel('Value')

    if x_label_rotation is not None:
        plt.xticks(rotation=x_label_rotation, ha='right')

    if label_position.lower() == 'above':
        label_type = 'edge'
        padding = 3
    elif label_position.lower().startswith('cent'):
        label_type = 'center'
        padding = 0
    else:
        raise ValueError("label_position must be either 'above' or 'center'.")

    for container in ax.containers:
        ax.bar_label(container, fmt='%.4f', label_type=label_type, padding=padding, fontsize=label_fontsize, color=label_color)

    if show_legend and len(metric_cols) > 1:
        plt.legend(title='Metric')
    else:
        plt.legend([], [], frameon=False)

    plt.tight_layout()
    plt.show()

Overwriting hyperparameter/visualize.py


In [None]:
from hyperparameter.visualize import plot_hp_comparison

In [None]:
plot_hp_comparison(hp_results_df, metric_cols=['F1_Score'],
                   x_col_name='Method', palette='Greys', 
                   label_position='center')