In [16]:
import pandas as pd
import time

from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import cross_val_score
import optuna 
from optuna.samplers import TPESampler

- Load and prepare the data.

In [3]:
digits = datasets.load_digits()

n_samples = len(digits.images)
data = digits.images.reshape((n_samples, -1))

X_train, X_test, y_train, y_test = train_test_split(data,
    digits.target, test_size=0.25, shuffle=False)

- Define RFC and sample space.

In [5]:
rfc = RandomForestClassifier(random_state=42)

param_grid = {
    'n_estimators': [100,150,200],
    'criterion': ['gini', 'entropy'],
    'min_samples_split': [2, 3, 4],
    'min_samples_leaf': [1, 2, 3, 4, 5],
    'max_features': ['auto', 'sqrt', 'log2'],
    'max_depth': [5, 6, 7]
}

- Perform the tuning using GridSearchCV.

In [8]:
gs = GridSearchCV(estimator=rfc,
    param_grid=param_grid,
    scoring='f1_micro',
    cv=5,
    n_jobs=-1,
    verbose=2)

In [9]:
time_start = time.time()
gs.fit(X_train, y_train)
time_grid = time.time() - time_start

Fitting 5 folds for each of 810 candidates, totalling 4050 fits


  warn(


In [11]:
values_grid = [810, gs.best_index_+1, gs.best_score_, time_grid]
columns = ['Number of iterations', 'Iteration Number of Optimal Hyperparamters', 'Score', 'Time Elapsed (s)']
results_grid = pd.DataFrame([values_grid], columns = columns)

- Perform the tuning using RandomizedSearchCV.

In [13]:
rs = RandomizedSearchCV(estimator=rfc,
    param_distributions=param_grid,
    scoring='f1_micro',
    cv=5,
    n_jobs=-1,
    verbose=2,
    n_iter=100)

In [14]:
time_start = time.time()
rs.fit(X_train, y_train)
time_random = time.time() - time_start

Fitting 5 folds for each of 100 candidates, totalling 500 fits


  warn(


In [15]:
values_grid = [[100, rs.best_index_+1, rs.best_score_, time_random]]
results_random = pd.DataFrame(values_grid, columns = columns)

- Perform the tuning using Bayesian Optimization.

In [17]:
def objective(trial):
    """return the f1-score"""

    n_estimators = trial.suggest_int('n_estimators', low=100, high=200, step=50)
    criterion = trial.suggest_categorical('criterion', ['gini', 'entropy'])
    min_samples_split = trial.suggest_int('min_samples_split', low=2, high=4, step=1)
    min_samples_leaf = trial.suggest_int('min_samples_leaf', low=1, high=5, step=1)
    max_depth = trial.suggest_int('max_depth', low=5, high=7, step=1)
    max_features = trial.suggest_categorical('max_features', ['auto', 'sqrt','log2'])

    rfc = RandomForestClassifier(n_estimators=n_estimators, 
        criterion=criterion,
        min_samples_split=min_samples_split,
        min_samples_leaf=min_samples_leaf,
        max_depth=max_depth,
        max_features=max_features,
        random_state=42)

    score = cross_val_score(estimator=rfc, 
        X=X_train, 
        y=y_train, 
        scoring='f1_micro',
        cv=5,
        n_jobs=-1).mean()
    
    return score

In [18]:
study = optuna.create_study(sampler=TPESampler(), direction='maximize')

[32m[I 2023-02-07 15:31:33,990][0m A new study created in memory with name: no-name-a1d3ec11-7170-45fb-b184-8e414b0b5d4b[0m


In [19]:
time_start = time.time()
study.optimize(objective, n_trials=100)
time_bayesian = time.time() - time_start

[32m[I 2023-02-07 15:31:43,770][0m Trial 0 finished with value: 0.8945724907063196 and parameters: {'n_estimators': 100, 'criterion': 'gini', 'min_samples_split': 4, 'min_samples_leaf': 2, 'max_depth': 5, 'max_features': 'log2'}. Best is trial 0 with value: 0.8945724907063196.[0m
[32m[I 2023-02-07 15:31:44,012][0m Trial 1 finished with value: 0.9027536830510808 and parameters: {'n_estimators': 100, 'criterion': 'entropy', 'min_samples_split': 2, 'min_samples_leaf': 4, 'max_depth': 5, 'max_features': 'auto'}. Best is trial 1 with value: 0.9027536830510808.[0m
[32m[I 2023-02-07 15:31:44,405][0m Trial 2 finished with value: 0.9064573867547846 and parameters: {'n_estimators': 200, 'criterion': 'gini', 'min_samples_split': 2, 'min_samples_leaf': 4, 'max_depth': 5, 'max_features': 'sqrt'}. Best is trial 2 with value: 0.9064573867547846.[0m
[32m[I 2023-02-07 15:31:44,774][0m Trial 3 finished with value: 0.9072063885446784 and parameters: {'n_estimators': 200, 'criterion': 'gini', '

In [20]:
values_bayesian = [100, study.best_trial.number, study.best_trial.value, time_bayesian]
results_bayesian = pd.DataFrame([values_bayesian], columns = columns)

- Compare the results.

In [21]:
df = results_grid.append(results_random).append(results_bayesian)
df.index = ['Grid Search', 'Random Search', 'Bayesian Optimization']
df

  df = results_grid.append(results_random).append(results_bayesian)


Unnamed: 0,Number of iterations,Iteration Number of Optimal Hyperparamters,Score,Time Elapsed (s)
Grid Search,810,680,0.935426,146.484093
Random Search,100,72,0.935426,17.360056
Bayesian Optimization,100,60,0.935426,45.013147
