In [1]:
import sys
sys.version_info

sys.version_info(major=3, minor=7, micro=13, releaselevel='final', serial=0)

# Read dataset and Vanilla RF

In [2]:
from sklearn.datasets import load_iris
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
import pandas as pd

In [3]:
x, y = load_iris(return_X_y=True)
print(x.shape, y.shape)

(150, 4) (150,)


In [4]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3)
print(x_train.shape, x_test.shape)

(105, 4) (45, 4)


In [5]:
rf = RandomForestClassifier(random_state=42)

In [6]:
rf.fit(x_train, y_train)

RandomForestClassifier(random_state=42)

In [7]:
rf.score(x_test, y_test)

0.9555555555555556

# Tuning
https://towardsdatascience.com/optimise-your-hyperparameter-tuning-with-hyperopt-861573239eb5  
https://towardsdatascience.com/automated-machine-learning-hyperparameter-tuning-in-python-dfda59b72f8a

In [8]:
rf.get_params()

{'bootstrap': True,
 'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': None,
 'max_features': 'auto',
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 100,
 'n_jobs': None,
 'oob_score': False,
 'random_state': 42,
 'verbose': 0,
 'warm_start': False}

In [9]:
%%capture
!pip install -U hyperopt
from hyperopt import fmin, hp, tpe, Trials, space_eval, STATUS_OK

In [10]:
def objective(search_space):
    model = RandomForestClassifier(**search_space, random_state=42)
    model.fit(x_train, y_train)
    y_pred = model.predict(x_test)
    accuracy = accuracy_score(y_test, y_pred)
    return {'loss': -accuracy, 'status': STATUS_OK}

In [11]:
search_space = {
    'n_estimators': hp.randint('n_estimators', 200, 1000),
    'max_depth': hp.randint('max_depth', 10, 200),               
    'min_samples_split': hp.uniform('min_samples_split', 0, 1),                
    'min_samples_leaf': hp.randint('min_samples_leaf', 1, 10),       
    'criterion': hp.choice('criterion', ['gini', 'entropy']),        
    'max_features': hp.choice('max_features', ['sqrt', 'log2'])
}

# Tree of Parzen Estimators (Bayesian)
algorithm = tpe.suggest
MAX_EVALS = 6 * 20
bayes_trials = Trials()

In [12]:
optimize = fmin(
    fn=objective,
    space=search_space,
    algo=algorithm,
    max_evals=MAX_EVALS,
    trials=bayes_trials,
)

100%|██████████| 120/120 [02:05<00:00,  1.04s/trial, best loss: -0.9555555555555556]


In [13]:
best_params = space_eval(search_space, optimize)
best_params

{'criterion': 'entropy',
 'max_depth': 28,
 'max_features': 'sqrt',
 'min_samples_leaf': 1,
 'min_samples_split': 0.1607131131788666,
 'n_estimators': 750}

In [14]:
best_rf = RandomForestClassifier(**best_params, random_state=42)
best_rf.fit(x_train, y_train)
best_rf.score(x_test, y_test)

0.9555555555555556

In [15]:
# bayes_trials.results