# Chapter: 5
## Section: Options for improving model performance

In [None]:
!pip install ray
!pip install "ray[tune]" tune-sklearn
!pip install bayesian-optimization
!pip install scikit-optimize

In [2]:
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier as RFC
from sklearn import datasets
from sklearn.model_selection import StratifiedKFold
import time
# determining random state for data split and model initialization
random_state = 42
# loading and splitting digit data to train and test sets
digits = datasets.load_digits()
x = digits.data
y = digits.target
x_train, x_test, y_train, y_test = train_test_split(x, y, random_state= random_state, test_size=0.2)
# list of hyperparameters to use for tuning
parameter_grid = {"max_depth": [2, 5, 10, 15, 20], "min_samples_split": [2, 5, 7]}
# validating using stratified k-fold (k=5) cross-validation
stratified_kfold_cv = StratifiedKFold(n_splits = 5, shuffle=True, random_state=random_state)
# generating the grid search
start_time = time.time()
sklearn_gridsearch = GridSearchCV(estimator = RFC(n_estimators = 10, random_state = random_state),
                                  param_grid = parameter_grid,
                                  cv = stratified_kfold_cv,
                                  n_jobs=-1)
# fitting the grid search cross-validation
sklearn_gridsearch.fit(x_train, y_train)
print("--- %s seconds ---" % (time.time() - start_time))
print("Best hyperparameters: {}".format(sklearn_gridsearch.best_params_))
print("Best score: {}".format(sklearn_gridsearch.best_score_))

--- 7.430998802185059 seconds ---
Best hyperparameters: {'max_depth': 10, 'min_samples_split': 7}
Best score: 0.9484949670925282


In [3]:
from sklearn.model_selection import RandomizedSearchCV
# generating the grid search
start_time = time.time()
sklearn_randomsearch = RandomizedSearchCV(estimator = RFC(n_estimators = 10, random_state = random_state),
                                          param_distributions = parameter_grid,
                                          cv = stratified_kfold_cv,
                                          random_state = random_state,
                                          n_iter = 5,
                                          n_jobs=-1)
# fitting the grid search cross-validation
sklearn_randomsearch.fit(x_train, y_train)
print("--- %s seconds ---" % (time.time() - start_time))
print("Best hyperparameters: {}".format(sklearn_randomsearch.best_params_))
print("Best score: {}".format(sklearn_randomsearch.best_score_))

--- 2.3692941665649414 seconds ---
Best hyperparameters: {'min_samples_split': 7, 'max_depth': 15}
Best score: 0.942934572202865


In [5]:
# first install tune_sklearn
from ray.tune.sklearn import TuneGridSearchCV
# generating the grid search
start_time = time.time()
tune_gridsearch = TuneGridSearchCV(
    RFC(n_estimators = 10, random_state = random_state),
    parameter_grid,
    cv = stratified_kfold_cv,
    early_stopping=True, # terminate unpromising configurations
    max_iters=10 #maximum number of iterations a given hyperparameter set
)
# fitting the grid search cross-validation
tune_gridsearch.fit(x_train, y_train)
print("--- %s seconds ---" % (time.time() - start_time))
print("Best hyperparameters: {}".format(tune_gridsearch.best_params_))
print("Best score: {}".format(tune_gridsearch.best_score_))

2023-07-25 19:39:24,082	INFO worker.py:1621 -- Started a local Ray instance.
2023-07-25 19:39:29,360	INFO tune.py:226 -- Initializing Ray automatically. For cluster usage or custom Ray initialization, call `ray.init(...)` before `tune.run(...)`.
2023-07-25 19:39:29,379	INFO tune.py:666 -- [output] This will use the new output engine with verbosity 0. To disable the new output and use the legacy output engine, set the environment variable RAY_AIR_NEW_OUTPUT=0. For more information, please see https://github.com/ray-project/ray/issues/36949


+-------------------------------------------------------------------+
| Configuration for experiment     _Trainable_2023-07-25_19-39-29   |
+-------------------------------------------------------------------+
| Search algorithm                 BasicVariantGenerator            |
| Scheduler                        AsyncHyperBandScheduler          |
| Number of trials                 15                               |
+-------------------------------------------------------------------+

View detailed results here: /root/ray_results/_Trainable_2023-07-25_19-39-29
To visualize your results with TensorBoard, run: `tensorboard --logdir /root/ray_results/_Trainable_2023-07-25_19-39-29`

--- 30.83316421508789 seconds ---
Best hyperparameters: {'max_depth': 10, 'min_samples_split': 7, 'n_estimators': 10}
Best score: 0.948494967092528


In [6]:
# first install bayesian-optimization and scikit-optimize
from ray.tune.sklearn import TuneSearchCV

start_time = time.time()
tune_bayessearch = TuneSearchCV(
    RFC(n_estimators = 10, random_state = random_state),
    parameter_grid,
    search_optimization="bayesian",
    cv = stratified_kfold_cv,
    n_trials=3, # number of parameter settings that are sampled
    early_stopping=True,
    max_iters=10,
    random_state = random_state)

tune_bayessearch.fit(x_train, y_train)
print("--- %s seconds ---" % (time.time() - start_time))
print("Best hyperparameters: {}".format(tune_bayessearch.best_params_))
print("Best score: {}".format(tune_bayessearch.best_score_))

2023-07-25 19:39:47,796	INFO tune.py:666 -- [output] This will use the new output engine with verbosity 0. To disable the new output and use the legacy output engine, set the environment variable RAY_AIR_NEW_OUTPUT=0. For more information, please see https://github.com/ray-project/ray/issues/36949


+-------------------------------------------------------------------+
| Configuration for experiment     _Trainable_2023-07-25_19-39-47   |
+-------------------------------------------------------------------+
| Search algorithm                 SearchGenerator                  |
| Scheduler                        AsyncHyperBandScheduler          |
| Number of trials                 3                                |
+-------------------------------------------------------------------+

View detailed results here: /root/ray_results/_Trainable_2023-07-25_19-39-47
To visualize your results with TensorBoard, run: `tensorboard --logdir /root/ray_results/_Trainable_2023-07-25_19-39-47`





--- 10.24624228477478 seconds ---
Best hyperparameters: {'max_depth': 15, 'min_samples_split': 2, 'n_estimators': 10}
Best score: 0.941545683313976
