In [2]:
# install hyperopt <- DONT RUN IF IN `requirements.txt`
!pip install hyperopt

Collecting hyperopt
  Downloading hyperopt-0.2.7-py2.py3-none-any.whl.metadata (1.7 kB)
Collecting networkx>=2.2 (from hyperopt)
  Downloading networkx-3.3-py3-none-any.whl.metadata (5.1 kB)
Collecting future (from hyperopt)
  Downloading future-1.0.0-py3-none-any.whl.metadata (4.0 kB)
Collecting tqdm (from hyperopt)
  Downloading tqdm-4.66.4-py3-none-any.whl.metadata (57 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m57.6/57.6 kB[0m [31m1.4 MB/s[0m eta [36m0:00:00[0mta [36m0:00:01[0m
Collecting py4j (from hyperopt)
  Downloading py4j-0.10.9.7-py2.py3-none-any.whl.metadata (1.5 kB)
Downloading hyperopt-0.2.7-py2.py3-none-any.whl (1.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.6/1.6 MB[0m [31m24.0 MB/s[0m eta [36m0:00:00[0m:00:01[0m
[?25hDownloading networkx-3.3-py3-none-any.whl (1.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.7/1.7 MB[0m [31m24.1 MB/s[0m eta [36m0:00:00[0m:00:01[0m
[?25hDownloadi

In [3]:
import os
import pickle
import click
import mlflow
import numpy as np

"""
hyperopt is a library that uses bayesian methods to find the best set of hyper parameters.
    - STATUS_OK -> a signal to tell hyperopt that each run has executed successfully or not.
    - Trails -> an object that keeps track of information in each run.
    - fmin -> a function that aims to minimise the objective/loss function.
    - hp -> a library to contains the search space (e.g. the available values for hyper parameters).
    - tpe -> an algorithm that controls the optimisation logic please see:
        https://hyperopt.github.io/hyperopt/
        https://proceedings.neurips.cc/paper_files/paper/2011/file/86e8f7ab32cfd12577bc2619bc635690-Paper.pdf
"""
from hyperopt import STATUS_OK, Trials, fmin, hp, tpe
from hyperopt.pyll import scope


from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import root_mean_squared_error

In [4]:
# ---- IMPORTANT: use `mlflow run --no-conda` to use without conda.------
mlflow.set_tracking_uri('sqlite:///mlflow.db')
mlflow.set_experiment("random-forest-hyperopt")

2024/05/26 07:01:00 INFO mlflow.tracking.fluent: Experiment with name 'random-forest-hyperopt' does not exist. Creating a new experiment.


<Experiment: artifact_location='/workspaces/mlopsZoomCamp/02-experiment-tracking/homework/mlruns/2', creation_time=1716706860280, experiment_id='2', last_update_time=1716706860280, lifecycle_stage='active', name='random-forest-hyperopt', tags={}>

In [5]:
def load_pickle(filename: str):
    with open(filename, "rb") as f_in:
        return pickle.load(f_in)

In [6]:
def run_optimization(data_path: str, num_trials: int):

    X_train, y_train = load_pickle(os.path.join(data_path, "train.pkl"))
    X_val, y_val = load_pickle(os.path.join(data_path, "val.pkl"))

    def objective(params):
        with mlflow.start_run():
            mlflow.set_tag('model', 'RandomForestRegressor')
            mlflow.log_params(params)
            rf = RandomForestRegressor(**params)
            rf.fit(X_train, y_train)
            y_pred = rf.predict(X_val)
            rmse = root_mean_squared_error(y_val, y_pred)
            mlflow.log_metric('rmse', rmse)

        return {'loss': rmse, 'status': STATUS_OK}

    search_space = {
        'max_depth': scope.int(hp.quniform('max_depth', 1, 20, 1)),
        'n_estimators': scope.int(hp.quniform('n_estimators', 10, 50, 1)),
        'min_samples_split': scope.int(hp.quniform('min_samples_split', 2, 10, 1)),
        'min_samples_leaf': scope.int(hp.quniform('min_samples_leaf', 1, 4, 1)),
        'random_state': 42
    }

    rstate = np.random.default_rng(42)  # for reproducible results
    fmin(
        fn=objective,
        space=search_space,
        algo=tpe.suggest,
        max_evals=num_trials,
        trials=Trials(),
        rstate=rstate
    )


In [7]:
run_optimization('./output', 15)

100%|██████████| 15/15 [01:04<00:00,  4.28s/trial, best loss: 5.335419588556921]
