In [3]:
import os
import pickle
import click
import mlflow
import numpy as np
from hyperopt import STATUS_OK, Trials, fmin, hp, tpe
from hyperopt.pyll import scope
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import root_mean_squared_error

In [4]:
mlflow.set_tracking_uri("sqlite:///mlflow.db")
mlflow.set_experiment("random-forest-hyperopt")


<Experiment: artifact_location=('file:///C:/Users/SebastianSy/Desktop/231110_Backup Gateway '
 'Final/Administration/Other/Sinn/Start-up and Job/Programmieren/MLOps '
 'Zoomcamp/week 2/mlruns/4'), creation_time=1716815613222, experiment_id='4', last_update_time=1716815613222, lifecycle_stage='active', name='random-forest-hyperopt', tags={}>

In [5]:
def load_pickle(filename: str):
    with open(filename, "rb") as f_in:
        return pickle.load(f_in)


In [12]:
def run_optimization(data_path: str, num_trials: int):

    X_train, y_train = load_pickle(os.path.join(data_path, "train.pkl"))
    X_val, y_val = load_pickle(os.path.join(data_path, "val.pkl"))

    def objective(params):

        with mlflow.start_run():
            mlflow.log_param("max_depth", params["max_depth"])
            mlflow.log_param("min_samples_leaf", params["min_samples_leaf"])
            mlflow.log_param("min_samples_split", params["min_samples_split"])
            mlflow.log_param("n_estimators", params["n_estimators"])
            mlflow.log_param("random_state", params["random_state"])
            rf = RandomForestRegressor(**params)
            rf.fit(X_train, y_train)
            y_pred = rf.predict(X_val)
            rmse = root_mean_squared_error(y_val, y_pred)
            
            mlflow.log_metric("rmse", rmse)
            mlflow.sklearn.log_model(RandomForestRegressor, artifact_path="artifact")
            mlflow.set_tag("model", rf)
            mlflow.end_run()
        return {'loss': rmse, 'status': STATUS_OK}

    search_space = {
        'max_depth': scope.int(hp.quniform('max_depth', 1, 20, 1)),
        'n_estimators': scope.int(hp.quniform('n_estimators', 10, 50, 1)),
        'min_samples_split': scope.int(hp.quniform('min_samples_split', 2, 10, 1)),
        'min_samples_leaf': scope.int(hp.quniform('min_samples_leaf', 1, 4, 1)),
        'random_state': 42
    }
    rstate = np.random.default_rng(42)  # for reproducible results
    fmin(
        fn=objective,
        space=search_space,
        algo=tpe.suggest,
        max_evals=num_trials,
        trials=Trials(),
        rstate=rstate
    )


In [14]:
run_optimization(data_path="./output", num_trials=15)

  0%|                                                                           | 0/15 [00:00<?, ?trial/s, best loss=?]





  7%|███▎                                              | 1/15 [00:33<07:48, 33.48s/trial, best loss: 5.370086069268862]





 13%|██████▋                                           | 2/15 [00:46<04:41, 21.62s/trial, best loss: 5.370086069268862]





 20%|██████████                                        | 3/15 [00:57<03:22, 16.85s/trial, best loss: 5.370086069268862]





 27%|█████████████▎                                    | 4/15 [01:19<03:26, 18.73s/trial, best loss: 5.357490752366866]





 33%|████████████████▋                                 | 5/15 [01:35<02:55, 17.59s/trial, best loss: 5.357490752366866]





 40%|████████████████████                              | 6/15 [02:06<03:19, 22.22s/trial, best loss: 5.354695072530291]





 47%|███████████████████████▎                          | 7/15 [02:32<03:09, 23.64s/trial, best loss: 5.354695072530291]





 53%|██████████████████████████▋                       | 8/15 [02:43<02:16, 19.44s/trial, best loss: 5.354695072530291]





 60%|██████████████████████████████                    | 9/15 [03:06<02:03, 20.64s/trial, best loss: 5.354695072530291]





 67%|████████████████████████████████▋                | 10/15 [03:24<01:39, 19.91s/trial, best loss: 5.354695072530291]





 73%|███████████████████████████████████▉             | 11/15 [03:41<01:15, 18.91s/trial, best loss: 5.335419588556921]





 80%|███████████████████████████████████████▏         | 12/15 [03:59<00:55, 18.66s/trial, best loss: 5.335419588556921]





 87%|██████████████████████████████████████████▍      | 13/15 [04:11<00:33, 16.65s/trial, best loss: 5.335419588556921]





 93%|█████████████████████████████████████████████▋   | 14/15 [04:30<00:17, 17.17s/trial, best loss: 5.335419588556921]





100%|█████████████████████████████████████████████████| 15/15 [05:00<00:00, 20.04s/trial, best loss: 5.335419588556921]
