In [2]:
import os
import pickle
import click
import mlflow
import numpy as np

from hyperopt import STATUS_OK, Trials, fmin, hp, tpe
from hyperopt.pyll import scope
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

mlflow.set_tracking_uri("sqlite:///mlflow.db")
mlflow.set_experiment("random-forest-hyperopt")


<Experiment: artifact_location='/workspaces/Testrepo/mlruns/2', creation_time=1723539801480, experiment_id='2', last_update_time=1723539801480, lifecycle_stage='active', name='random-forest-hyperopt', tags={}>

In [3]:

def load_pickle(filename: str):
    with open(filename, "rb") as f_in:
        return pickle.load(f_in)

In [13]:
def run_optimization(data_path: str, num_trials: int):

    X_train, y_train = load_pickle(os.path.join(data_path, "train.pkl"))
    X_val, y_val = load_pickle(os.path.join(data_path, "val.pkl"))

    def objective(params):

        with mlflow.start_run():
            mlflow.log_param("max_depth", params["max_depth"])
            mlflow.log_param("min_samples_leaf", params["min_samples_leaf"])
            mlflow.log_param("min_samples_split", params["min_samples_split"])
            mlflow.log_param("n_estimators", params["n_estimators"])
            mlflow.log_param("random_state", params["random_state"])
            
            rf = RandomForestRegressor(**params)
            rf.fit(X_train, y_train)
            y_pred = rf.predict(X_val)
            result = (y_pred==y_val)
            error = 1-sum(result)/len(result)
            print(error)
            
            mlflow.log_metric("error", error)
            mlflow.sklearn.log_model(RandomForestRegressor, artifact_path="artifact")
            mlflow.set_tag("model", rf)
            mlflow.end_run()
        return {'loss': error, 'status': STATUS_OK}

    search_space = {
        'max_depth': scope.int(hp.quniform('max_depth', 1, 20, 1)),
        'n_estimators': scope.int(hp.quniform('n_estimators', 10, 50, 1)),
        'min_samples_split': scope.int(hp.quniform('min_samples_split', 2, 10, 1)),
        'min_samples_leaf': scope.int(hp.quniform('min_samples_leaf', 1, 4, 1)),
        'random_state': 42
    }
    rstate = np.random.default_rng(42)  # for reproducible results
    fmin(
        fn=objective,
        space=search_space,
        algo=tpe.suggest,
        max_evals=num_trials,
        trials=Trials(),
        rstate=rstate
    )


In [11]:
from sklearn.linear_model import LogisticRegression, Ridge
model = LogisticRegression(solver='liblinear', C=10, max_iter=1000, random_state=42)

X_train, y_train = load_pickle(os.path.join("./output", "train.pkl"))
X_val, y_val = load_pickle(os.path.join("./output", "val.pkl"))



In [None]:
model.fit(X_train, y_train)
y_pred = model.predict(X_val)
result = (y_pred==y_val)
accuracy = sum(result)/len(result)
mlflow.log_metric("accuracy", accuracy)
print(1-accuracy)

In [19]:
run_optimization(data_path="./output", num_trials=15)

0.8099173553719008                                    
  0%|          | 0/15 [00:00<?, ?trial/s, best loss=?]




1.0                                                                             
  7%|▋         | 1/15 [00:02<00:28,  2.01s/trial, best loss: 0.8099173553719008]




0.8429752066115702                                                              
 13%|█▎        | 2/15 [00:04<00:25,  1.93s/trial, best loss: 0.8099173553719008]




0.768595041322314                                                               
 20%|██        | 3/15 [00:05<00:23,  1.92s/trial, best loss: 0.8099173553719008]




0.7851239669421488                                                              
 27%|██▋       | 4/15 [00:07<00:20,  1.90s/trial, best loss: 0.768595041322314]




0.7520661157024793                                                             
 33%|███▎      | 5/15 [00:09<00:18,  1.89s/trial, best loss: 0.768595041322314]




0.7933884297520661                                                              
 40%|████      | 6/15 [00:11<00:16,  1.87s/trial, best loss: 0.7520661157024793]




0.7851239669421488                                                              
 47%|████▋     | 7/15 [00:13<00:15,  1.88s/trial, best loss: 0.7520661157024793]




0.768595041322314                                                               
 53%|█████▎    | 8/15 [00:15<00:13,  1.87s/trial, best loss: 0.7520661157024793]




0.7603305785123967                                                              
 60%|██████    | 9/15 [00:17<00:11,  1.91s/trial, best loss: 0.7520661157024793]




0.6528925619834711                                                               
 67%|██████▋   | 10/15 [00:19<00:09,  1.87s/trial, best loss: 0.7520661157024793]




0.8099173553719008                                                               
 73%|███████▎  | 11/15 [00:20<00:07,  1.85s/trial, best loss: 0.6528925619834711]




0.7933884297520661                                                               
 80%|████████  | 12/15 [00:22<00:05,  1.90s/trial, best loss: 0.6528925619834711]




0.6859504132231404                                                               
 87%|████████▋ | 13/15 [00:24<00:03,  1.89s/trial, best loss: 0.6528925619834711]




0.768595041322314                                                                
 93%|█████████▎| 14/15 [00:26<00:01,  1.89s/trial, best loss: 0.6528925619834711]




100%|██████████| 15/15 [00:28<00:00,  1.90s/trial, best loss: 0.6528925619834711]


In [15]:
mlflow.end_run()