# Chapter 8
## Section: Tracking Machine Learnign Experiments

In [None]:
!pip install mlflow==2.1.1

In [None]:
import pandas as pd
import numpy as np
from sklearn.metrics import mean_squared_error, roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier as RF
from sklearn.datasets import load_breast_cancer
import mlflow
import mlflow.sklearn

np.random.seed(42)

def eval_metrics(actual, pred, pred_proba):
    rmse = np.sqrt(mean_squared_error(actual, pred))
    roc_auc = roc_auc_score(actual, pred_proba)
    return rmse, roc_auc

In [None]:
# loading sklearn's breast cancer dataset
X, y = load_breast_cancer(return_X_y=True)
# split the data into training and test sets. (0.7, 0.3) split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state=42)

esperiment_name = "mlflow-randomforest-cancer"

existing_exp = mlflow.get_experiment_by_name(esperiment_name)
if not existing_exp:
  experiment_id = mlflow.create_experiment(esperiment_name, artifact_location="...")
else:
  experiment_id = dict(existing_exp)['experiment_id']
mlflow.set_experiment(esperiment_name)

<Experiment: artifact_location='...', creation_time=1690294524586, experiment_id='777485854782028460', last_update_time=1690294524586, lifecycle_stage='active', name='mlflow-randomforest-cancer', tags={}>

In [None]:
for idx, n_estimators in enumerate([5, 10, 20]):
    rf = RF(n_estimators = n_estimators, random_state = 42)
    rf.fit(X_train, y_train)

    pred_probs = rf.predict_proba(X_test)
    pred_labels = rf.predict(X_test)
    # calculating rmse and roc-auc for the randorm forest model
    # predictions on the test set
    rmse, roc_auc = eval_metrics(actual = y_test,
                                   pred = pred_labels,
                                   pred_proba = [iter[1] for iter in pred_probs])

    # start mlflow
    RUN_NAME = f"run_{idx}"
    with mlflow.start_run(experiment_id=experiment_id, run_name=RUN_NAME) as run:
        # retrieve run id
        RUN_ID = run.info.run_id
        # track parameters
        mlflow.log_param("n_estimators", n_estimators)
        # track metrics
        mlflow.log_metric("rmse", rmse)
        # track metrics
        mlflow.log_metric("roc_auc", roc_auc)
        # track model
        mlflow.sklearn.log_model(rf, "model")



In [None]:
from mlflow.tracking import MlflowClient

esperiment_name = "mlflow-randomforest-cancer"

client = MlflowClient()
# retrieve experiment information
experiment_id = client.get_experiment_by_name(esperiment_name).experiment_id

In [None]:
# retrieve runs information (parameter: 'n_estimators', metric: 'roc_auc')
experiment_info = mlflow.search_runs([experiment_id])
# extracting run ids for the specified experiment
runs_id = experiment_info.run_id.values
# extracting parameters of different runs
runs_param = [client.get_run(run_id).data.params["n_estimators"] for run_id in runs_id]
# extracting roc-auc across different runs
runs_metric = [client.get_run(run_id).data.metrics["roc_auc"] for run_id in runs_id]

In [None]:
# retrieve artifact from best run
df = mlflow.search_runs([experiment_id], order_by=["metrics.roc_auc"])
best_run_id = df.loc[0,'run_id']
best_model_path = client.download_artifacts(best_run_id, "model")
best_model = mlflow.sklearn.load_model(best_model_path)
print("Best model: {}".format(best_model))

Best model: RandomForestClassifier(n_estimators=5, random_state=42)


  best_model_path = client.download_artifacts(best_run_id, "model")


In [None]:
# delete runs (mke sure you are certain about deleting the runs)
for run_id in runs_id:
    client.delete_run(run_id)

# delete experiment (make sure you are certain about deleting the experiment)
client.delete_experiment(experiment_id)