# MLFlow Basics

This notebook provides a quick overview of machine learning model training with MLFlow Tracking

In [None]:
import mlflow
import numpy as np
import pandas as pd
import sklearn.datasets
import sklearn.metrics
import sklearn.model_selection
import sklearn.ensemble
 
from hyperopt import fmin, tpe, hp, SparkTrials, Trials, STATUS_OK
from hyperopt.pyll import scope
import warnings
warnings.filterwarnings('ignore')

import mlflow.spark
import os
import shutil
from pyspark.sql import SparkSession

spark = (SparkSession.builder
            .config("spark.jars.packages", "org.mlflow:mlflow-spark:1.11.0")
            .master("local[*]")
            .getOrCreate())

## Load data
The tutorial uses a dataset describing different wine samples. The dataset is from the [UCI Machine Learning Repository](https://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/)

In [None]:
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/"
red_wine= pd.read_csv(url+'winequality-red.csv', sep=";")
red_wine['is_red']=1.0
white_wine= pd.read_csv(url+'winequality-white.csv', sep=";")
white_wine['is_red']=0.0
data_df = pd.concat([red_wine,white_wine], axis=0)
data_df.head()

In [None]:
data_label = data_df['quality'] >=7
data = data_df.drop(['quality'], axis=1)

In [None]:
X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split(data, data_label, test_size=0.2, random_state=1)

In [None]:
mlflow.set_tracking_uri("http://host.docker.internal:5000")

In [None]:
mlflow.autolog()

In [None]:
with mlflow.start_run(run_name='gradient_boost') as run:
    model = sklearn.ensemble.GradientBoostingClassifier(random_state=0)

    # Models, parameters, and training metrics are tracked automatically
    model.fit(X_train, y_train)

    predicted_probs = model.predict_proba(X_test)
    roc_auc = sklearn.metrics.roc_auc_score(y_test, predicted_probs[:,1])

    # The AUC score on test data is not automatically logged, so log it manually
    mlflow.log_metric("test_auc", roc_auc)
    print("Test AUC of: {}".format(roc_auc))

In [None]:
with mlflow.start_run(run_name='gradient_boost') as run:
    model_2 = sklearn.ensemble.GradientBoostingClassifier(random_state=0, n_estimators =200)

    # Models, parameters, and training metrics are tracked automatically
    model_2.fit(X_train, y_train)

    predicted_probs = model_2.predict_proba(X_test)
    roc_auc = sklearn.metrics.roc_auc_score(y_test, predicted_probs[:,1])

    # The AUC score on test data is not automatically logged, so log it manually
    mlflow.log_metric("test_auc", roc_auc)
    print("Test AUC of: {}".format(roc_auc))

![MLFlow]("./images/mlflow_gradientboost.png")

## Load models

In [None]:
# After a model has been logged, you can load it in different notebooks or jobs
# mlflow.pyfunc.load_model makes model prediction available under a common API
model_loaded = mlflow.pyfunc.load_model(
  'runs:/{run_id}/model'.format(
    run_id=run.info.run_id
  )
)
 
predictions_loaded = model_loaded.predict(X_test)
predictions_original = model_2.predict(X_test)
 
# The loaded model should match the original
assert(np.array_equal(predictions_loaded, predictions_original))

# Hyperparameter Tuning

## Hyperopt
Hyperopt is a Python library for hyperparameter tuning. 

In [11]:

# Define the search space to explore
search_space = {
  'n_estimators': scope.int(hp.quniform('n_estimators', 20, 1000, 1)),
  'learning_rate': hp.loguniform('learning_rate', -3, 0),
  'max_depth': scope.int(hp.quniform('max_depth', 2, 5, 1)),
}
 
def train_model(params):
    
    # Enable autologging on each worker
    mlflow.autolog()
    with mlflow.start_run(run_name='inner_run', nested=True):
        model_hp = sklearn.ensemble.GradientBoostingClassifier(
          random_state=0,
          **params
        )
        model_hp.fit(X_train, y_train)
        predicted_probs = model_hp.predict_proba(X_test)
        # Tune based on the test AUC
        # In production settings, you could use a separate validation set instead
        roc_auc = sklearn.metrics.roc_auc_score(y_test, predicted_probs[:,1])
        mlflow.log_metric('test_auc', roc_auc)
    
    # Set the loss to -1*auc_score so fmin maximizes the auc_score
    return {'status': STATUS_OK, 'loss': -1*roc_auc}
 
# SparkTrials distributes the tuning using Spark workers
# Greater parallelism speeds processing, but each hyperparameter trial has less information from other trials
# On smaller clusters or Databricks Community Edition try setting parallelism=2
#spark_trials = SparkTrials(
#  parallelism=8
#)
spark_trials =Trials() 
 
with mlflow.start_run(run_name='gb_hyperopt') as run:
  # Use hyperopt to find the parameters yielding the highest AUC
  best_params = fmin(
    fn=train_model, 
    space=search_space, 
    algo=tpe.suggest, 
    max_evals=32,
    trials=spark_trials)

 28%|██▊       | 9/32 [01:05<02:47,  7.29s/trial, best loss: -0.9034264687141583]

2022/12/12 03:37:28 INFO mlflow.tracking.fluent: Autologging successfully enabled for sklearn.

2022/12/12 03:37:28 INFO mlflow.tracking.fluent: Autologging successfully enabled for pyspark.

2022/12/12 03:37:28 INFO mlflow.tracking.fluent: Autologging successfully enabled for pyspark.ml.



 31%|███▏      | 10/32 [01:09<02:18,  6.30s/trial, best loss: -0.9034264687141583]

2022/12/12 03:37:32 INFO mlflow.tracking.fluent: Autologging successfully enabled for sklearn.

2022/12/12 03:37:32 INFO mlflow.tracking.fluent: Autologging successfully enabled for pyspark.

2022/12/12 03:37:32 INFO mlflow.tracking.fluent: Autologging successfully enabled for pyspark.ml.



 34%|███▍      | 11/32 [01:16<02:18,  6.58s/trial, best loss: -0.9034264687141583]

2022/12/12 03:37:39 INFO mlflow.tracking.fluent: Autologging successfully enabled for sklearn.

2022/12/12 03:37:39 INFO mlflow.tracking.fluent: Autologging successfully enabled for pyspark.

2022/12/12 03:37:39 INFO mlflow.tracking.fluent: Autologging successfully enabled for pyspark.ml.



 38%|███▊      | 12/32 [01:23<02:10,  6.51s/trial, best loss: -0.9034264687141583]

2022/12/12 03:37:46 INFO mlflow.tracking.fluent: Autologging successfully enabled for sklearn.

2022/12/12 03:37:46 INFO mlflow.tracking.fluent: Autologging successfully enabled for pyspark.

2022/12/12 03:37:46 INFO mlflow.tracking.fluent: Autologging successfully enabled for pyspark.ml.



 41%|████      | 13/32 [01:33<02:26,  7.72s/trial, best loss: -0.9034264687141583]

2022/12/12 03:37:56 INFO mlflow.tracking.fluent: Autologging successfully enabled for sklearn.

2022/12/12 03:37:56 INFO mlflow.tracking.fluent: Autologging successfully enabled for pyspark.

2022/12/12 03:37:56 INFO mlflow.tracking.fluent: Autologging successfully enabled for pyspark.ml.



 44%|████▍     | 14/32 [01:42<02:24,  8.01s/trial, best loss: -0.9034264687141583]

2022/12/12 03:38:05 INFO mlflow.tracking.fluent: Autologging successfully enabled for sklearn.

2022/12/12 03:38:05 INFO mlflow.tracking.fluent: Autologging successfully enabled for pyspark.

2022/12/12 03:38:05 INFO mlflow.tracking.fluent: Autologging successfully enabled for pyspark.ml.



 47%|████▋     | 15/32 [01:47<02:01,  7.15s/trial, best loss: -0.9034264687141583]

2022/12/12 03:38:10 INFO mlflow.tracking.fluent: Autologging successfully enabled for sklearn.

2022/12/12 03:38:10 INFO mlflow.tracking.fluent: Autologging successfully enabled for pyspark.

2022/12/12 03:38:10 INFO mlflow.tracking.fluent: Autologging successfully enabled for pyspark.ml.



 50%|█████     | 16/32 [01:53<01:49,  6.81s/trial, best loss: -0.9034264687141583]

2022/12/12 03:38:16 INFO mlflow.tracking.fluent: Autologging successfully enabled for sklearn.

2022/12/12 03:38:16 INFO mlflow.tracking.fluent: Autologging successfully enabled for pyspark.

2022/12/12 03:38:16 INFO mlflow.tracking.fluent: Autologging successfully enabled for pyspark.ml.



 53%|█████▎    | 17/32 [01:58<01:31,  6.13s/trial, best loss: -0.9034264687141583]

2022/12/12 03:38:20 INFO mlflow.tracking.fluent: Autologging successfully enabled for sklearn.

2022/12/12 03:38:20 INFO mlflow.tracking.fluent: Autologging successfully enabled for pyspark.

2022/12/12 03:38:20 INFO mlflow.tracking.fluent: Autologging successfully enabled for pyspark.ml.



 56%|█████▋    | 18/32 [02:05<01:28,  6.35s/trial, best loss: -0.9034264687141583]

2022/12/12 03:38:27 INFO mlflow.tracking.fluent: Autologging successfully enabled for sklearn.

2022/12/12 03:38:27 INFO mlflow.tracking.fluent: Autologging successfully enabled for pyspark.

2022/12/12 03:38:27 INFO mlflow.tracking.fluent: Autologging successfully enabled for pyspark.ml.



 59%|█████▉    | 19/32 [02:08<01:10,  5.39s/trial, best loss: -0.9034264687141583]

2022/12/12 03:38:30 INFO mlflow.tracking.fluent: Autologging successfully enabled for sklearn.

2022/12/12 03:38:31 INFO mlflow.tracking.fluent: Autologging successfully enabled for pyspark.

2022/12/12 03:38:31 INFO mlflow.tracking.fluent: Autologging successfully enabled for pyspark.ml.



 62%|██████▎   | 20/32 [02:16<01:13,  6.11s/trial, best loss: -0.9034264687141583]

2022/12/12 03:38:38 INFO mlflow.tracking.fluent: Autologging successfully enabled for sklearn.

2022/12/12 03:38:38 INFO mlflow.tracking.fluent: Autologging successfully enabled for pyspark.

2022/12/12 03:38:38 INFO mlflow.tracking.fluent: Autologging successfully enabled for pyspark.ml.



 66%|██████▌   | 21/32 [02:21<01:03,  5.78s/trial, best loss: -0.9034264687141583]

2022/12/12 03:38:43 INFO mlflow.tracking.fluent: Autologging successfully enabled for sklearn.

2022/12/12 03:38:43 INFO mlflow.tracking.fluent: Autologging successfully enabled for pyspark.

2022/12/12 03:38:43 INFO mlflow.tracking.fluent: Autologging successfully enabled for pyspark.ml.



 69%|██████▉   | 22/32 [02:35<01:23,  8.35s/trial, best loss: -0.9059991079393398]

2022/12/12 03:38:58 INFO mlflow.tracking.fluent: Autologging successfully enabled for sklearn.

2022/12/12 03:38:58 INFO mlflow.tracking.fluent: Autologging successfully enabled for pyspark.

2022/12/12 03:38:58 INFO mlflow.tracking.fluent: Autologging successfully enabled for pyspark.ml.



 72%|███████▏  | 23/32 [02:51<01:35, 10.56s/trial, best loss: -0.9067199248120301]

2022/12/12 03:39:13 INFO mlflow.tracking.fluent: Autologging successfully enabled for sklearn.

2022/12/12 03:39:13 INFO mlflow.tracking.fluent: Autologging successfully enabled for pyspark.

2022/12/12 03:39:13 INFO mlflow.tracking.fluent: Autologging successfully enabled for pyspark.ml.



 75%|███████▌  | 24/32 [03:06<01:37, 12.14s/trial, best loss: -0.9067876258442717]

2022/12/12 03:39:29 INFO mlflow.tracking.fluent: Autologging successfully enabled for sklearn.

2022/12/12 03:39:29 INFO mlflow.tracking.fluent: Autologging successfully enabled for pyspark.

2022/12/12 03:39:29 INFO mlflow.tracking.fluent: Autologging successfully enabled for pyspark.ml.



 78%|███████▊  | 25/32 [03:21<01:30, 12.90s/trial, best loss: -0.9101647126290302]

2022/12/12 03:39:44 INFO mlflow.tracking.fluent: Autologging successfully enabled for sklearn.

2022/12/12 03:39:44 INFO mlflow.tracking.fluent: Autologging successfully enabled for pyspark.

2022/12/12 03:39:44 INFO mlflow.tracking.fluent: Autologging successfully enabled for pyspark.ml.



 81%|████████▏ | 26/32 [03:35<01:18, 13.13s/trial, best loss: -0.9101647126290302]

2022/12/12 03:39:58 INFO mlflow.tracking.fluent: Autologging successfully enabled for sklearn.

2022/12/12 03:39:58 INFO mlflow.tracking.fluent: Autologging successfully enabled for pyspark.

2022/12/12 03:39:58 INFO mlflow.tracking.fluent: Autologging successfully enabled for pyspark.ml.



 84%|████████▍ | 27/32 [03:49<01:06, 13.40s/trial, best loss: -0.9101647126290302]

2022/12/12 03:40:12 INFO mlflow.tracking.fluent: Autologging successfully enabled for sklearn.

2022/12/12 03:40:12 INFO mlflow.tracking.fluent: Autologging successfully enabled for pyspark.

2022/12/12 03:40:12 INFO mlflow.tracking.fluent: Autologging successfully enabled for pyspark.ml.



 88%|████████▊ | 28/32 [04:02<00:52, 13.24s/trial, best loss: -0.9101647126290302]

2022/12/12 03:40:24 INFO mlflow.tracking.fluent: Autologging successfully enabled for sklearn.

2022/12/12 03:40:24 INFO mlflow.tracking.fluent: Autologging successfully enabled for pyspark.

2022/12/12 03:40:24 INFO mlflow.tracking.fluent: Autologging successfully enabled for pyspark.ml.



 91%|█████████ | 29/32 [04:14<00:38, 12.95s/trial, best loss: -0.9101647126290302]

2022/12/12 03:40:37 INFO mlflow.tracking.fluent: Autologging successfully enabled for sklearn.

2022/12/12 03:40:37 INFO mlflow.tracking.fluent: Autologging successfully enabled for pyspark.

2022/12/12 03:40:37 INFO mlflow.tracking.fluent: Autologging successfully enabled for pyspark.ml.



 94%|█████████▍| 30/32 [04:25<00:24, 12.30s/trial, best loss: -0.9101647126290302]

2022/12/12 03:40:47 INFO mlflow.tracking.fluent: Autologging successfully enabled for sklearn.

2022/12/12 03:40:47 INFO mlflow.tracking.fluent: Autologging successfully enabled for pyspark.

2022/12/12 03:40:47 INFO mlflow.tracking.fluent: Autologging successfully enabled for pyspark.ml.



 97%|█████████▋| 31/32 [04:39<00:13, 13.04s/trial, best loss: -0.9101647126290302]

2022/12/12 03:41:02 INFO mlflow.tracking.fluent: Autologging successfully enabled for sklearn.

2022/12/12 03:41:02 INFO mlflow.tracking.fluent: Autologging successfully enabled for pyspark.

2022/12/12 03:41:02 INFO mlflow.tracking.fluent: Autologging successfully enabled for pyspark.ml.



100%|██████████| 32/32 [04:50<00:00,  9.09s/trial, best loss: -0.9101647126290302]


## Search runs to retrieve the best model

In [12]:
# Sort runs by their test auc; in case of ties, use the most recent run
best_run = mlflow.search_runs(
  order_by=['metrics.test_auc DESC', 'start_time DESC'],
  max_results=10,
).iloc[0]
print('Best Run')
print('AUC: {}'.format(best_run["metrics.test_auc"]))
print('Num Estimators: {}'.format(best_run["params.n_estimators"]))
print('Max Depth: {}'.format(best_run["params.max_depth"]))
print('Learning Rate: {}'.format(best_run["params.learning_rate"]))
 
best_model_pyfunc = mlflow.pyfunc.load_model(
  'runs:/{run_id}/model'.format(
    run_id=best_run.run_id
  )
)
best_model_predictions = best_model_pyfunc.predict(X_test[:5])
print("Test Predictions: {}".format(best_model_predictions))


Best Run
AUC: 0.9101647126290302
Num Estimators: 949
Max Depth: 5
Learning Rate: 0.32079649376965114
Test Predictions: [False False False False False]
