# MLFlow Basics

This notebook provides a quick overview of machine learning model training with MLFlow Tracking

In [1]:
import mlflow
import numpy as np
import pandas as pd
import sklearn.datasets
import sklearn.metrics
import sklearn.model_selection
import sklearn.ensemble
 
from hyperopt import fmin, tpe, hp, SparkTrials, Trials, STATUS_OK
from hyperopt.pyll import scope

## Load data
The tutorial uses a dataset describing different wine samples. The dataset is from the [UCI Machine Learning Repository](https://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/)

In [6]:
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/"
red_wine= pd.read_csv(url+'winequality-red.csv', sep=";")
red_wine['is_red']=1.0
white_wine= pd.read_csv(url+'winequality-white.csv', sep=";")
white_wine['is_red']=0.0
data_df = pd.concat([red_wine,white_wine], axis=0)
data_df.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality,is_red
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5,1.0
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,5,1.0
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,5,1.0
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,6,1.0
4,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5,1.0


In [7]:
data_label = data_df['quality'] >=7
data = data_df.drop(['quality'], axis=1)

In [8]:
X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split(data, data_label, test_size=0.2, random_state=1)

In [11]:
mlflow.set_tracking_uri("http://127.0.0.1:5000")

In [12]:
mlflow.autolog()

2022/12/11 18:15:33 INFO mlflow.tracking.fluent: Autologging successfully enabled for sklearn.
2022/12/11 18:15:33 INFO mlflow.tracking.fluent: Autologging successfully enabled for pyspark.
2022/12/11 18:15:33 INFO mlflow.pyspark.ml: No SparkSession detected. Autologging will log pyspark.ml models contained in the default allowlist. To specify a custom allowlist, initialize a SparkSession prior to calling mlflow.pyspark.ml.autolog() and specify the path to your allowlist file via the spark.mlflow.pysparkml.autolog.logModelAllowlistFile conf.
2022/12/11 18:15:33 INFO mlflow.tracking.fluent: Autologging successfully enabled for pyspark.ml.


In [13]:
with mlflow.start_run(run_name='gradient_boost') as run:
    model = sklearn.ensemble.GradientBoostingClassifier(random_state=0)

    # Models, parameters, and training metrics are tracked automatically
    model.fit(X_train, y_train)

    predicted_probs = model.predict_proba(X_test)
    roc_auc = sklearn.metrics.roc_auc_score(y_test, predicted_probs[:,1])

    # The AUC score on test data is not automatically logged, so log it manually
    mlflow.log_metric("test_auc", roc_auc)
    print("Test AUC of: {}".format(roc_auc))

MlflowException: API request to http://127.0.0.1:5000/api/2.0/mlflow/runs/create failed with exception HTTPConnectionPool(host='127.0.0.1', port=5000): Max retries exceeded with url: /api/2.0/mlflow/runs/create (Caused by NewConnectionError('<urllib3.connection.HTTPConnection object at 0x7f07d56b7be0>: Failed to establish a new connection: [Errno 111] Connection refused'))