In [3]:
import os
import warnings
import sys
import mlflow
import mlflow.sklearn

import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier

import logging


In [17]:


# Wine Quality Sample
def train_rf(max_depth=10, n_estimators=100):

    logging.basicConfig(level=logging.WARN)
    logger = logging.getLogger(__name__)

    def eval_metrics(actual, pred):
        average_param = "weighted"
        accuracy = accuracy_score(actual, pred)
        precision = precision_score(actual, pred, average= average_param)
        recall = recall_score(actual, pred, average= average_param)
        f1_metrics = f1_score(actual, pred, average= average_param)
        return accuracy, precision, recall, f1_metrics


    warnings.filterwarnings("ignore")
    np.random.seed(40)

    # Read the wine-quality csv file from the URL
    csv_url =\
        'http://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-red.csv'
    try:
        data = pd.read_csv(csv_url, sep=';')
    except Exception as e:
        logger.exception(
            "Unable to download training & test CSV, check your internet connection. Error: %s", e)

    # Split the data into training and test sets. (0.75, 0.25) split.
    train, test = train_test_split(data)

    # The predicted column is "quality" which is a scalar from [3, 9]
    train_x = train.drop(["quality"], axis=1)
    test_x = test.drop(["quality"], axis=1)
    train_y = train[["quality"]]
    test_y = test[["quality"]]

    with mlflow.start_run():
        # Execute RandomForest
        lr = RandomForestClassifier(max_depth=10, n_estimators=100, random_state=42)
        lr.fit(train_x, train_y)

        # Evaluate Metrics
        y_pred = lr.predict(test_x)
        (accuracy, precision, recall, f1_metrics) = eval_metrics(test_y, y_pred)
        
        # Print out metrics
        print("RandomForest model (n_estimators =%f, max_depth=%f):" % (n_estimators, max_depth))
        print("  Accuracy: %s" % accuracy)
        print("  Precision: %s" % precision)
        print("  Recall: %s" % recall)
        print("  F1-score: %s" % f1_metrics)

        # param recorded with mlflow:
        mlflow.log_param("n_estimators", n_estimators)
        mlflow.log_param("max_depth", max_depth)
        mlflow.log_metric("Accuracy", accuracy)
        mlflow.log_metric("Precision", precision)
        mlflow.log_metric("Recall", recall)
        mlflow.log_metric("F1-Score", f1_metrics)
        # mlflow.log_metric("Average Parameter", average_param)

        mlflow.sklearn.log_model(lr, "model")

        # Write scores to a file
      #  with open("metrics.txt", 'w') as outfile:
       #     outfile.write("  RMSE: %s" % rmse)
        #    outfile.write("  MAE: %s" % mae)
         #   outfile.write("  R2: %s" % r2)

In [18]:
# Start the training
train_rf(10, 100)
train_rf()
train_rf(15,50)

RandomForest model (n_estimators =100.000000, max_depth=10.000000):
  Accuracy: 0.66
  Precision: 0.6223964216671088
  Recall: 0.66
  F1-score: 0.6337299848306085
RandomForest model (n_estimators =100.000000, max_depth=10.000000):
  Accuracy: 0.66
  Precision: 0.6223964216671088
  Recall: 0.66
  F1-score: 0.6337299848306085
RandomForest model (n_estimators =50.000000, max_depth=15.000000):
  Accuracy: 0.66
  Precision: 0.6223964216671088
  Recall: 0.66
  F1-score: 0.6337299848306085
