In [1]:
%load_ext autoreload
%autoreload 2

import udf_utils
from sklearn import linear_model
from sklearn import ensemble
from sklearn.pipeline import Pipeline
import sklearn.preprocessing as sk_prep
import itertools

The history saving thread hit an unexpected error (DatabaseError('database disk image is malformed')).History will not be written to the database.


## MLFlow demo

This notebook have code to run some examples to use mlflow in machine learning problems

### Regression examples

1. First, we need to define the data to use, in this case the *problem_type* is **regression**

In [2]:
demo_regression = udf_utils.ML_FLOW(problem_type = 'regression').generate_data()
experiment_name = 'Regression problem1'

2. Then, we need to define the model(s) and the hyperparameters to use in the trainning

A first model could be a linear regression. This can be our base model.

In [3]:
# Hyper parameters to use
fit_intercept = [True, False]

# Parameters defined by the user to log
other_params = {'udf_model_type': 'Linear Regression', 'udf_regularization': 'None', 'udf_scaler': 'Standard'}

def model_function(**kwargs):
    
    udf_pipeline = Pipeline([('scaler', sk_prep.StandardScaler()), ('regression', linear_model.LinearRegression(**kwargs))])
    
    return udf_pipeline

for intercept_value in fit_intercept:
    hyper_params = {'fit_intercept': intercept_value}
    demo_regression.run_experiments_autolog(experiment_name, 'Base line model', model_function, hyper_params, other_params)

2022/10/26 17:51:18 INFO mlflow.tracking.fluent: Autologging successfully enabled for sklearn.
2022/10/26 17:51:24 INFO mlflow.tracking.fluent: Autologging successfully enabled for sklearn.


Then we can start to use regularization os use complex model to see if we have an improvement.

In [4]:
# Hyper parameters to use
fit_intercept = [True, False]
regularization = [0.1, 0.5, 1.0, 2.0, 5.0, 10, 50, 100]

# Parameters defined by the user to log
other_params = {'udf_model_type': 'Ridge Regression', 'udf_scaler': 'Standard'}

def model_function(**kwargs):
    
    udf_pipeline = Pipeline([('scaler', sk_prep.StandardScaler()), ('regression', linear_model.Ridge(**kwargs))])
    
    return udf_pipeline

for intercept_value, alpha_value in itertools.product(fit_intercept, regularization):
    hyper_params = {'fit_intercept': intercept_value,
                    'alpha': alpha_value}
    
    demo_regression.run_experiments_autolog(experiment_name, 'Complex model', model_function, hyper_params, other_params)

2022/10/26 17:51:27 INFO mlflow.tracking.fluent: Autologging successfully enabled for sklearn.
2022/10/26 17:51:31 INFO mlflow.tracking.fluent: Autologging successfully enabled for sklearn.
2022/10/26 17:51:35 INFO mlflow.tracking.fluent: Autologging successfully enabled for sklearn.
2022/10/26 17:51:38 INFO mlflow.tracking.fluent: Autologging successfully enabled for sklearn.
2022/10/26 17:51:43 INFO mlflow.tracking.fluent: Autologging successfully enabled for sklearn.
2022/10/26 17:51:46 INFO mlflow.tracking.fluent: Autologging successfully enabled for sklearn.
2022/10/26 17:51:50 INFO mlflow.tracking.fluent: Autologging successfully enabled for sklearn.
2022/10/26 17:51:55 INFO mlflow.tracking.fluent: Autologging successfully enabled for sklearn.
2022/10/26 17:51:59 INFO mlflow.tracking.fluent: Autologging successfully enabled for sklearn.
2022/10/26 17:52:03 INFO mlflow.tracking.fluent: Autologging successfully enabled for sklearn.
2022/10/26 17:52:07 INFO mlflow.tracking.fluent: A

In [5]:
# Hyper parameters to use
fit_intercept = [True, False]
regularization = [0.1, 0.5, 1.0, 2.0, 5.0, 10, 50, 100]

# Parameters defined by the user to log
other_params = {'udf_model_type': 'Lasso Regression', 'udf_scaler': 'Standard'}

def model_function(**kwargs):
    
    udf_pipeline = Pipeline([('scaler', sk_prep.StandardScaler()), ('regression', linear_model.Lasso(**kwargs))])
    
    return udf_pipeline

for intercept_value, alpha_value in itertools.product(fit_intercept, regularization):
    hyper_params = {'fit_intercept': intercept_value,
                    'alpha': alpha_value}
    
    demo_regression.run_experiments_autolog(experiment_name, 'Complex model', model_function, hyper_params, other_params)

2022/10/26 17:52:25 INFO mlflow.tracking.fluent: Autologging successfully enabled for sklearn.
2022/10/26 17:52:28 INFO mlflow.tracking.fluent: Autologging successfully enabled for sklearn.
2022/10/26 17:52:31 INFO mlflow.tracking.fluent: Autologging successfully enabled for sklearn.
2022/10/26 17:52:34 INFO mlflow.tracking.fluent: Autologging successfully enabled for sklearn.
2022/10/26 17:52:37 INFO mlflow.tracking.fluent: Autologging successfully enabled for sklearn.
2022/10/26 17:52:40 INFO mlflow.tracking.fluent: Autologging successfully enabled for sklearn.
2022/10/26 17:52:43 INFO mlflow.tracking.fluent: Autologging successfully enabled for sklearn.
2022/10/26 17:52:46 INFO mlflow.tracking.fluent: Autologging successfully enabled for sklearn.
2022/10/26 17:52:48 INFO mlflow.tracking.fluent: Autologging successfully enabled for sklearn.
2022/10/26 17:52:52 INFO mlflow.tracking.fluent: Autologging successfully enabled for sklearn.
2022/10/26 17:52:55 INFO mlflow.tracking.fluent: A

### Classification examples


1. First, we need to define the data to use, in this case the *problem_type* is **classification**

In [6]:
demo_classification = udf_utils.ML_FLOW(problem_type = 'classification').generate_data(n_classes = 2)
experiment_name = 'Classification problem'

2. Then, we need to define the model(s) and the hyperparameters to use in the trainning

A first model could be a logistic regression. This can be our base model.

In [7]:
fit_intercept = [True, False]
other_params = {'udf_model_type': 'Logistic Regression', 'udf_regularization': 'None', 'udf_scaler': 'Standard'}

def model_function(**kwargs):
    
    udf_pipeline = Pipeline([('scaler', sk_prep.StandardScaler()), ('classification', linear_model.LogisticRegression(**kwargs))])
    
    return udf_pipeline

for intercept_value in fit_intercept:
    hyper_params = {'fit_intercept': intercept_value}
    demo_classification.run_experiments_autolog(experiment_name, 'Base line model', model_function, hyper_params, other_params)

2022/10/26 17:53:13 INFO mlflow.tracking.fluent: Autologging successfully enabled for sklearn.
2022/10/26 17:53:20 INFO mlflow.tracking.fluent: Autologging successfully enabled for sklearn.


Then we can start to use regularization os use complex model to see if we have an improvement.

In [8]:
# Hyper parameters to use
fit_intercept = [True, False]
penalty = ['l2', 'l1']
regularization = [0.001, 0.01, 0.05, 0.1, 0.5, 1, 5, 10]

# Parameters defined by the user to log
other_params = {'udf_model_type': 'Logistic Regression', 'udf_scaler': 'Standard'}

def model_function(**kwargs):
    
    # Solver saga allow all the penalties
    udf_pipeline = Pipeline([('scaler', sk_prep.StandardScaler()), ('classification', linear_model.LogisticRegression(**kwargs, solver = 'saga'))])
    
    return udf_pipeline

for intercept_value, penalty_type, c_value in itertools.product(fit_intercept, penalty, regularization):
    hyper_params = {'fit_intercept': intercept_value,
                    'penalty': penalty_type,
                    'C': c_value}
    
    demo_classification.run_experiments_autolog(experiment_name, 'Complex model', model_function, hyper_params, other_params)

2022/10/26 17:53:26 INFO mlflow.tracking.fluent: Autologging successfully enabled for sklearn.
2022/10/26 17:53:32 INFO mlflow.tracking.fluent: Autologging successfully enabled for sklearn.
2022/10/26 17:53:37 INFO mlflow.tracking.fluent: Autologging successfully enabled for sklearn.
2022/10/26 17:53:42 INFO mlflow.tracking.fluent: Autologging successfully enabled for sklearn.
2022/10/26 17:53:47 INFO mlflow.tracking.fluent: Autologging successfully enabled for sklearn.
2022/10/26 17:53:52 INFO mlflow.tracking.fluent: Autologging successfully enabled for sklearn.
2022/10/26 17:53:59 INFO mlflow.tracking.fluent: Autologging successfully enabled for sklearn.
2022/10/26 17:54:05 INFO mlflow.tracking.fluent: Autologging successfully enabled for sklearn.
2022/10/26 17:54:11 INFO mlflow.tracking.fluent: Autologging successfully enabled for sklearn.
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
2022/10/26 17:54:16 INFO mlflow.tra

In [9]:
# Hyper parameters to use
fit_intercept = [True, False]
penalty = ['elasticnet']
regularization = [0.001, 0.01, 0.05, 0.1, 0.5, 0.9]
l1_ratio = [0.25, 0.5, 0.75]

# Parameters defined by the user to log
other_params = {'udf_model_type': 'Logistic Regression', 'udf_scaler': 'Standard'}

def model_function(**kwargs):
    
    # Solver saga allow all the penalties
    udf_pipeline = Pipeline([('scaler', sk_prep.StandardScaler()), ('classification', linear_model.LogisticRegression(**kwargs, solver = 'saga'))])
    
    return udf_pipeline

for intercept_value, penalty_type, c_value, l1_ratio_value in itertools.product(fit_intercept, penalty, regularization, l1_ratio):
    hyper_params = {'fit_intercept': intercept_value,
                    'penalty': penalty_type,
                    'C': c_value,
                    'l1_ratio': l1_ratio_value}
    
    demo_classification.run_experiments_autolog(experiment_name, 'Complex model', model_function, hyper_params, other_params)

2022/10/26 17:56:33 INFO mlflow.tracking.fluent: Autologging successfully enabled for sklearn.
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
2022/10/26 17:56:39 INFO mlflow.tracking.fluent: Autologging successfully enabled for sklearn.
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
2022/10/26 17:56:44 INFO mlflow.tracking.fluent: Autologging successfully enabled for sklearn.
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
2022/10/26 17:56:50 INFO mlflow.tracking.fluent: Autologging successfully enabled for sklearn.
2022/10/26 17:56:56 INFO mlflow.tracking.fluent: Autologging successfully enabled for sklearn.
2022/10/26 17:57:01 INFO mlflow.tracking.fluent: Autologging successfully enabled for sklearn.
2022/10/26 17:57:07 INFO mlflow.tracking.fluent: Autologging successfully enabled for sklearn.
2022/

We can try with different kinds of algorithms like **random forest**

In [10]:
# Hyper parameters to use
n_estimators = [10, 20, 30, 40]
criterion = ['gini', 'entropy', 'log_loss']
max_features = ['sqrt', 'log2', 0.25, 0.5, 0.75]

# Parameters defined by the user to log
other_params = {'udf_model_type': 'Random Forest'}

def model_function(**kwargs):
    
    udf_pipeline = Pipeline([('classification', ensemble.RandomForestClassifier(**kwargs))])
    
    return udf_pipeline

for n_estimators_value, criterion_type, max_features_value in itertools.product(n_estimators, criterion, max_features):
    hyper_params = {'n_estimators': n_estimators_value,
                    'criterion': criterion_type,
                    'max_features': max_features_value}
    
    demo_classification.run_experiments_autolog(experiment_name, 'Ensemble', model_function, hyper_params, other_params)

2022/10/26 18:00:16 INFO mlflow.tracking.fluent: Autologging successfully enabled for sklearn.
2022/10/26 18:00:30 INFO mlflow.tracking.fluent: Autologging successfully enabled for sklearn.
2022/10/26 18:00:37 INFO mlflow.tracking.fluent: Autologging successfully enabled for sklearn.
2022/10/26 18:00:46 INFO mlflow.tracking.fluent: Autologging successfully enabled for sklearn.
2022/10/26 18:00:56 INFO mlflow.tracking.fluent: Autologging successfully enabled for sklearn.
2022/10/26 18:01:04 INFO mlflow.tracking.fluent: Autologging successfully enabled for sklearn.
2022/10/26 18:01:11 INFO mlflow.tracking.fluent: Autologging successfully enabled for sklearn.
2022/10/26 18:01:18 INFO mlflow.tracking.fluent: Autologging successfully enabled for sklearn.
2022/10/26 18:01:25 INFO mlflow.tracking.fluent: Autologging successfully enabled for sklearn.
2022/10/26 18:01:31 INFO mlflow.tracking.fluent: Autologging successfully enabled for sklearn.
2022/10/26 18:01:38 INFO mlflow.tracking.fluent: A