In [16]:
%load_ext autoreload
%autoreload 2

import udf_utils
from sklearn import linear_model
from sklearn import ensemble
from sklearn.pipeline import Pipeline
import sklearn.preprocessing as sk_prep
import itertools
import re
import mlflow

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## MLFlow demo

This notebook have code to run some examples to use mlflow in machine learning problems

--------
### Regression examples
--------

1. First, we need to define the data to use, in this case the *problem_type* is **regression**

In [None]:
demo_regression = udf_utils.ML_FLOW(problem_type = 'regression').generate_data()
experiment_name = 'Regression problem'

2. Then, we need to define the model(s) and the hyperparameters to use in the trainning

A first model could be a linear regression. This can be our base model.

In [None]:
# Hyper parameters to use
fit_intercept = [True, False]

# Parameters defined by the user to log
other_params = {'udf_model_type': 'Linear Regression', 'udf_regularization': 'None', 'udf_scaler': 'Standard'}

def model_function(**kwargs):
    
    udf_pipeline = Pipeline([('scaler', sk_prep.StandardScaler()), ('regression', linear_model.LinearRegression(**kwargs))])
    
    return udf_pipeline

for intercept_value in fit_intercept:
    hyper_params = {'fit_intercept': intercept_value}
    demo_regression.run_experiments_autolog(experiment_name, 'Base line model', model_function, hyper_params, other_params)

Then we can start to use regularization os use complex model to see if we have an improvement.

In [None]:
# Hyper parameters to use
fit_intercept = [True, False]
regularization = [0.1, 0.5, 1.0, 2.0, 5.0, 10, 50, 100]

# Parameters defined by the user to log
other_params = {'udf_model_type': 'Ridge Regression', 'udf_scaler': 'Standard'}

def model_function(**kwargs):
    
    udf_pipeline = Pipeline([('scaler', sk_prep.StandardScaler()), ('regression', linear_model.Ridge(**kwargs))])
    
    return udf_pipeline

for intercept_value, alpha_value in itertools.product(fit_intercept, regularization):
    hyper_params = {'fit_intercept': intercept_value,
                    'alpha': alpha_value}
    
    demo_regression.run_experiments_autolog(experiment_name, 'Complex model', model_function, hyper_params, other_params)

In [None]:
# Hyper parameters to use
fit_intercept = [True, False]
regularization = [0.1, 0.5, 1.0, 2.0, 5.0, 10, 50, 100]

# Parameters defined by the user to log
other_params = {'udf_model_type': 'Lasso Regression', 'udf_scaler': 'Standard'}

def model_function(**kwargs):
    
    udf_pipeline = Pipeline([('scaler', sk_prep.StandardScaler()), ('regression', linear_model.Lasso(**kwargs))])
    
    return udf_pipeline

for intercept_value, alpha_value in itertools.product(fit_intercept, regularization):
    hyper_params = {'fit_intercept': intercept_value,
                    'alpha': alpha_value}
    
    demo_regression.run_experiments_autolog(experiment_name, 'Complex model', model_function, hyper_params, other_params)

--------
### Classification examples
--------


1. First, we need to define the data to use, in this case the *problem_type* is **classification**

In [2]:
demo_classification = udf_utils.ML_FLOW(problem_type = 'classification').generate_data(n_classes = 2)
experiment_name = 'Classification problem'

2. Then, we need to define the model(s) and the hyperparameters to use in the trainning

A first model could be a logistic regression. This can be our base model.

In [3]:
fit_intercept = [True, False]
other_params = {'udf_model_type': 'Logistic Regression', 'udf_regularization': 'None', 'udf_scaler': 'Standard'}

def model_function(**kwargs):
    
    udf_pipeline = Pipeline([('scaler', sk_prep.StandardScaler()), ('classification', linear_model.LogisticRegression(**kwargs))])
    
    return udf_pipeline

for intercept_value in fit_intercept:
    hyper_params = {'fit_intercept': intercept_value}
    demo_classification.run_experiments_autolog(experiment_name, 'Base line model', model_function, hyper_params, other_params)

2022/11/16 11:03:04 INFO mlflow.tracking.fluent: Autologging successfully enabled for sklearn.
  y = column_or_1d(y, warn=True)


AttributeError: 'DataFrame' object has no attribute 'reshape'

Then we can start to use regularization os use complex model to see if we have an improvement.

In [None]:
# Hyper parameters to use
fit_intercept = [True, False]
penalty = ['l2', 'l1']
regularization = [0.001, 0.01, 0.05, 0.1, 0.5, 1, 5, 10]

# Parameters defined by the user to log
other_params = {'udf_model_type': 'Logistic Regression', 'udf_scaler': 'Standard'}

def model_function(**kwargs):
    
    # Solver saga allow all the penalties
    udf_pipeline = Pipeline([('scaler', sk_prep.StandardScaler()), ('classification', linear_model.LogisticRegression(**kwargs, solver = 'saga'))])
    
    return udf_pipeline

for intercept_value, penalty_type, c_value in itertools.product(fit_intercept, penalty, regularization):
    hyper_params = {'fit_intercept': intercept_value,
                    'penalty': penalty_type,
                    'C': c_value}
    
    demo_classification.run_experiments_autolog(experiment_name, 'Complex model', model_function, hyper_params, other_params)

In [None]:
# Hyper parameters to use
fit_intercept = [True, False]
penalty = ['elasticnet']
regularization = [0.001, 0.01, 0.05, 0.1, 0.5, 0.9]
l1_ratio = [0.25, 0.5, 0.75]

# Parameters defined by the user to log
other_params = {'udf_model_type': 'Logistic Regression', 'udf_scaler': 'Standard'}

def model_function(**kwargs):
    
    # Solver saga allow all the penalties
    udf_pipeline = Pipeline([('scaler', sk_prep.StandardScaler()), ('classification', linear_model.LogisticRegression(**kwargs, solver = 'saga'))])
    
    return udf_pipeline

for intercept_value, penalty_type, c_value, l1_ratio_value in itertools.product(fit_intercept, penalty, regularization, l1_ratio):
    hyper_params = {'fit_intercept': intercept_value,
                    'penalty': penalty_type,
                    'C': c_value,
                    'l1_ratio': l1_ratio_value}
    
    demo_classification.run_experiments_autolog(experiment_name, 'Complex model', model_function, hyper_params, other_params)

We can try with different kinds of algorithms like **random forest**

In [None]:
# Hyper parameters to use
n_estimators = [10, 20, 30, 40]
criterion = ['gini', 'entropy', 'log_loss']
max_features = ['sqrt', 'log2', 0.25, 0.5, 0.75]

# Parameters defined by the user to log
other_params = {'udf_model_type': 'Random Forest'}

def model_function(**kwargs):
    
    udf_pipeline = Pipeline([('classification', ensemble.RandomForestClassifier(**kwargs))])
    
    return udf_pipeline

for n_estimators_value, criterion_type, max_features_value in itertools.product(n_estimators, criterion, max_features):
    hyper_params = {'n_estimators': n_estimators_value,
                    'criterion': criterion_type,
                    'max_features': max_features_value}
    
    demo_classification.run_experiments_autolog(experiment_name, 'Ensemble', model_function, hyper_params, other_params)

--------
# Load Model
--------

In [17]:
experiments_list = mlflow.search_experiments(filter_string = "name='{}'".format(experiment_name))
id_list = [experiment.experiment_id for experiment in experiments_list]
id_list

['2']

In [24]:
# Regression metric
#metric_name = 'metrics.test_rms'

# Classification metric
metric_name = 'metrics.test_accuracy_score'

var_artifact_uri = mlflow.search_runs(id_list) \
                         .sort_values(by = metric_name, ascending = False) \
                         .artifact_uri[0]

var_artifact_uri

'file:///mnt/c/Users/V%C3%ADctor%20Samayoa/Documents/Qbitz/Git/mlflow-demo/mlruns/2/70cf77d29da244218a66f94f27ccb4d7/artifacts'

In [25]:
var_regex = re.search(r'/mlruns/.*/(.*)/artifacts', var_artifact_uri)
model_path = 'runs:/' + var_regex.group(1) + '/model'
model_path

'runs:/70cf77d29da244218a66f94f27ccb4d7/model'

In [29]:
my_model = mlflow.sklearn.load_model(model_path)

In [30]:
# Regression
#my_model.predict(demo_regression.x_train)

# Clasification
my_model.predict(demo_classification.x_test)

array([0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1,
       1, 0, 1, 1, 0, 1, 1, 0, 1, 0, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 0,
       1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 0,
       1, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1,
       0, 1, 1, 0, 1, 1, 0, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0,
       0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 1, 0,
       1, 1, 0, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0,
       0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1,
       0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 0, 1,
       0, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 1,
       0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0,
       0, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1,
       0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1,
       1, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 1])

In [31]:
# Clasification
my_model.predict_proba(demo_classification.x_test)

array([[0.98411086, 0.01588914],
       [0.73570781, 0.26429219],
       [0.91359349, 0.08640651],
       [0.56545424, 0.43454576],
       [0.46633733, 0.53366267],
       [0.05807924, 0.94192076],
       [0.77281579, 0.22718421],
       [0.36056152, 0.63943848],
       [0.23249973, 0.76750027],
       [0.5772653 , 0.4227347 ],
       [0.97899389, 0.02100611],
       [0.53600734, 0.46399266],
       [0.04761788, 0.95238212],
       [0.33461577, 0.66538423],
       [0.48170992, 0.51829008],
       [0.41003707, 0.58996293],
       [0.93969499, 0.06030501],
       [0.12136402, 0.87863598],
       [0.18157623, 0.81842377],
       [0.17666669, 0.82333331],
       [0.8406243 , 0.1593757 ],
       [0.05433343, 0.94566657],
       [0.01422153, 0.98577847],
       [0.99406542, 0.00593458],
       [0.39996821, 0.60003179],
       [0.37944285, 0.62055715],
       [0.5942216 , 0.4057784 ],
       [0.06764768, 0.93235232],
       [0.31045108, 0.68954892],
       [0.51444362, 0.48555638],
       [0.