# Introduction

Experiment with MLflow from this [Video Tutorial](https://www.youtube.com/watch?v=WbicniUy_u0).

In [1]:
# Import Standard Libraries
import mlflow
import numpy as np
import pandas as pd

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LinearRegression, ElasticNet
from sklearn.model_selection import train_test_split

print('MLflow Version: ', mlflow.version.VERSION)

MLflow Version:  2.2.2


In [2]:
help(mlflow)

Help on package mlflow:

NAME
    mlflow

DESCRIPTION
    The ``mlflow`` module provides a high-level "fluent" API for starting and managing MLflow runs.
    For example:
    
    .. code:: python
    
        import mlflow
    
        mlflow.start_run()
        mlflow.log_param("my", "param")
        mlflow.log_metric("score", 100)
        mlflow.end_run()
    
    You can also use the context manager syntax like this:
    
    .. code:: python
    
        with mlflow.start_run() as run:
            mlflow.log_param("my", "param")
            mlflow.log_metric("score", 100)
    
    which automatically terminates the run at the end of the ``with`` block.
    
    The fluent tracking API is not currently threadsafe. Any concurrent callers to the tracking API must
    implement mutual exclusion manually.
    
    For a lower level API, see the :py:mod:`mlflow.client` module.

PACKAGE CONTENTS
    __main__
    _doctor
    _spark_autologging
    artifacts (package)
    azure (package)
 

# Basic Code

In [3]:
# Set experiment
mlflow.set_experiment('basic_code')

2023/03/19 22:38:59 INFO mlflow.tracking.fluent: Experiment with name 'basic_code' does not exist. Creating a new experiment.


<Experiment: artifact_location='file:///Users/s.porreca/Projects/cheat_sheets/machine_learning/mlflow/mlruns/176222180829175951', creation_time=1679261939948, experiment_id='176222180829175951', last_update_time=1679261939948, lifecycle_stage='active', name='basic_code', tags={}>

In [4]:
# Start MLflow Run
with mlflow.start_run(run_name='Run Name {}'.format(np.random.randint(1, 100))) as mlflow_run:
    
    # Show run ID
    print('MLflow Run: ', mlflow_run.info.run_uuid)
    
    # Show Run Name
    print('MLflow Run: ', mlflow_run.info.run_name)
    
    # Set a Run tag
    mlflow.set_tag('tag', 'Tag {}'.format(np.random.randint(1, 100)))
    
    # Log a Parameter
    mlflow.log_param('parameter_1', np.random.randint(1, 100))
    
    # Log a Metric
    mlflow.log_metric('accuracy', np.random.randint(0, 100))
    
    mlflow.end_run()

MLflow Run:  3918cf37c28945b38967dec892726529
MLflow Run:  Run Name 24


# Regression Example

In [5]:
# Read data
data = pd.read_csv('./../../data/books_sold_train.csv', 
                   parse_dates=['date'], 
                   index_col=0)

In [6]:
data.head()

Unnamed: 0_level_0,date,country,store,product,num_sold
row_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,2017-01-01,Belgium,KaggleMart,Kaggle Advanced Techniques,663
1,2017-01-01,Belgium,KaggleMart,Kaggle Getting Started,615
2,2017-01-01,Belgium,KaggleMart,Kaggle Recipe Book,480
3,2017-01-01,Belgium,KaggleMart,Kaggle for Kids: One Smart Goose,710
4,2017-01-01,Belgium,KaggleRama,Kaggle Advanced Techniques,240


In [7]:
# Engineering the date
data['day'] = data['date'].dt.day

In [8]:
# Define features and labell
numerical_features = ['day']

categorical_features = ['product', 
                        'store', 
                        'country']

labels = ['num_sold']

In [9]:
# Numerical features pipeline
numerical_features_pipeline = Pipeline(steps=[
    ('numerical_scaler', StandardScaler())
])

In [10]:
# Categorical features pipeline
categorical_features_pipeline = Pipeline(steps=[
    ('categorical_one_hot_encoder', OneHotEncoder(handle_unknown='ignore'))
])

In [11]:
# Bunlde data preprocessing steps
data_preprocessor = ColumnTransformer(
    transformers=[
        ('numerical_preprocessing', numerical_features_pipeline, numerical_features),
        ('categorical_preprocessing', categorical_features_pipeline, categorical_features)
    ])

In [12]:
# Define X and y for the training set
X = data[numerical_features + categorical_features]
y = data[labels]

In [13]:
# Split training data into train and validation
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [14]:
# Define experiment name
mlflow_experiment_name = 'Regression Example'

# Create experiment or retrieve already existing experiment
try:
    mlflow_experiment_id = mlflow.create_experiment(name=mlflow_experiment_name)
except Exception as e:
    mlflow_experiment_id = mlflow.get_experiment_by_name(mlflow_experiment_name).experiment_id