# MLFlow experiment

In [1]:
import numpy as np
import pandas as pd
import math
import mlflow
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import SGDRegressor
from sklearn.metrics import r2_score
from sklearn.svm import SVR
from mlflow.models import infer_signature
from sklearn.metrics import mean_squared_error
import pickle

#imports for type hinting
from typing import Tuple, Dict, Any
from sklearn.base import BaseEstimator

In [2]:
print("MLFlow: {}".format(mlflow.__version__))

MLFlow: 2.14.0


## Functions definition

In [3]:
#imports  dataset and splits it into X matrix and y vector
def preprocess_split(path: str) -> Tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray]:
    dataset = pd.read_csv(path)
    X = dataset.iloc[:, :-1].values
    y = dataset.iloc[:, -1].values
    return train_test_split(X, y)

#standartizes the dataset the first time and saves the scaler
def preprocess_first_scale(X_train: np.ndarray, X_test: np.ndarray) -> Tuple[np.ndarray, np.ndarray]:
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    with open('models/scaler.pkl', 'wb') as f:
        pickle.dump(scaler, f)
    X_test = scaler.transform(X_test)
    return X_train, X_test

#standartizes the dataset the using already saved scaler
def preprocess_scale(X_test: np.ndarray) -> np.ndarray:
    scaler = StandardScaler()
    with open('models/scaler.pkl', 'rb') as f:
        scaler = pickle.load(f)
    X_test = scaler.transform(X_test)
    return X_test

In [4]:
#trains a given ml model on the gien dataset, measures and logs the performance
def train(model: BaseEstimator, X_train: np.ndarray, y_train: np.ndarray) -> None:
    model.fit(X_train, y_train)
    train_acc = model.score(X_train, y_train)
    mlflow.log_metric("Training Accuracy", train_acc)
    print(f"Train Accuracy: {train_acc:.3%}")

In [5]:
#evaluates and logs a models's performance on a test set
def evaluate(model: BaseEstimator, X_test: np.ndarray, y_test: np.ndarray) -> Tuple[float, float]:
    y_pred = model.predict(X_test)
    r2 =  r2_score(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)
    mlflow.log_metric("r2", r2)
    mlflow.log_metric("mean_squared_error", mse)
    return r2, mse

In [6]:
#functions for saving and loading sklearn models
def model_save(model: BaseEstimator, path: str) -> None:
    with open(path, 'wb') as f:
        pickle.dump(model, f)

def model_load(path: str) -> BaseEstimator:
    with open(path, 'rb') as f:
        model = pickle.load(f)
    return model

In [7]:
#this function creates a new mlflow run. One has to provide sklearn model, name of run, path to the dataset, and hyperparameters used 
def mlflow_run(model: BaseEstimator, name: str, path: str, params: Dict[str, Any]) -> Tuple[Dict[str, Any], float]:
    with mlflow.start_run(run_name=name):
        X_train, X_test, y_train, y_test = preprocess_split(path)
        X_train, X_test = preprocess_first_scale(X_train, X_test)
        train(model, X_train, y_train)
        r2, mse = evaluate(model, X_test, y_test)
        mlflow.log_params(params)
        mlflow.set_tag("Training Info", "Regression model for integration data")
        signature = infer_signature(X_train, model.predict(X_train))
        model_info = mlflow.sklearn.log_model(
            sk_model=model,
            artifact_path=name,
            signature=signature,
            input_example=X_train,
            registered_model_name=name,
        )
    return model_info, r2

## Setting experiment

In [8]:
mlflow.set_tracking_uri(uri="http://127.0.0.1:8000")

In [9]:
mlflow.set_experiment("Integration experiment")

<Experiment: artifact_location='mlflow-artifacts:/413595962434341677', creation_time=1721302419534, experiment_id='413595962434341677', last_update_time=1721302419534, lifecycle_stage='active', name='Integration experiment', tags={}>

In [10]:
#definting parameters for the 
sgd_params = {
    "max_iter": 1000,
    "loss": "squared_error",
    "penalty": "l2",
    "eta0": 0.01,
    "tol": 1e-3,
    "random_state": 42,
}

svr_params = {
    "kernel":'rbf', 
    "degree":3,
}
deg = [2, 3, 4, 5]

scores = list()
infos = list()

In [11]:
sgd = SGDRegressor(**sgd_params)

In [12]:
info_sgd, r2 = mlflow_run(sgd, "sgd_regressor", "data/trigo.csv", sgd_params)
scores.append(r2)
infos.append(info_sgd)

Train Accuracy: 98.402%


Registered model 'sgd_regressor' already exists. Creating a new version of this model...
2024/07/20 16:58:35 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: sgd_regressor, version 2
Created version '2' of model 'sgd_regressor'.


In [13]:
for i in deg:
    svr_params["degree"] = i
    svr = SVR(**svr_params)
    info_svr, r2 = mlflow_run(svr, "svr_regressor with degree {}".format(str(i)), "data/trigo.csv", svr_params)
    scores.append(r2)
    infos.append(info_svr)

Train Accuracy: 98.478%


Registered model 'svr_regressor with degree 2' already exists. Creating a new version of this model...
2024/07/20 16:58:40 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: svr_regressor with degree 2, version 2
Created version '2' of model 'svr_regressor with degree 2'.


Train Accuracy: 98.578%


Registered model 'svr_regressor with degree 3' already exists. Creating a new version of this model...
2024/07/20 16:58:44 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: svr_regressor with degree 3, version 2
Created version '2' of model 'svr_regressor with degree 3'.


Train Accuracy: 98.472%


Registered model 'svr_regressor with degree 4' already exists. Creating a new version of this model...
2024/07/20 16:58:46 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: svr_regressor with degree 4, version 2
Created version '2' of model 'svr_regressor with degree 4'.


Train Accuracy: 98.455%


Registered model 'svr_regressor with degree 5' already exists. Creating a new version of this model...
2024/07/20 16:58:49 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: svr_regressor with degree 5, version 2
Created version '2' of model 'svr_regressor with degree 5'.


## Load the model back for predictions as a generic Python Function model

In [14]:
print(scores)

[0.9861494226625116, 0.9833225039917809, 0.9811579886620382, 0.982810041989036, 0.9843699097582242]


In [15]:
uri_of_model = infos[scores.index(max(scores))].model_uri

In [16]:
loaded_model = mlflow.pyfunc.load_model(uri_of_model)

  from .autonotebook import tqdm as notebook_tqdm
Downloading artifacts: 100%|███████████████████████████████████████████████████████████| 10/10 [00:00<00:00, 76.65it/s]


In [17]:
X_train, X_test, y_train, y_test = preprocess_split("data/trigo.csv")
X_train, X_test = preprocess_first_scale(X_train, X_test)

In [18]:
predictions = loaded_model.predict(X_test)

In [19]:
result = pd.DataFrame(X_test)
result["actual_value"] = y_test
result["predicted_value"] = predictions

result[:4]

Unnamed: 0,0,1,2,3,4,actual_value,predicted_value
0,-0.238604,1.084306,1.653479,0.208665,-0.47068,1.701229,1.683011
1,0.123408,0.311506,0.217718,0.229903,-0.250455,1.013131,1.043823
2,-0.067284,0.387019,0.619371,0.609699,-0.239971,1.262944,1.26424
3,-0.299455,0.272204,-0.352473,-0.644197,0.520732,0.657563,0.677355


In [20]:
model_save(loaded_model, "models/model.pkl")