In [72]:
!pip install mlflow


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip available: [0m[31;49m22.3.1[0m[39;49m -> [0m[32;49m24.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [73]:
executions = 1

### Perform experiment with different runs

In [81]:

import mlflow.sklearn
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.svm import SVC

# Load classification dataset
train_df = pd.read_csv("./data/train.csv")
test_df = pd.read_csv("./data/test.csv")

X_train = train_df[[str(i) for i in range(0,30)]]
y_train = train_df['y']
X_test = test_df[[str(i) for i in range(0,30)]]
y_test = test_df['y']

# Set up MLFlow experiment
mlflow.set_experiment(f"Breast Cancer Classification Experiment#: {executions}")

# Define models and their parameter variations within the dictionary
model_params = {
    "Logistic Regression": (LogisticRegression, [
        {"max_iter": 100, "C": 1.0},
        {"max_iter": 200, "C": 0.5}
    ]),
    "Random Forest": (RandomForestClassifier, [
        {"n_estimators": 50, "max_depth": 5},
        {"n_estimators": 100, "max_depth": None}
    ]),
    "Support Vector Classifier": (SVC, [
        {"C": 1.0, "kernel": "linear"},
        {"C": 0.5, "kernel": "rbf"}
    ])
}

# Train models with parameter settings
for model_name, (ModelClass, params_list) in model_params.items():
    for idx, params in enumerate(params_list):
        with mlflow.start_run(run_name=f"Model: {model_name} Params#: {idx}"):
            # Initialize and train model
            model = ModelClass(**params)
            model.fit(X_train, y_train)
            predictions = model.predict(X_test)

            # Calculate metrics
            accuracy = accuracy_score(y_test, predictions)
            precision = precision_score(y_test, predictions)
            recall = recall_score(y_test, predictions)
            f1 = f1_score(y_test, predictions)

            # Log model parameters and metrics
            mlflow.log_param("model_name", model_name)
            for param_key, param_value in params.items():
                mlflow.log_param(param_key, param_value)
            mlflow.log_metric("accuracy", accuracy)
            mlflow.log_metric("precision", precision)
            mlflow.log_metric("recall", recall)
            mlflow.log_metric("f1_score", f1)
            mlflow.sklearn.log_model(model, "model")

            print(f"{model_name} with parameters {params} logged - Accuracy: {accuracy}, Precision: {precision}, Recall: {recall}, F1: {f1}")


2024/10/14 11:57:10 INFO mlflow.tracking.fluent: Experiment with name 'Breast Cancer Classification Experiment#: 2' does not exist. Creating a new experiment.
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Logistic Regression with parameters {'max_iter': 100, 'C': 1.0} logged - Accuracy: 0.956140350877193, Precision: 0.9459459459459459, Recall: 0.9859154929577465, F1: 0.9655172413793104




Logistic Regression with parameters {'max_iter': 200, 'C': 0.5} logged - Accuracy: 0.956140350877193, Precision: 0.9459459459459459, Recall: 0.9859154929577465, F1: 0.9655172413793104




Random Forest with parameters {'n_estimators': 50, 'max_depth': 5} logged - Accuracy: 0.9649122807017544, Precision: 0.958904109589041, Recall: 0.9859154929577465, F1: 0.9722222222222222




Random Forest with parameters {'n_estimators': 100, 'max_depth': None} logged - Accuracy: 0.9649122807017544, Precision: 0.958904109589041, Recall: 0.9859154929577465, F1: 0.9722222222222222




Support Vector Classifier with parameters {'C': 1.0, 'kernel': 'linear'} logged - Accuracy: 0.956140350877193, Precision: 0.9459459459459459, Recall: 0.9859154929577465, F1: 0.9655172413793104




Support Vector Classifier with parameters {'C': 0.5, 'kernel': 'rbf'} logged - Accuracy: 0.9385964912280702, Precision: 0.9102564102564102, Recall: 1.0, F1: 0.9530201342281879


### Select best model and save (register) it

In [82]:
import mlflow
from mlflow.exceptions import RestException

# Define model name
model_name = "BreastCancerClassifierBestModel"

# Get the experiment and the best run from the experiment
# experiment_name = "Breast Cancer Classification with Automated Model Selection"
experiment = mlflow.get_experiment_by_name(name=f"Breast Cancer Classification Experiment#: {executions}")
best_run = mlflow.search_runs(
    experiment_ids=[experiment.experiment_id],
    order_by=["metrics.f1_score DESC"], max_results=1
)



In [83]:
best_run

Unnamed: 0,run_id,experiment_id,status,artifact_uri,start_time,end_time,metrics.precision,metrics.f1_score,metrics.accuracy,metrics.recall,params.model_name,params.max_depth,params.n_estimators,tags.mlflow.runName,tags.mlflow.source.type,tags.mlflow.user,tags.mlflow.log-model.history,tags.mlflow.source.name
0,653adfd7b72642c19f685f9c8a83a941,677389738210410175,FINISHED,file:///Users/stefanandonov/Documents/GitWorks...,2024-10-14 09:57:14.374000+00:00,2024-10-14 09:57:15.522000+00:00,0.958904,0.972222,0.964912,0.985915,Random Forest,,100,Model: Random Forest Params#: 1,LOCAL,stefanandonov,"[{""run_id"": ""653adfd7b72642c19f685f9c8a83a941""...",/Users/stefanandonov/Documents/GitWorkspace/eu...


In [84]:
from mlflow import MlflowException

if not best_run.empty:
    best_run_id = best_run.iloc[0].run_id
    best_f1_score = best_run.iloc[0]["metrics.f1_score"]

    # Check if the model already exists in the registry
    try:

        client = mlflow.tracking.MlflowClient()
        versions = client.get_latest_versions(model_name, stages=["None", "Staging", "Production", "Archived"])
        current_best_version = max(versions, key=lambda x: x.version)

        # Get the current best F1 score from the registry
        current_best_f1_score = mlflow.get_run(current_best_version.run_id).data.metrics["f1_score"]

        # Compare the new model's F1 score with the existing best version
        if best_f1_score > current_best_f1_score:
            # Register the new model as a new version
            mlflow.register_model(f"runs:/{best_run_id}/model", model_name)
            print(f"New model registered as a new version with F1 score {best_f1_score}")
        else:
            print("Current model has an equal or better F1 score. No new version registered.")

    except MlflowException:
        # If the model does not exist in the registry, register it for the first time
        mlflow.register_model(
            model_uri=f"runs:/{best_run_id}/model",
            name=model_name
        )
        print(f"Model registered for the first time with F1 score {best_f1_score}")


New model registered as a new version with F1 score 0.9722222222222222


  versions = client.get_latest_versions(model_name, stages=["None", "Staging", "Production", "Archived"])
Registered model 'BreastCancerClassifierBestModel' already exists. Creating a new version of this model...
Created version '2' of model 'BreastCancerClassifierBestModel'.


In [85]:
executions+=1

In [80]:
executions

2