# 1. Preamboli

In [None]:
import pandas as pd
from sklearn.datasets import load_iris, load_breast_cancer
from sklearn.model_selection import train_test_split

import mlflow
import mlflow.sklearn
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

from mlflow.tracking import MlflowClient

import os
import re

import platform
import sys
import json

import pickle, tempfile

## Utils

In [None]:
def save_dataset_version(dataset_name: str, df: pd.DataFrame):
    """Salva il dataframe in una cartella "datasets/<dataset_name>/"
    con un nome di file che segue lo schema vXX.csv, dove XX è un numero
    incrementale che rappresenta la versione del dataset"""
    
    # Percorso della cartella dataset
    base_dir = "datasets"
    dataset_dir = os.path.join(base_dir, dataset_name)
    
    # Crea la cartella se non esiste
    os.makedirs(dataset_dir, exist_ok=True)
    
    # Trova i file già presenti che matchano lo schema vXX.csv
    existing_files = [f for f in os.listdir(dataset_dir) if re.match(r"v\d{2}\.csv", f)]
    
    if not existing_files:
        # Se non ci sono file, la prima versione è v01
        version_number = 1
    else:
        # Estrai i numeri delle versioni dai file
        versions = [int(re.findall(r"\d{2}", f)[0]) for f in existing_files]
        version_number = max(versions) + 1
    
    # Nome del file da salvare
    filename = f"v{version_number:02d}.csv"
    filepath = os.path.join(dataset_dir, filename)
    
    # Salva il dataframe
    df.to_csv(filepath, index=False)
    
    return filepath, version_number

In [None]:
def load_dataset(dataset_name):
    """
    Carica un dataset in base al nome fornito.
    Supporta "iris_dataset" e "breast_cancer_dataset".
    Restituisce X_train, X_test, y_train, y_test, dataset_version, dataset_path, df
    """
    if dataset_name == "iris_dataset":
        # Carichiamo il dataset
        iris = load_iris()
        df = pd.DataFrame(iris.data, columns=iris.feature_names)
        df['target'] = iris.target

        # Salviamo il dataset versionato localmente (puoi anche usare DVC o Git LFS)
        dataset_path, dataset_version = save_dataset_version(dataset_name, df)

        # Split train/test
        X = df[iris.feature_names]
        y = df['target']
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=42)

    elif dataset_name == "breast_cancer_dataset":
        # Carichiamo il dataset Breast Cancer
        cancer = load_breast_cancer()
        df = pd.DataFrame(cancer.data, columns=cancer.feature_names)
        df['target'] = cancer.target

        # Salviamo il dataset versionato localmente
        dataset_path, dataset_version = save_dataset_version(dataset_name, df)

        # Split train/test
        X = df[cancer.feature_names]
        y = df['target']
        X_train, X_test, y_train, y_test = train_test_split(
            X, y, test_size=0.3, random_state=42
        )
    
    return X_train, X_test, y_train, y_test, dataset_version, dataset_path, df

In [None]:
# Funzione helper per loggare esperimenti
def train_and_log_model(model_name="IrisClassifier",
                        model_dict = {
                            "model": "RandomForest",
                            "n_estimators": 100,
                            "max_depth": None
                        },
                        X_train=None,
                        X_test=None,
                        y_train=None,
                        y_test=None,
                        dataset_version=None,
                        dataset_name=None,
                        ):
    """
    Funzione per addestrare un modello, calcolare metriche e loggare tutto su MLflow.
    model_dict: dizionario con i parametri del modello
    model_name: nome del modello registrato in MLflow
    X_train, X_test, y_train, y_test: dati di addestramento e test
    dataset_version: versione del dataset (se disponibile)
    dataset_name: nome del dataset (se disponibile)
    """

    with mlflow.start_run() as run:
        if model_dict["model"] == "RandomForest":
            # 1. Crea il modello
            model = RandomForestClassifier(n_estimators=model_dict["n_estimators"], max_depth=model_dict["max_depth"], random_state=42)
            model.fit(X_train, y_train)

            mlflow.log_param("n_estimators", model_dict["n_estimators"])
            mlflow.log_param("max_depth", model_dict["max_depth"])

            # Log feature importance (se utile per analisi)
            feature_importances = dict(zip(X_train.columns, model.feature_importances_))
            mlflow.log_dict(feature_importances, "feature_importances.json")

        elif model_dict["model"] == "LogisticRegression":
            model = LogisticRegression(max_iter=model_dict["max_iter"], random_state=42)
            model.fit(X_train, y_train)

            mlflow.log_param("max_iter", model_dict["max_iter"])


        # 2. Previsioni e metriche
        preds = model.predict(X_test)
        acc = accuracy_score(y_test, preds)
        prec = precision_score(y_test, preds, average="weighted")
        rec = recall_score(y_test, preds, average="weighted")
        f1 = f1_score(y_test, preds, average="weighted")

        # 3. Log parametri principali del modello
        mlflow.log_param("model_class", model_dict["model"])
        if dataset_version is not None:
            mlflow.log_param("dataset_version", dataset_version)
        if dataset_name is not None:
            mlflow.log_param("dataset_name", dataset_name)


        # 4. Log metriche
        mlflow.log_param("accuracy", round( float(acc), 2))
        mlflow.log_param("precision_weighted", round( float(prec), 2))
        mlflow.log_param("recall_weighted", round( float(rec), 2))
        mlflow.log_param("f1_weighted", round( float(f1), 2))

        # ---  Salvataggio locale + upload negli artifacts
        with tempfile.TemporaryDirectory() as tmpdir:
            model_path = os.path.join(tmpdir, f"{model_name}.pkl")
            with open(model_path, "wb") as f:
                pickle.dump(model, f)

            # Scrivi info ambiente
            env_info = {
                "python_version": sys.version,
                "platform": platform.platform(),
                "mlflow_version": mlflow.__version__,
                "sklearn_version": model.__module__.split('.')[0]
            }
            env_path = os.path.join(tmpdir, "environment_info.json")
            with open(env_path, "w") as f:
                json.dump(env_info, f, indent=2)

            # Scrivi requirements standardizzati
            reqs_path = os.path.join(tmpdir, "requirements.txt")
            with open(reqs_path, "w") as f:
                f.write("scikit-learn\nmlflow\npandas\nnumpy\n")

            # Log degli artifacts (modello + env)
            mlflow.log_artifacts(tmpdir, artifact_path="model_package")

        # 5. Log del modello e dell’ambiente
        mlflow.sklearn.log_model(model, "model", registered_model_name=model_name)

        conda_env = mlflow.sklearn.get_default_conda_env()
        mlflow.sklearn.log_model(
            sk_model=model,
            artifact_path="model",
            conda_env=conda_env
        )

        # N.B. Non per forza tutti i modelli vanno registrati. 
        # Volendo, si può decidere di non registrare i modelli dentro questo codice, ma decidere a posteriori quali registrare e quali no

        # 6. Log di due righe di esempio dal dataset
        sample_input = X_train.head(2).copy()
        sample_input["target"] = y_train.iloc[:2].values
        mlflow.log_table(sample_input, "sample_input.parquet")

        # 7. Stampa riassunto
        print(f"Run {run.info.run_id} - Acc: {acc:.4f}")
        
        return run.info.run_id

# 2. Import e versioning del dataset

In [None]:
example_name = "breast_cancer" # "breast_cancer" / "iris"

In [None]:
dataset_name = f"{example_name}_dataset" #  "breast_cancer_dataset" / "iris_dataset"

X_train, X_test, y_train, y_test, dataset_version, dataset_path, df = load_dataset(dataset_name)

print(f"Dataset salvato in: {dataset_path}")
print(f"Versione del dataset: {dataset_version}")

df.head(2)

# 3. Setup MLflow Tracking

In [None]:
# Per impostare l'URI di tracciamento 
# mlflow.set_tracking_uri("")

In [None]:
# Impostiamo il nome dell'esperimento
mlflow.set_experiment(f"{example_name}_Classification")

## Runs

In [None]:
D_randomforest_1 = {
    "model": "RandomForest",
    "n_estimators": 10, 
    "max_depth": 3,
    }

D_randomforest_2= {
    "model": "RandomForest",
    "n_estimators": 20, 
    "max_depth": 5,
    }

D_logistic_1 = {
    "model": "LogisticRegression",
    "max_iter": 10
    }


D_logistic_2 = {
    "model": "LogisticRegression",
    "max_iter": 10
    }

D_list = [D_randomforest_1, D_randomforest_2, D_logistic_1, D_logistic_2]

In [None]:
MODEL_NAME = f"{example_name}_Classifier"

In [None]:
for D in D_list:
    # Eseguiamo alcuni esperimenti con parametri diversi
    run_id = train_and_log_model(   model_name = MODEL_NAME,
                                    model_dict = D,
                                    X_train = X_train,
                                    X_test = X_test,
                                    y_train = y_train,
                                    y_test = y_test,
                                    dataset_version = dataset_version,
                                    dataset_name = dataset_name,
                                    )

# 4. Gestione dei modelli e MLflow Model Registry

In [None]:
client = MlflowClient()

# 1. Prendi tutte le versioni registrate del modello
versions = client.search_model_versions(f"name='{MODEL_NAME}'")

# 2. Trova l'accuracy migliore tra i run
best_run_id = None
best_accuracy = -1.0
best_model_version = None

for v in versions:
    run_id = v.run_id
    metrics = client.get_run(run_id).data.params
    acc = metrics.get("accuracy", None)
    if acc is not None and float(acc) > best_accuracy:
        best_accuracy = float(acc)
        best_run_id = run_id
        best_model_version = v.version

print(f"Miglior modello trovato: run_id={best_run_id}, versione={best_model_version}, accuracy={best_accuracy:.4f}")

In [None]:
# 3. Aggiorna lo stato del modello migliore (es. in Production)
if best_model_version is not None:
    client.transition_model_version_stage(
        name=MODEL_NAME,
        version=best_model_version,
        stage="Production",   # oppure "Staging"
        archive_existing_versions=True  # sposta gli altri modelli fuori da Production
    )
    print(f"Il modello versione {best_model_version} è stato promosso a Production.")

# 5. Inference

In [None]:
# Recupera il modello in produzione
model_uri = f"models:/{MODEL_NAME}/Production"
model = mlflow.pyfunc.load_model(model_uri)

In [None]:
# Prepara nuovi dati di input (stesso schema usato in addestramento)
if example_name == "iris":
    # Qui uso l'Iris dataset come esempio
    new_data = pd.DataFrame({
        "sepal length (cm)": [5.1, 6.2, 5.9],
        "sepal width (cm)":  [3.5, 2.9, 3.0],
        "petal length (cm)": [1.4, 4.3, 5.1],
        "petal width (cm)":  [0.2, 1.3, 1.8],
    })


elif example_name == "breast_cancer":
    new_data = pd.DataFrame({
        "mean radius": [14.0, 20.0, 13.5],
        "mean texture": [20.0, 30.0, 15.0],
        "mean perimeter": [90.0, 130.0, 85.0],
        "mean area": [600.0, 1200.0, 500.0],
        "mean smoothness": [0.1, 0.15, 0.09],
        "mean compactness": [0.1, 0.2, 0.08],
        "mean concavity": [0.05, 0.1, 0.04],
        "mean concave points": [0.02, 0.05, 0.01],
        "mean symmetry": [0.2, 0.3, 0.18],
        "mean fractal dimension": [0.06, 0.07, 0.05],
        "radius error": [0.3, 0.4, 0.2],
        "texture error": [1.5, 2.0, 1.2],
        "perimeter error": [2.5, 3.5, 2.0],
        "area error": [20.0, 30.0, 15.0],
        "smoothness error": [0.005, 0.007, 0.004],
        "compactness error": [0.02, 0.03, 0.015],
        "concavity error": [0.02, 0.025, 0.01],
        "concave points error": [0.01, 0.015, 0.008],
        "symmetry error": [0.02, 0.03, 0.015],
        "fractal dimension error": [0.003, 0.004, 0.002],
        "worst radius": [16.0, 25.0, 14.5],
        "worst texture": [25.0, 40.0, 20.0],
        "worst perimeter": [110.0, 160.0, 95.0],
        "worst area": [800.0, 1500.0, 600.0],
        "worst smoothness": [0.15, 0.2, 0.12],
        "worst compactness": [0.25, 0.3, 0.2],
        "worst concavity": [0.15, 0.2, 0.1],
        "worst concave points": [0.07, 0.1, 0.05],
        "worst symmetry": [0.3, 0.4, 0.25],
        "worst fractal dimension": [0.08, 0.1, 0.07],
    })


# 3. Esegui inferenza
predictions = model.predict(new_data)

print("Inferenza:")
print(predictions)