# Tracking de modelos con mlflow

In [32]:
import mlflow.sklearn
from mlflow import log_metric, log_param
from mlflow.tracking import MlflowClient
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import  root_mean_squared_error, accuracy_score
from sklearn.feature_extraction import  DictVectorizer
import os
import pandas as pd
import numpy as np
import time
from sklearn.model_selection import train_test_split
import dagshub
import mlflow
import pickle
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
import mlflow.data

In [33]:
def read_dataframe(filename):

    df = pd.read_csv(filename)

    categorical = ['label', 'device']
    df[categorical] = df[categorical].astype(str)

    return df

In [34]:
df = read_dataframe("../data/waze_dataset.csv")

In [35]:
df.dropna(inplace=True)  # Eliminar valores faltantes
X = df[['sessions', 'drives', 'total_sessions']]  # Seleccionar características
y = df['label'].apply(lambda x: 1 if x == 'retained' else 0)  # Convertir a variable binaria (1 para retenido, 0 para no retenido)

In [36]:
dagshub.init(repo_owner='Parcex10', repo_name='PROYECTO_OSKU', mlflow=True)

In [37]:
MLFLOW_TRACKING_URI = "https://dagshub.com/colome8/PROYECTO_OSKU.mlflow"
print("MLFLOW_TRACKING_URI:", MLFLOW_TRACKING_URI)
mlflow.set_tracking_uri(MLFLOW_TRACKING_URI) 
mlflow.set_experiment("model-experiment")

MLFLOW_TRACKING_URI: https://dagshub.com/colome8/PROYECTO_OSKU.mlflow


2024/11/25 02:17:38 INFO mlflow.tracking.fluent: Experiment with name 'model-experiment' does not exist. Creating a new experiment.


<Experiment: artifact_location='mlflow-artifacts:/cb8402797f8c4bb5a397fb2e0337559a', creation_time=1732522658508, experiment_id='0', last_update_time=1732522658508, lifecycle_stage='active', name='model-experiment', tags={}>

In [38]:
# Datos de ejemplo
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [39]:
models = [
    
    {"model": LogisticRegression,
     "params": {},
     },
    
    {"model": DecisionTreeClassifier,
     "params": {},
     },
    
    {"model": RandomForestClassifier,
     "params": {}, 
     },

    {"model": SVC,
     "params": {"probability": True}
    },

]

In [40]:
dv = DictVectorizer()

In [41]:
with mlflow.start_run(run_name="Nested Runs"):
    for model in models:
        
        model_class = model["model"]
        model_name = model_class.__name__
        params = model["params"]
        
        with mlflow.start_run(run_name=model_name,nested=True):
            
            for param, value in params.items():
                log_param(param, value)

            ml_model = model_class(**params)
           
            ml_model.fit(X_train, y_train)
    
            y_pred = ml_model.predict(X_test)
            
            rmse = root_mean_squared_error(y_test, y_pred)
            accuracy = accuracy_score(y_test, y_pred)
            mlflow.log_metric("rmse", rmse)
            mlflow.log_metric("accuracy", accuracy)

            # Verifica y crea el directorio 'models' si no existe
            os.makedirs("models", exist_ok=True)
            
            # !mkdir models
            with open("models/preprocessor.b", "wb") as f_out:
                pickle.dump(dv, f_out)
                
            mlflow.log_artifact("models/preprocessor.b", artifact_path="preprocessor")

            # Registrar el modelo
            mlflow.sklearn.log_model(ml_model, model_name)

            print(f"Modelo registrado con precisión: {accuracy}")

2024/11/25 02:17:52 INFO mlflow.tracking._tracking_service.client: 🏃 View run LogisticRegression at: https://dagshub.com/colome8/PROYECTO_OSKU.mlflow/#/experiments/0/runs/c5599441a2ad46ddbb6ae20de981154b.
2024/11/25 02:17:52 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: https://dagshub.com/colome8/PROYECTO_OSKU.mlflow/#/experiments/0.


Modelo registrado con precisión: 0.7897777777777778


2024/11/25 02:18:07 INFO mlflow.tracking._tracking_service.client: 🏃 View run DecisionTreeClassifier at: https://dagshub.com/colome8/PROYECTO_OSKU.mlflow/#/experiments/0/runs/a50c1163fe664e168df18277935a432f.
2024/11/25 02:18:07 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: https://dagshub.com/colome8/PROYECTO_OSKU.mlflow/#/experiments/0.


Modelo registrado con precisión: 0.6531111111111111


2024/11/25 02:19:04 INFO mlflow.tracking._tracking_service.client: 🏃 View run RandomForestClassifier at: https://dagshub.com/colome8/PROYECTO_OSKU.mlflow/#/experiments/0/runs/24421b5ad755405989c7ba967f85c856.


Modelo registrado con precisión: 0.7348888888888889


2024/11/25 02:19:04 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: https://dagshub.com/colome8/PROYECTO_OSKU.mlflow/#/experiments/0.
2024/11/25 02:19:55 INFO mlflow.tracking._tracking_service.client: 🏃 View run SVC at: https://dagshub.com/colome8/PROYECTO_OSKU.mlflow/#/experiments/0/runs/fb91d1ff47a941d0b13ffb0aa836eee8.
2024/11/25 02:19:55 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: https://dagshub.com/colome8/PROYECTO_OSKU.mlflow/#/experiments/0.


Modelo registrado con precisión: 0.7897777777777778


2024/11/25 02:19:55 INFO mlflow.tracking._tracking_service.client: 🏃 View run Nested Runs at: https://dagshub.com/colome8/PROYECTO_OSKU.mlflow/#/experiments/0/runs/41912d4889fd4f2a8b732db361d89376.
2024/11/25 02:19:55 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: https://dagshub.com/colome8/PROYECTO_OSKU.mlflow/#/experiments/0.


In [42]:
mlflow.end_run()

Hyperparameter tuning

In [43]:
from sklearn.model_selection import RandomizedSearchCV

# Modelos y parámetros
models = [
    {
        "model": LogisticRegression,
        "params": {"C": [0.1, 1.0, 10], "solver": ["liblinear", "lbfgs"]},
    },
    {
        "model": DecisionTreeClassifier,
        "params": {"max_depth": [3, 5, 10], "min_samples_split": [2, 5, 10]},
    },
    {
        "model": RandomForestClassifier,
        "params": {"n_estimators": [50, 100], "max_depth": [5, 10, None]},
    },
]

In [44]:
# Ejecutar runs anidados
with mlflow.start_run(run_name="Hyperparameter Tuning"):
    for model in models:
        model_class = model["model"]
        model_name = model_class.__name__
        param_grid = model["params"]

        with mlflow.start_run(run_name=model_name, nested=True):
            # Loguear el modelo y su grid de hiperparámetros
            mlflow.log_param("param_grid", param_grid)

            # Configurar Randomized search
            randomized_search = RandomizedSearchCV(
            estimator=model_class(),
            param_distributions=param_grid,
            n_iter=9,  
            scoring="accuracy",
            cv=2,
            n_jobs=-1
            )


            # Ajustar modelo
            start_time = time.time()

            randomized_search.fit(X_train, y_train)
            duration = time.time() - start_time

            # Mejor modelo y resultados
            best_model = randomized_search.best_estimator_
            best_params = randomized_search.best_params_
            best_score = randomized_search.best_score_

            # Predicciones en conjunto de prueba
            y_pred = best_model.predict(X_test)
            rmse = root_mean_squared_error(y_test, y_pred)
            accuracy = accuracy_score(y_test, y_pred)

            # Loguear métricas y parámetros
            mlflow.log_param("best_params", best_params)
            mlflow.log_metric("cv_accuracy", best_score)
            mlflow.log_metric("rmse", rmse)
            mlflow.log_metric("accuracy", accuracy)
            mlflow.log_metric("tuning_duration", duration)

            # Guardar el preprocesador
            os.makedirs("models", exist_ok=True)
            with open("models/preprocessor.b", "wb") as f_out:
                pickle.dump(dv, f_out)

            mlflow.log_artifact("models/preprocessor.b", artifact_path="preprocessor")

            # Registrar el mejor modelo
            mlflow.sklearn.log_model(best_model, model_name)

            print(f"Modelo '{model_name}' registrado con precisión en prueba: {accuracy:.4f}")


2024/11/25 02:20:26 INFO mlflow.tracking._tracking_service.client: 🏃 View run LogisticRegression at: https://dagshub.com/colome8/PROYECTO_OSKU.mlflow/#/experiments/0/runs/3b3d394e26674ee08bb550d1d8ae00f5.


Modelo 'LogisticRegression' registrado con precisión en prueba: 0.7898


2024/11/25 02:20:26 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: https://dagshub.com/colome8/PROYECTO_OSKU.mlflow/#/experiments/0.


Modelo 'DecisionTreeClassifier' registrado con precisión en prueba: 0.7896


2024/11/25 02:20:42 INFO mlflow.tracking._tracking_service.client: 🏃 View run DecisionTreeClassifier at: https://dagshub.com/colome8/PROYECTO_OSKU.mlflow/#/experiments/0/runs/a11ec8070c2440af9fe3be70d313c757.
2024/11/25 02:20:42 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: https://dagshub.com/colome8/PROYECTO_OSKU.mlflow/#/experiments/0.
2024/11/25 02:21:00 INFO mlflow.tracking._tracking_service.client: 🏃 View run RandomForestClassifier at: https://dagshub.com/colome8/PROYECTO_OSKU.mlflow/#/experiments/0/runs/1736b91b81ea4bceb099784f39677b89.


Modelo 'RandomForestClassifier' registrado con precisión en prueba: 0.7898


2024/11/25 02:21:00 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: https://dagshub.com/colome8/PROYECTO_OSKU.mlflow/#/experiments/0.
2024/11/25 02:21:01 INFO mlflow.tracking._tracking_service.client: 🏃 View run Hyperparameter Tuning at: https://dagshub.com/colome8/PROYECTO_OSKU.mlflow/#/experiments/0/runs/eb68a2595c764120887863dbbdc30584.
2024/11/25 02:21:01 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: https://dagshub.com/colome8/PROYECTO_OSKU.mlflow/#/experiments/0.


Champion y Challenger

In [45]:
# Nombre del experimento
experiment_name = "model-experiment"  # Ajusta el nombre si es diferente
client = MlflowClient()

# Obtener el ID del experimento
experiment_id = client.get_experiment_by_name(experiment_name).experiment_id

# Buscar y ordenar las runs por accuracy
runs = client.search_runs(
    experiment_ids=[experiment_id],
    filter_string="",
    order_by=["metrics.accuracy DESC"],  # Ordenar por mayor accuracy
    max_results=10  # Opcional: limitar el número de runs
)

Registrar modelo

In [46]:
# Registrar el experimento en el Model Registry
model_registry_name = "model-experiment"

try:
    client.get_registered_model(model_registry_name)
    print(f"El registro de modelos '{model_registry_name}' ya existe.")
except:
    client.create_registered_model(model_registry_name)
    print(f"Registro de modelos '{model_registry_name}' creado.")


# Asignar Champion y Challenger
if len(runs) >= 2:
    # Run con mayor accuracy
    best_run = runs[0]
    second_best_run = runs[1]

    # Registrar modelos
    best_model_version = client.create_model_version(
        name=model_registry_name,
        source=f"runs:/{best_run.info.run_id}/model",  # Ruta del modelo en la run
        run_id=best_run.info.run_id
    )

    second_best_model_version = client.create_model_version(
        name=model_registry_name,
        source=f"runs:/{second_best_run.info.run_id}/model",
        run_id=second_best_run.info.run_id
    )

    # Asignar Champion
    client.transition_model_version_stage(
        name=model_registry_name,
        version=best_model_version.version,
        stage="Production"
    )
    client.set_registered_model_alias(model_registry_name, "Champion", best_model_version.version)

    # Asignar Challenger
    client.transition_model_version_stage(
        name=model_registry_name,
        version=second_best_model_version.version,
        stage="Staging"
    )
    client.set_registered_model_alias(model_registry_name, "Challenger", second_best_model_version.version)

    print(f"Champion: Run ID {best_run.info.run_id}, Accuracy: {best_run.data.metrics['accuracy']}")
    print(f"Challenger: Run ID {second_best_run.info.run_id}, Accuracy: {second_best_run.data.metrics['accuracy']}")
else:
    print("No hay suficientes runs para asignar Champion y Challenger.")



2024/11/25 02:21:02 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: model-experiment, version 1


Registro de modelos 'model-experiment' creado.


2024/11/25 02:21:02 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: model-experiment, version 2
  client.transition_model_version_stage(
  client.transition_model_version_stage(


Champion: Run ID 1736b91b81ea4bceb099784f39677b89, Accuracy: 0.7897777777777778
Challenger: Run ID 3b3d394e26674ee08bb550d1d8ae00f5, Accuracy: 0.7897777777777778
