In [1]:
import pickle
import pandas as pd
from sklearn.metrics import  root_mean_squared_error
from sklearn.feature_extraction import  DictVectorizer
import os
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
import dagshub
from dagshub import DAGsHubLogger
import mlflow
import pickle

In [2]:
def read_dataframe(filename):

    df = pd.read_csv(filename)

    categorical = ['label', 'device']
    df[categorical] = df[categorical].astype(str)

    return df

In [3]:
df = read_dataframe("../data/waze_dataset.csv")

In [4]:
df.dropna(inplace=True)  # Eliminar valores faltantes
X = df[['sessions', 'drives', 'total_sessions']]  # Seleccionar características
y = df['label'].apply(lambda x: 1 if x == 'retained' else 0)  # Convertir a variable binaria (1 para retenido, 0 para no retenido)

In [5]:
# Dividir los datos en conjuntos de entrenamiento y prueba
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [6]:
target = 'duration'

Definir los `dataset` como objetos de `mlflow` para poderlos trackear

In [7]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC

In [8]:
import mlflow.data

# Convertir X_train y X_test a NumPy arrays
X_train_array = X_train.values
X_test_array = X_test.values

# Convertir y_train y y_test a NumPy arrays y aplanar
y_train_array = y_train.to_numpy().ravel()
y_test_array = y_test.to_numpy().ravel()

Definir el `tracking URI` y el nombre del experimento

In [9]:
import mlflow

dagshub.init(repo_owner='Parcex10', repo_name='PROYECTO_OSKU', mlflow=True)

MLFLOW_TRACKING_URI = mlflow.get_tracking_uri()
print("MLFLOW_TRACKING_URI:", MLFLOW_TRACKING_URI)

mlflow.set_tracking_uri(MLFLOW_TRACKING_URI)
mlflow.set_experiment(experiment_name="model_experiment")

MLFLOW_TRACKING_URI: https://dagshub.com/Parcex10/PROYECTO_OSKU.mlflow


<Experiment: artifact_location='mlflow-artifacts:/57bd1ce7c19a48e0a7d07f64f62db70c', creation_time=1730432745321, experiment_id='0', last_update_time=1730432745321, lifecycle_stage='active', name='model_experiment', tags={}>

In [10]:
from dagshub import get_repo_bucket_client
# Get a boto3.client object
s3 = get_repo_bucket_client("Parcex10/PROYECTO_OSKU")

# Upload file
s3.upload_file(
    Bucket="PROYECTO_OSKU",  # name of the repo
    Filename="../data/waze_dataset.csv",  # local path of file to upload
    Key="waze_dataset.csv",  # remote path where to upload the file
)

In [11]:
mlflow.sklearn.autolog()

In [12]:
# Crear datasets para MLflow
training_dataset = mlflow.data.from_numpy(X_train_array, targets=y_train_array, name="train_waze_dataset")
validation_dataset = mlflow.data.from_numpy(X_test_array, targets=y_test_array, name="test_waze_dataset")

#### Nested Runs

Vamos a ver como podemos encadenar ejecuciones, para ello vamos a definir varios modelos a entrenar:

In [13]:
models = [
    
    {"model": LogisticRegression,
     "params": {},
     },
    
    {"model": DecisionTreeClassifier,
     "params": {},
     },
    
    {"model": RandomForestClassifier,
     "params": {}, 
     },

    {"model": SVC,
     "params": {"probability": True}
    },

]

In [14]:
dv = DictVectorizer()

In [15]:
with mlflow.start_run(run_name="Nested Runs"):
    for model in models:
        
        model_class = model["model"]
        model_name = model_class.__name__
        params = model["params"]
        
        with mlflow.start_run(run_name=model_name,nested=True):
            
            ml_model = model_class(**params)
           
            ml_model.fit(X_train, y_train)
    
            y_pred = ml_model.predict(X_test)
            
            rmse = root_mean_squared_error(y_test, y_pred)
            mlflow.log_metric("rmse", rmse)

            # Verifica y crea el directorio 'models' si no existe
            os.makedirs("models", exist_ok=True)
            
            # !mkdir models
            with open("models/preprocessor.b", "wb") as f_out:
                pickle.dump(dv, f_out)
                
            mlflow.log_artifact("models/preprocessor.b", artifact_path="preprocessor")

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
2024/11/07 16:44:10 INFO mlflow.tracking._tracking_service.client: 🏃 View run LogisticRegression at: https://dagshub.com/Parcex10/PROYECTO_OSKU.mlflow/#/experiments/0/runs/75a39eb80b6f4173a39a2a7180e31b6d.
2024/11/07 16:44:10 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: https://dagshub.com/Parcex10/PROYECTO_OSKU.mlflow/#/experiments/0.
2024/11/07 16:44:29 INFO mlflow.tracking._tracking_service.client: 🏃 View run DecisionTreeClassifier at: https://dagshub.com/Parcex10/PROYECTO_OSKU.mlflow/#/experiments/0/runs/4470edee59a04f65bcd65d97a006108b.
2024/11/07 16:44:29 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: https://dagshub.com/Parcex10/PROYECTO_OSKU.mlflow/#/experiments/0.
2024/11/07 16:45:03 INFO mlflow.tracking._tracking_service.client: 🏃 View run RandomForestClassifier at: https://dagshub.com/Parcex10/PROYECTO_OSKU.mlflow/#/experiments/0/runs/eed6d489c9ea49b1aeca918

In [16]:
run_ = mlflow.search_runs(order_by=['metrics.rmse ASC'],
                          output_format="list",
                          experiment_names=["model_experiment"]
                          )[0]

run_id = run_.info.run_id
run_uri = f"runs:/{run_id}/model"

result = mlflow.register_model(
    model_uri=run_uri,
    name="waze-model"
)

Successfully registered model 'waze-model'.
2024/11/07 16:46:19 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: waze-model, version 1
Created version '1' of model 'waze-model'.
