
## Tarea y actividad en clase.

1. Hacer merge de la rama que trabajamos a main.
2. Crear una nueva rama que se llame `feat: tarea 5`.
3. Crear un nuevo `jupyter-notebook` llamado `challenger-experiments.ipynb` en la rama creada anteriormente
4. Hacer dos `parent experiments` con `Gradient Boost` y `Random Forest` regressors en donde cada uno tenga `child experiments` con búsqueda de hyper-parámetros. Puede usar cualquier libreraría con la que se sienta cómodo: `hyperopt`, `optuna`, `scikit-learn` (Grid Search, Random Search, Halving Search etc)
5. Registrar el modelo con la mejor métrica `validation-rmse` de los obtenidos en dichos experimentos en el `model registry` en el mismo modelo ya previamente creado `nyc-taxi-model`.
6. Asígnele el alias `challenger`
7. Descargue en la carpeta `data` el conjunto de datos correspondiente a marzo del 2025
9. Use ese conjunto de datos para probarlo sobre los modelos con el alias `champion` y `challenger`
10. Obtenga la métrica de cada modelo
11. Decida si el nuevo modelo `challenger` debe ser promovido a `champion` o no. Use los criterios que usted como Data Scientis considere relevantes y justifique la respuesta.
12. Abrir un `PR` con los cambios hechos en la rama `feat: tarea 5` hacia la rama `main`.


Habrá dos entregas divididas de la siguiente manera:

1. **Trabajo en clase hoy Martes 21 de Octubre de 2025.** Para esta entrega, hacer un commit con el siguiente mensaje `feat: entrega trabajo en clase` con los avances realizados en clase.

2. **Tarea: Martes 28 de Octubre de 2025 a las 19:55.** Esta entrega debe contener todo lo descrito anteriormente

In [5]:
import os, mlflow
from dotenv import load_dotenv

load_dotenv(override=True)  # Carga las variables del archivo .env
EXPERIMENT_NAME = "/Users/oscar.josue2204@gmail.com/nyc-taxi-experiments"

mlflow.set_tracking_uri("databricks")
experiment = mlflow.set_experiment(experiment_name=EXPERIMENT_NAME)

In [4]:
import pickle
import pandas as pd
from sklearn.metrics import  root_mean_squared_error
from sklearn.feature_extraction import  DictVectorizer

In [5]:
def read_dataframe(filename):

    df = pd.read_parquet(filename)

    df['duration'] = df.lpep_dropoff_datetime - df.lpep_pickup_datetime
    df.duration = df.duration.apply(lambda td: td.total_seconds() / 60)

    df = df[(df.duration >= 1) & (df.duration <= 60)]

    categorical = ['PULocationID', 'DOLocationID']
    df[categorical] = df[categorical].astype(str)

    return df

In [6]:
df_train = read_dataframe('../data/green_tripdata_2025-01.parquet')
df_val = read_dataframe('../data/green_tripdata_2025-02.parquet')

In [7]:
def preprocess(df, dv):
    df['PU_DO'] = df['PULocationID'] + '_' + df['DOLocationID']
    categorical = ['PU_DO']
    numerical = ['trip_distance']
    train_dicts = df[categorical + numerical].to_dict(orient='records')
    return dv.transform(train_dicts)

In [8]:
# Dictionaries for preprocessing
dv = DictVectorizer()

# Create PU_DO for training
df_train['PU_DO'] = df_train['PULocationID'].astype(str) + '_' + df_train['DOLocationID'].astype(str)

# Define categorical and numerical variables
categorical = ['PU_DO']
numerical = ['trip_distance']

# Fit DictVectorizer on training data
train_dicts = df_train[categorical + numerical].to_dict(orient='records')
X_train = dv.fit_transform(train_dicts)

# Validation
X_val = preprocess(df_val, dv)


In [9]:
target = 'duration'
y_train = df_train[target].values
y_val = df_val[target].values

# MODELADO

In [10]:
training_dataset = mlflow.data.from_numpy(X_train.data, targets=y_train, name="green_tripdata_2025-01")
validation_dataset = mlflow.data.from_numpy(X_val.data, targets=y_val, name="green_tripdata_2025-02")

In [16]:
import math
import optuna
import pathlib
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from optuna.samplers import TPESampler
from mlflow.models.signature import infer_signature

  from .autonotebook import tqdm as notebook_tqdm


In [18]:
import math
import mlflow
import optuna
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, root_mean_squared_error
from mlflow.models.signature import infer_signature

def objective(trial: optuna.trial.Trial):
    # Hiperparámetros muestreados por Optuna
    params = {
        "max_features": trial.suggest_float("max_features", math.exp(-5), math.exp(-1), log=True),
        "n_estimators": trial.suggest_int("n_estimators", 5, 100),
        "criterion": "absolute_error",
        "min_samples_split": trial.suggest_int("min_samples_split", 2, 20),
        "min_impurity_decrease": trial.suggest_float("min_impurity_decrease", 0.0, 0.5),
        "random_state": 42,
        "n_jobs": -1
    }

    # Run anidado de MLflow
    with mlflow.start_run(nested=True):
        mlflow.set_tag("model_family", "RandomForestRegressor")
        mlflow.log_params(params)

        # Entrenamiento
        model = RandomForestRegressor(**params)
        model.fit(X_train, y_train)

        # Predicción y métrica
        y_pred = model.predict(X_val)
        rmse = root_mean_squared_error(y_val, y_pred)

        # Registrar métrica en MLflow
        mlflow.log_metric("rmse", rmse)

        # Guardar modelo en MLflow
        signature = infer_signature(X_val, y_pred)
        mlflow.sklearn.log_model(
            sk_model=model,
            name="model",
            input_example=X_val[:5],
            signature=signature
        )

    # Optuna minimizará este valor
    return rmse


In [19]:


mlflow.sklearn.autolog(log_models=False)

# ------------------------------------------------------------
# Crear el estudio de Optuna
# ------------------------------------------------------------
sampler = TPESampler(seed=42)
study = optuna.create_study(direction="minimize", sampler=sampler)

# ------------------------------------------------------------
# Ejecutar la optimización
# ------------------------------------------------------------
with mlflow.start_run(run_name="RandomForest Hyperparameter Optimization (Optuna)"):
    study.optimize(objective, n_trials=3)

    # --------------------------------------------------------
    # Recuperar y registrar los mejores hiperparámetros
    # --------------------------------------------------------
    best_params = study.best_params
    best_params["random_state"] = 42
    best_params["n_jobs"] = -1

    mlflow.log_params(best_params)

    # Etiquetas del run "padre"
    mlflow.set_tags({
        "project": "NYC Taxi Time Prediction Project",
        "optimizer_engine": "optuna",
        "model_family": "RandomForestRegressor",
        "feature_set_version": 1,
    })

    final_model = RandomForestRegressor(**best_params)
    final_model.fit(X_train, y_train)

    # Evaluar en validación
    y_pred = final_model.predict(X_val)
    rmse = root_mean_squared_error(y_val, y_pred)
    mlflow.log_metric("rmse", rmse)

    pathlib.Path("preprocessor").mkdir(exist_ok=True)
    with open("preprocessor/preprocessor.b", "wb") as f_out:
        pickle.dump(dv, f_out)
    mlflow.log_artifact("preprocessor/preprocessor.b", artifact_path="preprocessor")

    # --------------------------------------------------------
    # Registrar el modelo final en MLflow
    # --------------------------------------------------------
    feature_names = dv.get_feature_names_out()
    input_example = pd.DataFrame(X_val[:5].toarray(), columns=feature_names)
    signature = infer_signature(input_example, y_val[:5])

    mlflow.sklearn.log_model(
        sk_model=final_model,
        artifact_path="model",
        input_example=input_example,
        signature=signature
    )


[I 2025-10-28 12:51:47,476] A new study created in memory with name: no-name-0a393faa-b1b5-4cde-8bab-a020f626b7c3
2025/10/28 13:16:33 INFO mlflow.models.model: Found the following environment variables used during model inference: [DATABRICKS_HOST, DATABRICKS_TOKEN]. Please check if you need to set them when deploying the model. To disable this message, set environment variable `MLFLOW_RECORD_ENV_VARS_IN_MODEL_LOGGING` to `false`.
[I 2025-10-28 13:16:36,934] Trial 0 finished with value: 9.350831937975855 and parameters: {'max_features': 0.03014188565274509, 'n_estimators': 96, 'min_samples_split': 15, 'min_impurity_decrease': 0.2993292420985183}. Best is trial 0 with value: 9.350831937975855.


🏃 View run crawling-jay-550 at: dbc-f2fdebc8-23c1.cloud.databricks.com/ml/experiments/986703737805901/runs/d2e2fda1c0a64235a134b974a30bd648
🧪 View experiment at: dbc-f2fdebc8-23c1.cloud.databricks.com/ml/experiments/986703737805901


2025/10/28 13:18:54 INFO mlflow.models.model: Found the following environment variables used during model inference: [DATABRICKS_HOST, DATABRICKS_TOKEN]. Please check if you need to set them when deploying the model. To disable this message, set environment variable `MLFLOW_RECORD_ENV_VARS_IN_MODEL_LOGGING` to `false`.
[I 2025-10-28 13:18:57,762] Trial 1 finished with value: 9.409568152252904 and parameters: {'max_features': 0.012576498083161084, 'n_estimators': 19, 'min_samples_split': 3, 'min_impurity_decrease': 0.4330880728874676}. Best is trial 0 with value: 9.350831937975855.


🏃 View run debonair-sponge-767 at: dbc-f2fdebc8-23c1.cloud.databricks.com/ml/experiments/986703737805901/runs/55a5ae766b6a494781315a18f84bc0dd
🧪 View experiment at: dbc-f2fdebc8-23c1.cloud.databricks.com/ml/experiments/986703737805901


2025/10/28 14:08:34 INFO mlflow.models.model: Found the following environment variables used during model inference: [DATABRICKS_HOST, DATABRICKS_TOKEN]. Please check if you need to set them when deploying the model. To disable this message, set environment variable `MLFLOW_RECORD_ENV_VARS_IN_MODEL_LOGGING` to `false`.
[I 2025-10-28 14:08:37,920] Trial 2 finished with value: 9.189108126735146 and parameters: {'max_features': 0.074605581687201, 'n_estimators': 72, 'min_samples_split': 2, 'min_impurity_decrease': 0.48495492608099716}. Best is trial 2 with value: 9.189108126735146.


🏃 View run defiant-foal-471 at: dbc-f2fdebc8-23c1.cloud.databricks.com/ml/experiments/986703737805901/runs/15b5bad0d14c406eada119024f98f625
🧪 View experiment at: dbc-f2fdebc8-23c1.cloud.databricks.com/ml/experiments/986703737805901


2025/10/28 14:08:55 INFO mlflow.models.model: Found the following environment variables used during model inference: [DATABRICKS_HOST, DATABRICKS_TOKEN]. Please check if you need to set them when deploying the model. To disable this message, set environment variable `MLFLOW_RECORD_ENV_VARS_IN_MODEL_LOGGING` to `false`.


🏃 View run RandomForest Hyperparameter Optimization (Optuna) at: dbc-f2fdebc8-23c1.cloud.databricks.com/ml/experiments/986703737805901/runs/d680b2b7d25e43549f1f3b2518cc526e
🧪 View experiment at: dbc-f2fdebc8-23c1.cloud.databricks.com/ml/experiments/986703737805901


In [20]:

def objective(trial: optuna.trial.Trial):
    # Hiperparámetros a optimizar
    params = {
        "n_estimators": trial.suggest_int("n_estimators", 50, 300),
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.3, log=True),
        "max_depth": trial.suggest_int("max_depth", 3, 10),
        "min_samples_split": trial.suggest_int("min_samples_split", 2, 20),
        "min_samples_leaf": trial.suggest_int("min_samples_leaf", 1, 10),
        "subsample": trial.suggest_float("subsample", 0.6, 1.0),
        "max_features": trial.suggest_categorical("max_features", ["sqrt", "log2", None]),
        "random_state": 42
    }

    # Run anidado en MLflow
    with mlflow.start_run(nested=True):
        mlflow.set_tag("model_family", "GradientBoostingRegressor")
        mlflow.log_params(params)

        # Entrenamiento
        model = GradientBoostingRegressor(**params)
        model.fit(X_train, y_train)

        # Predicción y métrica
        y_pred = model.predict(X_val)
        rmse = root_mean_squared_error(y_val, y_pred)
        mlflow.log_metric("rmse", rmse)

        # Guardar modelo
        signature = infer_signature(X_val, y_pred)
        mlflow.sklearn.log_model(
            sk_model=model,
            artifact_path="model",
            input_example=X_val[:5],
            signature=signature
        )

    return rmse


In [22]:

# Activar autolog de sklearn
mlflow.sklearn.autolog(log_models=False)

# ------------------------------------------------------------
# Crear el estudio de Optuna
# ------------------------------------------------------------
sampler = TPESampler(seed=42)
study = optuna.create_study(direction="minimize", sampler=sampler)

# ------------------------------------------------------------
# Ejecutar la optimización
# ------------------------------------------------------------
with mlflow.start_run(run_name="GradientBoosting Hyperparameter Optimization (Optuna)"):
    study.optimize(objective, n_trials=5)

    # --------------------------------------------------------
    # Recuperar y registrar los mejores hiperparámetros
    # --------------------------------------------------------
    best_params = study.best_params
    best_params["random_state"] = 42
    mlflow.log_params(best_params)

    # Etiquetas del run padre
    mlflow.set_tags({
        "project": "NYC Taxi Time Prediction Project",
        "optimizer_engine": "optuna",
        "model_family": "GradientBoostingRegressor",
        "feature_set_version": 1,
    })

    # --------------------------------------------------------
    # Entrenar modelo final con los mejores parámetros
    # --------------------------------------------------------
    final_model = GradientBoostingRegressor(**best_params)
    final_model.fit(X_train, y_train)

    # Evaluar en validación
    y_pred = final_model.predict(X_val)
    rmse = root_mean_squared_error(y_val, y_pred)
    mlflow.log_metric("rmse", rmse)

    # --------------------------------------------------------
    # Guardar artefactos adicionales (preprocesador)
    # --------------------------------------------------------
    pathlib.Path("preprocessor").mkdir(exist_ok=True)
    with open("preprocessor/preprocessor.b", "wb") as f_out:
        pickle.dump(dv, f_out)
    mlflow.log_artifact("preprocessor/preprocessor.b", artifact_path="preprocessor")

    # --------------------------------------------------------
    # Registrar el modelo final en MLflow
    # --------------------------------------------------------
    feature_names = dv.get_feature_names_out()
    input_example = pd.DataFrame(X_val[:5].toarray(), columns=feature_names)
    signature = infer_signature(input_example, y_val[:5])

    mlflow.sklearn.log_model(
        sk_model=final_model,
        artifact_path="model",
        input_example=input_example,
        signature=signature
    )


[I 2025-10-28 16:34:03,706] A new study created in memory with name: no-name-a09954b6-d218-481d-ac6b-5aa375464670
2025/10/28 16:34:13 INFO mlflow.models.model: Found the following environment variables used during model inference: [DATABRICKS_HOST, DATABRICKS_TOKEN]. Please check if you need to set them when deploying the model. To disable this message, set environment variable `MLFLOW_RECORD_ENV_VARS_IN_MODEL_LOGGING` to `false`.
[I 2025-10-28 16:34:17,106] Trial 0 finished with value: 6.337732400879279 and parameters: {'n_estimators': 144, 'learning_rate': 0.2536999076681772, 'max_depth': 8, 'min_samples_split': 13, 'min_samples_leaf': 2, 'subsample': 0.662397808134481, 'max_features': 'log2'}. Best is trial 0 with value: 6.337732400879279.


🏃 View run respected-shrike-928 at: dbc-f2fdebc8-23c1.cloud.databricks.com/ml/experiments/986703737805901/runs/74bb795873034621af8b72363257ffc0
🧪 View experiment at: dbc-f2fdebc8-23c1.cloud.databricks.com/ml/experiments/986703737805901


2025/10/28 16:34:46 INFO mlflow.models.model: Found the following environment variables used during model inference: [DATABRICKS_HOST, DATABRICKS_TOKEN]. Please check if you need to set them when deploying the model. To disable this message, set environment variable `MLFLOW_RECORD_ENV_VARS_IN_MODEL_LOGGING` to `false`.
[I 2025-10-28 16:34:53,884] Trial 1 finished with value: 5.5346346494645875 and parameters: {'n_estimators': 227, 'learning_rate': 0.010725209743171997, 'max_depth': 10, 'min_samples_split': 17, 'min_samples_leaf': 3, 'subsample': 0.6727299868828402, 'max_features': None}. Best is trial 1 with value: 5.5346346494645875.


🏃 View run rogue-goat-39 at: dbc-f2fdebc8-23c1.cloud.databricks.com/ml/experiments/986703737805901/runs/9fb61d17aeb44e3d9595ae53c8fc70dc
🧪 View experiment at: dbc-f2fdebc8-23c1.cloud.databricks.com/ml/experiments/986703737805901


2025/10/28 16:35:04 INFO mlflow.models.model: Found the following environment variables used during model inference: [DATABRICKS_HOST, DATABRICKS_TOKEN]. Please check if you need to set them when deploying the model. To disable this message, set environment variable `MLFLOW_RECORD_ENV_VARS_IN_MODEL_LOGGING` to `false`.
[I 2025-10-28 16:35:08,064] Trial 2 finished with value: 8.545785847910508 and parameters: {'n_estimators': 158, 'learning_rate': 0.02692655251486473, 'max_depth': 7, 'min_samples_split': 4, 'min_samples_leaf': 3, 'subsample': 0.7465447373174767, 'max_features': 'log2'}. Best is trial 1 with value: 5.5346346494645875.


🏃 View run youthful-turtle-936 at: dbc-f2fdebc8-23c1.cloud.databricks.com/ml/experiments/986703737805901/runs/70e3a72c24494fb0b231561630c5ad96
🧪 View experiment at: dbc-f2fdebc8-23c1.cloud.databricks.com/ml/experiments/986703737805901


2025/10/28 16:35:17 INFO mlflow.models.model: Found the following environment variables used during model inference: [DATABRICKS_HOST, DATABRICKS_TOKEN]. Please check if you need to set them when deploying the model. To disable this message, set environment variable `MLFLOW_RECORD_ENV_VARS_IN_MODEL_LOGGING` to `false`.
[I 2025-10-28 16:35:21,087] Trial 3 finished with value: 8.274920154595872 and parameters: {'n_estimators': 179, 'learning_rate': 0.07500118950416987, 'max_depth': 3, 'min_samples_split': 13, 'min_samples_leaf': 2, 'subsample': 0.6260206371941118, 'max_features': 'log2'}. Best is trial 1 with value: 5.5346346494645875.


🏃 View run powerful-kite-417 at: dbc-f2fdebc8-23c1.cloud.databricks.com/ml/experiments/986703737805901/runs/2b704c3de0e547fe93d9ad4033f022d9
🧪 View experiment at: dbc-f2fdebc8-23c1.cloud.databricks.com/ml/experiments/986703737805901


2025/10/28 16:35:30 INFO mlflow.models.model: Found the following environment variables used during model inference: [DATABRICKS_HOST, DATABRICKS_TOKEN]. Please check if you need to set them when deploying the model. To disable this message, set environment variable `MLFLOW_RECORD_ENV_VARS_IN_MODEL_LOGGING` to `false`.
[I 2025-10-28 16:35:34,793] Trial 4 finished with value: 8.798031985440314 and parameters: {'n_estimators': 126, 'learning_rate': 0.013940346079873234, 'max_depth': 8, 'min_samples_split': 10, 'min_samples_leaf': 2, 'subsample': 0.798070764044508, 'max_features': 'log2'}. Best is trial 1 with value: 5.5346346494645875.


🏃 View run burly-gnat-699 at: dbc-f2fdebc8-23c1.cloud.databricks.com/ml/experiments/986703737805901/runs/3b195abf143149529773ad0e2867e01e
🧪 View experiment at: dbc-f2fdebc8-23c1.cloud.databricks.com/ml/experiments/986703737805901


2025/10/28 16:36:12 INFO mlflow.models.model: Found the following environment variables used during model inference: [DATABRICKS_HOST, DATABRICKS_TOKEN]. Please check if you need to set them when deploying the model. To disable this message, set environment variable `MLFLOW_RECORD_ENV_VARS_IN_MODEL_LOGGING` to `false`.


🏃 View run GradientBoosting Hyperparameter Optimization (Optuna) at: dbc-f2fdebc8-23c1.cloud.databricks.com/ml/experiments/986703737805901/runs/35d2893a971f485694b47c5dad9a6476
🧪 View experiment at: dbc-f2fdebc8-23c1.cloud.databricks.com/ml/experiments/986703737805901


In [18]:
model_name = "workspace.default.nyc-taxi-model"

In [20]:
run_id = input("Ingrese el run_id")
run_uri = f"runs:/{run_id}/model"

result = mlflow.register_model(
    model_uri=run_uri,
    name="workspace.default.nyc-taxi-model"
)

Registered model 'workspace.default.nyc-taxi-model' already exists. Creating a new version of this model...


MlflowException: Run 'd680b2b7d25e43549f1f3b2518cc526e' not found

In [21]:
runs = mlflow.search_runs(
    experiment_names=[EXPERIMENT_NAME],
    order_by=["metrics.rmse ASC"],
    output_format="list"
)

# Obtener el mejor run
if len(runs) > 0:
    best_run = runs[0]
    print("🏆 Champion Run encontrado:")
    print(f"Run ID: {best_run.info.run_id}")
    print(f"RMSE: {best_run.data.metrics['rmse']}")
    print(f"Params: {best_run.data.params}")
else:
    print("⚠️ No se encontraron runs con métrica RMSE.")

NameError: name 'EXPERIMENT_NAME' is not defined

In [15]:
result = mlflow.register_model(
    model_uri=f"runs:/{best_run.info.run_id}/model",
    name=model_name
)

NameError: name 'best_run' is not defined

In [27]:
from mlflow import MlflowClient

client = MlflowClient()

In [13]:
model_version = result.version
new_alias = "Champion"

client.set_registered_model_alias(
    name=model_name,
    alias=new_alias,
    version=result.version
)

NameError: name 'result' is not defined

In [26]:
import mlflow
import pandas as pd
from sklearn.metrics import mean_squared_error
import numpy as np

df_marzo = read_dataframe("../data/green_tripdata_2025-03.parquet")


X_marzo = preprocess(df_marzo, dv)
y_marzo = df_marzo[target].values


import mlflow.pyfunc

model_version_uri = f"models:/{model_name}@champion"

champion_version = mlflow.pyfunc.load_model(model_version_uri)



model_version_uri = f"models:/{model_name}@challenger"

challenger_version = mlflow.pyfunc.load_model(model_version_uri)


y_pred_champion = champion_version.predict(X_marzo)
y_pred_challenger = challenger_version.predict(X_marzo)

rmse_champion = root_mean_squared_error(y_marzo, y_pred_champion)
rmse_challenger = root_mean_squared_error(y_marzo, y_pred_challenger)

print(f"Champion RMSE:  {rmse_champion:.4f}")
print(f"Challenger RMSE: {rmse_challenger:.4f}")


MlflowException: Registered model alias champion not found.