### Challenger experiments
---
Libraries and preprocessing (the same as the previous notebooks)

In [171]:
import os
import mlflow
from dotenv import load_dotenv

load_dotenv(override=True)
EXPERIMENT_NAME = "/Users/rafaeltakata0105@gmail.com/nyc-taxi-experiments"

mlflow.set_tracking_uri("databricks")
experiment = mlflow.set_experiment(experiment_name=EXPERIMENT_NAME)

In [172]:
import pickle
import pandas as pd
from sklearn.metrics import  root_mean_squared_error
from sklearn.feature_extraction import  DictVectorizer
import optuna 

In [173]:
def read_dataframe(path):
    df = pd.read_parquet(path)
    df["duration"] = (df.lpep_dropoff_datetime - df.lpep_pickup_datetime).dt.total_seconds() / 60
    df = df[(df.duration >= 1) & (df.duration <= 60)]
    df[["PULocationID", "DOLocationID"]] = df[["PULocationID", "DOLocationID"]].astype(str)
    df["PU_DO"] = df["PULocationID"] + "_" + df["DOLocationID"]
    return df

In [174]:
df_train = read_dataframe('../data/green_tripdata_2025-01.parquet')
df_test = read_dataframe('../data/green_tripdata_2025-02.parquet')
#We now validate the data with March
df_val = read_dataframe('../data/green_tripdata_2025-03.parquet')

In [175]:
df_train['PU_DO'] = df_train['PULocationID'] + '_' + df_train['DOLocationID']
categorical = ['PU_DO']
numerical = ['trip_distance']
dv = DictVectorizer()

In [176]:
def preprocess(df, dv):
    train_dicts = df[categorical + numerical].to_dict(orient='records')
    return dv.transform(train_dicts)

In [177]:
dv = DictVectorizer()

train_dicts = df_train[categorical + numerical].to_dict(orient='records')

X_train = dv.fit_transform(train_dicts)
X_test = preprocess(df_test, dv)
X_val = preprocess(df_val, dv)

In [178]:
target = 'duration'

y_train = df_train[target].values
y_test = df_test[target].values
y_val = df_val[target].values

Log datasets

In [179]:
training_dataset = mlflow.data.from_numpy(X_train.data, targets=y_train, name="green_tripdata_2025-01")
test_dataset = mlflow.data.from_numpy(X_test.data, targets=y_test, name="green_tripdata_2025-02")
validation_dataset = mlflow.data.from_numpy(X_val.data, targets=y_val, name="green_tripdata_2025-03")

### Model training:

In [180]:
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from mlflow.models.signature import infer_signature
from optuna.samplers import TPESampler
import pathlib
import numpy as np
from sklearn.metrics import root_mean_squared_error

In [181]:
rfr = RandomForestRegressor()
gbr = GradientBoostingRegressor()

### Random Forest Regressor

Objective Function:

In [182]:
# ------------------------------------------------------------
# Definir la función objetivo para Optuna y MLflow
# ------------------------------------------------------------
def objective(trial: optuna.trial.Trial):
    # 1. Hiperparámetros MUESTREADOS por Optuna
    params = {
        # n_estimators: número de árboles
        "n_estimators": trial.suggest_int("n_estimators", 50, 500, step=50),
        # max_depth: profundidad máxima del árbol
        "max_depth": trial.suggest_int("max_depth", 5, 50),
        # min_samples_split: min muestras para dividir un nodo
        "min_samples_split": trial.suggest_int("min_samples_split", 2, 20),
        # min_samples_leaf: min muestras en una hoja
        "min_samples_leaf": trial.suggest_int("min_samples_leaf", 1, 10),
        # max_features: características a considerar para la mejor división
        "max_features": trial.suggest_float("max_features", 0.1, 1.0, log=False),
        "random_state": 42,
        "n_jobs": -1,
    }

    # 2. Run anidado para registrar cada trial en MLflow
    with mlflow.start_run(nested=True):
        mlflow.set_tag("model_family", "random_forest_regressor")
        mlflow.log_params(params)

        # 3. Crear y Entrenar el Modelo
        model = RandomForestRegressor(**params)
        model.fit(X_train, y_train)

        # 4. Predicción y Métrica en Validación
        y_pred = model.predict(X_test)
        # Usaremos el RMSE, que es la raíz cuadrada del MSE
        rmse = root_mean_squared_error(y_test, y_pred)

        # 5. Registrar la métrica principal
        mlflow.log_metric("rmse", rmse)
        
        # 6. Guardar el modelo del trial (incluye la firma y un ejemplo de entrada)
        
        # Crear la "signature"
        signature = infer_signature(X_test, y_pred)
        
        mlflow.sklearn.log_model(
            sk_model=model,
            artifact_path="model",
            input_example=X_test[:5],
            signature=signature,
            # Le pasamos el trial de Optuna para que MLflow lo registre como un Tag
            metadata={"optuna.trial_id": trial.number} 
        )

    # Optuna minimiza el valor retornado
    return rmse

Study run:

In [183]:
# ------------------------------------------------------------
# Configurar y Ejecutar la Optimización de Optuna
# ------------------------------------------------------------

sampler = TPESampler(seed=42)
# direction="minimize" porque queremos minimizar el RMSE
study = optuna.create_study(direction="minimize", sampler=sampler) 

# Run "padre" para agrupar toda la búsqueda de Optuna
with mlflow.start_run(run_name="Random Forest Regressor Optuna Tuning"):
    study.optimize(objective, n_trials=10) # Reducido a 20 por ser un ejemplo
    
    # --------------------------------------------------------
    # Registrar los mejores resultados en el run "padre"
    # --------------------------------------------------------
    best_params = study.best_params
    best_rmse = study.best_value
    
    # Registrar métricas y parámetros finales en el run padre
    mlflow.log_params(best_params)

    mlflow.set_tags({
        "project": "NYC Taxi Time Prediction Project",
        "optimizer_engine": "optuna",
        "model_family": "RandomForestRegressor",
        "feature_set_version": 1,
    })

    mlflow.log_metric("best_rmse_test", best_rmse)

    pathlib.Path("preprocessor").mkdir(exist_ok=True)
    with open("preprocessor/preprocessor.b", "wb") as f_out:
        pickle.dump(dv, f_out)

    mlflow.log_artifact("preprocessor/preprocessor.b", artifact_path="preprocessor")
    
    print(f"\nMejor RMSE encontrado: {best_rmse:.4f}")
    print("Mejores Parámetros:", best_params)

    # --------------------------------------------------------
    # Reentrenar y registrar el modelo FINAL
    # --------------------------------------------------------
    print("\nEntrenando y registrando el modelo final con los mejores parámetros...")
    final_model = RandomForestRegressor(**best_params)
    final_model.fit(X_train, y_train)

    # Predicción y registro de métrica final
    y_pred = final_model.predict(X_val)
    final_rmse = root_mean_squared_error(y_val, y_pred)

    # Guardar el modelo final de forma explícita en el run padre
    signature = infer_signature(X_val, y_pred)

    mlflow.sklearn.log_model(
        sk_model=final_model,
        name="model",
        input_example=X_val[:5],
        signature=signature,
        # Opcional: Registrar en el Model Registry
        # registered_model_name="RandomForestRegressor_Optimized",
    )
    mlflow.log_metric("validation-rmse", final_rmse)
    print("Modelo final registrado en MLflow.")

[I 2025-10-25 17:15:04,647] A new study created in memory with name: no-name-c9ff885a-5922-4473-a578-76f2c6014b2e


Downloading artifacts:   0%|          | 0/7 [00:00<?, ?it/s]

2025/10/25 17:15:34 INFO mlflow.models.model: Found the following environment variables used during model inference: [DATABRICKS_HOST, DATABRICKS_TOKEN]. Please check if you need to set them when deploying the model. To disable this message, set environment variable `MLFLOW_RECORD_ENV_VARS_IN_MODEL_LOGGING` to `false`.
[I 2025-10-25 17:16:17,230] Trial 0 finished with value: 5.443526012385376 and parameters: {'n_estimators': 200, 'max_depth': 48, 'min_samples_split': 15, 'min_samples_leaf': 6, 'max_features': 0.24041677639819287}. Best is trial 0 with value: 5.443526012385376.


🏃 View run funny-slug-308 at: https://dbc-79ec72f9-c392.cloud.databricks.com/ml/experiments/530562432550975/runs/92b66d3a308f4ec79ec808b2daf29be0
🧪 View experiment at: https://dbc-79ec72f9-c392.cloud.databricks.com/ml/experiments/530562432550975




Downloading artifacts:   0%|          | 0/7 [00:00<?, ?it/s]

2025/10/25 17:16:29 INFO mlflow.models.model: Found the following environment variables used during model inference: [DATABRICKS_HOST, DATABRICKS_TOKEN]. Please check if you need to set them when deploying the model. To disable this message, set environment variable `MLFLOW_RECORD_ENV_VARS_IN_MODEL_LOGGING` to `false`.
[I 2025-10-25 17:16:31,876] Trial 1 finished with value: 5.607836371190919 and parameters: {'n_estimators': 100, 'max_depth': 7, 'min_samples_split': 18, 'min_samples_leaf': 7, 'max_features': 0.737265320016441}. Best is trial 0 with value: 5.443526012385376.


🏃 View run handsome-lark-959 at: https://dbc-79ec72f9-c392.cloud.databricks.com/ml/experiments/530562432550975/runs/a8fb60609d7445308cebcf26d73a52d8
🧪 View experiment at: https://dbc-79ec72f9-c392.cloud.databricks.com/ml/experiments/530562432550975




Downloading artifacts:   0%|          | 0/7 [00:00<?, ?it/s]

2025/10/25 17:16:44 INFO mlflow.models.model: Found the following environment variables used during model inference: [DATABRICKS_HOST, DATABRICKS_TOKEN]. Please check if you need to set them when deploying the model. To disable this message, set environment variable `MLFLOW_RECORD_ENV_VARS_IN_MODEL_LOGGING` to `false`.
[I 2025-10-25 17:17:00,497] Trial 2 finished with value: 5.344099250244436 and parameters: {'n_estimators': 50, 'max_depth': 49, 'min_samples_split': 17, 'min_samples_leaf': 3, 'max_features': 0.26364247048639056}. Best is trial 2 with value: 5.344099250244436.


🏃 View run victorious-skink-133 at: https://dbc-79ec72f9-c392.cloud.databricks.com/ml/experiments/530562432550975/runs/5a016626331f4816acdea4e2b071f9af
🧪 View experiment at: https://dbc-79ec72f9-c392.cloud.databricks.com/ml/experiments/530562432550975




Downloading artifacts:   0%|          | 0/7 [00:00<?, ?it/s]

2025/10/25 17:17:13 INFO mlflow.models.model: Found the following environment variables used during model inference: [DATABRICKS_HOST, DATABRICKS_TOKEN]. Please check if you need to set them when deploying the model. To disable this message, set environment variable `MLFLOW_RECORD_ENV_VARS_IN_MODEL_LOGGING` to `false`.
[I 2025-10-25 17:17:25,650] Trial 3 finished with value: 5.478119002140051 and parameters: {'n_estimators': 100, 'max_depth': 18, 'min_samples_split': 11, 'min_samples_leaf': 5, 'max_features': 0.36210622617823773}. Best is trial 2 with value: 5.344099250244436.


🏃 View run painted-trout-81 at: https://dbc-79ec72f9-c392.cloud.databricks.com/ml/experiments/530562432550975/runs/3f8b69c516c2437eb515958e202fad9a
🧪 View experiment at: https://dbc-79ec72f9-c392.cloud.databricks.com/ml/experiments/530562432550975




Downloading artifacts:   0%|          | 0/7 [00:00<?, ?it/s]

2025/10/25 17:17:41 INFO mlflow.models.model: Found the following environment variables used during model inference: [DATABRICKS_HOST, DATABRICKS_TOKEN]. Please check if you need to set them when deploying the model. To disable this message, set environment variable `MLFLOW_RECORD_ENV_VARS_IN_MODEL_LOGGING` to `false`.
[I 2025-10-25 17:18:00,695] Trial 4 finished with value: 5.540601532065293 and parameters: {'n_estimators': 350, 'max_depth': 11, 'min_samples_split': 7, 'min_samples_leaf': 4, 'max_features': 0.5104629857953323}. Best is trial 2 with value: 5.344099250244436.


🏃 View run skillful-wasp-193 at: https://dbc-79ec72f9-c392.cloud.databricks.com/ml/experiments/530562432550975/runs/a4cbe876378e45aa86237d7ea5417f37
🧪 View experiment at: https://dbc-79ec72f9-c392.cloud.databricks.com/ml/experiments/530562432550975




Downloading artifacts:   0%|          | 0/7 [00:00<?, ?it/s]

2025/10/25 17:18:15 INFO mlflow.models.model: Found the following environment variables used during model inference: [DATABRICKS_HOST, DATABRICKS_TOKEN]. Please check if you need to set them when deploying the model. To disable this message, set environment variable `MLFLOW_RECORD_ENV_VARS_IN_MODEL_LOGGING` to `false`.
[I 2025-10-25 17:18:38,463] Trial 5 finished with value: 6.082324007237481 and parameters: {'n_estimators': 400, 'max_depth': 14, 'min_samples_split': 11, 'min_samples_leaf': 6, 'max_features': 0.14180537144799796}. Best is trial 2 with value: 5.344099250244436.


🏃 View run wistful-quail-219 at: https://dbc-79ec72f9-c392.cloud.databricks.com/ml/experiments/530562432550975/runs/cff6dfa35bac43baaf63f535544cdf92
🧪 View experiment at: https://dbc-79ec72f9-c392.cloud.databricks.com/ml/experiments/530562432550975




Downloading artifacts:   0%|          | 0/7 [00:00<?, ?it/s]

2025/10/25 17:18:57 INFO mlflow.models.model: Found the following environment variables used during model inference: [DATABRICKS_HOST, DATABRICKS_TOKEN]. Please check if you need to set them when deploying the model. To disable this message, set environment variable `MLFLOW_RECORD_ENV_VARS_IN_MODEL_LOGGING` to `false`.
[I 2025-10-25 17:19:18,500] Trial 6 finished with value: 5.584261779348117 and parameters: {'n_estimators': 350, 'max_depth': 12, 'min_samples_split': 3, 'min_samples_leaf': 10, 'max_features': 0.9690688297671034}. Best is trial 2 with value: 5.344099250244436.


🏃 View run puzzled-asp-934 at: https://dbc-79ec72f9-c392.cloud.databricks.com/ml/experiments/530562432550975/runs/07249705b8dc4fff80ef7bd6e5403399
🧪 View experiment at: https://dbc-79ec72f9-c392.cloud.databricks.com/ml/experiments/530562432550975




Downloading artifacts:   0%|          | 0/7 [00:00<?, ?it/s]

2025/10/25 17:19:40 INFO mlflow.models.model: Found the following environment variables used during model inference: [DATABRICKS_HOST, DATABRICKS_TOKEN]. Please check if you need to set them when deploying the model. To disable this message, set environment variable `MLFLOW_RECORD_ENV_VARS_IN_MODEL_LOGGING` to `false`.
[I 2025-10-25 17:20:25,178] Trial 7 finished with value: 5.493096242328059 and parameters: {'n_estimators': 450, 'max_depth': 19, 'min_samples_split': 3, 'min_samples_leaf': 7, 'max_features': 0.4961372443656412}. Best is trial 2 with value: 5.344099250244436.


🏃 View run illustrious-skunk-1 at: https://dbc-79ec72f9-c392.cloud.databricks.com/ml/experiments/530562432550975/runs/3f469e3686b74471931ec5ed17469429
🧪 View experiment at: https://dbc-79ec72f9-c392.cloud.databricks.com/ml/experiments/530562432550975




Downloading artifacts:   0%|          | 0/7 [00:00<?, ?it/s]

2025/10/25 17:20:37 INFO mlflow.models.model: Found the following environment variables used during model inference: [DATABRICKS_HOST, DATABRICKS_TOKEN]. Please check if you need to set them when deploying the model. To disable this message, set environment variable `MLFLOW_RECORD_ENV_VARS_IN_MODEL_LOGGING` to `false`.
[I 2025-10-25 17:20:49,315] Trial 8 finished with value: 5.5123146490637644 and parameters: {'n_estimators': 100, 'max_depth': 27, 'min_samples_split': 2, 'min_samples_leaf': 10, 'max_features': 0.3329019834400152}. Best is trial 2 with value: 5.344099250244436.


🏃 View run sneaky-sponge-971 at: https://dbc-79ec72f9-c392.cloud.databricks.com/ml/experiments/530562432550975/runs/1757317e4834426986f962bf98314b97
🧪 View experiment at: https://dbc-79ec72f9-c392.cloud.databricks.com/ml/experiments/530562432550975




Downloading artifacts:   0%|          | 0/7 [00:00<?, ?it/s]

2025/10/25 17:21:04 INFO mlflow.models.model: Found the following environment variables used during model inference: [DATABRICKS_HOST, DATABRICKS_TOKEN]. Please check if you need to set them when deploying the model. To disable this message, set environment variable `MLFLOW_RECORD_ENV_VARS_IN_MODEL_LOGGING` to `false`.
[I 2025-10-25 17:21:37,084] Trial 9 finished with value: 5.514804345153128 and parameters: {'n_estimators': 350, 'max_depth': 19, 'min_samples_split': 11, 'min_samples_leaf': 6, 'max_features': 0.26636900997297436}. Best is trial 2 with value: 5.344099250244436.


🏃 View run youthful-mouse-627 at: https://dbc-79ec72f9-c392.cloud.databricks.com/ml/experiments/530562432550975/runs/2d80610d39094c8baf27f1d0b8ce39c1
🧪 View experiment at: https://dbc-79ec72f9-c392.cloud.databricks.com/ml/experiments/530562432550975

Mejor RMSE encontrado: 5.3441
Mejores Parámetros: {'n_estimators': 50, 'max_depth': 49, 'min_samples_split': 17, 'min_samples_leaf': 3, 'max_features': 0.26364247048639056}

Entrenando y registrando el modelo final con los mejores parámetros...




Downloading artifacts:   0%|          | 0/7 [00:00<?, ?it/s]

2025/10/25 17:21:56 INFO mlflow.models.model: Found the following environment variables used during model inference: [DATABRICKS_HOST, DATABRICKS_TOKEN]. Please check if you need to set them when deploying the model. To disable this message, set environment variable `MLFLOW_RECORD_ENV_VARS_IN_MODEL_LOGGING` to `false`.


Modelo final registrado en MLflow.
🏃 View run Random Forest Regressor Optuna Tuning at: https://dbc-79ec72f9-c392.cloud.databricks.com/ml/experiments/530562432550975/runs/0f4d83ee81c240929dbba8f74c281aad
🧪 View experiment at: https://dbc-79ec72f9-c392.cloud.databricks.com/ml/experiments/530562432550975


Register the model
 (Manual method)

In [184]:
run_id = input("Ingrese el run_id")
run_uri = f"runs:/{run_id}/model"

result = mlflow.register_model(
    model_uri=run_uri,
    name="workspace.default.nyc-taxi-model"
)

Registered model 'workspace.default.nyc-taxi-model' already exists. Creating a new version of this model...


Downloading artifacts:   0%|          | 0/8 [00:00<?, ?it/s]

Uploading artifacts:   0%|          | 0/9 [00:00<?, ?it/s]

Created version '7' of model 'workspace.default.nyc-taxi-model'.


### Gradient Boosting Regressor

Objective function:

In [185]:
# ------------------------------------------------------------
# Definir la función objetivo para Optuna y MLflow
# ------------------------------------------------------------
def objective(trial: optuna.trial.Trial):
    # 1. Hiperparámetros MUESTREADOS por Optuna
    params = {
        # n_estimators (número de etapas de boosting)
        "n_estimators": trial.suggest_int("n_estimators", 50, 500, step=50),
        # learning_rate (tasa de aprendizaje, logarítmica para exploración eficiente)
        "learning_rate": trial.suggest_float("learning_rate", 1e-3, 0.5, log=True),
        # max_depth (profundidad máxima de cada estimador)
        "max_depth": trial.suggest_int("max_depth", 3, 15),
        # min_samples_split
        "min_samples_split": trial.suggest_int("min_samples_split", 2, 20),
        # min_samples_leaf
        "min_samples_leaf": trial.suggest_int("min_samples_leaf", 1, 10),
        # subsample (fracción de muestras usadas para ajustar los estimadores individuales)
        "subsample": trial.suggest_float("subsample", 0.6, 1.0),
        # max_features (características a considerar para la mejor división)
        "max_features": trial.suggest_categorical("max_features", ['sqrt', 'log2', None]),
        "random_state": 42,
    }

    # 2. Run anidado para registrar cada trial en MLflow
    with mlflow.start_run(nested=True):
        mlflow.set_tag("model_family", "Gradient_boosting_regressor")
        mlflow.log_params(params)

        # 3. Crear y Entrenar el Modelo
        model = GradientBoostingRegressor(**params)
        model.fit(X_train, y_train)

        # 4. Predicción y Métrica en Validación
        y_pred = model.predict(X_test)
        # Usaremos el RMSE, que es la raíz cuadrada del MSE
        rmse = root_mean_squared_error(y_test, y_pred)

        # 5. Registrar la métrica principal
        mlflow.log_metric("rmse", rmse)
        
        # 6. Guardar el modelo del trial (incluye la firma y un ejemplo de entrada)
        
        # Crear la "signature"
        signature = infer_signature(X_test, y_pred)
        
        mlflow.sklearn.log_model(
            sk_model=model,
            artifact_path="model",
            input_example=X_test[:5],
            signature=signature,
            # Le pasamos el trial de Optuna para que MLflow lo registre como un Tag
            metadata={"optuna.trial_id": trial.number} 
        )

    # Optuna minimiza el valor retornado
    return rmse

Study Optimization Runs

In [186]:
# ------------------------------------------------------------
# Configurar y Ejecutar la Optimización de Optuna
# ------------------------------------------------------------

sampler = TPESampler(seed=42)
# direction="minimize" porque queremos minimizar el RMSE
study = optuna.create_study(direction="minimize", sampler=sampler) 

# Run "padre" para agrupar toda la búsqueda de Optuna
with mlflow.start_run(run_name="Gradient Boosting Regressor"):
    study.optimize(objective, n_trials=10) 

    # --------------------------------------------------------
    # Registrar los mejores resultados en el run "padre"
    # --------------------------------------------------------
    best_params = study.best_params
    best_rmse = study.best_value
    
    # Registrar métricas y parámetros finales en el run padre
    mlflow.log_params(best_params)

    mlflow.set_tags({
        "project": "NYC Taxi Time Prediction Project",
        "optimizer_engine": "optuna",
        "model_family": "GradientBoostingRegressor",
        "feature_set_version": 1,
    })

    mlflow.log_metric("best_rmse_test", best_rmse)

    pathlib.Path("preprocessor").mkdir(exist_ok=True)
    with open("preprocessor/preprocessor.b", "wb") as f_out:
        pickle.dump(dv, f_out)

    mlflow.log_artifact("preprocessor/preprocessor.b", artifact_path="preprocessor")
    
    print(f"\nMejor RMSE encontrado: {best_rmse:.4f}")
    print("Mejores Parámetros:", best_params)

    # --------------------------------------------------------
    # Reentrenar y registrar el modelo FINAL
    # --------------------------------------------------------
    print("\nEntrenando y registrando el modelo final con los mejores parámetros...")
    final_model = GradientBoostingRegressor(**best_params)
    final_model.fit(X_train, y_train)

    # Predicción y registro de métrica final
    y_pred = final_model.predict(X_val)
    final_rmse = root_mean_squared_error(y_val, y_pred)

    # Guardar el modelo final de forma explícita en el run padre
    signature = infer_signature(X_val, y_pred)

    mlflow.sklearn.log_model(
        sk_model=final_model,
        name="model",
        input_example=X_val[:5],
        signature=signature,
    
    )
    mlflow.log_metric("validation-rmse", final_rmse)
    print("Modelo final registrado en MLflow.")

[I 2025-10-25 17:22:44,412] A new study created in memory with name: no-name-6c2861d1-40e1-4a99-ac94-caf3005184ee


Downloading artifacts:   0%|          | 0/7 [00:00<?, ?it/s]

2025/10/25 17:22:58 INFO mlflow.models.model: Found the following environment variables used during model inference: [DATABRICKS_HOST, DATABRICKS_TOKEN]. Please check if you need to set them when deploying the model. To disable this message, set environment variable `MLFLOW_RECORD_ENV_VARS_IN_MODEL_LOGGING` to `false`.
[I 2025-10-25 17:23:00,874] Trial 0 finished with value: 5.5583158005865165 and parameters: {'n_estimators': 200, 'learning_rate': 0.36808608148776095, 'max_depth': 12, 'min_samples_split': 13, 'min_samples_leaf': 2, 'subsample': 0.662397808134481, 'max_features': 'log2'}. Best is trial 0 with value: 5.5583158005865165.


🏃 View run adaptable-deer-326 at: https://dbc-79ec72f9-c392.cloud.databricks.com/ml/experiments/530562432550975/runs/c52575bdebe6436d9e78b1bf0ee5267a
🧪 View experiment at: https://dbc-79ec72f9-c392.cloud.databricks.com/ml/experiments/530562432550975




Downloading artifacts:   0%|          | 0/7 [00:00<?, ?it/s]

2025/10/25 17:24:27 INFO mlflow.models.model: Found the following environment variables used during model inference: [DATABRICKS_HOST, DATABRICKS_TOKEN]. Please check if you need to set them when deploying the model. To disable this message, set environment variable `MLFLOW_RECORD_ENV_VARS_IN_MODEL_LOGGING` to `false`.
[I 2025-10-25 17:25:00,706] Trial 1 finished with value: 7.211518600238097 and parameters: {'n_estimators': 400, 'learning_rate': 0.001136467270001117, 'max_depth': 15, 'min_samples_split': 17, 'min_samples_leaf': 3, 'subsample': 0.6727299868828402, 'max_features': None}. Best is trial 0 with value: 5.5583158005865165.


🏃 View run orderly-owl-545 at: https://dbc-79ec72f9-c392.cloud.databricks.com/ml/experiments/530562432550975/runs/344fd90274e148939b341b0905030b2e
🧪 View experiment at: https://dbc-79ec72f9-c392.cloud.databricks.com/ml/experiments/530562432550975




Downloading artifacts:   0%|          | 0/7 [00:00<?, ?it/s]

2025/10/25 17:25:15 INFO mlflow.models.model: Found the following environment variables used during model inference: [DATABRICKS_HOST, DATABRICKS_TOKEN]. Please check if you need to set them when deploying the model. To disable this message, set environment variable `MLFLOW_RECORD_ENV_VARS_IN_MODEL_LOGGING` to `false`.
[I 2025-10-25 17:25:19,322] Trial 2 finished with value: 8.843164321957916 and parameters: {'n_estimators': 250, 'learning_rate': 0.006109683510122491, 'max_depth': 10, 'min_samples_split': 4, 'min_samples_leaf': 3, 'subsample': 0.7465447373174767, 'max_features': 'log2'}. Best is trial 0 with value: 5.5583158005865165.


🏃 View run trusting-koi-131 at: https://dbc-79ec72f9-c392.cloud.databricks.com/ml/experiments/530562432550975/runs/48c0525942a94ceda5097da3701af63f
🧪 View experiment at: https://dbc-79ec72f9-c392.cloud.databricks.com/ml/experiments/530562432550975




Downloading artifacts:   0%|          | 0/7 [00:00<?, ?it/s]

2025/10/25 17:25:31 INFO mlflow.models.model: Found the following environment variables used during model inference: [DATABRICKS_HOST, DATABRICKS_TOKEN]. Please check if you need to set them when deploying the model. To disable this message, set environment variable `MLFLOW_RECORD_ENV_VARS_IN_MODEL_LOGGING` to `false`.
[I 2025-10-25 17:25:33,760] Trial 3 finished with value: 8.536614581697073 and parameters: {'n_estimators': 300, 'learning_rate': 0.039710847107924725, 'max_depth': 3, 'min_samples_split': 13, 'min_samples_leaf': 2, 'subsample': 0.6260206371941118, 'max_features': 'log2'}. Best is trial 0 with value: 5.5583158005865165.


🏃 View run gaudy-colt-899 at: https://dbc-79ec72f9-c392.cloud.databricks.com/ml/experiments/530562432550975/runs/5c99d0737caa4a73a0defca3ed8289a8
🧪 View experiment at: https://dbc-79ec72f9-c392.cloud.databricks.com/ml/experiments/530562432550975




Downloading artifacts:   0%|          | 0/7 [00:00<?, ?it/s]

2025/10/25 17:25:47 INFO mlflow.models.model: Found the following environment variables used during model inference: [DATABRICKS_HOST, DATABRICKS_TOKEN]. Please check if you need to set them when deploying the model. To disable this message, set environment variable `MLFLOW_RECORD_ENV_VARS_IN_MODEL_LOGGING` to `false`.
[I 2025-10-25 17:25:50,848] Trial 4 finished with value: 9.022007166858565 and parameters: {'n_estimators': 200, 'learning_rate': 0.0018349072049055448, 'max_depth': 11, 'min_samples_split': 10, 'min_samples_leaf': 2, 'subsample': 0.798070764044508, 'max_features': 'log2'}. Best is trial 0 with value: 5.5583158005865165.


🏃 View run dashing-hound-941 at: https://dbc-79ec72f9-c392.cloud.databricks.com/ml/experiments/530562432550975/runs/d16db68740844abf90c686c33a7751fb
🧪 View experiment at: https://dbc-79ec72f9-c392.cloud.databricks.com/ml/experiments/530562432550975




Downloading artifacts:   0%|          | 0/7 [00:00<?, ?it/s]

2025/10/25 17:26:06 INFO mlflow.models.model: Found the following environment variables used during model inference: [DATABRICKS_HOST, DATABRICKS_TOKEN]. Please check if you need to set them when deploying the model. To disable this message, set environment variable `MLFLOW_RECORD_ENV_VARS_IN_MODEL_LOGGING` to `false`.
[I 2025-10-25 17:26:10,205] Trial 5 finished with value: 8.629424941346821 and parameters: {'n_estimators': 350, 'learning_rate': 0.006939031266619744, 'max_depth': 9, 'min_samples_split': 12, 'min_samples_leaf': 2, 'subsample': 0.9878338511058234, 'max_features': 'log2'}. Best is trial 0 with value: 5.5583158005865165.


🏃 View run amazing-sponge-741 at: https://dbc-79ec72f9-c392.cloud.databricks.com/ml/experiments/530562432550975/runs/6d699081b6384fb894021f2d5236d24b
🧪 View experiment at: https://dbc-79ec72f9-c392.cloud.databricks.com/ml/experiments/530562432550975




Downloading artifacts:   0%|          | 0/7 [00:00<?, ?it/s]

2025/10/25 17:26:26 INFO mlflow.models.model: Found the following environment variables used during model inference: [DATABRICKS_HOST, DATABRICKS_TOKEN]. Please check if you need to set them when deploying the model. To disable this message, set environment variable `MLFLOW_RECORD_ENV_VARS_IN_MODEL_LOGGING` to `false`.
[I 2025-10-25 17:26:29,268] Trial 6 finished with value: 5.438115737910744 and parameters: {'n_estimators': 300, 'learning_rate': 0.3076882474301538, 'max_depth': 4, 'min_samples_split': 5, 'min_samples_leaf': 1, 'subsample': 0.7301321323053057, 'max_features': None}. Best is trial 6 with value: 5.438115737910744.


🏃 View run abrasive-tern-254 at: https://dbc-79ec72f9-c392.cloud.databricks.com/ml/experiments/530562432550975/runs/c6e0e1f75e5d426c9450ac87d87985b4
🧪 View experiment at: https://dbc-79ec72f9-c392.cloud.databricks.com/ml/experiments/530562432550975




Downloading artifacts:   0%|          | 0/7 [00:00<?, ?it/s]

2025/10/25 17:26:44 INFO mlflow.models.model: Found the following environment variables used during model inference: [DATABRICKS_HOST, DATABRICKS_TOKEN]. Please check if you need to set them when deploying the model. To disable this message, set environment variable `MLFLOW_RECORD_ENV_VARS_IN_MODEL_LOGGING` to `false`.
[I 2025-10-25 17:26:47,901] Trial 7 finished with value: 8.354247714577198 and parameters: {'n_estimators': 200, 'learning_rate': 0.005731044951044763, 'max_depth': 10, 'min_samples_split': 4, 'min_samples_leaf': 9, 'subsample': 0.6298202574719083, 'max_features': 'sqrt'}. Best is trial 6 with value: 5.438115737910744.


🏃 View run suave-jay-929 at: https://dbc-79ec72f9-c392.cloud.databricks.com/ml/experiments/530562432550975/runs/25063936d3704ff9bd058ee4946020d7
🧪 View experiment at: https://dbc-79ec72f9-c392.cloud.databricks.com/ml/experiments/530562432550975




Downloading artifacts:   0%|          | 0/7 [00:00<?, ?it/s]

2025/10/25 17:27:02 INFO mlflow.models.model: Found the following environment variables used during model inference: [DATABRICKS_HOST, DATABRICKS_TOKEN]. Please check if you need to set them when deploying the model. To disable this message, set environment variable `MLFLOW_RECORD_ENV_VARS_IN_MODEL_LOGGING` to `false`.
[I 2025-10-25 17:27:04,783] Trial 8 finished with value: 5.4495353582478305 and parameters: {'n_estimators': 50, 'learning_rate': 0.15882027186184094, 'max_depth': 12, 'min_samples_split': 15, 'min_samples_leaf': 8, 'subsample': 0.6296178606936361, 'max_features': None}. Best is trial 6 with value: 5.438115737910744.


🏃 View run wistful-gull-95 at: https://dbc-79ec72f9-c392.cloud.databricks.com/ml/experiments/530562432550975/runs/9eacc52a34c148a2990770d13bbed4fa
🧪 View experiment at: https://dbc-79ec72f9-c392.cloud.databricks.com/ml/experiments/530562432550975




Downloading artifacts:   0%|          | 0/7 [00:00<?, ?it/s]

2025/10/25 17:27:16 INFO mlflow.models.model: Found the following environment variables used during model inference: [DATABRICKS_HOST, DATABRICKS_TOKEN]. Please check if you need to set them when deploying the model. To disable this message, set environment variable `MLFLOW_RECORD_ENV_VARS_IN_MODEL_LOGGING` to `false`.
[I 2025-10-25 17:27:19,083] Trial 9 finished with value: 9.000186057603118 and parameters: {'n_estimators': 350, 'learning_rate': 0.007817787178411377, 'max_depth': 3, 'min_samples_split': 7, 'min_samples_leaf': 4, 'subsample': 0.8918424713352255, 'max_features': 'log2'}. Best is trial 6 with value: 5.438115737910744.


🏃 View run silent-stork-586 at: https://dbc-79ec72f9-c392.cloud.databricks.com/ml/experiments/530562432550975/runs/169536f1eb384e008a5631cbcb7c3dba
🧪 View experiment at: https://dbc-79ec72f9-c392.cloud.databricks.com/ml/experiments/530562432550975

Mejor RMSE encontrado: 5.4381
Mejores Parámetros: {'n_estimators': 300, 'learning_rate': 0.3076882474301538, 'max_depth': 4, 'min_samples_split': 5, 'min_samples_leaf': 1, 'subsample': 0.7301321323053057, 'max_features': None}

Entrenando y registrando el modelo final con los mejores parámetros...




Downloading artifacts:   0%|          | 0/7 [00:00<?, ?it/s]

2025/10/25 17:27:36 INFO mlflow.models.model: Found the following environment variables used during model inference: [DATABRICKS_HOST, DATABRICKS_TOKEN]. Please check if you need to set them when deploying the model. To disable this message, set environment variable `MLFLOW_RECORD_ENV_VARS_IN_MODEL_LOGGING` to `false`.


Modelo final registrado en MLflow.
🏃 View run Gradient Boosting Regressor at: https://dbc-79ec72f9-c392.cloud.databricks.com/ml/experiments/530562432550975/runs/ea4eba19af22445db9c80f8c67f41ce9
🧪 View experiment at: https://dbc-79ec72f9-c392.cloud.databricks.com/ml/experiments/530562432550975


In [187]:
run_id = input("Ingrese el run_id")
run_uri = f"runs:/{run_id}/model"

result = mlflow.register_model(
    model_uri=run_uri,
    name="workspace.default.nyc-taxi-model"
)

Registered model 'workspace.default.nyc-taxi-model' already exists. Creating a new version of this model...


Downloading artifacts:   0%|          | 0/8 [00:00<?, ?it/s]

Uploading artifacts:   0%|          | 0/9 [00:00<?, ?it/s]

Created version '8' of model 'workspace.default.nyc-taxi-model'.


Evaluate and name the models:

In [188]:
from mlflow import MlflowClient
import mlflow.pyfunc

In [189]:
model_name = "workspace.default.nyc-taxi-model" 
versions = [2, 5, 6]  
metric_name = "validation-rmse"

In [190]:
client = MlflowClient()

In [191]:
versions_metadata = client.search_model_versions(f"name='{model_name}'")
ranked_versions = []

for version in versions_metadata:
    run_id = version.run_id
    try:
        metric_value = client.get_run(run_id).data.metrics[metric_name]
        ranked_versions.append([version._version, metric_value])
    except KeyError:
            print(f"Advertencia: La versión {version._version} no tiene la métrica '{metric_name}' registrada.")
    print(f"La version {version._version} del modelo nyc-taxi-model tiene un rmse de {metric_value}")

La version 8 del modelo nyc-taxi-model tiene un rmse de 5.98646478833004
La version 7 del modelo nyc-taxi-model tiene un rmse de 5.930294509169692
La version 6 del modelo nyc-taxi-model tiene un rmse de 5.994918174473722
La version 5 del modelo nyc-taxi-model tiene un rmse de 5.931136039429667
Advertencia: La versión 4 no tiene la métrica 'validation-rmse' registrada.
La version 4 del modelo nyc-taxi-model tiene un rmse de 5.931136039429667
Advertencia: La versión 3 no tiene la métrica 'validation-rmse' registrada.
La version 3 del modelo nyc-taxi-model tiene un rmse de 5.931136039429667
La version 2 del modelo nyc-taxi-model tiene un rmse de 5.862100491730689
La version 1 del modelo nyc-taxi-model tiene un rmse de 5.862100491730689


In [192]:
ranked_versions = sorted(ranked_versions, key=lambda x: x[1])
ranked_versions

[['2', 5.862100491730689],
 ['1', 5.862100491730689],
 ['7', 5.930294509169692],
 ['5', 5.931136039429667],
 ['8', 5.98646478833004],
 ['6', 5.994918174473722]]

Let's add the aliases:

In [193]:
champion = ranked_versions[0][0]
challenger = ranked_versions[1][0]


In [194]:
all_versions = client.search_model_versions(f"name='{model_name}'")

print("\n--- Asignando Nuevos Alias ---")

# 4. Asignar el Alias 'champion'
client.set_registered_model_alias(
    name=model_name, 
    alias='champion', 
    version=champion
)
print(f"✅ Alias 'champion' asignado a la Versión {champion}.")

# 5. Asignar el Alias 'challenger'
client.set_registered_model_alias(
    name=model_name, 
    alias='challenger', 
    version=challenger
)
print(f"📢 Alias 'challenger' asignado a la Versión {challenger}.")


--- Asignando Nuevos Alias ---
✅ Alias 'champion' asignado a la Versión 2.
📢 Alias 'challenger' asignado a la Versión 1.
