In [8]:
import mlflow
import mlflow.sklearn
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials
from hyperopt.pyll import scope
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor
from sklearn.metrics import mean_squared_error
import numpy as np
import pickle
import pathlib
import pandas as pd

In [9]:
df_train = pd.read_parquet('data/green_tripdata_2024-01.parquet')
df_val = pd.read_parquet('data/green_tripdata_2024-02.parquet')

In [10]:
df_train['duration'] = (df_train['lpep_dropoff_datetime'] - df_train['lpep_pickup_datetime']).dt.total_seconds() / 60
df_val['duration'] = (df_val['lpep_dropoff_datetime'] - df_val['lpep_pickup_datetime']).dt.total_seconds() / 60

X_train = df_train[['trip_distance']]
y_train = df_train['duration']
X_val = df_val[['trip_distance']]
y_val = df_val['duration']

In [11]:
def rmse(y_true, y_pred):
    return np.sqrt(mean_squared_error(y_true, y_pred))

best_rmse = float("inf")
best_model = None

In [12]:
def objective_gb(params):
    global best_rmse, best_model
    with mlflow.start_run(nested=True):
        model = GradientBoostingRegressor(**params)
        model.fit(X_train, y_train)
        y_pred = model.predict(X_val)
        score = rmse(y_val, y_pred)
        
        mlflow.log_params(params)
        mlflow.log_metric('rmse', score)
        mlflow.sklearn.log_model(model, artifact_path="model")
        
        if score < best_rmse:
            best_rmse = score
            best_model = model  # Guardar el mejor modelo
        
        return {'loss': score, 'status': STATUS_OK}

In [13]:
search_space_gb = {
    'n_estimators': scope.int(hp.quniform('n_estimators', 50, 500, 25)),
    'learning_rate': hp.loguniform('learning_rate', -3, 0),
    'max_depth': scope.int(hp.quniform('max_depth', 2, 10, 1)),
    'min_samples_split': scope.int(hp.quniform('min_samples_split', 2, 10, 1)),
    'min_samples_leaf': scope.int(hp.quniform('min_samples_leaf', 1, 10, 1)),
    'random_state': 42
}

In [14]:
with mlflow.start_run(run_name="GradientBoostingExperiment"):
    trials_gb = Trials()
    best_params_gb = fmin(
        fn=objective_gb,
        space=search_space_gb,
        algo=tpe.suggest,
        max_evals=20,
        trials=trials_gb
    )
    mlflow.log_params(best_params_gb)

  0%|          | 0/20 [00:00<?, ?trial/s, best loss=?]




  5%|▌         | 1/20 [00:20<06:27, 20.37s/trial, best loss: 79.6131056874737]




 10%|█         | 2/20 [00:28<03:53, 12.95s/trial, best loss: 78.0291897411658]




 15%|█▌        | 3/20 [00:38<03:21, 11.86s/trial, best loss: 78.0291897411658]




 20%|██        | 4/20 [01:12<05:25, 20.37s/trial, best loss: 78.0291897411658]




 25%|██▌       | 5/20 [01:31<04:59, 19.97s/trial, best loss: 78.0040159580805]




 30%|███       | 6/20 [01:37<03:32, 15.19s/trial, best loss: 77.33721729299118]




 35%|███▌      | 7/20 [01:43<02:40, 12.31s/trial, best loss: 77.33721729299118]




 40%|████      | 8/20 [02:02<02:54, 14.55s/trial, best loss: 77.33721729299118]




 45%|████▌     | 9/20 [02:33<03:35, 19.56s/trial, best loss: 77.33721729299118]




 50%|█████     | 10/20 [02:57<03:29, 20.91s/trial, best loss: 77.33721729299118]




 55%|█████▌    | 11/20 [03:13<02:55, 19.51s/trial, best loss: 77.33721729299118]




 60%|██████    | 12/20 [03:31<02:32, 19.06s/trial, best loss: 77.33721729299118]




 65%|██████▌   | 13/20 [03:43<01:58, 16.94s/trial, best loss: 77.33721729299118]




 70%|███████   | 14/20 [03:59<01:39, 16.64s/trial, best loss: 77.33721729299118]




 75%|███████▌  | 15/20 [04:05<01:06, 13.39s/trial, best loss: 77.33721729299118]




 80%|████████  | 16/20 [04:11<00:44, 11.20s/trial, best loss: 77.33721729299118]




 85%|████████▌ | 17/20 [04:16<00:27,  9.08s/trial, best loss: 77.24229901339169]




 90%|█████████ | 18/20 [04:40<00:27, 13.66s/trial, best loss: 77.24229901339169]




 95%|█████████▌| 19/20 [05:02<00:16, 16.23s/trial, best loss: 77.24229901339169]




100%|██████████| 20/20 [05:17<00:00, 15.90s/trial, best loss: 77.24229901339169]


In [15]:
def objective_rf(params):
    global best_rmse, best_model
    with mlflow.start_run(nested=True):
        model = RandomForestRegressor(**params)
        model.fit(X_train, y_train)
        y_pred = model.predict(X_val)
        score = rmse(y_val, y_pred)
        
        mlflow.log_params(params)
        mlflow.log_metric('rmse', score)
        mlflow.sklearn.log_model(model, artifact_path="model")
        
        if score < best_rmse:
            best_rmse = score
            best_model = model  # Guardar el mejor modelo
        
        return {'loss': score, 'status': STATUS_OK}

In [16]:
search_space_rf = {
    'n_estimators': scope.int(hp.quniform('n_estimators', 50, 500, 25)),
    'max_depth': scope.int(hp.quniform('max_depth', 2, 20, 1)),
    'min_samples_split': scope.int(hp.quniform('min_samples_split', 2, 10, 1)),
    'min_samples_leaf': scope.int(hp.quniform('min_samples_leaf', 1, 10, 1)),
    'random_state': 42
}

In [17]:
with mlflow.start_run(run_name="RandomForestExperiment"):
    trials_rf = Trials()
    best_params_rf = fmin(
        fn=objective_rf,
        space=search_space_rf,
        algo=tpe.suggest,
        max_evals=20,
        trials=trials_rf
    )
    mlflow.log_params(best_params_rf)

  0%|          | 0/20 [00:00<?, ?trial/s, best loss=?]




  5%|▌         | 1/20 [00:30<09:36, 30.35s/trial, best loss: 77.81407600837389]




 10%|█         | 2/20 [00:58<08:44, 29.14s/trial, best loss: 77.81407600837389]




 15%|█▌        | 3/20 [01:15<06:38, 23.45s/trial, best loss: 77.72820142607888]




 20%|██        | 4/20 [01:30<05:20, 20.04s/trial, best loss: 77.38031019889793]




 25%|██▌       | 5/20 [01:51<05:06, 20.44s/trial, best loss: 77.2248719263206] 




 30%|███       | 6/20 [02:01<03:56, 16.91s/trial, best loss: 77.2248719263206]




 35%|███▌      | 7/20 [02:20<03:50, 17.74s/trial, best loss: 77.2248719263206]




 40%|████      | 8/20 [02:41<03:44, 18.70s/trial, best loss: 77.2248719263206]




 45%|████▌     | 9/20 [03:09<03:57, 21.57s/trial, best loss: 77.2248719263206]




 50%|█████     | 10/20 [03:15<02:46, 16.70s/trial, best loss: 77.2248719263206]




 55%|█████▌    | 11/20 [03:30<02:25, 16.16s/trial, best loss: 77.2248719263206]




 60%|██████    | 12/20 [03:38<01:50, 13.82s/trial, best loss: 77.14146726597599]




 65%|██████▌   | 13/20 [03:56<01:46, 15.19s/trial, best loss: 77.14146726597599]




 70%|███████   | 14/20 [04:08<01:24, 14.06s/trial, best loss: 77.14146726597599]




 75%|███████▌  | 15/20 [04:13<00:56, 11.35s/trial, best loss: 77.14146726597599]




 80%|████████  | 16/20 [04:24<00:44, 11.12s/trial, best loss: 77.14146726597599]




 85%|████████▌ | 17/20 [04:36<00:34, 11.48s/trial, best loss: 77.14146726597599]




 90%|█████████ | 18/20 [04:41<00:19,  9.50s/trial, best loss: 77.14146726597599]




 95%|█████████▌| 19/20 [04:47<00:08,  8.48s/trial, best loss: 77.14146726597599]




100%|██████████| 20/20 [04:57<00:00, 14.86s/trial, best loss: 77.14146726597599]


In [18]:
with mlflow.start_run(run_name="Register Best Model"):
    mlflow.sklearn.log_model(best_model, artifact_path="model", registered_model_name="nyc-taxi-model")
    print(f"El modelo con mejor RMSE ({best_rmse}) ha sido registrado.")

Successfully registered model 'nyc-taxi-model'.
Created version '1' of model 'nyc-taxi-model'.


El modelo con mejor RMSE (77.14146726597599) ha sido registrado.
