In [1]:
# Create the directory if it doesn't exist
!mkdir -p ../data

# Download files using curl
!curl -o ../data/green_tripdata_2024-01.parquet https://d37ci6vzurychx.cloudfront.net/trip-data/green_tripdata_2024-01.parquet
!curl -o ../data/green_tripdata_2024-02.parquet https://d37ci6vzurychx.cloudfront.net/trip-data/green_tripdata_2024-02.parquet

La sintaxis del comando no es correcta.
  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed

  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0
100 1330k  100 1330k    0     0  1893k      0 --:--:-- --:--:-- --:--:-- 1903k
  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed

  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0
100 1253k  100 1253k    0     0  2464k      0 --:--:-- --:--:-- --:--:-- 2487k


In [2]:
import pickle
import pandas as pd
from sklearn.metrics import  root_mean_squared_error
from sklearn.feature_extraction import  DictVectorizer
from sklearn.linear_model import Lasso, Ridge, LinearRegression

In [3]:
def read_dataframe(filename):

    df = pd.read_parquet(filename)

    df['duration'] = df.lpep_dropoff_datetime - df.lpep_pickup_datetime
    df.duration = df.duration.apply(lambda td: td.total_seconds() / 60)

    df = df[(df.duration >= 1) & (df.duration <= 60)]

    categorical = ['PULocationID', 'DOLocationID']
    df[categorical] = df[categorical].astype(str)

    return df

In [4]:
df_train = read_dataframe('../data/green_tripdata_2024-01.parquet')
df_val = read_dataframe('../data/green_tripdata_2024-02.parquet')

In [5]:
df_train['PU_DO'] = df_train['PULocationID'] + '_' + df_train['DOLocationID']
df_val['PU_DO'] = df_val['PULocationID'] + '_' + df_val['DOLocationID']

In [6]:
categorical = ['PU_DO']  #'PULocationID', 'DOLocationID']
numerical = ['trip_distance']
dv = DictVectorizer()

train_dicts = df_train[categorical + numerical].to_dict(orient='records')
X_train = dv.fit_transform(train_dicts)

val_dicts = df_val[categorical + numerical].to_dict(orient='records')
X_val = dv.transform(val_dicts)

In [7]:
target = 'duration'
y_train = df_train[target].values
y_val = df_val[target].values

In [8]:
import dagshub
import mlflow


dagshub.init(url="https://dagshub.com/Pacolaz/nyc-taxi-time-prediction", mlflow=True)

MLFLOW_TRACKING_URI = mlflow.get_tracking_uri()

print(MLFLOW_TRACKING_URI)

mlflow.set_tracking_uri(MLFLOW_TRACKING_URI)
mlflow.set_experiment(experiment_name="nyc-taxi-experiment")

https://dagshub.com/Pacolaz/nyc-taxi-time-prediction.mlflow


<Experiment: artifact_location='mlflow-artifacts:/95f7e9b740a14573ae117bfbbbc17c06', creation_time=1730258370273, experiment_id='0', last_update_time=1730258370273, lifecycle_stage='active', name='nyc-taxi-experiment', tags={}>

In [9]:
training_dataset = mlflow.data.from_numpy(X_train.data, targets=y_train, name="green_tripdata_2024-01")
validation_dataset = mlflow.data.from_numpy(X_val.data, targets=y_val, name="green_tripdata_2024-02")

In [10]:
from dagshub import get_repo_bucket_client
# Get a boto3.client object
s3 = get_repo_bucket_client("Pacolaz/nyc-taxi-time-prediction")

# Upload file
s3.upload_file(
    Bucket="nyc-taxi-time-prediction",  # name of the repo
    Filename="../data/green_tripdata_2024-01.parquet",  # local path of file to upload
    Key="train_data.parquet",  # remote path where to upload the file
)

# Download file
# s3.download_file(
#     Bucket="nyc-taxi-time-prediction",  # name of the repo
#     Key="train_data.parquet",  #  remote path from where to download the file
#     Filename="local.csv",  # local path where to download the file
# )

In [11]:
s3.upload_file(
    Bucket="nyc-taxi-time-prediction",  # name of the repo
    Filename="../data/green_tripdata_2024-02.parquet",  # local path of file to upload
    Key="eval_data.parquet",  # remote path where to upload the file
)

# Download file
# s3.download_file(
#     Bucket="nyc-taxi-time-prediction",  # name of the repo
#     Key="train_data.parquet",  #  remote path from where to download the file
#     Filename="local.csv",  # local path where to download the file
# )

In [12]:
import xgboost as xgb
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials
from hyperopt.pyll import scope
import pathlib

In [13]:
train = xgb.DMatrix(X_train, label=y_train)
valid = xgb.DMatrix(X_val, label=y_val)

In [14]:
def objective(params):
    with mlflow.start_run(nested=True):
         
        # Tag model
        mlflow.set_tag("model_family", "xgboost")
        
        # Log parameters
        mlflow.log_params(params)
        
        # Train model
        booster = xgb.train(
            params=params,
            dtrain=train,
            num_boost_round=100,
            evals=[(valid, 'validation')],
            early_stopping_rounds=10
        )
        
        # Log xgboost model with artifact_path
        mlflow.xgboost.log_model(booster, artifact_path="model")
         
        # Predict in the val dataset
        y_pred = booster.predict(valid)
        
        # Calculate metric
        rmse = root_mean_squared_error(y_val, y_pred)
        
        # Log performance metric
        mlflow.log_metric("rmse", rmse)

    return {'loss': rmse, 'status': STATUS_OK}

In [15]:
mlflow.xgboost.autolog()

with mlflow.start_run(run_name="Xgboost Hyper-parameter Optimization", nested=True):
    search_space = {
        'max_depth': scope.int(hp.quniform('max_depth', 4, 100, 1)),
        'learning_rate': hp.loguniform('learning_rate', -3, 0),
        'reg_alpha': hp.loguniform('reg_alpha', -5, -1),
        'reg_lambda': hp.loguniform('reg_lambda', -6, -1),
        'min_child_weight': hp.loguniform('min_child_weight', -1, 3),
        'objective': 'reg:squarederror',
        'seed': 42
    }
    
    best_params = fmin(
        fn=objective,
        space=search_space,
        algo=tpe.suggest,
        max_evals=10,
        trials=Trials()
    )
    best_params["max_depth"] = int(best_params["max_depth"])
    best_params["seed"] = 42
    best_params["objective"] = "reg:squarederror"
    
    mlflow.log_params(best_params)

    # Log tags
    mlflow.set_tags(
        tags={
            "project": "NYC Taxi Time Prediction Project",
            "optimizer_engine": "hyper-opt",
            "model_family": "xgboost",
            "feature_set_version": 1,
        }
    )

    # Log a fit model instance
    booster = xgb.train(
        params=best_params,
        dtrain=train,
        num_boost_round=100,
        evals=[(valid, 'validation')],
        early_stopping_rounds=10
    )
        
    y_pred = booster.predict(valid)
    
    rmse = root_mean_squared_error(y_val, y_pred)
    mlflow.log_metric("rmse", rmse)
    
    pathlib.Path("models").mkdir(exist_ok=True)
    with open("models/preprocessor.b", "wb") as f_out:
        pickle.dump(dv, f_out)
        
    mlflow.log_artifact("models/preprocessor.b", artifact_path="preprocessor")

[0]	validation-rmse:7.24715                           
[1]	validation-rmse:6.25749                           
[2]	validation-rmse:5.76748                           
[3]	validation-rmse:5.53527                           
[4]	validation-rmse:5.42608                           
[5]	validation-rmse:5.37003                           
[6]	validation-rmse:5.34328                           
[7]	validation-rmse:5.32848                           
[8]	validation-rmse:5.31684                           
[9]	validation-rmse:5.31045                           
[10]	validation-rmse:5.30648                          
[11]	validation-rmse:5.30050                          
[12]	validation-rmse:5.29764                          
[13]	validation-rmse:5.29207                          
[14]	validation-rmse:5.28516                          
[15]	validation-rmse:5.28406                          
[16]	validation-rmse:5.28025                          
[17]	validation-rmse:5.27933                          
[18]	valid






2024/10/29 21:27:05 INFO mlflow.tracking._tracking_service.client: 🏃 View run shivering-moose-103 at: https://dagshub.com/Pacolaz/nyc-taxi-time-prediction.mlflow/#/experiments/0/runs/a99e8df6520d4d2484dec09b35fd9546.

2024/10/29 21:27:05 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: https://dagshub.com/Pacolaz/nyc-taxi-time-prediction.mlflow/#/experiments/0.



[0]	validation-rmse:8.77829                                                     
[1]	validation-rmse:8.47186                                                     
[2]	validation-rmse:8.19051                                                     
[3]	validation-rmse:7.93269                                                     
[4]	validation-rmse:7.69267                                                     
[5]	validation-rmse:7.47659                                                     
[6]	validation-rmse:7.27882                                                     
[7]	validation-rmse:7.09135                                                     
[8]	validation-rmse:6.92871                                                     
[9]	validation-rmse:6.77631                                                     
[10]	validation-rmse:6.63723                                                    
[11]	validation-rmse:6.50936                                                    
[12]	validation-rmse:6.39611






2024/10/29 21:32:24 INFO mlflow.tracking._tracking_service.client: 🏃 View run respected-mule-816 at: https://dagshub.com/Pacolaz/nyc-taxi-time-prediction.mlflow/#/experiments/0/runs/b3f8a8141bd94254814859e01f129c64.

2024/10/29 21:32:24 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: https://dagshub.com/Pacolaz/nyc-taxi-time-prediction.mlflow/#/experiments/0.



[0]	validation-rmse:6.19866                                                     
[1]	validation-rmse:5.55913                                                     
[2]	validation-rmse:5.42291                                                     
[3]	validation-rmse:5.38649                                                     
[4]	validation-rmse:5.37017                                                     
[5]	validation-rmse:5.35716                                                     
[6]	validation-rmse:5.34393                                                     
[7]	validation-rmse:5.34207                                                     
[8]	validation-rmse:5.33776                                                     
[9]	validation-rmse:5.33507                                                     
[10]	validation-rmse:5.33340                                                    
[11]	validation-rmse:5.33197                                                    
[12]	validation-rmse:5.32656






2024/10/29 21:34:29 INFO mlflow.tracking._tracking_service.client: 🏃 View run sassy-lark-226 at: https://dagshub.com/Pacolaz/nyc-taxi-time-prediction.mlflow/#/experiments/0/runs/4bda502f022c4de0a0f26a5667094bc9.

2024/10/29 21:34:29 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: https://dagshub.com/Pacolaz/nyc-taxi-time-prediction.mlflow/#/experiments/0.



[0]	validation-rmse:8.78444                                                     
[1]	validation-rmse:8.48303                                                     
[2]	validation-rmse:8.20535                                                     
[3]	validation-rmse:7.94984                                                     
[4]	validation-rmse:7.71518                                                     
[5]	validation-rmse:7.49982                                                     
[6]	validation-rmse:7.30298                                                     
[7]	validation-rmse:7.12299                                                     
[8]	validation-rmse:6.95838                                                     
[9]	validation-rmse:6.80877                                                     
[10]	validation-rmse:6.67234                                                    
[11]	validation-rmse:6.54857                                                    
[12]	validation-rmse:6.43596






2024/10/29 21:37:09 INFO mlflow.tracking._tracking_service.client: 🏃 View run adaptable-finch-600 at: https://dagshub.com/Pacolaz/nyc-taxi-time-prediction.mlflow/#/experiments/0/runs/a8ffa99f787d46a7a369da67326d47cf.

2024/10/29 21:37:09 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: https://dagshub.com/Pacolaz/nyc-taxi-time-prediction.mlflow/#/experiments/0.



[0]	validation-rmse:5.49943                                                     
[1]	validation-rmse:5.47758                                                     
[2]	validation-rmse:5.44875                                                     
[3]	validation-rmse:5.44312                                                     
[4]	validation-rmse:5.44150                                                     
[5]	validation-rmse:5.43279                                                     
[6]	validation-rmse:5.43126                                                     
[7]	validation-rmse:5.41072                                                     
[8]	validation-rmse:5.40734                                                     
[9]	validation-rmse:5.40870                                                     
[10]	validation-rmse:5.41066                                                    
[11]	validation-rmse:5.40852                                                    
[12]	validation-rmse:5.40963






2024/10/29 21:38:12 INFO mlflow.tracking._tracking_service.client: 🏃 View run peaceful-chimp-544 at: https://dagshub.com/Pacolaz/nyc-taxi-time-prediction.mlflow/#/experiments/0/runs/55e10ca787844e4f803d754ad28b7b3c.

2024/10/29 21:38:12 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: https://dagshub.com/Pacolaz/nyc-taxi-time-prediction.mlflow/#/experiments/0.



[0]	validation-rmse:6.05532                                                     
[1]	validation-rmse:5.40037                                                     
[2]	validation-rmse:5.26460                                                     
[3]	validation-rmse:5.22876                                                     
[4]	validation-rmse:5.21300                                                     
[5]	validation-rmse:5.20725                                                     
[6]	validation-rmse:5.20211                                                     
[7]	validation-rmse:5.19922                                                     
[8]	validation-rmse:5.19808                                                     
[9]	validation-rmse:5.19656                                                     
[10]	validation-rmse:5.19826                                                    
[11]	validation-rmse:5.19786                                                    
[12]	validation-rmse:5.19705






2024/10/29 21:39:40 INFO mlflow.tracking._tracking_service.client: 🏃 View run amazing-sheep-600 at: https://dagshub.com/Pacolaz/nyc-taxi-time-prediction.mlflow/#/experiments/0/runs/984dbad02e4141a3b3842f39fca85766.

2024/10/29 21:39:40 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: https://dagshub.com/Pacolaz/nyc-taxi-time-prediction.mlflow/#/experiments/0.



[0]	validation-rmse:8.54242                                                     
[1]	validation-rmse:8.05077                                                     
[2]	validation-rmse:7.63198                                                     
[3]	validation-rmse:7.27109                                                     
[4]	validation-rmse:6.96691                                                     
[5]	validation-rmse:6.70708                                                     
[6]	validation-rmse:6.49013                                                     
[7]	validation-rmse:6.30722                                                     
[8]	validation-rmse:6.15453                                                     
[9]	validation-rmse:6.02737                                                     
[10]	validation-rmse:5.92138                                                    
[11]	validation-rmse:5.83486                                                    
[12]	validation-rmse:5.76067






2024/10/29 21:40:18 INFO mlflow.tracking._tracking_service.client: 🏃 View run whimsical-donkey-1 at: https://dagshub.com/Pacolaz/nyc-taxi-time-prediction.mlflow/#/experiments/0/runs/2dfd5b393a114f7e8e3cf4a9cb959c92.

2024/10/29 21:40:18 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: https://dagshub.com/Pacolaz/nyc-taxi-time-prediction.mlflow/#/experiments/0.



[0]	validation-rmse:5.59813                                                     
[1]	validation-rmse:5.40117                                                    
[2]	validation-rmse:5.35857                                                    
[3]	validation-rmse:5.35225                                                    
[4]	validation-rmse:5.33606                                                    
[5]	validation-rmse:5.33368                                                    
[6]	validation-rmse:5.32504                                                    
[7]	validation-rmse:5.31983                                                    
[8]	validation-rmse:5.31534                                                    
[9]	validation-rmse:5.30506                                                    
[10]	validation-rmse:5.29968                                                   
[11]	validation-rmse:5.29688                                                   
[12]	validation-rmse:5.29453           






2024/10/29 21:41:12 INFO mlflow.tracking._tracking_service.client: 🏃 View run handsome-stag-543 at: https://dagshub.com/Pacolaz/nyc-taxi-time-prediction.mlflow/#/experiments/0/runs/aac21a356dc64189af12b4a709d0dc3f.

2024/10/29 21:41:12 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: https://dagshub.com/Pacolaz/nyc-taxi-time-prediction.mlflow/#/experiments/0.



[0]	validation-rmse:8.76642                                                    
[1]	validation-rmse:8.44991                                                    
[2]	validation-rmse:8.15973                                                    
[3]	validation-rmse:7.89408                                                    
[4]	validation-rmse:7.65133                                                    
[5]	validation-rmse:7.43015                                                    
[6]	validation-rmse:7.22849                                                    
[7]	validation-rmse:7.04547                                                    
[8]	validation-rmse:6.87918                                                    
[9]	validation-rmse:6.72891                                                    
[10]	validation-rmse:6.59253                                                   
[11]	validation-rmse:6.46957                                                   
[12]	validation-rmse:6.35865            






2024/10/29 21:42:18 INFO mlflow.tracking._tracking_service.client: 🏃 View run angry-finch-195 at: https://dagshub.com/Pacolaz/nyc-taxi-time-prediction.mlflow/#/experiments/0/runs/3740c6224b1e4af1973d07c1a0743e53.

2024/10/29 21:42:18 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: https://dagshub.com/Pacolaz/nyc-taxi-time-prediction.mlflow/#/experiments/0.



[0]	validation-rmse:8.27181                                                    
[1]	validation-rmse:7.60194                                                    
[2]	validation-rmse:7.07362                                                    
[3]	validation-rmse:6.66019                                                    
[4]	validation-rmse:6.34204                                                    
[5]	validation-rmse:6.09920                                                    
[6]	validation-rmse:5.91372                                                    
[7]	validation-rmse:5.77337                                                    
[8]	validation-rmse:5.66757                                                    
[9]	validation-rmse:5.58684                                                    
[10]	validation-rmse:5.52638                                                   
[11]	validation-rmse:5.48091                                                   
[12]	validation-rmse:5.44649            






2024/10/29 21:43:29 INFO mlflow.tracking._tracking_service.client: 🏃 View run delightful-calf-426 at: https://dagshub.com/Pacolaz/nyc-taxi-time-prediction.mlflow/#/experiments/0/runs/a2bcb4fb47f242588639e98d63d49ded.

2024/10/29 21:43:29 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: https://dagshub.com/Pacolaz/nyc-taxi-time-prediction.mlflow/#/experiments/0.



100%|██████████| 10/10 [19:36<00:00, 117.67s/trial, best loss: 5.184572086147766]
[0]	validation-rmse:6.05532
[1]	validation-rmse:5.40037
[2]	validation-rmse:5.26460
[3]	validation-rmse:5.22876
[4]	validation-rmse:5.21300
[5]	validation-rmse:5.20725
[6]	validation-rmse:5.20211
[7]	validation-rmse:5.19922
[8]	validation-rmse:5.19808
[9]	validation-rmse:5.19656
[10]	validation-rmse:5.19826
[11]	validation-rmse:5.19786
[12]	validation-rmse:5.19705
[13]	validation-rmse:5.19588
[14]	validation-rmse:5.19482
[15]	validation-rmse:5.19392
[16]	validation-rmse:5.18929
[17]	validation-rmse:5.18471
[18]	validation-rmse:5.18373
[19]	validation-rmse:5.18284
[20]	validation-rmse:5.18370
[21]	validation-rmse:5.18353
[22]	validation-rmse:5.17709
[23]	validation-rmse:5.17698
[24]	validation-rmse:5.17828
[25]	validation-rmse:5.17898
[26]	validation-rmse:5.17806
[27]	validation-rmse:5.17552
[28]	validation-rmse:5.17266
[29]	validation-rmse:5.17418
[30]	validation-rmse:5.17489
[31]	validation-rmse:5.17432


2024/10/29 21:43:57 INFO mlflow.tracking._tracking_service.client: 🏃 View run Xgboost Hyper-parameter Optimization at: https://dagshub.com/Pacolaz/nyc-taxi-time-prediction.mlflow/#/experiments/0/runs/2adf1f25498a47a3b8028e882ad4a9e1.
2024/10/29 21:43:57 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: https://dagshub.com/Pacolaz/nyc-taxi-time-prediction.mlflow/#/experiments/0.


In [16]:
best_params

{'learning_rate': 0.6087681395216564,
 'max_depth': 47,
 'min_child_weight': 2.1228251865839693,
 'reg_alpha': 0.010706662118787161,
 'reg_lambda': 0.19200106429052627,
 'seed': 42,
 'objective': 'reg:squarederror'}

In [17]:
run_id = "15ad68e1a5fb493294e1444770c3eaa0"
run_uri = f"runs:/{run_id}/model"

result = mlflow.register_model(
    model_uri=run_uri,
    name="nyc-taxi-model"
)

Successfully registered model 'nyc-taxi-model'.


MlflowException: API request to endpoint /api/2.0/mlflow/runs/get failed with error code 400 != 200. Response body: '"repo not associated with run"'

In [None]:
from datetime import datetime
from mlflow import MlflowClient

client = MlflowClient(tracking_uri=MLFLOW_TRACKING_URI)
client.update_registered_model(
    name="nyc-taxi-model",
    description="Model registry for the NYC Taxi Time Prediction Project",
)

new_alias = "champion"
date = datetime.today()
model_version = "1"

# create "champion" alias for version 1 of model "nyc-taxi-model"
client.set_registered_model_alias(
    name="nyc-taxi-model",
    alias=new_alias,
    version=model_version
)

client.update_model_version(
    name="nyc-taxi-model",
    version=model_version,
    description=f"The model version {model_version} was transitioned to {new_alias} on {date}",
)