In [4]:
!if not exist ..\data mkdir ..\data

In [5]:
# Create the directory if it doesn't exist
# !mkdir -p ../data

# Download files using curl
!curl -o ../data/green_tripdata_2024-01.parquet https://d37ci6vzurychx.cloudfront.net/trip-data/green_tripdata_2024-01.parquet
!curl -o ../data/green_tripdata_2024-02.parquet https://d37ci6vzurychx.cloudfront.net/trip-data/green_tripdata_2024-02.parquet

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed

  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0
100 1330k  100 1330k    0     0  2587k      0 --:--:-- --:--:-- --:--:-- 2603k
  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed

  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0
100 1253k  100 1253k    0     0  2605k      0 --:--:-- --:--:-- --:--:-- 2633k


In [6]:
import pickle
import pandas as pd
from sklearn.metrics import  mean_squared_error
from sklearn.feature_extraction import  DictVectorizer
from sklearn.linear_model import Lasso, Ridge, LinearRegression

In [7]:
def read_dataframe(filename):

    df = pd.read_parquet(filename)

    df['duration'] = df.lpep_dropoff_datetime - df.lpep_pickup_datetime
    df.duration = df.duration.apply(lambda td: td.total_seconds() / 60)

    df = df[(df.duration >= 1) & (df.duration <= 60)]

    categorical = ['PULocationID', 'DOLocationID']
    df[categorical] = df[categorical].astype(str)

    return df

In [8]:
df_train = read_dataframe('../data/green_tripdata_2024-01.parquet')
df_val = read_dataframe('../data/green_tripdata_2024-02.parquet')

In [9]:
df_train['PU_DO'] = df_train['PULocationID'] + '_' + df_train['DOLocationID']
df_val['PU_DO'] = df_val['PULocationID'] + '_' + df_val['DOLocationID']

In [10]:
categorical = ['PU_DO']  #'PULocationID', 'DOLocationID']
numerical = ['trip_distance']
dv = DictVectorizer()

train_dicts = df_train[categorical + numerical].to_dict(orient='records')
X_train = dv.fit_transform(train_dicts)

val_dicts = df_val[categorical + numerical].to_dict(orient='records')
X_val = dv.transform(val_dicts)

In [11]:
target = 'duration'
y_train = df_train[target].values
y_val = df_val[target].values

In [16]:
pip install dagshub

Collecting dagshub
  Obtaining dependency information for dagshub from https://files.pythonhosted.org/packages/39/09/64d87ab8f3d8dbd25d45b602d2a400bf45ebeab232bc84f3a8375b269be3/dagshub-0.3.35-py3-none-any.whl.metadata
  Downloading dagshub-0.3.35-py3-none-any.whl.metadata (11 kB)
Collecting fusepy>=3 (from dagshub)
  Downloading fusepy-3.0.1.tar.gz (11 kB)
  Preparing metadata (setup.py): started
  Preparing metadata (setup.py): finished with status 'done'
Collecting httpx~=0.23.0 (from dagshub)
  Obtaining dependency information for httpx~=0.23.0 from https://files.pythonhosted.org/packages/ac/a2/0260c0f5d73bdf06e8d3fc1013a82b9f0633dc21750c9e3f3cb1dba7bb8c/httpx-0.23.3-py3-none-any.whl.metadata
  Downloading httpx-0.23.3-py3-none-any.whl.metadata (7.1 kB)
Collecting rich~=13.1.0 (from dagshub)
  Obtaining dependency information for rich~=13.1.0 from https://files.pythonhosted.org/packages/55/19/8b1ed0f3ea49306b8115afe84e8e5cd92925d732260efc75e4e3e3089bf0/rich-13.1.0-py3-none-any.whl.

ERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
fastapi 0.109.2 requires typing-extensions>=4.8.0, but you have typing-extensions 4.7.1 which is incompatible.


In [12]:
import dagshub
import mlflow


dagshub.init(url="https://dagshub.com/Pacolaz/nyc-taxi-time-predicition", mlflow=True)

MLFLOW_TRACKING_URI = mlflow.get_tracking_uri()

print(MLFLOW_TRACKING_URI)

mlflow.set_tracking_uri(MLFLOW_TRACKING_URI)
mlflow.set_experiment(experiment_name="nyc-taxi-experiment")

https://dagshub.com/Pacolaz/nyc-taxi-time-predicition.mlflow


<Experiment: artifact_location='mlflow-artifacts:/2275d465861f4be68b5709db9f27801d', creation_time=1726630130833, experiment_id='0', last_update_time=1726630130833, lifecycle_stage='active', name='nyc-taxi-experiment', tags={}>

In [14]:
from dagshub import get_repo_bucket_client
# Get a boto3.client object
s3 = get_repo_bucket_client("Pacolaz/nyc-taxi-time-predicition")

# Upload file
s3.upload_file(
    Bucket="nyc-taxi-time-predicition",  # name of the repo
    Filename="../data/green_tripdata_2024-01.parquet",  # local path of file to upload
    Key="train_data.parquet",  # remote path where to upload the file
)

s3.upload_file(
    Bucket="nyc-taxi-time-predicition",  # name of the repo
    Filename="../data/green_tripdata_2024-02.parquet",  # local path of file to upload
    Key="eval_data.parquet",  # remote path where to upload the file
)

In [15]:
training_dataset = mlflow.data.from_numpy(X_train.data, targets=y_train, name="green_tripdata_2024-01")
validation_dataset = mlflow.data.from_numpy(X_val.data, targets=y_val, name="green_tripdata_2024-02")

In [16]:
import xgboost as xgb
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials
from hyperopt.pyll import scope
import pathlib

In [17]:
train = xgb.DMatrix(X_train, label=y_train)
valid = xgb.DMatrix(X_val, label=y_val)

In [22]:
def objective(params):
    with mlflow.start_run(nested=True):
         
        # Tag model
        mlflow.set_tag("model_family", "xgboost")
        
        # Log parameters
        mlflow.log_params(params)
        
        # Train model
        booster = xgb.train(
            params=params,
            dtrain=train,
            num_boost_round=100,
            evals=[(valid, 'validation')],
            early_stopping_rounds=10
        )
        
        # Log xgboost model with artifact_path
        mlflow.xgboost.log_model(booster, artifact_path="model")
         
        # Predict in the val dataset
        y_pred = booster.predict(valid)
        
        # Calculate metric
        rmse = mean_squared_error(y_val, y_pred)
        
        # Log performance metric
        mlflow.log_metric("rmse", rmse)

    return {'loss': rmse, 'status': STATUS_OK}

In [23]:
mlflow.xgboost.autolog()

with mlflow.start_run(run_name="Xgboost Hyper-parameter Optimization", nested=True):
    search_space = {
        'max_depth': scope.int(hp.quniform('max_depth', 4, 100, 1)),
        'learning_rate': hp.loguniform('learning_rate', -3, 0),
        'reg_alpha': hp.loguniform('reg_alpha', -5, -1),
        'reg_lambda': hp.loguniform('reg_lambda', -6, -1),
        'min_child_weight': hp.loguniform('min_child_weight', -1, 3),
        'objective': 'reg:squarederror',
        'seed': 42
    }
    
    best_params = fmin(
        fn=objective,
        space=search_space,
        algo=tpe.suggest,
        max_evals=10,
        trials=Trials()
    )
    best_params["max_depth"] = int(best_params["max_depth"])
    best_params["seed"] = 42
    best_params["objective"] = "reg:squarederror"
    
    mlflow.log_params(best_params)

    # Log tags
    mlflow.set_tags(
        tags={
            "project": "NYC Taxi Time Prediction Project",
            "optimizer_engine": "hyper-opt",
            "model_family": "xgboost",
            "feature_set_version": 1,
        }
    )

    # Log a fit model instance
    booster = xgb.train(
        params=best_params,
        dtrain=train,
        num_boost_round=100,
        evals=[(valid, 'validation')],
        early_stopping_rounds=10
    )
        
    y_pred = booster.predict(valid)
    
    rmse = mean_squared_error(y_val, y_pred)
    mlflow.log_metric("rmse", rmse)
    
    pathlib.Path("models").mkdir(exist_ok=True)
    with open("models/preprocessor.b", "wb") as f_out:
        pickle.dump(dv, f_out)
        
    mlflow.log_artifact("models/preprocessor.b", artifact_path="preprocessor")

[0]	validation-rmse:8.68304                           
[1]	validation-rmse:8.29912                           
[2]	validation-rmse:7.94991                           
[3]	validation-rmse:7.64568                           
[4]	validation-rmse:7.36622                           
[5]	validation-rmse:7.12515                           
[6]	validation-rmse:6.90721                           
[7]	validation-rmse:6.71574                           
[8]	validation-rmse:6.53591                           
[9]	validation-rmse:6.38373                           
[10]	validation-rmse:6.25457                          
[11]	validation-rmse:6.13001                          
[12]	validation-rmse:6.02803                          
[13]	validation-rmse:5.93688                          
[14]	validation-rmse:5.85949                          
[15]	validation-rmse:5.78695                          
[16]	validation-rmse:5.72715                          
[17]	validation-rmse:5.67645                          
[18]	valid






2024/09/20 12:21:34 INFO mlflow.tracking._tracking_service.client: 🏃 View run indecisive-frog-511 at: https://dagshub.com/Pacolaz/nyc-taxi-time-predicition.mlflow/#/experiments/0/runs/c42e438715644e5eb70ab7087a15f91e.

2024/09/20 12:21:34 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: https://dagshub.com/Pacolaz/nyc-taxi-time-predicition.mlflow/#/experiments/0.



[0]	validation-rmse:6.02507                                                      
[1]	validation-rmse:5.38364                                                      
[2]	validation-rmse:5.25972                                                      
[3]	validation-rmse:5.23267                                                      
[4]	validation-rmse:5.22050                                                      
[5]	validation-rmse:5.21821                                                      
[6]	validation-rmse:5.21949                                                      
[7]	validation-rmse:5.22025                                                      
[8]	validation-rmse:5.21979                                                      
[9]	validation-rmse:5.21719                                                      
[10]	validation-rmse:5.21627                                                     
[11]	validation-rmse:5.21295                                                     
[12]	validation-






2024/09/20 12:22:35 INFO mlflow.tracking._tracking_service.client: 🏃 View run fortunate-skink-987 at: https://dagshub.com/Pacolaz/nyc-taxi-time-predicition.mlflow/#/experiments/0/runs/fa78399309464e77a8c144fed4ea0baa.

2024/09/20 12:22:35 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: https://dagshub.com/Pacolaz/nyc-taxi-time-predicition.mlflow/#/experiments/0.



[0]	validation-rmse:7.98874                                                      
[1]	validation-rmse:7.17618                                                     
[2]	validation-rmse:6.60117                                                     
[3]	validation-rmse:6.20248                                                     
[4]	validation-rmse:5.92959                                                     
[5]	validation-rmse:5.74514                                                     
[6]	validation-rmse:5.62102                                                     
[7]	validation-rmse:5.53588                                                     
[8]	validation-rmse:5.47840                                                     
[9]	validation-rmse:5.43713                                                     
[10]	validation-rmse:5.40971                                                    
[11]	validation-rmse:5.39087                                                    
[12]	validation-rmse:5.3768






2024/09/20 12:23:44 INFO mlflow.tracking._tracking_service.client: 🏃 View run crawling-quail-586 at: https://dagshub.com/Pacolaz/nyc-taxi-time-predicition.mlflow/#/experiments/0/runs/1c56c7b5affd4e1d952fcbb9ed99bc25.

2024/09/20 12:23:44 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: https://dagshub.com/Pacolaz/nyc-taxi-time-predicition.mlflow/#/experiments/0.



[0]	validation-rmse:6.60446                                                     
[1]	validation-rmse:5.84493                                                    
[2]	validation-rmse:5.63030                                                    
[3]	validation-rmse:5.54967                                                    
[4]	validation-rmse:5.52707                                                    
[5]	validation-rmse:5.51099                                                    
[6]	validation-rmse:5.50125                                                    
[7]	validation-rmse:5.49512                                                    
[8]	validation-rmse:5.48756                                                    
[9]	validation-rmse:5.48431                                                    
[10]	validation-rmse:5.47829                                                   
[11]	validation-rmse:5.47489                                                   
[12]	validation-rmse:5.46797           






2024/09/20 12:24:11 INFO mlflow.tracking._tracking_service.client: 🏃 View run unequaled-sow-222 at: https://dagshub.com/Pacolaz/nyc-taxi-time-predicition.mlflow/#/experiments/0/runs/597165aee9814d7aa592c94e39b2806c.

2024/09/20 12:24:11 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: https://dagshub.com/Pacolaz/nyc-taxi-time-predicition.mlflow/#/experiments/0.



[0]	validation-rmse:5.86747                                                    
[1]	validation-rmse:5.43192                                                    
[2]	validation-rmse:5.35458                                                    
[3]	validation-rmse:5.33879                                                    
[4]	validation-rmse:5.32581                                                    
[5]	validation-rmse:5.29830                                                    
[6]	validation-rmse:5.29516                                                    
[7]	validation-rmse:5.29352                                                    
[8]	validation-rmse:5.28342                                                    
[9]	validation-rmse:5.27730                                                    
[10]	validation-rmse:5.27303                                                   
[11]	validation-rmse:5.27059                                                   
[12]	validation-rmse:5.26727            






2024/09/20 12:24:53 INFO mlflow.tracking._tracking_service.client: 🏃 View run funny-cow-759 at: https://dagshub.com/Pacolaz/nyc-taxi-time-predicition.mlflow/#/experiments/0/runs/7ea0d01c70614990aa00ce1e309bee07.

2024/09/20 12:24:53 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: https://dagshub.com/Pacolaz/nyc-taxi-time-predicition.mlflow/#/experiments/0.



[0]	validation-rmse:8.61766                                                    
[1]	validation-rmse:8.17870                                                    
[2]	validation-rmse:7.79092                                                    
[3]	validation-rmse:7.44843                                                    
[4]	validation-rmse:7.14816                                                    
[5]	validation-rmse:6.88325                                                    
[6]	validation-rmse:6.65334                                                    
[7]	validation-rmse:6.45163                                                    
[8]	validation-rmse:6.27868                                                    
[9]	validation-rmse:6.12706                                                    
[10]	validation-rmse:5.99576                                                   
[11]	validation-rmse:5.88383                                                   
[12]	validation-rmse:5.78684            






2024/09/20 12:27:47 INFO mlflow.tracking._tracking_service.client: 🏃 View run peaceful-dove-467 at: https://dagshub.com/Pacolaz/nyc-taxi-time-predicition.mlflow/#/experiments/0/runs/02470ebfc03f4e7c93214a1b5cab043b.

2024/09/20 12:27:47 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: https://dagshub.com/Pacolaz/nyc-taxi-time-predicition.mlflow/#/experiments/0.



[0]	validation-rmse:5.50217                                                     
[1]	validation-rmse:5.48288                                                     
[2]	validation-rmse:5.47959                                                     
[3]	validation-rmse:5.47672                                                     
[4]	validation-rmse:5.46788                                                     
[5]	validation-rmse:5.46233                                                     
[6]	validation-rmse:5.43822                                                     
[7]	validation-rmse:5.43687                                                     
[8]	validation-rmse:5.43930                                                     
[9]	validation-rmse:5.44389                                                     
[10]	validation-rmse:5.44585                                                    
[11]	validation-rmse:5.44652                                                    
[12]	validation-rmse:5.44657






2024/09/20 12:28:19 INFO mlflow.tracking._tracking_service.client: 🏃 View run grandiose-gnu-250 at: https://dagshub.com/Pacolaz/nyc-taxi-time-predicition.mlflow/#/experiments/0/runs/1304ab9053c24c009c9eff0dccc27957.

2024/09/20 12:28:19 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: https://dagshub.com/Pacolaz/nyc-taxi-time-predicition.mlflow/#/experiments/0.



[0]	validation-rmse:7.89821                                                     
[1]	validation-rmse:7.05074                                                     
[2]	validation-rmse:6.47437                                                     
[3]	validation-rmse:6.08981                                                     
[4]	validation-rmse:5.84109                                                     
[5]	validation-rmse:5.68162                                                     
[6]	validation-rmse:5.58032                                                     
[7]	validation-rmse:5.51014                                                     
[8]	validation-rmse:5.46698                                                     
[9]	validation-rmse:5.43559                                                     
[10]	validation-rmse:5.41451                                                    
[11]	validation-rmse:5.40018                                                    
[12]	validation-rmse:5.38845






2024/09/20 12:29:36 INFO mlflow.tracking._tracking_service.client: 🏃 View run brawny-grub-234 at: https://dagshub.com/Pacolaz/nyc-taxi-time-predicition.mlflow/#/experiments/0/runs/0dd2751be863404abc61369af8e9c485.

2024/09/20 12:29:36 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: https://dagshub.com/Pacolaz/nyc-taxi-time-predicition.mlflow/#/experiments/0.



[0]	validation-rmse:6.48374                                                     
[1]	validation-rmse:5.66266                                                     
[2]	validation-rmse:5.43174                                                     
[3]	validation-rmse:5.36069                                                     
[4]	validation-rmse:5.33729                                                     
[5]	validation-rmse:5.32398                                                     
[6]	validation-rmse:5.31565                                                     
[7]	validation-rmse:5.31081                                                     
[8]	validation-rmse:5.30727                                                     
[9]	validation-rmse:5.30472                                                     
[10]	validation-rmse:5.30044                                                    
[11]	validation-rmse:5.29520                                                    
[12]	validation-rmse:5.29142






2024/09/20 12:30:12 INFO mlflow.tracking._tracking_service.client: 🏃 View run fun-snake-951 at: https://dagshub.com/Pacolaz/nyc-taxi-time-predicition.mlflow/#/experiments/0/runs/ee7aa6fd3a674e0b8f18e9af2d2b1edb.

2024/09/20 12:30:12 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: https://dagshub.com/Pacolaz/nyc-taxi-time-predicition.mlflow/#/experiments/0.



[0]	validation-rmse:8.76445                                                     
[1]	validation-rmse:8.44894                                                     
[2]	validation-rmse:8.16113                                                     
[3]	validation-rmse:7.89950                                                     
[4]	validation-rmse:7.66205                                                     
[5]	validation-rmse:7.44681                                                     
[6]	validation-rmse:7.25255                                                     
[7]	validation-rmse:7.07796                                                     
[8]	validation-rmse:6.91960                                                     
[9]	validation-rmse:6.77766                                                     
[10]	validation-rmse:6.64874                                                    
[11]	validation-rmse:6.53489                                                    
[12]	validation-rmse:6.43092






2024/09/20 12:30:37 INFO mlflow.tracking._tracking_service.client: 🏃 View run welcoming-grouse-517 at: https://dagshub.com/Pacolaz/nyc-taxi-time-predicition.mlflow/#/experiments/0/runs/2e956e6569834f879d85e622f5e50a12.

2024/09/20 12:30:37 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: https://dagshub.com/Pacolaz/nyc-taxi-time-predicition.mlflow/#/experiments/0.



100%|██████████| 10/10 [12:02<00:00, 72.26s/trial, best loss: 26.979939829364316]
[0]	validation-rmse:8.61766
[1]	validation-rmse:8.17870
[2]	validation-rmse:7.79092
[3]	validation-rmse:7.44843
[4]	validation-rmse:7.14816
[5]	validation-rmse:6.88325
[6]	validation-rmse:6.65334
[7]	validation-rmse:6.45163
[8]	validation-rmse:6.27868
[9]	validation-rmse:6.12706
[10]	validation-rmse:5.99576
[11]	validation-rmse:5.88383
[12]	validation-rmse:5.78684
[13]	validation-rmse:5.70439
[14]	validation-rmse:5.63102
[15]	validation-rmse:5.57067
[16]	validation-rmse:5.51819
[17]	validation-rmse:5.47272
[18]	validation-rmse:5.43400
[19]	validation-rmse:5.40252
[20]	validation-rmse:5.37486
[21]	validation-rmse:5.35070
[22]	validation-rmse:5.33010
[23]	validation-rmse:5.31352
[24]	validation-rmse:5.29731
[25]	validation-rmse:5.28512
[26]	validation-rmse:5.27341
[27]	validation-rmse:5.26353
[28]	validation-rmse:5.25550
[29]	validation-rmse:5.24872
[30]	validation-rmse:5.24147
[31]	validation-rmse:5.23634


2024/09/20 12:33:21 INFO mlflow.tracking._tracking_service.client: 🏃 View run Xgboost Hyper-parameter Optimization at: https://dagshub.com/Pacolaz/nyc-taxi-time-predicition.mlflow/#/experiments/0/runs/7c13a3061a234ee1a427accf7db41c5e.
2024/09/20 12:33:21 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: https://dagshub.com/Pacolaz/nyc-taxi-time-predicition.mlflow/#/experiments/0.


In [24]:
run_id = input("Ingrese el run_id")
run_uri = f"runs:/{run_id}/model"

result = mlflow.register_model(
    model_uri=run_uri,
    name="nyc-taxi-model"
)

Ingrese el run_id7c13a3061a234ee1a427accf7db41c5e


Registered model 'nyc-taxi-model' already exists. Creating a new version of this model...
2024/09/20 12:33:57 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: nyc-taxi-model, version 1
Created version '1' of model 'nyc-taxi-model'.


In [25]:
from datetime import datetime
from mlflow import MlflowClient

client = MlflowClient(tracking_uri=MLFLOW_TRACKING_URI)
client.update_registered_model(
    name="nyc-taxi-model",
    description="Model registry for the NYC Taxi Time Prediction Project",
)

new_alias = "champion"
date = datetime.today()
model_version = "1"

# create "champion" alias for version 1 of model "nyc-taxi-model"
client.set_registered_model_alias(
    name="nyc-taxi-model",
    alias=new_alias,
    version=model_version
)

client.update_model_version(
    name="nyc-taxi-model",
    version=model_version,
    description=f"The model version {model_version} was transitioned to {new_alias} on {date}",
)

<ModelVersion: aliases=['champion'], creation_timestamp=1726857237358, current_stage='None', description='The model version 1 was transitioned to champion on 2024-09-20 12:34:00.607826', last_updated_timestamp=1726857241268, name='nyc-taxi-model', run_id='7c13a3061a234ee1a427accf7db41c5e', run_link='', source='mlflow-artifacts:/2275d465861f4be68b5709db9f27801d/7c13a3061a234ee1a427accf7db41c5e/artifacts/model', status='READY', status_message='', tags={}, user_id='', version='1'>

In [26]:
import mlflow.pyfunc

model_name = "nyc-taxi-model"
alias = "champion"

model_uri = f"models:/{model_name}@{alias}"

champion_version = mlflow.pyfunc.load_model(
    model_uri=model_uri
)

champion_version.predict(X_val)

Downloading artifacts:   0%|          | 0/5 [00:00<?, ?it/s]

array([19.1048  , 28.31477 ,  9.504406, ..., 47.04458 , 14.088727,
       19.8717  ], dtype=float32)