In [1]:
!python -V

Python 3.13.5


In [2]:
import pandas as pd

%pip install --trusted-host pypi.org --trusted-host files.pythonhosted.org mlflow==3.1.1

Note: you may need to restart the kernel to use updated packages.


In [3]:
import pickle

In [4]:
%pip install seaborn
import seaborn as sns
import matplotlib.pyplot as plt

Note: you may need to restart the kernel to use updated packages.


In [5]:
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso
from sklearn.linear_model import Ridge

from sklearn.metrics import mean_squared_error

In [6]:
import mlflow
from pathlib import Path

try:
    # 尝试获取当前脚本所在目录（适用于直接运行脚本）
    project_root = Path(__file__).parent.resolve()
except NameError:
    # 如果 __file__ 未定义（如在 Notebook 中），使用当前工作目录
    project_root = Path.cwd().resolve()



# 步骤2：构造数据库路径（项目根目录下的mlflow.db）
db_path = project_root / "mlflow.db"

# 步骤3：转换为SQLite的URI格式（自动处理Windows/Linux路径差异）

db_uri = f"sqlite:///{db_path.as_posix()}"
print(f"Using MLflow tracking URI: {db_uri}")

mlflow.set_tracking_uri(db_uri)


# 强制初始化数据库（创建实验）
try:
    mlflow.set_experiment("nyc-taxi-experiment")
    print("实验创建成功，数据库已初始化")
except Exception as e:
    print(f"初始化失败: {e}")
    # 手动检查文件是否创建
    if db_path.exists():
        print(f"文件已创建，但权限不足: {db_path}")
    else:
        print(f"文件未创建，路径可能无效或权限不足")

Using MLflow tracking URI: sqlite:////Users/bytedance/Documents/ByteRAG/experiment_tracking/mlflow.db


2025/07/01 14:23:03 INFO mlflow.store.db.utils: Creating initial MLflow database tables...
2025/07/01 14:23:03 INFO mlflow.store.db.utils: Updating database tables
INFO  [alembic.runtime.migration] Context impl SQLiteImpl.
INFO  [alembic.runtime.migration] Will assume non-transactional DDL.
INFO  [alembic.runtime.migration] Context impl SQLiteImpl.
INFO  [alembic.runtime.migration] Will assume non-transactional DDL.


实验创建成功，数据库已初始化


In [7]:
def read_dataframe(filename):
    df = pd.read_csv(filename)

    df.lpep_dropoff_datetime = pd.to_datetime(df.lpep_dropoff_datetime)
    df.lpep_pickup_datetime = pd.to_datetime(df.lpep_pickup_datetime)

    df['duration'] = df.lpep_dropoff_datetime - df.lpep_pickup_datetime
    df.duration = df.duration.apply(lambda td: td.total_seconds() / 60)

    df = df[(df.duration >= 1) & (df.duration <= 60)]

    categorical = ['PULocationID', 'DOLocationID']
    df[categorical] = df[categorical].astype(str)
    
    return df

In [8]:
df=read_dataframe('./data/taxi_tripdata.csv')
#get df_train and df_val from df
#by randomly splitting the data into 80% train and 20% validation
df_train = df.sample(frac=0.8, random_state=42)
df_val = df.drop(df_train.index)


  df = pd.read_csv(filename)


In [9]:
len(df_train), len(df_val)

(63278, 15819)

In [10]:
df_train['PU_DO'] = df_train['PULocationID'] + '_' + df_train['DOLocationID']
df_val['PU_DO'] = df_val['PULocationID'] + '_' + df_val['DOLocationID']

In [11]:
categorical = ['PU_DO'] #'PULocationID', 'DOLocationID']
numerical = ['trip_distance']

dv = DictVectorizer()

train_dicts = df_train[categorical + numerical].to_dict(orient='records')
X_train = dv.fit_transform(train_dicts)

val_dicts = df_val[categorical + numerical].to_dict(orient='records')
X_val = dv.transform(val_dicts)

In [12]:
target = 'duration'
y_train = df_train[target].values
y_val = df_val[target].values

In [13]:
lr = LinearRegression()
lr.fit(X_train, y_train)

y_pred = lr.predict(X_val)

mean_squared_error(y_val, y_pred)

71.4023870743136

In [14]:
with open('models/lin_reg.bin', 'wb') as f_out:
    pickle.dump((dv, lr), f_out)

In [15]:
with mlflow.start_run():

    mlflow.set_tag("developer", "cristian")

    mlflow.log_param("train-data-path", "./data/taxi_tripdata.csv")
    mlflow.log_param("valid-data-path", "./data/taxi_tripdata.csv")

    alpha = 0.1
    mlflow.log_param("alpha", alpha)
    lr = Lasso(alpha)
    lr.fit(X_train, y_train)

    y_pred = lr.predict(X_val)
    rmse = mean_squared_error(y_val, y_pred)
    mlflow.log_metric("rmse", rmse)

    mlflow.log_artifact(local_path="models/lin_reg.bin", artifact_path="models_pickle")

In [16]:
import xgboost as xgb

In [17]:
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials
from hyperopt.pyll import scope

  import pkg_resources


In [18]:
train = xgb.DMatrix(X_train, label=y_train)
valid = xgb.DMatrix(X_val, label=y_val)

In [19]:
def objective(params):
    with mlflow.start_run():
        mlflow.set_tag("model", "xgboost")
        mlflow.log_params(params)
        booster = xgb.train(
            params=params,
            dtrain=train,
            num_boost_round=1000,
            evals=[(valid, 'validation')],
            early_stopping_rounds=50
        )
        y_pred = booster.predict(valid)
        rmse = mean_squared_error(y_val, y_pred)
        mlflow.log_metric("rmse", rmse)

    return {'loss': rmse, 'status': STATUS_OK}

In [20]:
search_space = {
    'max_depth': scope.int(hp.quniform('max_depth', 4, 100, 1)),
    'learning_rate': hp.loguniform('learning_rate', -3, 0),
    'reg_alpha': hp.loguniform('reg_alpha', -5, -1),
    'reg_lambda': hp.loguniform('reg_lambda', -6, -1),
    'min_child_weight': hp.loguniform('min_child_weight', -1, 3),
    'objective': 'reg:linear',
    'seed': 42
}

best_result = fmin(
    fn=objective,
    space=search_space,
    algo=tpe.suggest,
    max_evals=50,
    trials=Trials()
)

[0]	validation-rmse:9.93637                           
[1]	validation-rmse:8.38430                           
  0%|          | 0/50 [00:00<?, ?trial/s, best loss=?]

  self.starting_round = model.num_boosted_rounds()



[2]	validation-rmse:7.58081                           
[3]	validation-rmse:7.18776                           
[4]	validation-rmse:6.99963                           
[5]	validation-rmse:6.90851                           
[6]	validation-rmse:6.85690                           
[7]	validation-rmse:6.82932                           
[8]	validation-rmse:6.81449                           
[9]	validation-rmse:6.80399                           
[10]	validation-rmse:6.79359                          
[11]	validation-rmse:6.77817                          
[12]	validation-rmse:6.76671                          
[13]	validation-rmse:6.76040                          
[14]	validation-rmse:6.75089                          
[15]	validation-rmse:6.74901                          
[16]	validation-rmse:6.74301                          
[17]	validation-rmse:6.73797                          
[18]	validation-rmse:6.73656                          
[19]	validation-rmse:6.73355                          
[20]	valid

  self.starting_round = model.num_boosted_rounds()



[17]	validation-rmse:6.69531                                                   
[18]	validation-rmse:6.68932                                                   
[19]	validation-rmse:6.68702                                                   
[20]	validation-rmse:6.68705                                                   
[21]	validation-rmse:6.68387                                                   
[22]	validation-rmse:6.67821                                                   
[23]	validation-rmse:6.67637                                                   
[24]	validation-rmse:6.67386                                                   
[25]	validation-rmse:6.67249                                                   
[26]	validation-rmse:6.66533                                                   
[27]	validation-rmse:6.66232                                                   
[28]	validation-rmse:6.65948                                                   
[29]	validation-rmse:6.65732            

  self.starting_round = model.num_boosted_rounds()



[1]	validation-rmse:10.94658                                                   
[2]	validation-rmse:10.24364                                                   
[3]	validation-rmse:9.64503                                                    
[4]	validation-rmse:9.13797                                                    
[5]	validation-rmse:8.71137                                                    
[6]	validation-rmse:8.35389                                                    
[7]	validation-rmse:8.05652                                                    
[8]	validation-rmse:7.81081                                                    
[9]	validation-rmse:7.60962                                                    
[10]	validation-rmse:7.44405                                                   
[11]	validation-rmse:7.30889                                                   
[12]	validation-rmse:7.19880                                                   
[13]	validation-rmse:7.10890            

  self.starting_round = model.num_boosted_rounds()



[2]	validation-rmse:6.79511                                                    
[3]	validation-rmse:6.76467                                                    
[4]	validation-rmse:6.74659                                                    
[5]	validation-rmse:6.72712                                                    
[6]	validation-rmse:6.70958                                                    
[7]	validation-rmse:6.70248                                                    
[8]	validation-rmse:6.68955                                                    
[9]	validation-rmse:6.68659                                                    
[10]	validation-rmse:6.68416                                                   
[11]	validation-rmse:6.68044                                                   
[12]	validation-rmse:6.67805                                                   
[13]	validation-rmse:6.67647                                                   
[14]	validation-rmse:6.67379            

  self.starting_round = model.num_boosted_rounds()



[0]	validation-rmse:6.91613                                                    
[1]	validation-rmse:6.85162                                                    
[2]	validation-rmse:6.85134                                                    
[3]	validation-rmse:6.85213                                                    
[4]	validation-rmse:6.84856                                                    
[5]	validation-rmse:6.84959                                                    
[6]	validation-rmse:6.84378                                                    
[7]	validation-rmse:6.83523                                                    
[8]	validation-rmse:6.83582                                                    
[9]	validation-rmse:6.82976                                                    
[10]	validation-rmse:6.83497                                                   
[11]	validation-rmse:6.83262                                                   
[12]	validation-rmse:6.84039            

  self.starting_round = model.num_boosted_rounds()



[0]	validation-rmse:7.67366                                                    
[1]	validation-rmse:6.75416                                                    
[2]	validation-rmse:6.61796                                                    
[3]	validation-rmse:6.58956                                                    
[4]	validation-rmse:6.58348                                                    
[5]	validation-rmse:6.57446                                                    
[6]	validation-rmse:6.57516                                                    
[7]	validation-rmse:6.57116                                                    
[8]	validation-rmse:6.56415                                                    
[9]	validation-rmse:6.56207                                                    
[10]	validation-rmse:6.55928                                                   
[11]	validation-rmse:6.55139                                                   
[12]	validation-rmse:6.54815            

  self.starting_round = model.num_boosted_rounds()



[0]	validation-rmse:11.90574                                                   
[1]	validation-rmse:11.18612                                                   
[2]	validation-rmse:10.55218                                                   
[3]	validation-rmse:9.99500                                                    
[4]	validation-rmse:9.50816                                                    
[5]	validation-rmse:9.08333                                                    
[6]	validation-rmse:8.71526                                                    
[7]	validation-rmse:8.39755                                                    
[8]	validation-rmse:8.12424                                                    
[9]	validation-rmse:7.88901                                                    
[10]	validation-rmse:7.68987                                                   
[11]	validation-rmse:7.52045                                                   
[12]	validation-rmse:7.37683            

  self.starting_round = model.num_boosted_rounds()



[0]	validation-rmse:9.50116                                                    
[1]	validation-rmse:7.88859                                                    
[2]	validation-rmse:7.14432                                                    
[3]	validation-rmse:6.81758                                                    
[4]	validation-rmse:6.68436                                                    
[5]	validation-rmse:6.62183                                                    
[6]	validation-rmse:6.58799                                                    
[7]	validation-rmse:6.56946                                                    
[8]	validation-rmse:6.55893                                                    
[9]	validation-rmse:6.55605                                                    
[10]	validation-rmse:6.55140                                                   
[11]	validation-rmse:6.54697                                                   
[12]	validation-rmse:6.54580            

  self.starting_round = model.num_boosted_rounds()



[0]	validation-rmse:8.39940                                                    
[1]	validation-rmse:7.05847                                                    
[2]	validation-rmse:6.70322                                                    
[3]	validation-rmse:6.61674                                                    
[4]	validation-rmse:6.58221                                                    
[5]	validation-rmse:6.57008                                                    
[6]	validation-rmse:6.56356                                                    
[7]	validation-rmse:6.55769                                                    
[8]	validation-rmse:6.56026                                                    
[9]	validation-rmse:6.55455                                                    
[10]	validation-rmse:6.55573                                                   
[11]	validation-rmse:6.55229                                                   
[12]	validation-rmse:6.54808            

  self.starting_round = model.num_boosted_rounds()



[11]	validation-rmse:6.68624                                                   
[12]	validation-rmse:6.68557                                                   
[13]	validation-rmse:6.68222                                                   
[14]	validation-rmse:6.67930                                                   
[15]	validation-rmse:6.67850                                                   
[16]	validation-rmse:6.67514                                                   
[17]	validation-rmse:6.67213                                                   
[18]	validation-rmse:6.67086                                                   
[19]	validation-rmse:6.67026                                                   
[20]	validation-rmse:6.66683                                                   
[21]	validation-rmse:6.66487                                                   
[22]	validation-rmse:6.66401                                                   
[23]	validation-rmse:6.66313            

  self.starting_round = model.num_boosted_rounds()



[4]	validation-rmse:9.46178                                                     
[5]	validation-rmse:9.04809                                                     
[6]	validation-rmse:8.69410                                                     
[7]	validation-rmse:8.39283                                                     
[8]	validation-rmse:8.13779                                                     
[9]	validation-rmse:7.92259                                                     
[10]	validation-rmse:7.74167                                                    
[11]	validation-rmse:7.58991                                                    
[12]	validation-rmse:7.46291                                                    
[13]	validation-rmse:7.35709                                                    
[14]	validation-rmse:7.26889                                                    
[15]	validation-rmse:7.19559                                                    
[16]	validation-rmse:7.13520

  self.starting_round = model.num_boosted_rounds()



[2]	validation-rmse:10.00494                                                    
[3]	validation-rmse:9.38060                                                     
[4]	validation-rmse:8.86695                                                     
[5]	validation-rmse:8.44677                                                     
[6]	validation-rmse:8.10593                                                     
[7]	validation-rmse:7.83149                                                     
[8]	validation-rmse:7.61228                                                     
[9]	validation-rmse:7.43746                                                     
[10]	validation-rmse:7.29845                                                    
[11]	validation-rmse:7.18810                                                    
[12]	validation-rmse:7.10131                                                    
[13]	validation-rmse:7.03270                                                    
[14]	validation-rmse:6.97680

  self.starting_round = model.num_boosted_rounds()



[1]	validation-rmse:7.83950                                                     
[2]	validation-rmse:7.16109                                                     
[3]	validation-rmse:6.88892                                                     
[4]	validation-rmse:6.76822                                                     
[5]	validation-rmse:6.71676                                                     
[6]	validation-rmse:6.68374                                                     
[7]	validation-rmse:6.66536                                                     
[8]	validation-rmse:6.65400                                                     
[9]	validation-rmse:6.64179                                                     
[10]	validation-rmse:6.62861                                                    
[11]	validation-rmse:6.61393                                                    
[12]	validation-rmse:6.61085                                                    
[13]	validation-rmse:6.60603

  self.starting_round = model.num_boosted_rounds()



[8]	validation-rmse:9.54737                                                     
[9]	validation-rmse:9.31354                                                     
[10]	validation-rmse:9.09854                                                    
[11]	validation-rmse:8.90105                                                    
[12]	validation-rmse:8.71985                                                    
[13]	validation-rmse:8.55390                                                    
[14]	validation-rmse:8.40197                                                    
[15]	validation-rmse:8.26303                                                    
[16]	validation-rmse:8.13631                                                    
[17]	validation-rmse:8.02069                                                    
[18]	validation-rmse:7.91526                                                    
[19]	validation-rmse:7.81928                                                    
[20]	validation-rmse:7.73184

  self.starting_round = model.num_boosted_rounds()



[2]	validation-rmse:10.47971                                                    
[3]	validation-rmse:9.91265                                                     
[4]	validation-rmse:9.42502                                                     
[5]	validation-rmse:9.00556                                                     
[6]	validation-rmse:8.64270                                                     
[7]	validation-rmse:8.33184                                                     
[8]	validation-rmse:8.06770                                                     
[9]	validation-rmse:7.84742                                                     
[10]	validation-rmse:7.65504                                                    
[11]	validation-rmse:7.49594                                                    
[12]	validation-rmse:7.36390                                                    
[13]	validation-rmse:7.25059                                                    
[14]	validation-rmse:7.15590

  self.starting_round = model.num_boosted_rounds()



[24]	validation-rmse:6.89293                                                    
[25]	validation-rmse:6.88295                                                    
[26]	validation-rmse:6.87498                                                    
[27]	validation-rmse:6.86908                                                    
[28]	validation-rmse:6.86347                                                    
[29]	validation-rmse:6.85977                                                    
[30]	validation-rmse:6.85469                                                    
[31]	validation-rmse:6.85202                                                    
[32]	validation-rmse:6.84634                                                    
[33]	validation-rmse:6.84356                                                    
[34]	validation-rmse:6.84076                                                    
[35]	validation-rmse:6.83920                                                    
[36]	validation-rmse:6.83669

  self.starting_round = model.num_boosted_rounds()



[1]	validation-rmse:11.02931                                                    
[2]	validation-rmse:10.34780                                                    
[3]	validation-rmse:9.76031                                                     
[4]	validation-rmse:9.25465                                                     
[5]	validation-rmse:8.82806                                                     
[6]	validation-rmse:8.46236                                                     
[7]	validation-rmse:8.15514                                                     
[8]	validation-rmse:7.89466                                                     
[9]	validation-rmse:7.67751                                                     
[10]	validation-rmse:7.49774                                                    
[11]	validation-rmse:7.34472                                                    
[12]	validation-rmse:7.21959                                                    
[13]	validation-rmse:7.11480

  self.starting_round = model.num_boosted_rounds()



[2]	validation-rmse:9.35817                                                     
[3]	validation-rmse:8.69703                                                     
[4]	validation-rmse:8.19663                                                     
[5]	validation-rmse:7.82251                                                     
[6]	validation-rmse:7.54606                                                     
[7]	validation-rmse:7.34375                                                     
[8]	validation-rmse:7.19540                                                     
[9]	validation-rmse:7.08727                                                     
[10]	validation-rmse:7.00870                                                    
[11]	validation-rmse:6.95203                                                    
[12]	validation-rmse:6.91101                                                    
[13]	validation-rmse:6.87928                                                    
[14]	validation-rmse:6.85396

  self.starting_round = model.num_boosted_rounds()



[0]	validation-rmse:9.21596                                                     
[1]	validation-rmse:7.63378                                                     
[2]	validation-rmse:7.00165                                                     
[3]	validation-rmse:6.75687                                                     
[4]	validation-rmse:6.66111                                                     
[5]	validation-rmse:6.61416                                                     
[6]	validation-rmse:6.59401                                                     
[7]	validation-rmse:6.58290                                                     
[8]	validation-rmse:6.57443                                                     
[9]	validation-rmse:6.56756                                                     
[10]	validation-rmse:6.55560                                                    
[11]	validation-rmse:6.55239                                                    
[12]	validation-rmse:6.54957

  self.starting_round = model.num_boosted_rounds()



[0]	validation-rmse:7.36511                                                     
[1]	validation-rmse:6.80156                                                     
[2]	validation-rmse:6.73403                                                     
[3]	validation-rmse:6.72214                                                     
[4]	validation-rmse:6.70935                                                     
[5]	validation-rmse:6.70255                                                     
[6]	validation-rmse:6.69956                                                     
[7]	validation-rmse:6.69476                                                     
[8]	validation-rmse:6.69937                                                     
[9]	validation-rmse:6.70344                                                     
[10]	validation-rmse:6.70440                                                    
[11]	validation-rmse:6.70563                                                    
[12]	validation-rmse:6.70997

  self.starting_round = model.num_boosted_rounds()



[1]	validation-rmse:9.53762                                                     
[2]	validation-rmse:8.58752                                                     
[3]	validation-rmse:7.92553                                                     
[4]	validation-rmse:7.47757                                                     
[5]	validation-rmse:7.17616                                                     
[6]	validation-rmse:6.97753                                                     
[7]	validation-rmse:6.84720                                                     
[8]	validation-rmse:6.75988                                                     
[9]	validation-rmse:6.70254                                                     
[10]	validation-rmse:6.66495                                                    
[11]	validation-rmse:6.63724                                                    
[12]	validation-rmse:6.62015                                                    
[13]	validation-rmse:6.60640

  self.starting_round = model.num_boosted_rounds()



[1]	validation-rmse:9.43476                                                     
[2]	validation-rmse:8.48644                                                     
[3]	validation-rmse:7.84314                                                     
[4]	validation-rmse:7.41994                                                     
[5]	validation-rmse:7.14563                                                     
[6]	validation-rmse:6.96853                                                     
[7]	validation-rmse:6.85319                                                     
[8]	validation-rmse:6.78030                                                     
[9]	validation-rmse:6.72881                                                     
[10]	validation-rmse:6.69562                                                    
[11]	validation-rmse:6.67240                                                    
[12]	validation-rmse:6.65410                                                    
[13]	validation-rmse:6.64205

  self.starting_round = model.num_boosted_rounds()



[1]	validation-rmse:11.82084                                                    
[2]	validation-rmse:11.41677                                                    
[3]	validation-rmse:11.03766                                                    
[4]	validation-rmse:10.68624                                                    
[5]	validation-rmse:10.36028                                                    
[6]	validation-rmse:10.05710                                                    
[7]	validation-rmse:9.77619                                                     
[8]	validation-rmse:9.51543                                                     
[9]	validation-rmse:9.27500                                                     
[10]	validation-rmse:9.05365                                                    
[11]	validation-rmse:8.84726                                                    
[12]	validation-rmse:8.65950                                                    
[13]	validation-rmse:8.48429

  self.starting_round = model.num_boosted_rounds()



[1]	validation-rmse:11.54952                                                    
[2]	validation-rmse:11.04125                                                    
[3]	validation-rmse:10.57873                                                    
[4]	validation-rmse:10.15948                                                    
[5]	validation-rmse:9.78043                                                     
[6]	validation-rmse:9.43725                                                     
[7]	validation-rmse:9.12826                                                     
[8]	validation-rmse:8.84975                                                     
[9]	validation-rmse:8.60033                                                     
[10]	validation-rmse:8.37588                                                    
[11]	validation-rmse:8.17584                                                    
[12]	validation-rmse:7.99724                                                    
[13]	validation-rmse:7.83762

KeyboardInterrupt: 

In [21]:
mlflow.xgboost.autolog(disable=True)

In [23]:
with mlflow.start_run():
    
    train = xgb.DMatrix(X_train, label=y_train)
    valid = xgb.DMatrix(X_val, label=y_val)

    best_params = {
        'learning_rate': 0.09585355369315604,
        'max_depth': 30,
        'min_child_weight': 1.060597050922164,
        'objective': 'reg:linear',
        'reg_alpha': 0.018060244040060163,
        'reg_lambda': 0.011658731377413597,
        'seed': 42
    }

    mlflow.log_params(best_params)

    booster = xgb.train(
        params=best_params,
        dtrain=train,
        num_boost_round=330,
        evals=[(valid, 'validation')],
        early_stopping_rounds=50
    )

    y_pred = booster.predict(valid)
    rmse = mean_squared_error(y_val, y_pred)
    mlflow.log_metric("rmse", rmse)

    with open("models/preprocessor.b", "wb") as f_out:
        pickle.dump(dv, f_out)
    mlflow.log_artifact("models/preprocessor.b", artifact_path="preprocessor")

    mlflow.xgboost.log_model(booster, artifact_path="models_mlflow")

[0]	validation-rmse:11.85278


  self.starting_round = model.num_boosted_rounds()


[1]	validation-rmse:11.09323
[2]	validation-rmse:10.43034
[3]	validation-rmse:9.85894
[4]	validation-rmse:9.36512
[5]	validation-rmse:8.93988
[6]	validation-rmse:8.57922
[7]	validation-rmse:8.26839
[8]	validation-rmse:8.00591
[9]	validation-rmse:7.78299
[10]	validation-rmse:7.59600
[11]	validation-rmse:7.43683
[12]	validation-rmse:7.30452
[13]	validation-rmse:7.19303
[14]	validation-rmse:7.10180
[15]	validation-rmse:7.02518
[16]	validation-rmse:6.95948
[17]	validation-rmse:6.90571
[18]	validation-rmse:6.86069
[19]	validation-rmse:6.82202
[20]	validation-rmse:6.78987
[21]	validation-rmse:6.76190
[22]	validation-rmse:6.73973
[23]	validation-rmse:6.71976
[24]	validation-rmse:6.70385
[25]	validation-rmse:6.69016
[26]	validation-rmse:6.67770
[27]	validation-rmse:6.66733
[28]	validation-rmse:6.65707
[29]	validation-rmse:6.64983
[30]	validation-rmse:6.64264
[31]	validation-rmse:6.63693
[32]	validation-rmse:6.63106
[33]	validation-rmse:6.62646
[34]	validation-rmse:6.62134
[35]	validation-rmse:

  xgb_model.save_model(model_data_path)


In [24]:
run_id='f5b4bd2300ec4943955d253603ec3e29'
model_uri = f"runs:/{run_id}/models_mlflow"
#register model
mlflow.register_model(model_uri=model_uri, name="nyc-taxi-xgboost")

2025/07/01 14:39:20 INFO mlflow.store.db.utils: Creating initial MLflow database tables...
2025/07/01 14:39:20 INFO mlflow.store.db.utils: Updating database tables
INFO  [alembic.runtime.migration] Context impl SQLiteImpl.
INFO  [alembic.runtime.migration] Will assume non-transactional DDL.
Registered model 'nyc-taxi-xgboost' already exists. Creating a new version of this model...
Created version '2' of model 'nyc-taxi-xgboost'.


<ModelVersion: aliases=[], creation_timestamp=1751351960220, current_stage='None', deployment_job_state=None, description=None, last_updated_timestamp=1751351960220, metrics=None, model_id=None, name='nyc-taxi-xgboost', params=None, run_id='f5b4bd2300ec4943955d253603ec3e29', run_link=None, source='models:/m-a36aed9b6cff45a4863f07e38ef31f0d', status='READY', status_message=None, tags={}, user_id=None, version=2>

In [None]:
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, ExtraTreesRegressor
from sklearn.svm import LinearSVR

mlflow.sklearn.autolog()

for model_class in (RandomForestRegressor, GradientBoostingRegressor, ExtraTreesRegressor, LinearSVR):

    with mlflow.start_run():

        mlflow.log_param("train-data-path", "./data/taxi_tripdata.csv")
        mlflow.log_param("valid-data-path", "./data/taxi_tripdata.csv")
        mlflow.log_artifact("models/preprocessor.b", artifact_path="preprocessor")

        mlmodel = model_class()
        mlmodel.fit(X_train, y_train)

        y_pred = mlmodel.predict(X_val)
        rmse = mean_squared_error(y_val, y_pred)
        mlflow.log_metric("rmse", rmse)
        