In [25]:
!python -V

Python 3.13.5


In [26]:
import pandas as pd

%pip install --trusted-host pypi.org --trusted-host files.pythonhosted.org mlflow==3.1.1

Note: you may need to restart the kernel to use updated packages.


In [27]:
import pickle

In [28]:
%pip install seaborn
import seaborn as sns
import matplotlib.pyplot as plt

Note: you may need to restart the kernel to use updated packages.


In [29]:
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso
from sklearn.linear_model import Ridge

from sklearn.metrics import mean_squared_error

In [30]:
import mlflow
import os

os.environ["AWS_PROFILE"] = "dev" # fill in with your AWS profile. More info: https://docs.aws.amazon.com/sdk-for-java/latest/developer-guide/setup.html#setup-credentials

TRACKING_SERVER_HOST = "3.87.201.140" # fill in with the public DNS of the EC2 instance
mlflow.set_tracking_uri(f"http://{TRACKING_SERVER_HOST}:5000")


# 强制初始化数据库（创建实验）

mlflow.set_experiment("nyc-taxi-experiment")
print("实验创建成功，数据库已初始化")


实验创建成功，数据库已初始化


In [31]:
def read_dataframe(filename):
    df = pd.read_csv(filename)

    df.lpep_dropoff_datetime = pd.to_datetime(df.lpep_dropoff_datetime)
    df.lpep_pickup_datetime = pd.to_datetime(df.lpep_pickup_datetime)

    df['duration'] = df.lpep_dropoff_datetime - df.lpep_pickup_datetime
    df.duration = df.duration.apply(lambda td: td.total_seconds() / 60)

    df = df[(df.duration >= 1) & (df.duration <= 60)]

    categorical = ['PULocationID', 'DOLocationID']
    df[categorical] = df[categorical].astype(str)
    
    return df

In [32]:
df=read_dataframe('./data/taxi_tripdata.csv')
#get df_train and df_val from df
#by randomly splitting the data into 80% train and 20% validation
df_train = df.sample(frac=0.8, random_state=42)
df_val = df.drop(df_train.index)


  df = pd.read_csv(filename)


In [33]:
len(df_train), len(df_val)

(63278, 15819)

In [34]:
df_train['PU_DO'] = df_train['PULocationID'] + '_' + df_train['DOLocationID']
df_val['PU_DO'] = df_val['PULocationID'] + '_' + df_val['DOLocationID']

In [35]:
categorical = ['PU_DO'] #'PULocationID', 'DOLocationID']
numerical = ['trip_distance']

dv = DictVectorizer()

train_dicts = df_train[categorical + numerical].to_dict(orient='records')
X_train = dv.fit_transform(train_dicts)

val_dicts = df_val[categorical + numerical].to_dict(orient='records')
X_val = dv.transform(val_dicts)

In [36]:
target = 'duration'
y_train = df_train[target].values
y_val = df_val[target].values

In [37]:
lr = LinearRegression()
lr.fit(X_train, y_train)

y_pred = lr.predict(X_val)

mean_squared_error(y_val, y_pred)

2025/07/02 19:49:03 INFO mlflow.utils.autologging_utils: Created MLflow autologging run with ID '56e9487396234a8793a2e6135bb97be5', which will track hyperparameters, performance metrics, model artifacts, and lineage information for the current sklearn workflow


🏃 View run polite-boar-713 at: http://3.87.201.140:5000/#/experiments/2/runs/56e9487396234a8793a2e6135bb97be5
🧪 View experiment at: http://3.87.201.140:5000/#/experiments/2


71.4023870743136

In [14]:
with open('models/lin_reg.bin', 'wb') as f_out:
    pickle.dump((dv, lr), f_out)

In [None]:
from sklearn.pipeline import Pipeline
with mlflow.start_run():

    mlflow.set_tag("developer", "cristian")

    mlflow.log_param("train-data-path", "./data/taxi_tripdata.csv")
    mlflow.log_param("valid-data-path", "./data/taxi_tripdata.csv")

    alpha = 0.1
    mlflow.log_param("alpha", alpha)
    lr = Lasso(alpha)
    lr.fit(X_train, y_train)

    y_pred = lr.predict(X_val)
    rmse = mean_squared_error(y_val, y_pred)
    mlflow.log_metric("rmse", rmse)

    pipeline = Pipeline([
    ('dv', dv),
    ('model', lr)
    ])
    mlflow.sklearn.log_model(pipeline, 'lr_dv_model')
    



🏃 View run delicate-slug-848 at: http://3.87.201.140:5000/#/experiments/2/runs/247fd94a7bd948c5ab72b9f7a35d502c
🧪 View experiment at: http://3.87.201.140:5000/#/experiments/2


In [16]:
import xgboost as xgb

In [17]:
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials
from hyperopt.pyll import scope

  import pkg_resources


In [18]:
train = xgb.DMatrix(X_train, label=y_train)
valid = xgb.DMatrix(X_val, label=y_val)

In [19]:
def objective(params):
    with mlflow.start_run():
        mlflow.set_tag("model", "xgboost")
        mlflow.log_params(params)
        booster = xgb.train(
            params=params,
            dtrain=train,
            num_boost_round=1000,
            evals=[(valid, 'validation')],
            early_stopping_rounds=50
        )
        y_pred = booster.predict(valid)
        rmse = mean_squared_error(y_val, y_pred)
        mlflow.log_metric("rmse", rmse)

    return {'loss': rmse, 'status': STATUS_OK}

In [20]:
search_space = {
    'max_depth': scope.int(hp.quniform('max_depth', 4, 100, 1)),
    'learning_rate': hp.loguniform('learning_rate', -3, 0),
    'reg_alpha': hp.loguniform('reg_alpha', -5, -1),
    'reg_lambda': hp.loguniform('reg_lambda', -6, -1),
    'min_child_weight': hp.loguniform('min_child_weight', -1, 3),
    'objective': 'reg:linear',
    'seed': 42
}

# best_result = fmin(
#     fn=objective,
#     space=search_space,
#     algo=tpe.suggest,
#     max_evals=50,
#     trials=Trials()
# )

In [21]:
mlflow.xgboost.autolog(disable=True)

In [None]:
with mlflow.start_run():
    
    train = xgb.DMatrix(X_train, label=y_train)
    valid = xgb.DMatrix(X_val, label=y_val)

    best_params = {
        'learning_rate': 0.09585355369315604,
        'max_depth': 30,
        'min_child_weight': 1.060597050922164,
        'objective': 'reg:linear',
        'reg_alpha': 0.018060244040060163,
        'reg_lambda': 0.011658731377413597,
        'seed': 42
    }

    mlflow.log_params(best_params)

    booster = xgb.train(
        params=best_params,
        dtrain=train,
        num_boost_round=330,
        evals=[(valid, 'validation')],
        early_stopping_rounds=50
    )

    y_pred = booster.predict(valid)
    rmse = mean_squared_error(y_val, y_pred)
    mlflow.log_metric("rmse", rmse)

    with open("models/preprocessor.b", "wb") as f_out:
        pickle.dump(dv, f_out)
    mlflow.log_artifact("models/preprocessor.b", artifact_path="preprocessor")


    mlflow.xgboost.log_model(booster, artifact_path="models_mlflow")

[0]	validation-rmse:11.85278


  self.starting_round = model.num_boosted_rounds()


[1]	validation-rmse:11.09323
[2]	validation-rmse:10.43034
[3]	validation-rmse:9.85894
[4]	validation-rmse:9.36512
[5]	validation-rmse:8.93988
[6]	validation-rmse:8.57922
[7]	validation-rmse:8.26839
[8]	validation-rmse:8.00591
[9]	validation-rmse:7.78299
[10]	validation-rmse:7.59600
[11]	validation-rmse:7.43683
[12]	validation-rmse:7.30452
[13]	validation-rmse:7.19303
[14]	validation-rmse:7.10180
[15]	validation-rmse:7.02518
[16]	validation-rmse:6.95948
[17]	validation-rmse:6.90571
[18]	validation-rmse:6.86069
[19]	validation-rmse:6.82202
[20]	validation-rmse:6.78987
[21]	validation-rmse:6.76190
[22]	validation-rmse:6.73973
[23]	validation-rmse:6.71976
[24]	validation-rmse:6.70385
[25]	validation-rmse:6.69016
[26]	validation-rmse:6.67770
[27]	validation-rmse:6.66733
[28]	validation-rmse:6.65707
[29]	validation-rmse:6.64983
[30]	validation-rmse:6.64264
[31]	validation-rmse:6.63693
[32]	validation-rmse:6.63106
[33]	validation-rmse:6.62646
[34]	validation-rmse:6.62134
[35]	validation-rmse:

  xgb_model.save_model(model_data_path)


🏃 View run rebellious-asp-73 at: http://3.87.201.140:5000/#/experiments/2/runs/d103a0b71ff94140818515cfe9a6ae68
🧪 View experiment at: http://3.87.201.140:5000/#/experiments/2


In [None]:
run_id='f5b4bd2300ec4943955d253603ec3e29'
model_uri = f"runs:/{run_id}/models_mlflow"
#register model
mlflow.register_model(model_uri=model_uri, name="nyc-taxi-xgboost")

Registered model 'nyc-taxi-xgboost' already exists. Creating a new version of this model...


RestException: RESOURCE_DOES_NOT_EXIST: Run with id=f5b4bd2300ec4943955d253603ec3e29 not found

In [None]:
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, ExtraTreesRegressor
from sklearn.svm import LinearSVR

mlflow.sklearn.autolog()

for model_class in (LinearSVR):

    with mlflow.start_run():

        mlflow.log_param("train-data-path", "./data/taxi_tripdata.csv")
        mlflow.log_param("valid-data-path", "./data/taxi_tripdata.csv")
        mlflow.log_artifact("models/preprocessor.b", artifact_path="preprocessor")

        mlmodel = model_class()
        mlmodel.fit(X_train, y_train)

        y_pred = mlmodel.predict(X_val)
        rmse = mean_squared_error(y_val, y_pred)
        mlflow.log_metric("rmse", rmse)

        

🏃 View run marvelous-cat-788 at: http://3.87.201.140:5000/#/experiments/2/runs/7d205f67e80d4e6087c52b221a670e22
🧪 View experiment at: http://3.87.201.140:5000/#/experiments/2
🏃 View run funny-lark-228 at: http://3.87.201.140:5000/#/experiments/2/runs/e09800d791794ea387823ea83b03ad6c
🧪 View experiment at: http://3.87.201.140:5000/#/experiments/2
🏃 View run spiffy-cub-400 at: http://3.87.201.140:5000/#/experiments/2/runs/390e28d605194f50af1bf2a95954ade5
🧪 View experiment at: http://3.87.201.140:5000/#/experiments/2




🏃 View run magnificent-lamb-544 at: http://3.87.201.140:5000/#/experiments/2/runs/ac6aa6fa88a24327b2f89ddd4962b246
🧪 View experiment at: http://3.87.201.140:5000/#/experiments/2
