In [1]:
!python -V

Python 3.9.19


In [2]:
import pandas as pd
import numpy
import pickle
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso
from sklearn.linear_model import Ridge 
from sklearn.metrics import mean_squared_error

In [13]:
import mlflow
mlflow.set_tracking_uri("http://127.0.0.1:5001")

#mlflow.set_tracking_uri("sqlite:///mlflow.db") # artifact store
mlflow.set_experiment("nyc-taxi-experiment") # creates new one if does not exist

2024/05/27 07:06:33 INFO mlflow.tracking.fluent: Experiment with name 'nyc-taxi-experiment' does not exist. Creating a new experiment.


<Experiment: artifact_location='mlflow-artifacts:/1', creation_time=1716793593311, experiment_id='1', last_update_time=1716793593311, lifecycle_stage='active', name='nyc-taxi-experiment', tags={}>

In [4]:
def read_dataframe(filename):
    df = pd.read_parquet(filename)

    df.lpep_dropoff_datetime = pd.to_datetime(df.lpep_dropoff_datetime)
    df.lpep_pickup_datetime = pd.to_datetime(df.lpep_pickup_datetime)

    df['duration'] = df.lpep_dropoff_datetime - df.lpep_pickup_datetime
    df.duration = df.duration.apply(lambda td: td.total_seconds() / 60)

    df = df[(df.duration >= 1) & (df.duration <= 60)]

    categorical = ['PULocationID', 'DOLocationID']
    df[categorical] = df[categorical].astype(str)
    
    return df

In [5]:
#df_train = read_dataframe('./data/green_tripdata_2021-01.csv')
#df_val = read_dataframe('./data/green_tripdata_2021-02.csv')

df_train = read_dataframe("https://d37ci6vzurychx.cloudfront.net/trip-data/green_tripdata_2021-01.parquet")
df_val   = read_dataframe("https://d37ci6vzurychx.cloudfront.net/trip-data/green_tripdata_2021-02.parquet")

In [6]:
len(df_train), len(df_val)

(73908, 61921)

In [7]:
df_train['PU_DO'] = df_train['PULocationID'] + '_' + df_train['DOLocationID']
df_val['PU_DO'] = df_val['PULocationID'] + '_' + df_val['DOLocationID']

In [8]:

categorical = ['PU_DO'] #'PULocationID', 'DOLocationID']
numerical = ['trip_distance']

dv = DictVectorizer()

train_dicts = df_train[categorical + numerical].to_dict(orient='records')
X_train = dv.fit_transform(train_dicts)

val_dicts = df_val[categorical + numerical].to_dict(orient='records')
X_val = dv.transform(val_dicts)

In [9]:
target = 'duration'
y_train = df_train[target].values
y_val = df_val[target].values

In [10]:
lr = LinearRegression()
lr.fit(X_train, y_train)

y_pred = lr.predict(X_val)

mean_squared_error(y_val, y_pred, squared=False)



7.758715207559742

In [11]:
with open('models/lin_reg.bin', 'wb') as f_out:
    pickle.dump((dv, lr), f_out)

- Track experiments:

In [14]:
with mlflow.start_run():
    # log info about run for specific person
    mlflow.set_tag("developer", "cristian")

    # log info about data set
    mlflow.log_param("train-data-path", "./data/green_tripdata_2021-01.csv")
    mlflow.log_param("valid-data-path", "./data/green_tripdata_2021-02.csv")

    # log model params
    alpha = 0.1
    mlflow.log_param("alpha", alpha)
    
    lr = Lasso(alpha)
    lr.fit(X_train, y_train)

    y_pred = lr.predict(X_val)
    mse = mean_squared_error(y_val, y_pred)
    rmse = numpy.sqrt(mse)
    mlflow.log_metric("rmse", rmse)

    # log model instance
    mlflow.log_artifact(local_path="models/lin_reg.bin", artifact_path="models_pickle")

 - Parameter tuning:

In [15]:
import xgboost as xgb
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials
from hyperopt.pyll import scope

In [16]:
train = xgb.DMatrix(X_train, label=y_train) # optimize for training speed and memory efficiency
valid = xgb.DMatrix(X_val, label=y_val)

In [17]:
def objective(params):
    with mlflow.start_run():
        mlflow.set_tag("model", "xgboost")
        mlflow.log_params(params)
        booster = xgb.train(
            params=params,
            dtrain=train,
            num_boost_round=1000,
            evals=[(valid, 'validation')],
            early_stopping_rounds=50
        )
        y_pred = booster.predict(valid)
        rmse = numpy.sqrt(mean_squared_error(y_val, y_pred))
        mlflow.log_metric("rmse", rmse)

    return {'loss': rmse, 'status': STATUS_OK}

In [20]:
search_space = {
    'max_depth': scope.int(hp.quniform('max_depth', 4, 100, 1)),
    'learning_rate': hp.loguniform('learning_rate', -3, 0),
    'reg_alpha': hp.loguniform('reg_alpha', -5, -1),
    'reg_lambda': hp.loguniform('reg_lambda', -6, -1),
    'min_child_weight': hp.loguniform('min_child_weight', -1, 3),
    'objective': 'reg:squarederror',
    'seed': 42
}

best_result = fmin(
    fn=objective,
    space=search_space,
    algo=tpe.suggest,
    max_evals=50,
    trials=Trials()
)

  0%|          | 0/50 [00:00<?, ?trial/s, best loss=?]




[0]	validation-rmse:7.48229                           
[1]	validation-rmse:6.80354                           
[2]	validation-rmse:6.67602                           
[3]	validation-rmse:6.64127                           
[4]	validation-rmse:6.62223                           
[5]	validation-rmse:6.61347                           
[6]	validation-rmse:6.60792                           
[7]	validation-rmse:6.59781                           
[8]	validation-rmse:6.59185                           
[9]	validation-rmse:6.58558                           
[10]	validation-rmse:6.58148                          
[11]	validation-rmse:6.57829                          
[12]	validation-rmse:6.57501                          
[13]	validation-rmse:6.57284                          
[14]	validation-rmse:6.57038                          
[15]	validation-rmse:6.56870                          
[16]	validation-rmse:6.56762                          
[17]	validation-rmse:6.56562                          
[18]	valid




[0]	validation-rmse:9.93058                                                    
[1]	validation-rmse:8.54137                                                    
[2]	validation-rmse:7.73001                                                    
[3]	validation-rmse:7.26870                                                    
[4]	validation-rmse:7.00686                                                    
[5]	validation-rmse:6.85723                                                    
[6]	validation-rmse:6.76351                                                    
[7]	validation-rmse:6.70251                                                    
[8]	validation-rmse:6.66386                                                    
[9]	validation-rmse:6.64016                                                    
[10]	validation-rmse:6.62266                                                   
[11]	validation-rmse:6.61009                                                   
[12]	validation-rmse:6.60091            




[0]	validation-rmse:11.51543                                                   
[1]	validation-rmse:10.89700                                                   
[2]	validation-rmse:10.35021                                                   
[3]	validation-rmse:9.86864                                                    
[4]	validation-rmse:9.44420                                                    
[5]	validation-rmse:9.07410                                                    
[6]	validation-rmse:8.75017                                                    
[7]	validation-rmse:8.46885                                                    
[8]	validation-rmse:8.22268                                                    
[9]	validation-rmse:8.00976                                                    
[10]	validation-rmse:7.82575                                                   
[11]	validation-rmse:7.66650                                                   
[12]	validation-rmse:7.52802            




[0]	validation-rmse:10.98036                                                   
[1]	validation-rmse:9.99632                                                    
[2]	validation-rmse:9.21548                                                    
[3]	validation-rmse:8.60325                                                    
[4]	validation-rmse:8.12752                                                    
[5]	validation-rmse:7.76108                                                    
[6]	validation-rmse:7.48064                                                    
[7]	validation-rmse:7.26700                                                    
[8]	validation-rmse:7.09863                                                    
[9]	validation-rmse:6.97262                                                    
[10]	validation-rmse:6.87267                                                   
[11]	validation-rmse:6.79635                                                   
[12]	validation-rmse:6.73758            




[0]	validation-rmse:6.64168                                                      
[1]	validation-rmse:6.56629                                                      
[2]	validation-rmse:6.55793                                                      
[3]	validation-rmse:6.54879                                                      
[4]	validation-rmse:6.54132                                                      
[5]	validation-rmse:6.53382                                                      
[6]	validation-rmse:6.53167                                                      
[7]	validation-rmse:6.52796                                                      
[8]	validation-rmse:6.52180                                                      
[9]	validation-rmse:6.51693                                                      
[10]	validation-rmse:6.51136                                                     
[11]	validation-rmse:6.50638                                                     
[12]	validation-




[0]	validation-rmse:10.04330                                                   
[1]	validation-rmse:8.68846                                                    
[2]	validation-rmse:7.87463                                                    
[3]	validation-rmse:7.39620                                                    
[4]	validation-rmse:7.11752                                                    
[5]	validation-rmse:6.95591                                                    
[6]	validation-rmse:6.85703                                                    
[7]	validation-rmse:6.79282                                                    
[8]	validation-rmse:6.75115                                                    
[9]	validation-rmse:6.72518                                                    
[10]	validation-rmse:6.70623                                                   
[11]	validation-rmse:6.68837                                                   
[12]	validation-rmse:6.67658            




[0]	validation-rmse:11.60293                                                   
[1]	validation-rmse:11.05376                                                   
[2]	validation-rmse:10.56079                                                   
[3]	validation-rmse:10.11962                                                   
[4]	validation-rmse:9.72529                                                    
[5]	validation-rmse:9.37371                                                    
[6]	validation-rmse:9.06102                                                    
[7]	validation-rmse:8.78359                                                    
[8]	validation-rmse:8.53794                                                    
[9]	validation-rmse:8.32053                                                    
[10]	validation-rmse:8.12886                                                   
[11]	validation-rmse:7.96011                                                   
[12]	validation-rmse:7.81173            




[0]	validation-rmse:11.54320                                                   
[1]	validation-rmse:10.94666                                                   
[2]	validation-rmse:10.41681                                                   
[3]	validation-rmse:9.94498                                                    
[4]	validation-rmse:9.52597                                                    
[5]	validation-rmse:9.16257                                                    
[6]	validation-rmse:8.84165                                                    
[7]	validation-rmse:8.55566                                                    
[8]	validation-rmse:8.30661                                                    
[9]	validation-rmse:8.09274                                                    
[10]	validation-rmse:7.89809                                                   
[11]	validation-rmse:7.73685                                                   
[12]	validation-rmse:7.59175            




[3]	validation-rmse:10.68105                                                      
[4]	validation-rmse:10.36512                                                      
[5]	validation-rmse:10.07276                                                      
[6]	validation-rmse:9.80303                                                       
[7]	validation-rmse:9.55407                                                       
[8]	validation-rmse:9.32380                                                       
[9]	validation-rmse:9.11184                                                       
[10]	validation-rmse:8.91736                                                      
[11]	validation-rmse:8.73761                                                      
[12]	validation-rmse:8.57320                                                      
[13]	validation-rmse:8.42214                                                      
[14]	validation-rmse:8.28373                                                      
[15]




[0]	validation-rmse:8.45701                                                       
[1]	validation-rmse:7.13578                                                       
[2]	validation-rmse:6.70942                                                       
[3]	validation-rmse:6.56439                                                       
[4]	validation-rmse:6.50062                                                       
[5]	validation-rmse:6.46655                                                       
[6]	validation-rmse:6.45303                                                       
[7]	validation-rmse:6.44220                                                       
[8]	validation-rmse:6.43777                                                       
[9]	validation-rmse:6.43261                                                       
[10]	validation-rmse:6.42720                                                      
[11]	validation-rmse:6.42246                                                      
[12]




[0]	validation-rmse:11.65962                                                    
[1]	validation-rmse:11.15433                                                    
[2]	validation-rmse:10.69386                                                    
[3]	validation-rmse:10.27495                                                    
[4]	validation-rmse:9.89356                                                     
[5]	validation-rmse:9.54834                                                     
[6]	validation-rmse:9.23569                                                     
[7]	validation-rmse:8.95249                                                     
[8]	validation-rmse:8.69690                                                     
[9]	validation-rmse:8.46767                                                     
[10]	validation-rmse:8.26014                                                    
[11]	validation-rmse:8.07486                                                    
[12]	validation-rmse:7.90851

KeyboardInterrupt: 

- Autologging: https://mlflow.org/docs/latest/tracking/autolog.html


mlflow.sklearn.autolog() is a function in the MLflow library that automatically logs various machine learning model training and evaluation artifacts for Scikit-Learn models. When enabled, it simplifies the process of tracking and storing essential information during the model development lifecycle.

***Key Features:***
- Automatic Logging of Parameters: Logs the hyperparameters used for training the model.
- Automatic Logging of Metrics: Logs common metrics like accuracy, mean squared error, etc., evaluated on the training and validation datasets.
- Automatic Logging of Artifacts: Saves the trained model and other useful artifacts.
- Simplified Integration: Eliminates the need for manually writing logging code, making it easier to keep track of experiments and results.

-> *`mlflow.sklearn.autolog() abstracts away the manual effort of explicitly coding the logging of parameters, metrics, and artifacts!`*

In [19]:
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, ExtraTreesRegressor
from sklearn.svm import LinearSVR
import numpy as np

mlflow.sklearn.autolog() # log all models and params

regressors = [RandomForestRegressor, GradientBoostingRegressor, ExtraTreesRegressor, LinearSVR]

for model_class in regressors:

    with mlflow.start_run(): # start new run for each model

        mlflow.log_param("train-data-path", "./data/green_tripdata_2021-01.csv")
        mlflow.log_param("valid-data-path", "./data/green_tripdata_2021-02.csv")
        mlflow.log_artifact("models/preprocessor.b", artifact_path="preprocessor") # uploads a local file or directory to the artifact store.

        mlmodel = model_class()
        mlmodel.fit(X_train, y_train)

        y_pred = mlmodel.predict(X_val)
        rmse = np.sqrt(mean_squared_error(y_val, y_pred))
        mlflow.log_metric("rmse", rmse)



### ***`Better way to save models in a way that can be easily run:`***
---

In [26]:
mlflow.xgboost.autolog(disable=True)

with mlflow.start_run():
    train = xgb.DMatrix(X_train, label=y_train)
    valid = xgb.DMatrix(X_val, label=y_val)

    best_params = {
        'learning_rate': 0.095,
        'max_depth': 30,
        'min_child_weight': 1.06,
        'objective': 'reg:squarederror',
        'reg_alpha': 0.0180,
        'reg_lambda': 0.0116,
        'seed': 42
    }
    
    mlflow.log_params(best_params)
    
    booster = xgb.train(
        params=best_params,
        dtrain=train,
        num_boost_round=1000,
        evals=[(valid, 'validation')],
        early_stopping_rounds=50
    )
    y_pred = booster.predict(valid)
    rmse = numpy.sqrt(mean_squared_error(y_val, y_pred))
    mlflow.log_metric('rmse', rmse)
    
    with open('models/preprocessor.b', 'wb') as f_out:
        pickle.dump(dv, f_out) # dv = dictvectorizer
    
    mlflow.log_artifact('models/preprocessor.b', artifact_path='preprocessor')
    mlflow.xgboost.log_model(booster, artifact_path='models_mlflow')

[0]	validation-rmse:11.45155
[1]	validation-rmse:10.78383
[2]	validation-rmse:10.19903
[3]	validation-rmse:9.69205
[4]	validation-rmse:9.25092
[5]	validation-rmse:8.86731
[6]	validation-rmse:8.53887
[7]	validation-rmse:8.25458
[8]	validation-rmse:8.01162
[9]	validation-rmse:7.80559
[10]	validation-rmse:7.62738
[11]	validation-rmse:7.47628
[12]	validation-rmse:7.34533
[13]	validation-rmse:7.23521
[14]	validation-rmse:7.13971
[15]	validation-rmse:7.05922
[16]	validation-rmse:6.99016
[17]	validation-rmse:6.93036
[18]	validation-rmse:6.87921
[19]	validation-rmse:6.83534
[20]	validation-rmse:6.79681
[21]	validation-rmse:6.76449
[22]	validation-rmse:6.73577
[23]	validation-rmse:6.70980
[24]	validation-rmse:6.68833
[25]	validation-rmse:6.66932
[26]	validation-rmse:6.65226
[27]	validation-rmse:6.63732
[28]	validation-rmse:6.62436
[29]	validation-rmse:6.61211
[30]	validation-rmse:6.60104
[31]	validation-rmse:6.59163
[32]	validation-rmse:6.58380
[33]	validation-rmse:6.57582
[34]	validation-rmse:



In [32]:
# Print the artifact URI for verification
artifact_uri = mlflow.get_artifact_uri()
print(f"Artifacts logged in: {artifact_uri}")

Artifacts logged in: /home/basti/MLOps-Zoomcamp/week2/mlruns/1/dbe6cee64d5a4e498823ba0e15afcc4f/artifacts


In [28]:
import mlflow
logged_model = 'runs:/c433c1fa572149ee86f714f9f900748f/models_mlflow'

# Load model as a PyFuncModel.
loaded_model = mlflow.pyfunc.load_model(logged_model)
loaded_model

mlflow.pyfunc.loaded_model:
  artifact_path: models_mlflow
  flavor: mlflow.xgboost
  run_id: c433c1fa572149ee86f714f9f900748f

In [30]:
xgboost_model = mlflow.xgboost.load_model(logged_model)
xgboost_model

<xgboost.core.Booster at 0x7fe765d59ee0>

In [31]:
xgboost_model.predict(valid)

array([14.609752 ,  7.1668797, 15.909137 , ..., 13.501514 ,  6.652708 ,
        8.171159 ], dtype=float32)