# DataTalks.Club MLOps Zoomcamp - Homework 2

## Q1
Using virtual-environment, I installed mlflow, and the installed version is 2.22.0.

Solution:

In [4]:
!mlflow --version

mlflow, version 2.22.0


## Q2
4 files are saved to the output folder.

Solution:

In [55]:
from preprocess_data import run_data_prep
raw_data_path = '/home/onur/WORK/DS/repos/MLOps/nytaxi_mlops/02-experiment-tracking/data_raw'
prepocessed_data_path = '/home/onur/WORK/DS/repos/MLOps/nytaxi_mlops/02-experiment-tracking/data_preprocessed'
run_data_prep(
    raw_data_path=raw_data_path,
    dest_path=prepocessed_data_path ,
    dataset="green"
)

In [2]:
!ls /home/onur/WORK/DS/repos/MLOps/nytaxi_mlops/02-experiment-tracking/data_preprocessed |wc -l

4


In [3]:
!ls /home/onur/WORK/DS/repos/MLOps/nytaxi_mlops/02-experiment-tracking/data_preprocessed 

dv.pkl	test.pkl  train.pkl  val.pkl


## Q3
Value of the min_samples_split parameter is: 2

Solution:

In [56]:
import os
import pickle

from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import root_mean_squared_error


def load_pickle(filename: str):
    with open(filename, "rb") as f_in:
        return pickle.load(f_in)

def run_train(data_path: str):

    X_train, y_train = load_pickle(os.path.join(data_path, "train.pkl"))
    X_val, y_val = load_pickle(os.path.join(data_path, "val.pkl"))

    rf = RandomForestRegressor(max_depth=10, random_state=0)
    rf.fit(X_train, y_train)
    # training rmse
    y_pred = rf.predict(X_train)
    rmse_train = root_mean_squared_error(y_train, y_pred)
    print (f'RMSE training: {rmse_train}')
    # validation rmse
    y_pred = rf.predict(X_val)
    rmse_val = root_mean_squared_error(y_val, y_pred)
    print (f'RMSE validation: {rmse_val}')
    return rmse_train, rmse_val, rf


In [None]:
import mlflow

mlflow.set_tracking_uri("sqlite:///mlflow.db")
mlflow.set_experiment("nyc-taxi-exp-2")
mlflow.autolog()


raw_data_path = '/home/onur/WORK/DS/repos/MLOps/nytaxi_mlops/02-experiment-tracking/data_raw'
prepocessed_data_path = '/home/onur/WORK/DS/repos/MLOps/nytaxi_mlops/02-experiment-tracking/data_preprocessed'
# Save the preprocessor as an artifact in the artifacts folder
with open(os.path.join(prepocessed_data_path, "dv.pkl"), "rb") as f_in: 
    dv = pickle.load(f_in)
artifacts_preprocessor_path = '/home/onur/WORK/DS/repos/MLOps/nytaxi_mlops/02-experiment-tracking/artifacts_preprocessor'
preprocessor_fpath = os.path.join(artifacts_preprocessor_path, 'preprocesssor_dv.b')
with open(preprocessor_fpath, 'wb') as f_out:
    pickle.dump(dv, f_out)

with mlflow.start_run():

    mlflow.set_tag("developer", "onur")
    mlflow.log_param("train_data_path", os.path.join(raw_data_path, "green_tripdata_2023-01.parquet"))
    mlflow.log_param("val_data_path", os.path.join(raw_data_path, "green_tripdata_2023-02.parquet"))
    mlflow.log_param("test_data_path", os.path.join(raw_data_path, "green_tripdata_2023-03.parquet"))

    rmse_train, rmse_val, rf = run_train(prepocessed_data_path)

    mlflow.log_metric("rmse_train", rmse_train)
    mlflow.log_metric("rmse_val", rmse_val)

    # save and log the model (actually this will result in a duplicate artifact, as the autologging is also saving the model)
    mlflow.sklearn.log_model(rf, artifact_path='models_mlflow')
    # log the preprocessor
    mlflow.log_artifact(preprocessor_fpath, artifact_path='preprocessor')

mlflow.end_run()
print('done')

2025/05/26 17:35:27 INFO mlflow.tracking.fluent: Experiment with name 'nyc-taxi-exp-2' does not exist. Creating a new experiment.


2025/05/26 17:35:27 INFO mlflow.tracking.fluent: Autologging successfully enabled for sklearn.
2025/05/26 17:35:27 INFO mlflow.tracking.fluent: Autologging successfully enabled for pyspark.


RMSE training: 5.204138209108148
RMSE validation: 5.431162180141208




done


In [69]:
from mlflow.tracking import MlflowClient
mlflow_client = MlflowClient()
mlflow_client.search_experiments()

[<Experiment: artifact_location='/home/onur/WORK/DS/repos/MLOps/nytaxi_mlops/02-experiment-tracking/mlruns/2', creation_time=1748273727650, experiment_id='2', last_update_time=1748273727650, lifecycle_stage='active', name='nyc-taxi-exp-2', tags={}>,
 <Experiment: artifact_location='mlflow-artifacts:/0', creation_time=1748264367241, experiment_id='0', last_update_time=1748264367241, lifecycle_stage='active', name='Default', tags={}>]

In [None]:
runs = mlflow_client.search_runs(experiment_ids='2')
run = mlflow_client.get_run(run_id=runs[0].info.run_id)
run.data.params['min_samples_split']

'2'

In [71]:
# other parameters (just for curiosity)
print('Run parameters:')
for key, value in run.data.params.items():
    print(f'  {key}: {value}')

print('Run metrics:')
for key, value in run.data.metrics.items():
    print(f'  {key}: {value}') 

Run parameters:
  train_data_path: /home/onur/WORK/DS/repos/MLOps/nytaxi_mlops/02-experiment-tracking/data_raw/green_tripdata_2023-01.parquet
  val_data_path: /home/onur/WORK/DS/repos/MLOps/nytaxi_mlops/02-experiment-tracking/data_raw/green_tripdata_2023-02.parquet
  test_data_path: /home/onur/WORK/DS/repos/MLOps/nytaxi_mlops/02-experiment-tracking/data_raw/green_tripdata_2023-03.parquet
  bootstrap: True
  ccp_alpha: 0.0
  criterion: squared_error
  max_depth: 10
  max_features: 1.0
  max_leaf_nodes: None
  max_samples: None
  min_impurity_decrease: 0.0
  min_samples_leaf: 1
  min_samples_split: 2
  min_weight_fraction_leaf: 0.0
  monotonic_cst: None
  n_estimators: 100
  n_jobs: None
  oob_score: False
  random_state: 0
  verbose: 0
  warm_start: False
Run metrics:
  training_mean_squared_error: 27.083054499499358
  training_mean_absolute_error: 3.4244701942312354
  training_r2_score: 0.6673983775155525
  training_root_mean_squared_error: 5.204138209108148
  training_score: 0.6673983

## Q4

To properly configure the server, in addition to backend-store-uri, we need to pass the 'default-artifact-root' parameter, as follows:

`mlflow server --backend-store-uri sqlite:///mlflow.db --default-artifact-root ./artifacts`


## Q5

The best validation RMSE I got is: 5.335

Solution:

In [76]:
import os
import pickle
import mlflow
import numpy as np
from hyperopt import STATUS_OK, Trials, fmin, hp, tpe
from hyperopt.pyll import scope
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import root_mean_squared_error


def load_pickle(filename: str):
    with open(filename, "rb") as f_in:
        return pickle.load(f_in)


def run_optimization(
        data_path:str,
        num_trials:int=15):

    X_train, y_train = load_pickle(os.path.join(data_path, "train.pkl"))
    X_val, y_val = load_pickle(os.path.join(data_path, "val.pkl"))

    def objective(params):

        with mlflow.start_run():
            mlflow.log_params(params)
            rf = RandomForestRegressor(**params)
            rf.fit(X_train, y_train)
            y_pred = rf.predict(X_val)
            rmse = root_mean_squared_error(y_val, y_pred)
            
            mlflow.log_metric("rmse", rmse)
        
        mlflow.end_run()
        return {'loss': rmse, 'status': STATUS_OK}

    search_space = {
        'max_depth': scope.int(hp.quniform('max_depth', 1, 20, 1)),
        'n_estimators': scope.int(hp.quniform('n_estimators', 10, 50, 1)),
        'min_samples_split': scope.int(hp.quniform('min_samples_split', 2, 10, 1)),
        'min_samples_leaf': scope.int(hp.quniform('min_samples_leaf', 1, 4, 1)),
        'random_state': 42
    }

    rstate = np.random.default_rng(42)  # for reproducible results
    fmin(
        fn=objective,
        space=search_space,
        algo=tpe.suggest,
        max_evals=num_trials,
        trials=Trials(),
        rstate=rstate
    )

In [77]:
mlflow.set_tracking_uri("http://127.0.0.1:5000")
mlflow.set_experiment("nyc-taxi-RF-hyperopt")

run_optimization(
    data_path=prepocessed_data_path
)


  0%|          | 0/15 [00:00<?, ?trial/s, best loss=?]

🏃 View run intrigued-sheep-845 at: http://127.0.0.1:5000/#/experiments/3/runs/22b143deda554c4aadcee4aa069afd66

🧪 View experiment at: http://127.0.0.1:5000/#/experiments/3

🏃 View run gifted-kit-810 at: http://127.0.0.1:5000/#/experiments/3/runs/37dfe3186419435e825c69f8e80d15dc

🧪 View experiment at: http://127.0.0.1:5000/#/experiments/3                   

🏃 View run upbeat-hen-972 at: http://127.0.0.1:5000/#/experiments/3/runs/4a6da5823fff4be9803da2b14753ea19

🧪 View experiment at: http://127.0.0.1:5000/#/experiments/3                   

🏃 View run luxuriant-shark-875 at: http://127.0.0.1:5000/#/experiments/3/runs/f2c3def4a9ad4e0bac2b93ffbc01db09

🧪 View experiment at: http://127.0.0.1:5000/#/experiments/3                   

🏃 View run silent-owl-881 at: http://127.0.0.1:5000/#/experiments/3/runs/488442cbe3a0417d9e7ae5ce264e5177

🧪 View experiment at: http://127.0.0.1:5000/#/experiments/3                   

🏃 View run unruly-pig-303 at: http://127.0.0.1:5000/#/experiments/3/runs/e

In [78]:
from mlflow.tracking import MlflowClient
from mlflow.entities import ViewType
mlflow_client = MlflowClient()
mlflow_client.search_experiments()

[<Experiment: artifact_location='/home/onur/WORK/DS/repos/MLOps/nytaxi_mlops/02-experiment-tracking/artifacts/3', creation_time=1748280191640, experiment_id='3', last_update_time=1748280191640, lifecycle_stage='active', name='nyc-taxi-RF-hyperopt', tags={}>,
 <Experiment: artifact_location='/home/onur/WORK/DS/repos/MLOps/nytaxi_mlops/02-experiment-tracking/mlruns/2', creation_time=1748273727650, experiment_id='2', last_update_time=1748273727650, lifecycle_stage='active', name='nyc-taxi-exp-2', tags={}>,
 <Experiment: artifact_location='mlflow-artifacts:/0', creation_time=1748264367241, experiment_id='0', last_update_time=1748264367241, lifecycle_stage='active', name='Default', tags={}>]

In [None]:
runs = mlflow_client.search_runs(
    experiment_ids='3',
    # filter_string='',
    run_view_type=ViewType.ACTIVE_ONLY,
    max_results=3,
    order_by=["metrics.rmse ASC"]
    )

In [84]:
for run in runs:
    print(f"run id: {run.info.run_id}, rmse: {run.data.metrics['rmse']:.3f}")

run id: 260b82c646e14dd98efcd3482f7b8898, rmse: 5.335
run id: ef8950ee1c5d428ba83c6bfe5e137c80, rmse: 5.355
run id: 83e4f608920c474d9c1840f7401cebf7, rmse: 5.355
run id: f2c3def4a9ad4e0bac2b93ffbc01db09, rmse: 5.357
run id: 5e610742a707422cb2f53d342948fa6f, rmse: 5.363


## Q6

The test RMSE of the best model is: 5.567

Solution:

In [85]:
import os
import pickle
import click
import mlflow

from mlflow.entities import ViewType
from mlflow.tracking import MlflowClient
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error


HPO_EXPERIMENT_NAME = "nyc-taxi-RF-hyperopt"
RF_PARAMS = ['max_depth', 'n_estimators', 'min_samples_split', 'min_samples_leaf', 'random_state']

def load_pickle(filename):
    with open(filename, "rb") as f_in:
        return pickle.load(f_in)


def train_and_log_model(data_path, params):
    X_train, y_train = load_pickle(os.path.join(data_path, "train.pkl"))
    X_val, y_val = load_pickle(os.path.join(data_path, "val.pkl"))
    X_test, y_test = load_pickle(os.path.join(data_path, "test.pkl"))

    with mlflow.start_run():
        new_params = {}
        for param in RF_PARAMS:
            new_params[param] = int(params[param])

        mlflow.log_params(new_params)
        rf = RandomForestRegressor(**new_params)
        rf.fit(X_train, y_train)

        # Evaluate model on the validation and test sets
        val_rmse = root_mean_squared_error(y_val, rf.predict(X_val))
        mlflow.log_metric("val_rmse", val_rmse)
        test_rmse = root_mean_squared_error(y_test, rf.predict(X_test))
        mlflow.log_metric("test_rmse", test_rmse)


def run_register_model(data_path: str, top_n: int):

    client = MlflowClient()

    # Retrieve the top_n model runs and log the models
    experiment = client.get_experiment_by_name(HPO_EXPERIMENT_NAME)
    runs = client.search_runs(
        experiment_ids=experiment.experiment_id,
        run_view_type=ViewType.ACTIVE_ONLY,
        max_results=top_n,
        order_by=["metrics.rmse ASC"]
    )
    for run in runs:
        train_and_log_model(data_path=data_path, params=run.data.params)

In [87]:
mlflow.set_tracking_uri("http://127.0.0.1:5000")
EXPERIMENT_NAME = "nyc-taxi-register-best-models"
mlflow.set_experiment(EXPERIMENT_NAME)
mlflow.sklearn.autolog()

run_register_model(
    data_path=prepocessed_data_path,
    top_n=5
)

🏃 View run intrigued-tern-598 at: http://127.0.0.1:5000/#/experiments/4/runs/2e1d7f7fa84c4d93991acd5a372768ab
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/4
🏃 View run nebulous-lynx-189 at: http://127.0.0.1:5000/#/experiments/4/runs/c75ea79a82a84710a126a2efac846736
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/4
🏃 View run salty-fowl-284 at: http://127.0.0.1:5000/#/experiments/4/runs/0c85244770974913bdfb387760812f4d
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/4
🏃 View run placid-penguin-712 at: http://127.0.0.1:5000/#/experiments/4/runs/9f1e1fbc70ff476cb8e4e28762a799fb
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/4
🏃 View run enchanting-robin-165 at: http://127.0.0.1:5000/#/experiments/4/runs/1136644bb73e445f9382a8680441ef02
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/4


In [88]:
from mlflow.tracking import MlflowClient
from mlflow.entities import ViewType
mlflow_client = MlflowClient()
mlflow_client.search_experiments()

[<Experiment: artifact_location='/home/onur/WORK/DS/repos/MLOps/nytaxi_mlops/02-experiment-tracking/artifacts/4', creation_time=1748283823272, experiment_id='4', last_update_time=1748283823272, lifecycle_stage='active', name='nyc-taxi-register-best-models', tags={}>,
 <Experiment: artifact_location='/home/onur/WORK/DS/repos/MLOps/nytaxi_mlops/02-experiment-tracking/artifacts/3', creation_time=1748280191640, experiment_id='3', last_update_time=1748280191640, lifecycle_stage='active', name='nyc-taxi-RF-hyperopt', tags={}>,
 <Experiment: artifact_location='/home/onur/WORK/DS/repos/MLOps/nytaxi_mlops/02-experiment-tracking/mlruns/2', creation_time=1748273727650, experiment_id='2', last_update_time=1748273727650, lifecycle_stage='active', name='nyc-taxi-exp-2', tags={}>,
 <Experiment: artifact_location='mlflow-artifacts:/0', creation_time=1748264367241, experiment_id='0', last_update_time=1748264367241, lifecycle_stage='active', name='Default', tags={}>]

In [89]:
experiment = mlflow_client.get_experiment_by_name(EXPERIMENT_NAME)

In [91]:
runs = mlflow_client.search_runs(
    experiment_ids=experiment.experiment_id,
    # filter_string='',
    run_view_type=ViewType.ACTIVE_ONLY,
    max_results=5,
    order_by=["metrics.test_rmse ASC"]
    )

In [None]:
# list the test-rmse's
for run in runs:
    print(f"run id: {run.info.run_id}, test-rmse: {run.data.metrics['test_rmse']:.3f}")

run id: 2e1d7f7fa84c4d93991acd5a372768ab, test-rmse: 5.567
run id: c75ea79a82a84710a126a2efac846736, test-rmse: 5.585
run id: 9f1e1fbc70ff476cb8e4e28762a799fb, test-rmse: 5.589
run id: 0c85244770974913bdfb387760812f4d, test-rmse: 5.592
run id: 1136644bb73e445f9382a8680441ef02, test-rmse: 5.594


In [136]:
# Register the best 2 models:
model_name = "nyc-taxi-regressor-candidates"
best_run_id = runs[0].info.run_id
best_model_uri = f"runs:/{best_run_id}/model"
mlflow.register_model(
    model_uri=best_model_uri,
    name=model_name
)
mlflow_client.set_registered_model_alias(
    model_name,
    "champion",
    1)
mlflow_client.set_model_version_tag(
    model_name,
    "1",
    "validation_status",
    "approved")


Successfully registered model 'nyc-taxi-regressor-candidates'.
2025/05/27 11:02:22 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: nyc-taxi-regressor-candidates, version 1
Created version '1' of model 'nyc-taxi-regressor-candidates'.


In [None]:
best_run_id = runs[1].info.run_id
best_model_uri = f"runs:/{best_run_id}/model"
mlflow.register_model(
    model_uri=best_model_uri,
    name=model_name
)
mlflow_client.set_registered_model_alias(
    model_name,
    "challenger",
    2)
mlflow_client.set_model_version_tag(
    model_name,
    "2",
    "validation_status",
    "pending")

Registered model 'nyc-taxi-regressor-candidates' already exists. Creating a new version of this model...
2025/05/27 11:02:43 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: nyc-taxi-regressor-candidates, version 2
Created version '2' of model 'nyc-taxi-regressor-candidates'.


In [138]:
models = mlflow_client.search_registered_models()
for i, model in enumerate(models):
    print(f"name: {model.name}, aliases: {model.aliases}")
    # client.get_model_version_by_alias(model.name, "Champion")
    

name: nyc-taxi-regressor-candidates, aliases: {'challenger': '2', 'champion': '1'}


In [139]:
model = mlflow_client.get_registered_model(model_name)
model.aliases

{'challenger': '2', 'champion': '1'}

In [None]:
# Stage the model
model_name_stag = "nyc-taxi-regressor-stag"
mlflow_client.copy_model_version(
    src_model_uri=f"models:/{model_name}@champion",
    dst_name=model_name_stag,
)
mlflow_client.set_registered_model_alias(
    model_name_stag,
    "staging_champion",
    1)

Successfully registered model 'nyc-taxi-regressor-stag'.
Copied version '1' of model 'nyc-taxi-regressor-candidates' to version '1' of model 'nyc-taxi-regressor-stag'.


<ModelVersion: aliases=[], creation_timestamp=1748336852967, current_stage='None', description='', last_updated_timestamp=1748336852967, name='nyc-taxi-regressor-stag', run_id='2e1d7f7fa84c4d93991acd5a372768ab', run_link='', source='models:/nyc-taxi-regressor-candidates/1', status='READY', status_message=None, tags={'validation_status': 'approved'}, user_id='', version='1'>

In [None]:
# Productionize the model
model_name_prod = "nyc-taxi-regressor-prod"
mlflow_client.copy_model_version(
    src_model_uri=f"models:/{model_name_stag}@staging_champion",
    dst_name=model_name_prod
)
mlflow_client.set_registered_model_alias(
    model_name_prod,
    "theone",
    1)

Successfully registered model 'nyc-taxi-regressor-prod'.
Copied version '1' of model 'nyc-taxi-regressor-stag' to version '1' of model 'nyc-taxi-regressor-prod'.


<ModelVersion: aliases=[], creation_timestamp=1748337501170, current_stage='None', description='', last_updated_timestamp=1748337501170, name='nyc-taxi-regressor-prod', run_id='2e1d7f7fa84c4d93991acd5a372768ab', run_link='', source='models:/nyc-taxi-regressor-stag/1', status='READY', status_message=None, tags={'validation_status': 'approved'}, user_id='', version='1'>