In [1]:
import mlflow
import os
from mlflow.entities import ViewType
from mlflow.tracking import MlflowClient

In [2]:
os.listdir("data/raw")

['green_tripdata_2021-01.parquet',
 'green_tripdata_2021-02.parquet',
 'green_tripdata_2021-03.parquet']

# Q1. Install MLflow

In [3]:
Q1 = "1.26.0"
print(f"mlflow, version {Q1}")

mlflow, version 1.26.0


# Q2. Download and preprocess the data

In [11]:
!python preprocess_data.py --raw_data_path "data/raw" --dest_path "data/preprocessed"

In [4]:
print(os.listdir("data/preprocessed"))
Q2 = len(os.listdir("data/preprocessed"))

['dv.pkl', 'test.pkl', 'train.pkl', 'valid.pkl']


# Q3. Train a model with autolog

In [12]:
!python train.py --data_path "data/preprocessed"

2022/05/26 13:19:56 INFO mlflow.tracking.fluent: Autologging successfully enabled for sklearn.


![image.png](attachment:image.png)

In [5]:
Q3 = 17
print("Autolog parameters", Q3)

Autolog parameters 17


# Q4. Launch the tracking server locally

In [13]:
"mlflow server --backend-store-uri sqlite:///mlflow.db --default-artifact-root artifacts"
Q4 = "default-artifact-root"
print("CLI argument for specifying artifacts folder: ", Q4)

CLI argument for specifying artifacts folder:  default-artifact-root


# Q5. Tune the hyperparameters of the model

In [48]:
!python hpo.py --data_path "data/preprocessed"


  0%|          | 0/50 [00:00<?, ?trial/s, best loss=?]
  2%|▏         | 1/50 [00:09<07:39,  9.37s/trial, best loss: 6.658956269343007]
  4%|▍         | 2/50 [00:09<03:20,  4.17s/trial, best loss: 6.658956269343007]
  6%|▌         | 3/50 [00:10<02:02,  2.62s/trial, best loss: 6.658956269343007]
  8%|▊         | 4/50 [00:17<03:11,  4.16s/trial, best loss: 6.651438559376775]
 10%|█         | 5/50 [00:20<02:49,  3.76s/trial, best loss: 6.651438559376775]
 12%|█▏        | 6/50 [00:30<04:21,  5.95s/trial, best loss: 6.651438559376775]
 14%|█▍        | 7/50 [00:40<05:11,  7.24s/trial, best loss: 6.651438559376775]
 16%|█▌        | 8/50 [00:41<03:42,  5.30s/trial, best loss: 6.651438559376775]
 18%|█▊        | 9/50 [00:47<03:50,  5.62s/trial, best loss: 6.651438559376775]
 20%|██        | 10/50 [00:52<03:38,  5.46s/trial, best loss: 6.651438559376775]
 22%|██▏       | 11/50 [00:56<03:14,  5.00s/trial, best loss: 6.642137287429206]
 24%|██▍       | 12/50 [01:00<02:53,  4.57s/trial, best loss: 

![image.png](attachment:image.png)

In [37]:
HPO_EXPERIMENT_NAME = "random-forest-hyperopt"
EXPERIMENT_NAME = "random-forest-best-models"

mlflow.set_tracking_uri("http://127.0.0.1:5000")
mlflow.set_experiment(EXPERIMENT_NAME)
experiment = client.get_experiment_by_name(HPO_EXPERIMENT_NAME)

client = MlflowClient()
runs = client.search_runs(
    experiment_ids=experiment.experiment_id,
    run_view_type=ViewType.ACTIVE_ONLY,
    max_results=1,
    order_by=["metrics.Validation_RMSE ASC"]
)

Q5 = runs[0].data.metrics["Validation_RMSE"]
print("Minimal Validation loss is", Q5)

Minimal Validation loss is 6.6284257482044735


# Q6. Promote the best model to the model registry

In [49]:
!python register_model.py --data_path "data/preprocessed"

Registered model 'random-forest-reg' already exists. Creating a new version of this model...
2022/05/26 16:31:15 INFO mlflow.tracking._model_registry.client: Waiting up to 300 seconds for model version to finish creation.                     Model name: random-forest-reg, version 2
Created version '2' of model 'random-forest-reg'.


In [54]:
last_experiment = client.get_experiment_by_name(EXPERIMENT_NAME)
best_run = client.search_runs(
    experiment_ids=last_experiment.experiment_id,
    run_view_type=ViewType.ACTIVE_ONLY,
    max_results=1,
    order_by=["metrics.Test_RMSE ASC"]
)[0]
Q6 = best_run.data.metrics["Test_RMSE"]
print("Test RMSE", Q6)

Test RMSE 6.547886185595423


In [55]:
print("Results:")
print("Q1", Q1)
print("Q2", Q2)
print("Q3", Q3)
print("Q4", Q4)
print("Q5", Q5)
print("Q6", Q6)

Results:
Q1 1.26.0
Q2 4
Q3 17
Q4 default-artifact-root
Q5 6.6284257482044735
Q6 6.547886185595423
