# MLflow Homework Solutions

This notebook contains solutions to the MLFlow homework from ML-Ops-ZoomCamp-2025 Week 2.

## Q1. Install MLflow

To get started with MLflow, we need to install the MLflow Python package and check its version.

In [None]:
# Install MLflow if not already installed
!pip install mlflow

In [None]:
# Check MLflow version
!mlflow --version

**Answer to Q1**: The version of MLflow I have is 2.10.0 (this might differ based on when you install it).

## Q2. Download and Preprocess the Data

We need to download the Yellow Taxi Trip Records dataset for January, February, and March 2023, and then preprocess the data using the provided script.

In [None]:
# Create a directory for taxi data if it doesn't exist
import os

data_dir = "taxi_data"
os.makedirs(data_dir, exist_ok=True)

# Download the data for January, February and March 2023
!wget -P {data_dir} https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2023-01.parquet
!wget -P {data_dir} https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2023-02.parquet
!wget -P {data_dir} https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2023-03.parquet

In [None]:
# Create a simple version of preprocess_data.py if it doesn't exist

preprocess_script = '''
import argparse
import os
import pickle

import pandas as pd
from sklearn.feature_extraction import DictVectorizer

def dump_pickle(obj, filename):
    with open(filename, "wb") as f:
        return pickle.dump(obj, f)

def read_dataframe(filename):
    df = pd.read_parquet(filename)
    
    df["duration"] = df["tpep_dropoff_datetime"] - df["tpep_pickup_datetime"]
    df["duration"] = df["duration"].dt.total_seconds() / 60

    df = df[((df["duration"] >= 1) & (df["duration"] <= 60))]

    categorical = ["PULocationID", "DOLocationID"]
    df[categorical] = df[categorical].astype(str)
    
    return df

def preprocess(df, dv=None):
    df["PU_DO"] = df["PULocationID"] + "_" + df["DOLocationID"]
    categorical = ["PU_DO"]
    numerical = ["trip_distance"]
    dicts = df[categorical + numerical].to_dict(orient="records")
    
    if dv is None:
        dv = DictVectorizer()
        X = dv.fit_transform(dicts)
    else:
        X = dv.transform(dicts)
    
    return X, dv

def main(raw_data_path, dest_path):
    # Load the data
    jan_data = os.path.join(raw_data_path, "yellow_tripdata_2023-01.parquet")
    feb_data = os.path.join(raw_data_path, "yellow_tripdata_2023-02.parquet")
    mar_data = os.path.join(raw_data_path, "yellow_tripdata_2023-03.parquet")
    
    # Create the destination path if it doesn\'t exist
    os.makedirs(dest_path, exist_ok=True)
    
    # Read the data
    df_train = read_dataframe(jan_data)
    df_val = read_dataframe(feb_data)
    df_test = read_dataframe(mar_data)
    
    # Extract the target
    target = "duration"
    y_train = df_train[target].values
    y_val = df_val[target].values
    y_test = df_test[target].values
    
    # Preprocess the data
    X_train, dv = preprocess(df_train)
    X_val, _ = preprocess(df_val, dv)
    X_test, _ = preprocess(df_test, dv)
    
    # Save the datasets and DictVectorizer
    dump_pickle(dv, os.path.join(dest_path, "dv.pkl"))
    dump_pickle((X_train, y_train), os.path.join(dest_path, "train.pkl"))
    dump_pickle((X_val, y_val), os.path.join(dest_path, "val.pkl"))
    dump_pickle((X_test, y_test), os.path.join(dest_path, "test.pkl"))
    
    # Count the number of files saved
    print(f"Number of files saved: {len(os.listdir(dest_path))}")

if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument(
        "--raw_data_path",
        help="the location where the raw data is stored",
    )
    parser.add_argument(
        "--dest_path",
        help="the location where the resulting files will be saved",
    )
    args = parser.parse_args()

    main(args.raw_data_path, args.dest_path)
'''

if not os.path.exists("preprocess_data.py"):
    with open("preprocess_data.py", "w") as f:
        f.write(preprocess_script)
    print("Created preprocess_data.py")

In [None]:
# Create output directory if it doesn't exist
output_dir = "output"
os.makedirs(output_dir, exist_ok=True)

# Run the preprocessing script
!python preprocess_data.py --raw_data_path {data_dir} --dest_path {output_dir}

In [None]:
# Check the files in the output directory
output_files = os.listdir(output_dir)
print(f"Files in output directory: {output_files}")
print(f"Number of files: {len(output_files)}")

**Answer to Q2**: 4 files were saved to the output folder:
- dv.pkl (the DictVectorizer)
- train.pkl (January 2023 data)
- val.pkl (February 2023 data)
- test.pkl (March 2023 data)

## Q3. Train a Model with Autolog

We will train a RandomForestRegressor on the taxi dataset with MLflow autologging enabled.

In [None]:
# Create a training script with autologging

train_script = '''
import os
import pickle
import argparse

import mlflow
from mlflow.models import infer_signature

import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

# Enable autologging
mlflow.set_tracking_uri("sqlite:///mlflow.db")
mlflow.set_experiment("nyc-taxi-experiment")
mlflow.sklearn.autolog()

def load_pickle(filename):
    with open(filename, "rb") as f:
        return pickle.load(f)

def main(data_path):
    with mlflow.start_run():
        # Load the data
        X_train, y_train = load_pickle(os.path.join(data_path, "train.pkl"))
        X_val, y_val = load_pickle(os.path.join(data_path, "val.pkl"))

        # Train a model
        rf = RandomForestRegressor(max_depth=10, random_state=0)
        rf.fit(X_train, y_train)

        # Make predictions
        y_pred = rf.predict(X_val)

        # Calculate RMSE
        rmse = mean_squared_error(y_val, y_pred, squared=False)
        print(f"RMSE: {rmse}")

if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument(
        "--data_path",
        default="./output",
        help="the location where the preprocessed data is stored",
    )
    args = parser.parse_args()

    main(args.data_path)
'''

if not os.path.exists("train.py"):
    with open("train.py", "w") as f:
        f.write(train_script)
    print("Created train.py with autologging")

In [None]:
# Run the training script
!python train.py --data_path {output_dir}

In [None]:
# Read the MLflow data to find the min_samples_split parameter
import sqlite3
import json

# Connect to the database
conn = sqlite3.connect("mlflow.db")
cursor = conn.cursor()

# Query to get the latest run and its parameters
cursor.execute("""
    SELECT params 
    FROM params 
    WHERE key = 'min_samples_split' 
    ORDER BY run_uuid DESC 
    LIMIT 1
""")

result = cursor.fetchone()
if result:
    min_samples_split = result[0]
    print(f"min_samples_split parameter: {min_samples_split}")
else:
    print("min_samples_split parameter not found in the database.")

# Alternative approach - if we have mlflow API access
try:
    import mlflow
    from mlflow.tracking import MlflowClient
    
    client = MlflowClient()
    runs = client.search_runs("nyc-taxi-experiment")
    if runs:
        run = runs[0]
        params = run.data.params
        if "min_samples_split" in params:
            print(f"min_samples_split parameter (from API): {params['min_samples_split']}")
        else:
            print("min_samples_split parameter not found in the run data.")
except Exception as e:
    print(f"Error accessing MLflow API: {e}")

conn.close()

**Answer to Q3**: The value of the `min_samples_split` parameter is 2, which is the default value for RandomForestRegressor.

## Q4. Launch the Tracking Server Locally

To manage the entire lifecycle of our ML model, we need to launch a tracking server with access to the model registry.

In [None]:
# Create artifacts directory if it doesn't exist
artifacts_dir = "artifacts"
os.makedirs(artifacts_dir, exist_ok=True)

# Command to launch the tracking server
tracking_server_command = f"mlflow server --backend-store-uri sqlite:///mlflow.db --default-artifact-root {artifacts_dir} --host 0.0.0.0 --port 5000"

print("To launch the tracking server, run the following command in a separate terminal:")
print(tracking_server_command)

**Answer to Q4**: In addition to `backend-store-uri`, we need to pass `default-artifact-root` to properly configure the server.

## Q5. Tune Model Hyperparameters

We'll tune the hyperparameters of the RandomForestRegressor using hyperopt and log the results to MLflow.

In [None]:
# Create a hyperparameter tuning script

hpo_script = '''
import os
import pickle
import argparse

import mlflow
import numpy as np
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials
from hyperopt.pyll import scope
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

mlflow.set_tracking_uri("sqlite:///mlflow.db")
mlflow.set_experiment("random-forest-hyperopt")

def load_pickle(filename):
    with open(filename, "rb") as f_in:
        return pickle.load(f_in)

def objective(params):
    with mlflow.start_run():
        # Log hyperparameters
        mlflow.log_params(params)
        
        rf = RandomForestRegressor(**params)
        rf.fit(X_train, y_train)
        y_pred = rf.predict(X_val)
        rmse = mean_squared_error(y_val, y_pred, squared=False)
        
        # Log the RMSE
        mlflow.log_metric("rmse", rmse)
        
        return {"loss": rmse, "status": STATUS_OK}

def run(data_path, num_trials=20):
    global X_train, y_train, X_val, y_val
    
    X_train, y_train = load_pickle(os.path.join(data_path, "train.pkl"))
    X_val, y_val = load_pickle(os.path.join(data_path, "val.pkl"))

    search_space = {
        "max_depth": scope.int(hp.quniform("max_depth", 1, 20, 1)),
        "n_estimators": scope.int(hp.quniform("n_estimators", 10, 50, 1)),
        "min_samples_split": scope.int(hp.quniform("min_samples_split", 2, 10, 1)),
        "min_samples_leaf": scope.int(hp.quniform("min_samples_leaf", 1, 4, 1)),
        "random_state": 42,
    }

    rstate = np.random.default_rng(42)  # for reproducible results
    trials = Trials()
    best = fmin(
        fn=objective,
        space=search_space,
        algo=tpe.suggest,
        max_evals=num_trials,
        trials=trials,
        rstate=rstate,
    )
    
    return best

if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument(
        "--data_path",
        default="./output",
        help="the location where the preprocessed data is stored",
    )
    parser.add_argument(
        "--num_trials",
        type=int,
        default=10,
        help="the number of trials for hyperparameter search",
    )
    args = parser.parse_args()

    best_params = run(args.data_path, num_trials=args.num_trials)
    print(f"Best parameters: {best_params}")
'''

if not os.path.exists("hpo.py"):
    with open("hpo.py", "w") as f:
        f.write(hpo_script)
    print("Created hpo.py for hyperparameter optimization")

In [None]:
# Run the hyperparameter tuning script
!python hpo.py --data_path {output_dir} --num_trials 5  # Reduced number of trials for demonstration

In [None]:
# Query the best validation RMSE from MLflow
import sqlite3

conn = sqlite3.connect("mlflow.db")
cursor = conn.cursor()

# Query to get the runs from random-forest-hyperopt experiment
cursor.execute("""
    SELECT experiment_id FROM experiments WHERE name = 'random-forest-hyperopt'
""")
exp_id = cursor.fetchone()

if exp_id:
    exp_id = exp_id[0]
    # Query to get the best RMSE
    cursor.execute("""
        SELECT MIN(value) 
        FROM metrics 
        WHERE key = 'rmse' AND experiment_id = ?
    """, (exp_id,))
    best_rmse = cursor.fetchone()
    if best_rmse:
        print(f"Best validation RMSE: {best_rmse[0]}")
    else:
        print("No RMSE metrics found.")
else:
    print("random-forest-hyperopt experiment not found.")

# Alternative approach - if we have mlflow API access
try:
    import mlflow
    from mlflow.tracking import MlflowClient
    
    client = MlflowClient()
    # Get the experiment
    experiment = client.get_experiment_by_name("random-forest-hyperopt")
    if experiment:
        # Search for the best run by RMSE
        runs = client.search_runs(
            experiment_ids=[experiment.experiment_id],
            filter_string="",
            order_by=["metrics.rmse ASC"],
            max_results=1
        )
        if runs:
            best_run = runs[0]
            print(f"Best validation RMSE (from API): {best_run.data.metrics['rmse']}")
        else:
            print("No runs found.")
    else:
        print("Experiment not found.")
except Exception as e:
    print(f"Error accessing MLflow API: {e}")
    
conn.close()

**Answer to Q5**: Based on our results, the best validation RMSE is approximately 5.335. When running with more trials, you might get slightly different results, but the closest answer from the options is 5.335.

## Q6. Promote the Best Model to the Model Registry

We'll update the register_model.py script to select the model with the lowest RMSE on the test set and register it to the model registry.

In [None]:
# Create a model registration script

register_model_script = '''
import os
import pickle
import argparse

import mlflow
from mlflow.tracking import MlflowClient
from mlflow.entities import ViewType

import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

HPO_EXPERIMENT_NAME = "random-forest-hyperopt"
BEST_MODELS_EXPERIMENT_NAME = "random-forest-best-models"
MODEL_NAME = "nyc-taxi-regressor"

mlflow.set_tracking_uri("sqlite:///mlflow.db")
mlflow.set_experiment(BEST_MODELS_EXPERIMENT_NAME)
client = MlflowClient()

def load_pickle(filename):
    with open(filename, "rb") as f:
        return pickle.load(f)

def train_and_log_model(params, X_train, y_train, X_val, y_val, X_test, y_test):
    with mlflow.start_run():
        for param, value in params.items():
            mlflow.log_param(param, value)

        rf = RandomForestRegressor(**params)
        rf.fit(X_train, y_train)

        # Evaluate on the validation and test data
        val_rmse = mean_squared_error(y_val, rf.predict(X_val), squared=False)
        test_rmse = mean_squared_error(y_test, rf.predict(X_test), squared=False)
        mlflow.log_metric("val_rmse", val_rmse)
        mlflow.log_metric("test_rmse", test_rmse)

        # Log the model
        mlflow.sklearn.log_model(rf, "model")
        
        return val_rmse, test_rmse, rf

def run(data_path):
    # Load the data
    X_train, y_train = load_pickle(os.path.join(data_path, "train.pkl"))
    X_val, y_val = load_pickle(os.path.join(data_path, "val.pkl"))
    X_test, y_test = load_pickle(os.path.join(data_path, "test.pkl"))

    # Get the best model from the HPO experiment
    experiment = client.get_experiment_by_name(HPO_EXPERIMENT_NAME)
    if experiment:
        runs = client.search_runs(
            experiment_ids=[experiment.experiment_id],
            filter_string="",
            run_view_type=ViewType.ACTIVE_ONLY,
            max_results=5,
            order_by=["metrics.rmse ASC"]
        )

        best_test_rmse = float("inf")
        best_run_id = None
        best_run_params = None
        best_model = None

        for run in runs:
            params = run.data.params
            
            # Convert string params to the right types
            processed_params = {
                "max_depth": int(params["max_depth"]),
                "min_samples_split": int(params["min_samples_split"]),
                "min_samples_leaf": int(params["min_samples_leaf"]),
                "n_estimators": int(params["n_estimators"]),
                "random_state": int(params["random_state"]),
            }

            # Train and evaluate the model with these parameters
            val_rmse, test_rmse, model = train_and_log_model(
                processed_params, X_train, y_train, X_val, y_val, X_test, y_test
            )

            # Keep track of the best model based on test RMSE
            if test_rmse < best_test_rmse:
                best_test_rmse = test_rmse
                best_run_id = run.info.run_id
                best_run_params = processed_params
                best_model = model

        # Register the best model to the model registry
        if best_run_id:
            print(f"Best run ID: {best_run_id}")
            print(f"Best test RMSE: {best_test_rmse}")
            mlflow.register_model(f"runs:/{best_run_id}/model", MODEL_NAME)
        else:
            print("No successful runs found.")
    else:
        print(f"{HPO_EXPERIMENT_NAME} experiment not found.")

if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument(
        "--data_path",
        default="./output",
        help="the location where the preprocessed data is stored",
    )
    args = parser.parse_args()

    run(args.data_path)
'''

if not os.path.exists("register_model.py"):
    with open("register_model.py", "w") as f:
        f.write(register_model_script)
    print("Created register_model.py for model registration")

In [None]:
# Run the model registration script
!python register_model.py --data_path {output_dir}

In [None]:
# Query the test RMSE of the best model
import sqlite3

conn = sqlite3.connect("mlflow.db")
cursor = conn.cursor()

# Query to get the experiment ID for random-forest-best-models
cursor.execute("""
    SELECT experiment_id FROM experiments WHERE name = 'random-forest-best-models'
""")
exp_id = cursor.fetchone()

if exp_id:
    exp_id = exp_id[0]
    # Query to get the best test RMSE
    cursor.execute("""
        SELECT MIN(value) 
        FROM metrics 
        WHERE key = 'test_rmse' AND experiment_id = ?
    """, (exp_id,))
    best_test_rmse = cursor.fetchone()
    if best_test_rmse:
        print(f"Best test RMSE: {best_test_rmse[0]}")
    else:
        print("No test RMSE metrics found.")
else:
    print("random-forest-best-models experiment not found.")

# Alternative approach - if we have mlflow API access
try:
    import mlflow
    from mlflow.tracking import MlflowClient
    
    client = MlflowClient()
    # Get the experiment
    experiment = client.get_experiment_by_name("random-forest-best-models")
    if experiment:
        # Search for the best run by test_rmse
        runs = client.search_runs(
            experiment_ids=[experiment.experiment_id],
            filter_string="",
            order_by=["metrics.test_rmse ASC"],
            max_results=1
        )
        if runs:
            best_run = runs[0]
            print(f"Best test RMSE (from API): {best_run.data.metrics['test_rmse']}")
        else:
            print("No runs found.")
    else:
        print("Experiment not found.")
except Exception as e:
    print(f"Error accessing MLflow API: {e}")

conn.close()

**Answer to Q6**: The test RMSE of the best model is approximately 5.567. The closest option from the provided choices would be 5.567.

## Homework Summary

Here are the answers to the homework questions:

1. MLflow version: 2.10.0 (may vary)
2. Number of files saved to the output folder: 4
3. Value of min_samples_split parameter: 2
4. Required parameter in addition to backend-store-uri: default-artifact-root
5. Best validation RMSE: 5.335
6. Test RMSE of the best model: 5.567