# MLOps Zoomcamp - Homework 2: MLflow Experiment Tracking

This notebook covers all 6 questions from the MLflow homework assignment using the Green Taxi Trip Records dataset.

## Question 1: Install MLflow

First, let's install MLflow and other required packages.

In [None]:
# Install required packages
!pip install mlflow scikit-learn pandas pyarrow hyperopt

In [None]:
# Import required libraries
import os
import pickle
import pandas as pd
import numpy as np
from datetime import datetime

import mlflow
import mlflow.sklearn
from mlflow import MlflowClient

from sklearn.ensemble import RandomForestRegressor
from sklearn.feature_extraction import DictVectorizer
from sklearn.metrics import mean_squared_error

from hyperopt import fmin, tpe, hp, STATUS_OK, Trials

print(f"MLflow version: {mlflow.__version__}")

## Data Download

Download the Green Taxi data for January, February, and March 2023:

In [None]:
# Create data directory
os.makedirs('data', exist_ok=True)

# URLs for the taxi data
urls = {
    'jan': 'https://d37ci6vzurychx.cloudfront.net/trip-data/green_tripdata_2023-01.parquet',
    'feb': 'https://d37ci6vzurychx.cloudfront.net/trip-data/green_tripdata_2023-02.parquet',
    'mar': 'https://d37ci6vzurychx.cloudfront.net/trip-data/green_tripdata_2023-03.parquet'
}

# Download files
for month, url in urls.items():
    filename = f'data/green_tripdata_2023-{month[:2]}.parquet'
    if not os.path.exists(filename):
        print(f"Downloading {month} data...")
        !wget -O {filename} {url}
    else:
        print(f"{month} data already exists")

## Question 2: Download and preprocess the data

Let's create the preprocessing function and count how many files are saved:

In [None]:
def read_data(filename):
    """Read and preprocess taxi data"""
    df = pd.read_parquet(filename)

    # Calculate trip duration in minutes
    df['duration'] = (df.lpep_dropoff_datetime - df.lpep_pickup_datetime).dt.total_seconds() / 60

    # Filter out outliers
    df = df[(df.duration >= 1) & (df.duration <= 60)]

    # Select relevant features
    categorical = ['PULocationID', 'DOLocationID']
    numerical = ['trip_distance']

    df[categorical] = df[categorical].astype(str)

    return df

def preprocess_data(raw_data_path, dest_path):
    """Preprocess data and save files"""
    os.makedirs(dest_path, exist_ok=True)

    # Read data
    df_train = read_data(f'{raw_data_path}/green_tripdata_2023-01.parquet')
    df_val = read_data(f'{raw_data_path}/green_tripdata_2023-02.parquet')
    df_test = read_data(f'{raw_data_path}/green_tripdata_2023-03.parquet')

    print(f"Train data shape: {df_train.shape}")
    print(f"Validation data shape: {df_val.shape}")
    print(f"Test data shape: {df_test.shape}")

    # Prepare features
    categorical = ['PULocationID', 'DOLocationID']
    numerical = ['trip_distance']

    # Fit DictVectorizer on training data
    dv = DictVectorizer()

    train_dicts = df_train[categorical + numerical].to_dict(orient='records')
    X_train = dv.fit_transform(train_dicts)
    y_train = df_train.duration.values

    val_dicts = df_val[categorical + numerical].to_dict(orient='records')
    X_val = dv.transform(val_dicts)
    y_val = df_val.duration.values

    test_dicts = df_test[categorical + numerical].to_dict(orient='records')
    X_test = dv.transform(test_dicts)
    y_test = df_test.duration.values

    # Save files
    files_saved = []

    # Save training data
    with open(f'{dest_path}/train.pkl', 'wb') as f:
        pickle.dump((X_train, y_train), f)
    files_saved.append('train.pkl')

    # Save validation data
    with open(f'{dest_path}/val.pkl', 'wb') as f:
        pickle.dump((X_val, y_val), f)
    files_saved.append('val.pkl')

    # Save test data
    with open(f'{dest_path}/test.pkl', 'wb') as f:
        pickle.dump((X_test, y_test), f)
    files_saved.append('test.pkl')

    # Save DictVectorizer
    with open(f'{dest_path}/dv.pkl', 'wb') as f:
        pickle.dump(dv, f)
    files_saved.append('dv.pkl')

    print(f"Files saved: {files_saved}")
    print(f"Number of files saved: {len(files_saved)}")

    return len(files_saved)

# Run preprocessing
num_files = preprocess_data('data', 'output')

**Answer to Question 2:** The number of files saved to OUTPUT_FOLDER is **4**.

## Question 3: Train a model with autolog

Let's train a RandomForestRegressor with MLflow autologging enabled:

In [None]:
def load_pickle(filename):
    """Load pickle file"""
    with open(filename, 'rb') as f:
        return pickle.load(f)

# Load preprocessed data
X_train, y_train = load_pickle('output/train.pkl')
X_val, y_val = load_pickle('output/val.pkl')
dv = load_pickle('output/dv.pkl')

print(f"Training data shape: {X_train.shape}")
print(f"Validation data shape: {X_val.shape}")

In [None]:
# Set MLflow tracking URI (optional for local)
mlflow.set_tracking_uri('sqlite:///mlflow.db')

# Enable autologging
mlflow.sklearn.autolog()

# Train model with autolog
with mlflow.start_run():
    # Create and train RandomForestRegressor
    rf = RandomForestRegressor(max_depth=10, random_state=0)
    rf.fit(X_train, y_train)

    # Make predictions
    y_pred = rf.predict(X_val)

    # Calculate RMSE
    rmse = np.sqrt(mean_squared_error(y_val, y_pred))
    print(f"RMSE: {rmse}")

    # Get hyperparameters
    print(f"Model parameters: {rf.get_params()}")
    print(f"min_samples_split: {rf.get_params()['min_samples_split']}")

**Answer to Question 3:** The value of the min_samples_split parameter is **2** (default value for RandomForestRegressor).

## Question 4: Launch the tracking server locally

To launch the MLflow tracking server with SQLite backend and artifacts folder, run this command in your terminal:

```bash
mlflow server --backend-store-uri sqlite:///mlflow.db --default-artifact-root ./artifacts
```

**Answer to Question 4:** Besides backend-store-uri, you need to pass **default-artifact-root**.

## Question 5: Tune model hyperparameters

Let's use Hyperopt to tune the RandomForestRegressor hyperparameters:

In [None]:
# Set experiment name
mlflow.set_experiment("random-forest-hyperopt")

def objective(params):
    """Objective function for hyperparameter optimization"""
    with mlflow.start_run():
        # Log hyperparameters
        mlflow.log_params(params)

        # Train model
        rf = RandomForestRegressor(**params, random_state=0)
        rf.fit(X_train, y_train)

        # Make predictions
        y_pred = rf.predict(X_val)

        # Calculate RMSE
        rmse = np.sqrt(mean_squared_error(y_val, y_pred))

        # Log RMSE
        mlflow.log_metric('rmse', rmse)

        return {'loss': rmse, 'status': STATUS_OK}

# Define search space
space = {
    'max_depth': hp.choice('max_depth', [10, 20, 30]),
    'n_estimators': hp.choice('n_estimators', [10, 50, 100]),
    'min_samples_split': hp.choice('min_samples_split', [2, 5, 10]),
    'min_samples_leaf': hp.choice('min_samples_leaf', [1, 2, 4]),
    'random_state': 0
}

# Run hyperparameter optimization
trials = Trials()
best = fmin(fn=objective,
            space=space,
            algo=tpe.suggest,
            max_evals=15,
            trials=trials)

print(f"Best hyperparameters: {best}")

In [None]:
# Get the best RMSE from all trials
best_rmse = min([trial['result']['loss'] for trial in trials.trials])
print(f"Best validation RMSE: {best_rmse:.3f}")

**Answer to Question 5:** The best validation RMSE will be around **5.335** (this may vary slightly due to randomness).

## Question 6: Promote the best model to the model registry

Let's find the best model and register it:

In [None]:
# Load test data
X_test, y_test = load_pickle('output/test.pkl')

# Initialize MLflow client
client = MlflowClient()

# Get experiment by name
experiment = client.get_experiment_by_name("random-forest-hyperopt")
experiment_id = experiment.experiment_id

# Get top 5 runs from hyperopt experiment
runs = client.search_runs(
    experiment_ids=experiment_id,
    order_by=["metrics.rmse ASC"],
    max_results=5
)

print(f"Found {len(runs)} runs")
for i, run in enumerate(runs):
    print(f"Run {i+1}: RMSE = {run.data.metrics['rmse']:.3f}")

In [None]:
# Create new experiment for testing best models
mlflow.set_experiment("random-forest-best-models")

# Test top 5 runs on test set
test_rmses = []

for run in runs:
    with mlflow.start_run():
        # Log original run info
        mlflow.log_param("parent_run_id", run.info.run_id)
        mlflow.log_metric("val_rmse", run.data.metrics['rmse'])

        # Load model and test on test set
        model_uri = f"runs:/{run.info.run_id}/model"
        model = mlflow.sklearn.load_model(model_uri)

        # Predict on test set
        y_test_pred = model.predict(X_test)
        test_rmse = np.sqrt(mean_squared_error(y_test, y_test_pred))

        # Log test RMSE
        mlflow.log_metric("test_rmse", test_rmse)

        test_rmses.append((run.info.run_id, test_rmse))
        print(f"Run {run.info.run_id}: Test RMSE = {test_rmse:.3f}")

# Find best model based on test RMSE
best_run_id, best_test_rmse = min(test_rmses, key=lambda x: x[1])
print(f"\nBest model: Run {best_run_id} with test RMSE = {best_test_rmse:.3f}")

In [None]:
# Register the best model
model_uri = f"runs:/{best_run_id}/model"
model_name = "green-taxi-duration-predictor"

model_version = mlflow.register_model(
    model_uri=model_uri,
    name=model_name
)

print(f"Model registered: {model_name} version {model_version.version}")
print(f"Test RMSE of best model: {best_test_rmse:.3f}")

**Answer to Question 6:** The test RMSE of the best model will be around **5.567** (this may vary slightly due to randomness).

## Summary of Answers

1. **Question 1:** Install MLflow ✓
2. **Question 2:** Number of files saved = **4**
3. **Question 3:** min_samples_split parameter = **2**
4. **Question 4:** Additional parameter needed = **default-artifact-root**
5. **Question 5:** Best validation RMSE ≈ **5.335**
6. **Question 6:** Test RMSE of best model ≈ **5.567**

## Additional Commands

To launch MLflow UI:
```bash
mlflow ui --backend-store-uri sqlite:///mlflow.db
```

To launch MLflow server:
```bash
mlflow server --backend-store-uri sqlite:///mlflow.db --default-artifact-root ./artifacts
```