**Homework 3: Training Pipelines for MLOps Zoomcamp 2025**

**1 & 2. Tool you use and Version**

In [1]:
!prefect version

Version:             3.4.4
API version:         0.8.4
Python version:      3.10.16
Git commit:          0367d7aa
Built:               Thu, May 29, 2025 09:37 PM
OS/Arch:             linux/x86_64
Profile:             ephemeral
Server type:         ephemeral
Pydantic version:    2.11.5
Server:
  Database:          sqlite
  SQLite version:    3.45.3


**Import the packages and set config**

In [2]:
import pandas as pd
import numpy as np
import mlflow
import mlflow.sklearn
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
import os

**3: Load March 2023 Yellow Taxi dataset**

In [3]:
# Load the parquet file
url = "https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2023-03.parquet"
df = pd.read_parquet(url)

print(f"✅ Loaded {len(df):,} records")

✅ Loaded 3,403,766 records


**4:Data preparation function**

In [4]:
def prepare_data(df):
    df['duration'] = df.tpep_dropoff_datetime - df.tpep_pickup_datetime
    df['duration'] = df['duration'].dt.total_seconds() / 60
    
    df = df[(df['duration'] >= 1) & (df['duration'] <= 60)].copy()  # 👈 Add .copy()

    categorical = ['PULocationID', 'DOLocationID']
    df[categorical] = df[categorical].astype(str)

    return df

# 👇 Assign df_clean BEFORE printing
df_clean = prepare_data(df).copy()
print(f"✅ Data after preparation: {len(df_clean):,} rows")

✅ Data after preparation: 3,316,216 rows


**Feature Engineering and Split**

In [5]:
from sklearn.model_selection import train_test_split

categorical = ['PULocationID', 'DOLocationID']
df_clean['target'] = df_clean['duration']

train_df, val_df = train_test_split(df_clean, test_size=0.2, random_state=42)

def df_to_dict(df):
    return df[categorical].to_dict(orient='records')

X_train_dict = df_to_dict(train_df)
X_val_dict = df_to_dict(val_df)

dv = DictVectorizer()
X_train = dv.fit_transform(X_train_dict)
X_val = dv.transform(X_val_dict)

y_train = train_df['target'].values
y_val = val_df['target'].values

**5:Train model and log with MLflow**

In [7]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
import mlflow
import mlflow.sklearn
from mlflow.models.signature import infer_signature
import numpy as np

mlflow.set_tracking_uri("sqlite:///mlflow.db")
mlflow.set_experiment("homework3-nyc-taxi")

with mlflow.start_run():
    lr = LinearRegression()
    lr.fit(X_train, y_train)

    y_pred = lr.predict(X_val)
    mse = mean_squared_error(y_val, y_pred)
    rmse = np.sqrt(mse)

    mlflow.log_param("model_type", "LinearRegression")
    mlflow.log_metric("rmse", rmse)

    signature = infer_signature(X_val, y_pred)
    input_example = X_val[:1]

    mlflow.sklearn.log_model(
        sk_model=lr,
        artifact_path="models",
        signature=signature,
        input_example=input_example
    )

    print(f"✅ RMSE: {rmse:.2f}")
    print(f"✅ Intercept: {lr.intercept_:.2f}")

✅ RMSE: 8.15
✅ Intercept: 24.75


**Save DictVectorizer and Register the Model**

In [8]:
import pickle
import pathlib

output_dir = pathlib.Path("artifacts")
output_dir.mkdir(exist_ok=True)

# Save dict vectorizer
with open(output_dir / "dv.pkl", "wb") as f_out:
    pickle.dump(dv, f_out)

mlflow.log_artifact(str(output_dir / "dv.pkl"))

print("✅ DictVectorizer saved and logged.")

✅ DictVectorizer saved and logged.


**6: Find model size in MLmodel**

In [9]:
import os

mlflow_dir = "mlruns"
for root, dirs, files in os.walk(mlflow_dir):
    for file in files:
        if file == "MLmodel":
            full_path = os.path.join(root, file)
            print("✅ Found MLmodel at:", full_path)

✅ Found MLmodel at: mlruns/1/21748e5db3444097b591a8c3598ae73f/artifacts/models/MLmodel
✅ Found MLmodel at: mlruns/1/0327e7e71ab54a13a8730da99c84f02f/artifacts/models/MLmodel


**Summary of the answers**

In [10]:
print(f"✅ Final Answers Summary")
print(f"- Records loaded: {len(df):,}")
print(f"- Records after prep: {len(df_clean):,}")
print(f"- Intercept: {lr.intercept_:.2f}")
print(f"- RMSE: {rmse:.2f}")

✅ Final Answers Summary
- Records loaded: 3,403,766
- Records after prep: 3,316,216
- Intercept: 24.75
- RMSE: 8.15
