In [0]:
# Gina Wang
# Dr. Morales & Nana
# QMSSGR5069 Applied Data Sciences - Take Home Exercise #2
# March 24, 2025
# Collaborator: Jay Jun (We went through the assignment together and talked about ways to approach each question)

#### Question 1: [20 pts] Build any model of your choice with tunable hyperparameters

In [0]:
# Install required packages to read from S3
%pip install s3fs

In [0]:
# Restart the Python kernel to activate s3fs
%restart_python

In [0]:
# Import libraries
import pandas as pd
import numpy as np
import mlflow
import mlflow.sklearn
import matplotlib.pyplot as plt
import seaborn as sns
import os
import tempfile

from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

In [0]:
# Load datasets from S3
results = pd.read_csv("s3://columbia-gr5069-main/raw/results.csv")
races = pd.read_csv("s3://columbia-gr5069-main/raw/races.csv")
drivers = pd.read_csv("s3://columbia-gr5069-main/raw/drivers.csv")
lap_times = pd.read_csv("s3://columbia-gr5069-main/raw/lap_times.csv")
pit_stops = pd.read_csv("s3://columbia-gr5069-main/raw/pit_stops.csv")
qualifying = pd.read_csv("s3://columbia-gr5069-main/raw/qualifying.csv")

In [0]:
# Join: results + races + drivers
results_merged = results.merge(races, on="raceId", suffixes=("", "_race"))
results_merged = results_merged.merge(drivers, on="driverId", suffixes=("", "_driver"))

# Feature: average lap time per driver per race
lap_avg = lap_times.groupby(['raceId', 'driverId'])['milliseconds'].mean().reset_index()
lap_avg.rename(columns={'milliseconds': 'avg_lap_time_ms'}, inplace=True)

# Feature: number of pit stops per driver per race
pit_count = pit_stops.groupby(['raceId', 'driverId']).size().reset_index(name='num_pit_stops')

# Feature: qualifying position (lowest value if multiple attempts)
qualifying_agg = qualifying.groupby(['raceId', 'driverId'])['position'].min().reset_index()
qualifying_agg.rename(columns={'position': 'qualifying_position'}, inplace=True)

# Merge engineered features
results_merged = results_merged.merge(lap_avg, on=['raceId', 'driverId'], how='left')
results_merged = results_merged.merge(pit_count, on=['raceId', 'driverId'], how='left')
results_merged = results_merged.merge(qualifying_agg, on=['raceId', 'driverId'], how='left')

# Select features for modeling
model_data = results_merged[[
    'raceId', 'driverId', 'grid', 'positionOrder', 'points',
    'avg_lap_time_ms', 'num_pit_stops', 'qualifying_position'
]]

# Drop rows with missing data
model_data = model_data.dropna(subset=[
    'positionOrder', 'avg_lap_time_ms', 'num_pit_stops', 'qualifying_position'
])

# Display to verify
display(model_data.head())

#### Question 2: [20 pts] Create an experiment setup where - for each run - you log: the hyperparameters used in the model, the model itself, every possible metric from the model you chose, at least two artifacts (plots, or csv files)

In [0]:
# Clean and prep for ML
df = model_data.copy()

# Drop non-numeric columns if needed
df = df.select_dtypes(include=[np.number])  # Drop all non-numeric cols

# Define features and target
X = df.drop(columns=["positionOrder"])
y = df["positionOrder"]

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

# Train model and log with MLflow
with mlflow.start_run(run_name="Basic RF Experiment") as run:
    rf = RandomForestRegressor()
    rf.fit(X_train, y_train)
    predictions = rf.predict(X_test)

    # Log model
    mlflow.sklearn.log_model(rf, "random-forest-model")

    # Metrics
    mse = mean_squared_error(y_test, predictions)
    mlflow.log_metric("mse", mse)
    print(f"  mse: {mse}")

    # Log metadata
    runID = run.info.run_uuid
    experimentID = run.info.experiment_id
    print("Inside MLflow Run with run_id {} and experiment_id {}".format(runID, experimentID))

#### Question 3: [20 pts] Track your MLFlow experiment and run at least 10 experiments with different parameters each

In [0]:
# Define function to train, evaluate, and log
def log_rf_run(experimentID, run_name, params, X_train, X_test, y_train, y_test):
    with mlflow.start_run(experiment_id=experimentID, run_name=run_name) as run:
        rf = RandomForestRegressor(**params)
        rf.fit(X_train, y_train)
        predictions = rf.predict(X_test)

        # Log model
        mlflow.sklearn.log_model(rf, "rf-model")

        # Log hyperparameters
        for param, value in params.items():
            mlflow.log_param(param, value)

        # Metrics
        mse = mean_squared_error(y_test, predictions)
        mae = mean_absolute_error(y_test, predictions)
        r2 = r2_score(y_test, predictions)

        # Log metrics
        mlflow.log_metric("mse", mse)
        mlflow.log_metric("mae", mae)
        mlflow.log_metric("r2", r2)

        print(f"[{run_name}] MSE: {mse:.2f} | MAE: {mae:.2f} | R²: {r2:.3f}")

        # Feature importance
        importance = pd.DataFrame({
            'Feature': X_train.columns,
            'Importance': rf.feature_importances_
        }).sort_values(by='Importance', ascending=False)

        temp_csv = tempfile.NamedTemporaryFile(delete=False, suffix=".csv")
        importance.to_csv(temp_csv.name, index=False)
        mlflow.log_artifact(temp_csv.name, "feature-importance")

        # Residual plot
        fig, ax = plt.subplots()
        sns.residplot(x=predictions, y=y_test, lowess=True, ax=ax)
        plt.xlabel("Predicted")
        plt.ylabel("Residuals")
        plt.title("Model Residual Plot")

        temp_img = tempfile.NamedTemporaryFile(delete=False, suffix=".png")
        fig.savefig(temp_img.name)
        mlflow.log_artifact(temp_img.name, "residual-plot")

        return run.info.run_uuid


In [0]:

# Run 10 experiments with different parameters
params_list = [
    {"n_estimators": 120, "max_depth": 5, "random_state": 42},
    {"n_estimators": 240, "max_depth": 5, "random_state": 42},
    {"n_estimators": 360, "max_depth": 5, "random_state": 42},
    {"n_estimators": 120, "max_depth": 10, "random_state": 42},
    {"n_estimators": 240, "max_depth": 10, "random_state": 42},
    {"n_estimators": 360, "max_depth": 10, "random_state": 42},
    {"n_estimators": 120, "max_depth": 15, "random_state": 42},
    {"n_estimators": 240, "max_depth": 15, "random_state": 42},
    {"n_estimators": 360, "max_depth": 15, "random_state": 42},
    {"n_estimators": 500, "max_depth": 20, "random_state": 42},
]

for i, params in enumerate(params_list, 1):
    run_name = f"Run {i}: {params['n_estimators']} Estimators, Depth {params['max_depth']}"
    log_rf_run(experimentID, run_name, params, X_train, X_test, y_train, y_test)


#### Question 4: [20 pts] Select your best model run and explain why

After carefully evaluating the performance metrics from all ten Random Forest model runs, I selected Run 5 - which used 240 estimators and a maximum tree depth of 10 - as the best model. This choice is grounded in a close examination of the three primary evaluation metrics: Mean Squared Error (MSE), Mean Absolute Error (MAE), and R² score.

Run 5 produced an MSE of 3.96, MAE of 1.16, and an R² value of 0.889. What’s notable is that these values are identical or nearly identical to several other runs (specifically Runs 6, 8, and 9), but Run 5 achieves this level of performance with a smaller number of trees and moderate tree depth, which makes it both computationally more efficient and likely less prone to overfitting. For instance, Run 6 used 360 estimators with the same depth of 10 and achieved the same MSE and R², but at a greater computational cost. Similarly, Run 8 required a deeper tree (depth 15) and also 240 estimators to match the MSE and R² of Run 5, introducing more complexity into the model without delivering better performance.

While all models reported the same MAE of 1.16 - which suggests that on average, the absolute error per prediction was consistent across the board - the marginal gains in R² and MSE plateaued after a certain level of complexity. Run 5 hit the sweet spot in that trade-off. It reflects a balance between bias and variance: it's deep and wide enough to capture the signal in the data, but not so complex that it risks overfitting to noise. From a model selection perspective, parsimony, or choosing the simplest model that performs well, is a valuable guiding principle, and Run 5 aligns well with this.

Therefore, although several models yielded nearly identical predictive accuracy, Run 5 offers the most efficient path to strong performance, which makes it the most optimal choice from both a statistical and practical standpoint. In a real-world deployment setting, this kind of model would likely generalize better while also consuming fewer computational resources during training and inference. 
