In [0]:
base_path = "s3://columbia-gr5069-main/raw/"
spark_results = spark.read.option("header", True).csv(base_path + "results.csv")
spark_drivers = spark.read.option("header", True).csv(base_path + "drivers.csv")
spark_races = spark.read.option("header", True).csv(base_path + "races.csv")
spark_status_map = spark.read.option("header", True).csv(base_path + "status.csv")
results = spark_results.toPandas()
drivers = spark_drivers.toPandas()
races = spark_races.toPandas()
status_map = spark_status_map.toPandas()

 (1) Build any model with tunable hyperparameters

In [0]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
# Merge readable status into results
results = results.merge(status_map, on="statusId", how="left")  # get 'status' column

# Merge datasets to include driver and race year info
df = results.merge(drivers, on="driverId").merge(races[["raceId", "year"]], on="raceId")

# Replace '\N' strings with NaN values
df.replace("\\N", np.nan, inplace=True)

# Convert necessary columns to numeric
df['fastestLap'] = pd.to_numeric(df['fastestLap'], errors='coerce')
df['rank'] = pd.to_numeric(df['rank'], errors='coerce')

# Create binary label: finished = 1, else 0
df['status'] = df['status'].astype(str)
df['finished'] = df['status'].apply(lambda x: 1 if x.strip().lower() == 'finished' else 0)

# Drop rows with missing values
df.dropna(subset=['grid', 'laps', 'fastestLap', 'rank', 'year', 'nationality'], inplace=True)

# Encode categorical: nationality
le = LabelEncoder()
df['nationality_encoded'] = le.fit_transform(df['nationality'])

# Feature and label selection
features = ['grid', 'laps', 'fastestLap', 'rank', 'year', 'nationality_encoded']
X = df[features].astype(float)
y = df['finished']

(2) Log hyperparameters, model, metrics, artifacts with MLflow

In [0]:
!pip install mlflow scikit-learn matplotlib seaborn

import mlflow
import mlflow.sklearn
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, f1_score
import matplotlib.pyplot as plt
import seaborn as sns
import os

mlflow.set_experiment("/Users/hh3110@columbia.edu/F1 Finished Prediction")
os.makedirs("artifacts", exist_ok=True)

[43mNote: you may need to restart the kernel using %restart_python or dbutils.library.restartPython() to use updated packages.[0m


 (3) Track 10+ experiments

In [0]:
# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Set MLflow experiment
mlflow.set_experiment("/Users/hh3110@columbia.edu/F1 Finished Prediction")  
os.makedirs("artifacts", exist_ok=True)

# Try different hyperparameters
for n in [50, 100, 150, 200, 250, 300, 350, 400, 450, 500]:
    with mlflow.start_run():
        model = RandomForestClassifier(n_estimators=n, random_state=42)
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)

        # Calculate metrics
        acc = accuracy_score(y_test, y_pred)
        prec = precision_score(y_test, y_pred, zero_division=0)
        rec = recall_score(y_test, y_pred, zero_division=0)
        f1 = f1_score(y_test, y_pred, zero_division=0)
        cm = confusion_matrix(y_test, y_pred)

        # Log hyperparameters and metrics
        mlflow.log_param("n_estimators", n)
        mlflow.log_metric("accuracy", acc)
        mlflow.log_metric("precision", prec)
        mlflow.log_metric("recall", rec)
        mlflow.log_metric("f1", f1)

        # Log model
        mlflow.sklearn.log_model(model, "model")

        # Plot confusion matrix
        plt.figure(figsize=(5, 4))
        sns.heatmap(cm, annot=True, fmt="d", cmap="Blues")
        plt.title(f"Confusion Matrix (n={n})")
        plt.xlabel("Predicted")
        plt.ylabel("Actual")
        plot_path = f"artifacts/conf_matrix_{n}.png"
        plt.savefig(plot_path)
        plt.close()
        mlflow.log_artifact(plot_path)

        # Save predictions CSV
        pred_df = X_test.copy()
        pred_df['actual'] = y_test.values
        pred_df['predicted'] = y_pred
        pred_csv_path = f"artifacts/predictions_{n}.csv"
        pred_df.to_csv(pred_csv_path, index=False)
        mlflow.log_artifact(pred_csv_path)

 - mlflow (current: 2.21.3, required: mlflow==2.11.4)
To fix the mismatches, call `mlflow.pyfunc.get_model_dependencies(model_uri)` to fetch the model's environment and install dependencies using the resulting environment file.
 - mlflow (current: 2.21.3, required: mlflow==2.11.4)
To fix the mismatches, call `mlflow.pyfunc.get_model_dependencies(model_uri)` to fetch the model's environment and install dependencies using the resulting environment file.
 - mlflow (current: 2.21.3, required: mlflow==2.11.4)
To fix the mismatches, call `mlflow.pyfunc.get_model_dependencies(model_uri)` to fetch the model's environment and install dependencies using the resulting environment file.
 - mlflow (current: 2.21.3, required: mlflow==2.11.4)
To fix the mismatches, call `mlflow.pyfunc.get_model_dependencies(model_uri)` to fetch the model's environment and install dependencies using the resulting environment file.
 - mlflow (current: 2.21.3, required: mlflow==2.11.4)
To fix the mismatches, call `mlflo

(4) Best Model Run: Explanation

After running 10 experiments with different values of `n_estimators`, I selected the best model based on the **highest F1 score**, which balances both precision and recall.

**Best Run Summary:**
- `n_estimators`: **250**
- `Accuracy`: **0.8536**
- `Precision`: **0.8679**
- `Recall`: **0.8494**
- `F1 Score`: **0.8586**

This model achieved the best balance between catching both drivers who finish and those who do not. Since our data may have some class imbalance, the F1 score is a more appropriate metric than accuracy alone.

In addition, this run also generated and logged:
- A confusion matrix plot
- A CSV file of prediction results

All metrics, parameters, model, and artifacts were logged to MLflow and can be used for further evaluation or deployment.


(5) MLflow Screenshot Evidence

As required, here are the screenshots of:

The MLflow Experiment Homepage
The 10 Individual Model Run Detail Pages

MLflow Experiment Homepage
This page shows that at least 10 runs were logged successfully using MLflow, including run names, parameters, durations, and accuracy metrics.

![Homepage](screenshots/homepage.jpg)

---

MLflow Run Detail Pages
The following images are detailed views of each individual run, including logged hyperparameters, model accuracy, precision, recall, F1 score, and artifacts.

![Run Detail 1](screenshots/1.jpg)
![Run Detail 2](screenshots/2.jpg)
![Run Detail 3](screenshots/3.jpg)
![Run Detail 4](screenshots/4.jpg)
![Run Detail 5](screenshots/5.jpg)
![Run Detail 6](screenshots/6.jpg)
![Run Detail 7](screenshots/7.jpg)
![Run Detail 8](screenshots/8.jpg)
![Run Detail 9](screenshots/9.jpg)
![Run Detail 10](screenshots/10.jpg)
