In [0]:
import mlflow
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from sklearn.preprocessing import StandardScaler
import numpy as np
import pandas as pd
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
import matplotlib.pyplot as plt
import os
from mlflow.tracking import MlflowClient

In [0]:
df_results = spark.read.csv('s3://columbia-gr5069-main/raw/results.csv', header=True)
df_results_pd = df_results.toPandas()

In [0]:
display(df_results)

1. Build any model of your choice with tunable hyperparameters

2. Create an experiment setup where - for each run - you log:

- the hyperparameters used in the model
- the model itself
- every possible metric from the model you chose
- at least two artifacts (plots, or csv files)

3. Track your MLFlow experiment and run at least 10 experiments with different parameters each

In [0]:
#  scored = whether the driver earned points

df_results_pd["scored"] = (df_results_pd["points"].astype(float) > 0).astype(int)

In [0]:
features = ["grid", "laps", "milliseconds", "constructorId", "rank", "positionOrder"]

# Replace '\N' with NaN across the whole dataframe
df_results_pd.replace('\\N', np.nan, inplace=True)

# Force all features to be numeric
for col in features:
    df_results_pd[col] = pd.to_numeric(df_results_pd[col], errors='coerce')

# Drop rows with missing values in features or target
df_model = df_results_pd.dropna(subset=features + ["scored"])

# Split into X and y
X = df_model[features]
y = df_model["scored"]

In [0]:
# Standardize feature values
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Split data
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Define hyperparameter ranges
n_estimators_list = [50, 100, 150]
max_depth_list = [3, 5, 10, None]

# Run experiments with different hyperparameter combinations
for n in n_estimators_list:
    for depth in max_depth_list:
        with mlflow.start_run():
            clf = RandomForestClassifier(n_estimators=n, max_depth=depth, random_state=42)
            clf.fit(X_train, y_train)
            y_pred = clf.predict(X_test)
            f1 = f1_score(y_test, y_pred)

            # Log hyperparameters and metric
            mlflow.log_param("n_estimators", n)
            mlflow.log_param("max_depth", depth)
            mlflow.log_metric("f1_score", f1)

            # ✅ Log confusion matrix as an artifact
            cm = confusion_matrix(y_test, y_pred)
            disp = ConfusionMatrixDisplay(cm)
            disp.plot()
            artifact_path = f"/tmp/cm_{n}_{depth}.png"
            plt.savefig(artifact_path)
            mlflow.log_artifact(artifact_path)
            plt.close()  # prevent memory buildup

            # Log the model
            mlflow.sklearn.log_model(clf, "model")

            print(f"Run: n_estimators={n}, max_depth={depth}, F1={f1:.4f}")

4. Select your best model run and explain why

In [0]:
client = MlflowClient()
experiment_id = "2163228893731700"

# Search for best F1
runs = client.search_runs(experiment_ids=experiment_id, order_by=["metrics.f1_score DESC"])

best_run = runs[0]
print("Best run ID:", best_run.info.run_id)
print("F1 score:", best_run.data.metrics["f1_score"])
print("Params:", best_run.data.params)

**WHY BEST:**
This model had the highest F1 score of 0.9801 using 150 trees and no max depth. It performed better than others because it had more trees to capture patterns and wasn’t limited in how deep each tree could grow. This helped it make more accurate predictions compared to models with fewer trees or limited depth.