In [0]:
pip install mlflow

Collecting mlflow
  Obtaining dependency information for mlflow from https://files.pythonhosted.org/packages/72/f1/be329ef23da1933135afc5e493cbe1b7ecf5302890f2a7ac2124b64c01bd/mlflow-2.21.3-py3-none-any.whl.metadata
  Downloading mlflow-2.21.3-py3-none-any.whl.metadata (30 kB)
Collecting mlflow-skinny==2.21.3 (from mlflow)
  Obtaining dependency information for mlflow-skinny==2.21.3 from https://files.pythonhosted.org/packages/2d/f8/b71f88ca373f248fd7fdf3751f74c7b36a71b7ee2b5f4b803ee053ac963a/mlflow_skinny-2.21.3-py3-none-any.whl.metadata
  Downloading mlflow_skinny-2.21.3-py3-none-any.whl.metadata (31 kB)
Collecting Flask<4 (from mlflow)
  Obtaining dependency information for Flask<4 from https://files.pythonhosted.org/packages/af/47/93213ee66ef8fae3b93b3e29206f6b251e65c97bd91d8e1c5596ef15af0a/flask-3.1.0-py3-none-any.whl.metadata
  Downloading flask-3.1.0-py3-none-any.whl.metadata (2.7 kB)
Collecting Jinja2<4,>=2.11 (from mlflow)
  Obtaining dependency information for Jinja2<4,>=2.11

In [0]:
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.sql.functions import col
from pyspark.ml.feature import StandardScaler
import mlflow
import mlflow.spark
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os
from sklearn.metrics import f1_score

In [0]:
drivers_path = "s3://columbia-gr5069-main/raw/drivers.csv"
results_path = "s3://columbia-gr5069-main/raw/results.csv"
drivers_df = spark.read.csv(drivers_path, header=True, inferSchema=True)
results_df = spark.read.csv(results_path, header=True, inferSchema=True)
display(drivers_df.limit(5))
display(results_df.limit(5))

driverId,driverRef,number,code,forename,surname,dob,nationality,url
1,hamilton,44,HAM,Lewis,Hamilton,1985-01-07,British,http://en.wikipedia.org/wiki/Lewis_Hamilton
2,heidfeld,\N,HEI,Nick,Heidfeld,1977-05-10,German,http://en.wikipedia.org/wiki/Nick_Heidfeld
3,rosberg,6,ROS,Nico,Rosberg,1985-06-27,German,http://en.wikipedia.org/wiki/Nico_Rosberg
4,alonso,14,ALO,Fernando,Alonso,1981-07-29,Spanish,http://en.wikipedia.org/wiki/Fernando_Alonso
5,kovalainen,\N,KOV,Heikki,Kovalainen,1981-10-19,Finnish,http://en.wikipedia.org/wiki/Heikki_Kovalainen


resultId,raceId,driverId,constructorId,number,grid,position,positionText,positionOrder,points,laps,time,milliseconds,fastestLap,rank,fastestLapTime,fastestLapSpeed,statusId
1,18,1,1,22,1,1,1,1,10.0,58,1:34:50.616,5690616,39,2,1:27.452,218.3,1
2,18,2,2,3,5,2,2,2,8.0,58,+5.478,5696094,41,3,1:27.739,217.586,1
3,18,3,3,7,7,3,3,3,6.0,58,+8.163,5698779,41,5,1:28.090,216.719,1
4,18,4,4,5,11,4,4,4,5.0,58,+17.181,5707797,58,7,1:28.603,215.464,1
5,18,5,1,23,3,5,5,5,4.0,58,+18.014,5708630,43,1,1:27.418,218.385,1


In [0]:
df = results_df.join(drivers_df, on="driverId")
df = df.withColumn("label", (col("positionOrder") == 1).cast("double"))
df = df.withColumn("milliseconds", col("milliseconds").cast("double"))
feature_cols = ["grid", "laps", "milliseconds"]
df = df.dropna(subset=feature_cols + ["label"])
assembler = VectorAssembler(inputCols=feature_cols, outputCol="features")
df = assembler.transform(df)
scaler = StandardScaler(inputCol="features", outputCol="scaled_features", withMean=True, withStd=True)
scaler_model = scaler.fit(df)
df = scaler_model.transform(df)

In [0]:
train_df, test_df = df.randomSplit([0.8, 0.2], seed=42)
def run_exp(numTrees, maxDepth):
    with mlflow.start_run():
        mlflow.log_param("numTrees", numTrees)
        mlflow.log_param("maxDepth", maxDepth)
        rf = RandomForestClassifier(featuresCol="scaled_features", labelCol="label", numTrees=numTrees, maxDepth=maxDepth)
        model = rf.fit(train_df)

        predictions = model.transform(test_df)
        evaluator = BinaryClassificationEvaluator(labelCol="label", metricName="areaUnderROC")
        auc = evaluator.evaluate(predictions)
        mlflow.log_metric("AUC", auc)

        pred_df = predictions.select("label", "prediction", "probability").toPandas()
        f1 = f1_score(pred_df["label"], pred_df["prediction"])
        mlflow.log_metric("F1_score", f1)

        mlflow.spark.log_model(model, "random-forest-model")

        pred_df.to_csv("predictions.csv", index=False)
        mlflow.log_artifact("predictions.csv")

        cm = pd.crosstab(pred_df["label"], pred_df["prediction"])
        sns.heatmap(cm, annot=True, fmt="d", cmap="Blues")
        plt.title("Confusion Matrix")
        plt.savefig("confusion_matrix.png")
        mlflow.log_artifact("confusion_matrix.png")
        plt.close()

        os.remove("predictions.csv")
        os.remove("confusion_matrix.png")


In [0]:
for numTrees in [20, 50, 100, 120]:
    for maxDepth in [5, 10, 15]:
        run_exp(numTrees, maxDepth)

2025/04/05 23:01:55 INFO mlflow.spark: Inferring pip requirements by reloading the logged model from the databricks artifact repository, which can be time-consuming. To speed up, explicitly specify the conda_env or pip_requirements when calling log_model().
 - mlflow (current: 2.21.3, required: mlflow==2.11.4)
To fix the mismatches, call `mlflow.pyfunc.get_model_dependencies(model_uri)` to fetch the model's environment and install dependencies using the resulting environment file.
2025/04/05 23:02:44 INFO mlflow.spark: Inferring pip requirements by reloading the logged model from the databricks artifact repository, which can be time-consuming. To speed up, explicitly specify the conda_env or pip_requirements when calling log_model().
 - mlflow (current: 2.21.3, required: mlflow==2.11.4)
To fix the mismatches, call `mlflow.pyfunc.get_model_dependencies(model_uri)` to fetch the model's environment and install dependencies using the resulting environment file.
2025/04/05 23:03:34 INFO mlf

**Best Parameters:**
After training 12 random forest models with different hyperparameters (numTrees, maxDepth), I compared their performance based on 2 key evaluation metircs: AUC and F1 score (I gave more weight on F1 score). The best performing model is the one with hyperparameters (numTrees = 120, maxDepth = 5), which obtained the highest F1 score (0.5288) showing a good balance between precision and recall, while had the good AUC score as well (0.8616). Therefore, I chose this combination of hyperparameters as my best model.