In [0]:
pip install mlflow

In [0]:
import mlflow

mlflow.set_experiment("/Users/yq2397@columbia.edu/take-home-exercise-3-yq2397")

In [0]:
from pyspark.sql import functions as F
from pyspark.sql.types import DoubleType
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.regression import RandomForestRegressor, GBTRegressor
from pyspark.ml.evaluation import RegressionEvaluator
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

In [0]:
df_results = spark.read.csv('s3://columbia-gr5069-main/raw/results.csv', header = True)
df_drivers = spark.read.csv('s3://columbia-gr5069-main/raw/drivers.csv', header = True)
df_races= spark.read.csv('s3://columbia-gr5069-main/raw/races.csv', header = True)

In [0]:
merged_df = df_results.join(df_races, on="raceId", how="left")
merged_df = merged_df.join(df_drivers, on="driverId", how="left")
display(merged_df.limit(10))

In [0]:
merged_df = merged_df.withColumn("grid", merged_df["grid"].cast(DoubleType()))
merged_df = merged_df.withColumn("positionOrder", merged_df["positionOrder"].cast(DoubleType()))
merged_df = merged_df.withColumn("laps", merged_df["laps"].cast(DoubleType()))

merged_df = merged_df.withColumn("dob", F.to_date(merged_df["dob"]))
merged_df = merged_df.withColumn("date", F.to_date(merged_df["date"]))
merged_df = merged_df.withColumn("driver_age", 
                                F.datediff(merged_df["date"], merged_df["dob"])/365.25)

model_df = merged_df.select("grid", "driver_age", "laps", "positionOrder")
model_df = model_df.dropna()

print(f"Total records for modeling: {model_df.count()}")
display(model_df.limit(10))

In [0]:
train_df, test_df = model_df.randomSplit([0.8, 0.2], seed=42)
print(f"Training set count: {train_df.count()}")
print(f"Testing set count: {test_df.count()}")

feature_cols = ["grid", "driver_age", "laps"]
assembler = VectorAssembler(inputCols=feature_cols, outputCol="features")

In [0]:
def train_model_simple(model_type, params=None):
    
    with mlflow.start_run():
        mlflow.log_param("model_type", model_type)
        if params:
            for key, value in params.items():
                mlflow.log_param(key, value)

        train_vector = assembler.transform(train_df)
        test_vector = assembler.transform(test_df)

        if model_type == 'rf':
            model = RandomForestRegressor(featuresCol="features", labelCol="positionOrder", **params if params else {})
        elif model_type == 'gbt':
            model = GBTRegressor(featuresCol="features", labelCol="positionOrder", **params if params else {})

        trained_model = model.fit(train_vector)

        predictions = trained_model.transform(test_vector)

        evaluator = RegressionEvaluator(labelCol="positionOrder", predictionCol="prediction")
        rmse = evaluator.setMetricName("rmse").evaluate(predictions)
        r2 = evaluator.setMetricName("r2").evaluate(predictions)

        pred_df = predictions.select("positionOrder", "prediction").toPandas()
        acc_within_1 = np.mean(np.abs(pred_df["positionOrder"] - np.round(pred_df["prediction"])) <= 1) * 100
        acc_within_3 = np.mean(np.abs(pred_df["positionOrder"] - np.round(pred_df["prediction"])) <= 3) * 100

        mlflow.log_metric("rmse", rmse)
        mlflow.log_metric("r2", r2)
        mlflow.log_metric("accuracy_within_1", acc_within_1)
        mlflow.log_metric("accuracy_within_3", acc_within_3)

        feature_importance = pd.DataFrame({
            'Feature': feature_cols,
            'Importance': trained_model.featureImportances.toArray()
        }).sort_values('Importance', ascending=False)
        
        plt.figure(figsize=(8, 4))
        plt.barh(feature_importance['Feature'], feature_importance['Importance'])
        plt.title('Feature Importance')
        plt.tight_layout()
        plt.savefig("feature_importance.png")
        plt.close()

        mlflow.log_artifact("feature_importance.png")
        feature_importance.to_csv("feature_importance.csv", index=False)
        mlflow.log_artifact("feature_importance.csv")
        
        mlflow.spark.log_model(trained_model, f"{model_type}_model")
        
        return trained_model, rmse, r2, acc_within_1, acc_within_3

In [0]:
# Parameters
rf_params = [
    {'numTrees': 100, 'maxDepth': 10, 'seed': 42},
    {'numTrees': 200, 'maxDepth': 15, 'seed': 42},
    {'numTrees': 300, 'maxDepth': 5, 'seed': 42},
    {'numTrees': 100, 'maxDepth': 20, 'seed': 42},
    {'numTrees': 200, 'maxDepth': 10, 'seed': 42}
]

gbt_params = [
    {'maxIter': 100, 'stepSize': 0.1, 'maxDepth': 3, 'seed': 42},
    {'maxIter': 200, 'stepSize': 0.05, 'maxDepth': 5, 'seed': 42},
    {'maxIter': 100, 'stepSize': 0.01, 'maxDepth': 7, 'seed': 42},
    {'maxIter': 200, 'stepSize': 0.1, 'maxDepth': 5, 'seed': 42},
    {'maxIter': 300, 'stepSize': 0.05, 'maxDepth': 3, 'seed': 42}
]

results = []

# Random Forest experiments
for i, params in enumerate(rf_params):
    print(f"RF Experiment {i+1}/{len(rf_params)}")
    _, rmse, r2, acc1, acc3 = train_model_simple('rf', params)
    results.append({
        'Model Type': 'Random Forest',
        'Parameters': params,
        'RMSE': rmse,
        'R²': r2,
        'Accuracy within ±1': acc1,
        'Accuracy within ±3': acc3
    })

# Gradient Boosting experiments
for i, params in enumerate(gbt_params):
    print(f"GBT Experiment {i+1}/{len(gbt_params)}")
    _, rmse, r2, acc1, acc3 = train_model_simple('gbt', params)
    results.append({
        'Model Type': 'Gradient Boosting',
        'Parameters': params,
        'RMSE': rmse,
        'R²': r2,
        'Accuracy within ±1': acc1,
        'Accuracy within ±3': acc3
    })


results_df = pd.DataFrame(results)
best_model = results_df.loc[results_df['RMSE'].idxmin()]

print("\n===== Experiment Results =====")
print(results_df[['Model Type', 'RMSE', 'R²', 'Accuracy within ±1', 'Accuracy within ±3']])
print("\n===== Best Model =====")
print(f"Model Type: {best_model['Model Type']}")
print(f"Parameters: {best_model['Parameters']}")
print(f"RMSE: {best_model['RMSE']:.4f}")
print(f"R²: {best_model['R²']:.4f}")
print(f"Accuracy within ±1 position: {best_model['Accuracy within ±1']:.2f}%")
print(f"Accuracy within ±3 positions: {best_model['Accuracy within ±3']:.2f}%")

**Best Model**

For this F1 position prediction project, we testes 10 different models: 5 Random Forest and 5 Gradient Boosting models with different hyperparameters. The best performing model was a Gradient Boosting Regressor with 300 trees, a learning rate of 0.05, and a maximum depth of 3. This model had an RMSE of 4.24 and could explain about 69% of what determines a driver's finishing position. In simpler terms, our model could predict a driver's finishing position within 1 spot of their actual position 30% of the time, and within 3 spots 64% of the time. This beat all our Random Forest models. We think the model worked well because we didn't make the trees too deep or the learning too aggressive, which helped it avoid memorizing the training data instead of learning real patterns.