In [0]:
pip install mlflow

In [0]:
import boto3
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer

s3 = boto3.client('s3')
bucket = "ne-gr5069"

def load_s3_csv(key):
    obj = s3.get_object(Bucket=bucket, Key=key)
    return pd.read_csv(obj['Body'])

# Replace with actual paths from your S3 bucket
pitstops_df = load_s3_csv("raw/pit_stops.csv")
laptimes_df = load_s3_csv("raw/lap_times.csv")
results_df = load_s3_csv("raw/results.csv")


In [0]:
pitstop_counts = pitstops_df.groupby(['raceId', 'driverId']).size().reset_index(name='pitstop_count')

# Join with results (to get positionOrder as label)
df = pd.merge(results_df, pitstop_counts, on=['raceId', 'driverId'], how='inner')


In [0]:
display(df)

In [0]:
feature_cols = ['pitstop_count', 'grid', 'laps', 'fastestLap', 'rank']

# Drop rows with any missing values in selected features
df_model = df[feature_cols + ['positionOrder']].dropna()

# Define X and y
X = df_model[feature_cols]
y = df_model['positionOrder'].apply(lambda x: 1 if x <= 3 else 0) 



In [0]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)


# Convert non-numeric columns to numeric, setting errors='coerce' to convert non-numeric values to NaN
X = X.apply(pd.to_numeric, errors='coerce')

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

# Impute missing values
imputer = SimpleImputer(strategy='mean')
X_train_imputed = imputer.fit_transform(X_train)
X_test_imputed = imputer.transform(X_test)

In [0]:
def log_pitstop_model(experimentID, run_name, params, X_train, X_test, y_train, y_test):
    import os
    import matplotlib.pyplot as plt
    import mlflow.sklearn
    import seaborn as sns
    import tempfile
    from sklearn.ensemble import RandomForestClassifier
    from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

    with mlflow.start_run(experiment_id=experimentID, run_name=run_name) as run:
        model = RandomForestClassifier(**params)
        model.fit(X_train_imputed, y_train)
        predictions = model.predict(X_test_imputed)

        # Log model & parameters
        mlflow.sklearn.log_model(model, "pitstop-model")
        [mlflow.log_param(k, v) for k, v in params.items()]

        # Log metrics
        mlflow.log_metric("accuracy", accuracy_score(y_test, predictions))
        mlflow.log_metric("precision", precision_score(y_test, predictions))
        mlflow.log_metric("recall", recall_score(y_test, predictions))
        mlflow.log_metric("f1_score", f1_score(y_test, predictions))

        # Confusion matrix plot
        cm = confusion_matrix(y_test, predictions)
        fig, ax = plt.subplots()
        sns.heatmap(cm, annot=True, fmt='d', cmap="Blues", ax=ax)
        ax.set_title("Confusion Matrix")

        temp = tempfile.NamedTemporaryFile(delete=False, suffix=".png")
        plt.savefig(temp.name)
        mlflow.log_artifact(temp.name, "confusion_matrix.png")
        display(fig)

        return run.info.run_uuid


In [0]:
import mlflow
experimentID = mlflow.set_experiment("/Users/yp2728@columbia.edu/F1_prediction_model").experiment_id

param_grid = [
    {"n_estimators": 60, "max_depth": 4, "random_state":12},
    {"n_estimators": 120, "max_depth": 6, "random_state": 42},
    {"n_estimators": 180, "max_depth": 8, "random_state": 42},
    {"n_estimators": 110, "max_depth": None, "random_state": 42},
    {"n_estimators": 220, "max_depth": 6, "random_state": 42},
    {"n_estimators": 320, "max_depth": 12, "random_state": 42},
    {"n_estimators": 90, "max_depth": 5, "random_state": 42},
    {"n_estimators": 130, "max_depth": 7, "random_state": 42},
    {"n_estimators": 270, "max_depth": 9, "random_state": 42},
    {"n_estimators": 110, "max_depth": 14, "random_state": 42},
]

for i, params in enumerate(param_grid):
    log_pitstop_model(experimentID, f"Pitstop Run {i+1}", params, X_train, X_test, y_train, y_test)

For this assignment, I selected Pitstop Run 5 as the best model based on its overall performance across key evaluation metrics. While several runs had similar accuracy scores, Pitstop Run 5 achieved the highest F1 score of 0.674967, indicating a strong balance between precision and recall. This balance is especially important in our task of predicting podium finishes, where both false positives and false negatives can be problematic. The model also demonstrated solid precision and recall scores of 0.674967 each, reinforcing its consistency. Although Pitstop Run 1 had a slightly higher F1 score, it used a different random state and had lower precision, which may indicate variability in results. Pitstop Run 5, with hyperparameters of 220 estimators and a maximum depth of 6, showed stable and reliable performance, making it the most suitable model for this classification task.