In [0]:
# Install MLflow (only needed if restarting the kernel or running for the first time)
%pip install mlflow==1.14.0
import mlflow
import boto3
import pandas as pd

In [0]:
# Set up S3 client and bucket
s3 = boto3.client('s3')
bucket = "columbia-gr5069-main"

# List of files you want to load from the bucket
keys = {
    "drivers": "raw/drivers.csv",
    "races": "raw/races.csv",
    "results": "raw/results.csv",
    "constructors": "raw/constructors.csv",
}

# Dictionary to store the loaded DataFrames
dataframes = {}

# Loop through and load each CSV into a DataFrame
for name, key in keys.items():
    obj = s3.get_object(Bucket=bucket, Key=key)
    df = pd.read_csv(obj['Body'])
    dataframes[name] = df
    print(f"Loaded {name} ({df.shape[0]} rows, {df.shape[1]} columns)")

# Example usage:
drivers_df = dataframes['drivers']
races_df = dataframes['races']
results_df = dataframes['results']
constructors_df = dataframes['constructors']

# Preview a DataFrame
display(drivers_df)
display(races_df)
display(results_df)
display(constructors_df)

In [0]:
# merge four dataframes into one
merged_df = pd.merge(results_df, races_df, on='raceId', how='left', suffixes=('', '_race'))
merged_df = pd.merge(merged_df, drivers_df, on='driverId', how='left', suffixes=('', '_driver'))
merged_df = pd.merge(merged_df, constructors_df, on='constructorId', how='left', suffixes=('', '_constructor'))
merged_df['top_10'] = merged_df['positionOrder'] <= 10
merged_df = merged_df[merged_df['positionOrder'].notnull()]
merged_df.head()

In [0]:
merged_df['dob'] = pd.to_datetime(merged_df['dob'], errors='coerce')
merged_df['date'] = pd.to_datetime(merged_df['date'], errors='coerce')

# Calculate driver age at time of race
merged_df['driver_age'] = (merged_df['date'] - merged_df['dob']).dt.days // 365

# Select modeling features
features = [
    'grid',
    'constructorRef',
    'nationality',
    'driver_age',
    'year',
    'round'
]

# Drop rows with missing values in selected features
model_df = merged_df[features + ['top_10']].dropna()

In [0]:
from sklearn.preprocessing import OneHotEncoder

# One-hot encode categorical features
categorical = ['constructorRef', 'nationality']
model_df = pd.get_dummies(model_df, columns=categorical, drop_first=True)
model_df.head()

In [0]:
# Create an experiment setup
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report, accuracy_score, f1_score
from sklearn.model_selection import train_test_split

X = model_df.drop('top_10', axis=1)
y = model_df['top_10']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define tunable hyperparameters
params = {
    'max_depth': 5,
    'min_samples_split': 10,
    'min_samples_leaf': 5,
    'criterion': 'gini'
}

clf = DecisionTreeClassifier(**params, random_state=42)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred))
print("F1 Score:", f1_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

In [0]:
import mlflow
import mlflow.sklearn
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, f1_score, classification_report, confusion_matrix, ConfusionMatrixDisplay
import matplotlib.pyplot as plt
import pandas as pd
import os
import itertools
import random

# Convert target to int
y_train = y_train.astype(int)
y_test = y_test.astype(int)

# Define hyperparameter grid
param_grid = {
    'max_depth': [3, 5, 7],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 3],
    'criterion': ['gini', 'entropy']
}

# Create 10 unique param combinations
param_combinations = list(itertools.product(
    param_grid['max_depth'],
    param_grid['min_samples_split'],
    param_grid['min_samples_leaf'],
    param_grid['criterion']
))
random.seed(42)
sampled_combinations = random.sample(param_combinations, 10)

# Ordinal names for model runs
ordinals = ["first", "second", "third", "fourth", "fifth",
            "sixth", "seventh", "eighth", "ninth", "tenth"]

# Loop through 10 experiments
for i, (max_depth, min_split, min_leaf, criterion) in enumerate(sampled_combinations):
    params = {
        'max_depth': max_depth,
        'min_samples_split': min_split,
        'min_samples_leaf': min_leaf,
        'criterion': criterion
    }
    ordinal = ordinals[i]

    with mlflow.start_run(run_name=f"{ordinal.capitalize()} Run"):
        # Train model
        clf = DecisionTreeClassifier(**params, random_state=42)
        clf.fit(X_train, y_train)
        y_pred = clf.predict(X_test)

        # Metrics
        acc = accuracy_score(y_test, y_pred)
        f1 = f1_score(y_test, y_pred)
        report = classification_report(y_test, y_pred, output_dict=True)

        # Log params & metrics
        mlflow.log_params(params)
        mlflow.log_metric("accuracy", acc)
        mlflow.log_metric("f1_score", f1)
        mlflow.log_metric("precision", report['1']['precision'])
        mlflow.log_metric("recall", report['1']['recall'])

        # Log model with ordinal name
        mlflow.sklearn.log_model(clf, f"{ordinal}_run_model")

        # Artifact 1: Confusion Matrix
        cm = confusion_matrix(y_test, y_pred)
        disp = ConfusionMatrixDisplay(confusion_matrix=cm)
        disp.plot()
        plt.title(f"{ordinal.capitalize()} Run - Confusion Matrix")
        cm_path = f"{ordinal}_confusion_matrix.png"
        plt.savefig(cm_path)
        mlflow.log_artifact(cm_path)

        # Artifact 2: Predictions CSV
        preds_df = X_test.copy()
        preds_df["actual"] = y_test
        preds_df["predicted"] = y_pred
        csv_path = f"{ordinal}_predictions.csv"
        preds_df.to_csv(csv_path, index=False)
        mlflow.log_artifact(csv_path)

        # Cleanup
        os.remove(cm_path)
        os.remove(csv_path)

        print(f"{ordinal.capitalize()} run completed — accuracy: {acc:.4f}, f1: {f1:.4f}")


##### Select your best model run and explain why

Among the 10 decision tree models I trained, the best-performing run was the Fourth Run, which achieved an accuracy of 0.689, an F1 score of 0.617, precision of 0.637, and recall of 0.597.

I selected this run as the best model because it had the highest F1 score, which reflects a strong balance between precision and recall — an important consideration when predicting top 10 finishes where false positives and false negatives both matter. It also tied for the best accuracy, making it a strong overall performer. And The best model used the following hyperparameters:

- criterion: "gini"
- max_depth: 5
- min_samples_split: 2
- min_samples_leaf: 1

While my best model achieved moderate/weak performance (F1 score ~0.62), :(, this was expected given the limited features available. With additional data — such as qualifying times, lap performance, or historical driver stats — I believe the model could be significantly improved. Also, in reality, I would try several other models such as Random Forest or XGBoost to compare the result. Nonetheless, this process demonstrates a complete ML experimentation workflow, from feature selection to tracking and model comparison.