In [0]:

pip install --pre mlflow

Collecting mlflow
  Obtaining dependency information for mlflow from https://files.pythonhosted.org/packages/52/dd/c7142789e712744be92c02776fc13fa7e1f0dc54ff7a309250c1259b5630/mlflow-3.0.0rc0-py3-none-any.whl.metadata
  Downloading mlflow-3.0.0rc0-py3-none-any.whl.metadata (30 kB)
Collecting mlflow-skinny==3.0.0rc0 (from mlflow)
  Obtaining dependency information for mlflow-skinny==3.0.0rc0 from https://files.pythonhosted.org/packages/99/00/a3f3a004c067226b971881f19e41361ef7eccbf6f9f6a9927d5d73af738a/mlflow_skinny-3.0.0rc0-py3-none-any.whl.metadata
  Downloading mlflow_skinny-3.0.0rc0-py3-none-any.whl.metadata (31 kB)
Collecting Flask<4 (from mlflow)
  Obtaining dependency information for Flask<4 from https://files.pythonhosted.org/packages/af/47/93213ee66ef8fae3b93b3e29206f6b251e65c97bd91d8e1c5596ef15af0a/flask-3.1.0-py3-none-any.whl.metadata
  Downloading flask-3.1.0-py3-none-any.whl.metadata (2.7 kB)
Collecting Jinja2<4,>=2.11 (from mlflow)
  Obtaining dependency information for Jin

In [0]:
## Step 1: Imports and MLflow Setup

# COMMAND ----------
import pandas as pd
import numpy as np

import mlflow
import mlflow.spark
import matplotlib.pyplot as plt

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (accuracy_score, f1_score, precision_score, recall_score,
                             roc_auc_score, confusion_matrix, roc_curve)
from sklearn.model_selection import train_test_split

import os

import warnings

warnings.filterwarnings('ignore')

# Start new MLflow experiment
mlflow.set_experiment("/Users/sc5558@columbia.edu/f1_top3_prediction")


<Experiment: artifact_location='dbfs:/databricks/mlflow-tracking/3507016641691734', creation_time=1744171799422, experiment_id='3507016641691734', last_update_time=1744174151263, lifecycle_stage='active', name='/Users/sc5558@columbia.edu/f1_top3_prediction', tags={'mlflow.experiment.sourceName': '/Users/sc5558@columbia.edu/f1_top3_prediction',
 'mlflow.experimentType': 'MLFLOW_EXPERIMENT',
 'mlflow.ownerEmail': 'sc5558@columbia.edu',
 'mlflow.ownerId': '5453346838374231',
 'mlflow.sharedViewState.12f288d35f0de0c5ef50898a0fddef952aa42b51bba390e3fa1d0797fde9a2ec': 'deflate;eJxdUl1v2zAM/CuFnoOie/Wbm6ZrsaQonCwoEAyJIjE2AVkyRKqLW/S/j/5onfWRxzseddS7ItDRVPfoGKLKlJqpEC3E2/YXtFJr5ojHxEDXxDrynrGGiZSTUdlJO4KZ6vubrp2pfLkUksMTmNY4+BqfG8bXTm81awKmz87uz0zVwYLbQiQMflI4d1UkT6IhcGAY7Dy4VAuS7S7XO6xDigYOQrxEV91Q+o7eDfYHJbZRpi/OjfYWrMrePwbkGb3v6t3IeEBrwU/1FgmP6JDblW4m2UDrTGX3+8divdn/uNkXv5/WssErwt+VPmONb93oMTeRLZH402BEkXJjJGXJooA+7QtJIvgZQ2rAbrVLQI9+Xkn4EgnHJP0KLSzqhtv/4bLT3MpZfXJuLL89XScOBZwiULXw+uguTEsXjt

In [0]:
## Step 2: Load and Join Datasets

results = spark.read.csv("s3://columbia-gr5069-main/raw/results.csv", header=True, inferSchema=True)
qual = spark.read.csv("s3://columbia-gr5069-main/raw/qualifying.csv", header=True, inferSchema=True)
races = spark.read.csv("s3://columbia-gr5069-main/raw/races.csv", header=True, inferSchema=True)
drivers = spark.read.csv("s3://columbia-gr5069-main/raw/drivers.csv", header=True, inferSchema=True)
def preprocess_data():
    """
    Loads and merges the F1 datasets, creates the target variable, and prepares the feature set.
    """
    # Load the datasets
    results = pd.read_csv("s3://columbia-gr5069-main/raw/results.csv")
    qualifying = pd.read_csv("s3://columbia-gr5069-main/raw/qualifying.csv")
    races = pd.read_csv('s3://columbia-gr5069-main/raw/races.csv')
    drivers = pd.read_csv('s3://columbia-gr5069-main/raw/drivers.csv')
    
    # Merge results with race info (using raceId)
    df = results.merge(races[['raceId', 'year', 'round']], on='raceId', how='left')
    
    # Merge driver info (using driverId)
    if 'nationality' in drivers.columns:
        df = df.merge(drivers[['driverId', 'nationality']], on='driverId', how='left')
    else:
        df = df.merge(drivers[['driverId']], on='driverId', how='left')
    
    # Merge qualifying data (rename 'position' to 'qualifying_position' if exists)
    if 'position' in qualifying.columns:
        qualifying = qualifying.rename(columns={'position': 'qualifying_position'})
        df = df.merge(qualifying[['raceId', 'driverId', 'qualifying_position']],
                      on=['raceId', 'driverId'], how='left')
    
    # Create target variable: finish_top3 is 1 if positionOrder <= 3, else 0
    df['finish_top3'] = np.where(df['positionOrder'] <= 3, 1, 0)
    
    # Select features to use – these choices are illustrative
    # Features: starting grid, points earned, year, round,
    # qualifying position, and nationality (if available)
    feature_cols = ['grid', 'points', 'year', 'round']
    if 'qualifying_position' in df.columns:
        feature_cols.append('qualifying_position')
    if 'nationality' in df.columns:
        feature_cols.append('nationality')
    
    df = df[feature_cols + ['finish_top3']]
    
    # Drop rows with missing values
    df = df.dropna()
    
    # One-hot encode the categorical feature 'nationality', if present
    if 'nationality' in df.columns and df['nationality'].dtype == 'object':
        df = pd.get_dummies(df, columns=['nationality'], drop_first=True)
    
    return df

def plot_confusion_matrix(cm, labels, filename):
    """
    Creates and saves a confusion matrix plot.
    """
    plt.figure(figsize=(6, 4))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=labels, yticklabels=labels)
    plt.ylabel('Actual')
    plt.xlabel('Predicted')
    plt.title('Confusion Matrix')
    plt.tight_layout()
    plt.savefig(filename)
    plt.close()

def plot_feature_importances(importances, feature_names, filename):
    """
    Creates and saves a bar chart of feature importances.
    """
    plt.figure(figsize=(8, 6))
    indices = np.argsort(importances)
    plt.barh(range(len(importances)), importances[indices])
    plt.yticks(range(len(importances)), [feature_names[i] for i in indices])
    plt.xlabel("Feature Importance")
    plt.title("Feature Importances from Random Forest")
    plt.tight_layout()
    plt.savefig(filename)
    plt.close()


In [0]:
%pip install fsspec s3fs


In [0]:
def main():
    # Set the MLflow experiment (this is your designated experiment path)
    mlflow.set_experiment("/Users/sc5558@columbia.edu/f1_top3_prediction")
    
    # Preprocess and merge the data
    df = preprocess_data()
    
    # Separate features and target variable
    X = df.drop('finish_top3', axis=1)
    y = df['finish_top3']
    
    # Train/test split
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
    # Define hyperparameter combinations for Random Forest (10 experiments)
    experiment_params = [
        {'n_estimators': 100, 'max_depth': None, 'min_samples_split': 2, 'min_samples_leaf': 1, 'max_features': 'sqrt'},
        {'n_estimators': 150, 'max_depth': 10,   'min_samples_split': 2, 'min_samples_leaf': 1, 'max_features': 'sqrt'},
        {'n_estimators': 100, 'max_depth': 20,   'min_samples_split': 4, 'min_samples_leaf': 2, 'max_features': 'sqrt'},
        {'n_estimators': 200, 'max_depth': 15,   'min_samples_split': 2, 'min_samples_leaf': 1, 'max_features': 'log2'},
        {'n_estimators': 200, 'max_depth': 10,   'min_samples_split': 5, 'min_samples_leaf': 2, 'max_features': 'sqrt'},
        {'n_estimators': 250, 'max_depth': 20,   'min_samples_split': 4, 'min_samples_leaf': 2, 'max_features': 'log2'},
        {'n_estimators': 150, 'max_depth': None, 'min_samples_split': 5, 'min_samples_leaf': 2, 'max_features': None},
        {'n_estimators': 300, 'max_depth': 15,   'min_samples_split': 2, 'min_samples_leaf': 1, 'max_features': 'log2'},
        {'n_estimators': 100, 'max_depth': 30,   'min_samples_split': 2, 'min_samples_leaf': 1, 'max_features': None},
        {'n_estimators': 200, 'max_depth': 25,   'min_samples_split': 4, 'min_samples_leaf': 2, 'max_features': 'sqrt'}
    ]
    
    best_f1 = -1
    best_run_id = None
    best_params = None

    # Loop over parameter sets and run experiments with MLflow tracking
    for params in experiment_params:
        with mlflow.start_run() as run:
            # Log hyperparameters
            mlflow.log_params(params)
            
            # Build and train the Random Forest model
            model = RandomForestClassifier(**params, random_state=42)
            model.fit(X_train, y_train)
            
            # Generate predictions and prediction probabilities
            y_pred = model.predict(X_test)
            y_proba = model.predict_proba(X_test)[:, 1]
            
            # Calculate metrics
            acc = accuracy_score(y_test, y_pred)
            f1 = f1_score(y_test, y_pred)
            precision = precision_score(y_test, y_pred)
            recall = recall_score(y_test, y_pred)
            auc = roc_auc_score(y_test, y_proba)
            
            # Log metrics
            mlflow.log_metric("accuracy", acc)
            mlflow.log_metric("f1_score", f1)
            mlflow.log_metric("precision", precision)
            mlflow.log_metric("recall", recall)
            mlflow.log_metric("auc", auc)
            
            # Log the trained model
            mlflow.sklearn.log_model(model, "model")
            
            # Artifact 1: Confusion Matrix Plot
            cm = confusion_matrix(y_test, y_pred)
            cm_filename = "confusion_matrix.png"
            plot_confusion_matrix(cm, labels=["0", "1"], filename=cm_filename)
            mlflow.log_artifact(cm_filename)
            
            # Artifact 2: Feature Importance Plot (if available)
            importance = model.feature_importances_
            fi_filename = "feature_importances.png"
            plot_feature_importances(importance, X.columns.tolist(), filename=fi_filename)
            mlflow.log_artifact(fi_filename)
            
            # Optional: Log predictions CSV as an additional artifact
            preds_df = X_test.copy()
            preds_df["y_true"] = y_test
            preds_df["y_pred"] = y_pred
            preds_filename = "predictions.csv"
            preds_df.to_csv(preds_filename, index=False)
            mlflow.log_artifact(preds_filename)
            
            # Update best run based on F1 score
            if f1 > best_f1:
                best_f1 = f1
                best_run_id = run.info.run_id
                best_params = params
            
            print(f"Completed Run {run.info.run_id} - F1 Score: {f1}")
    
    # Print the best model run details
    print("\nBest Run Details:")
    print("Run ID:", best_run_id)
    print("Hyperparameters:", best_params)
    print("F1 Score:", best_f1)

if __name__ == '__main__':
    main()

Selected Best Run
After reviewing the MLflow logs from all 10 experiments, colorful-ray-398 is the best run. 

And here are the key performance metrics logged for this run:

F1 Score: 0.0998

Accuracy: 1.0

Precision: 0.997

Recall: 1.0


The F1 score is the highest among all realistic models, meaning it has the best balance of precision and recall.

An AUC of 1.0 shows perfect ranking capability, meaning it can fully distinguish Top 3 finishers from the rest.

Precision of 0.997 means almost all predicted Top 3s are correct.

Recall of 1.0 means it finds every real Top 3 driver.

Accuracy of 1.0 confirms strong performance across all classes.


Although "suave-elk-377" reports perfect performance across all metrics, such results are unrealistic and indicate potential overfitting due to unlimited depth and full feature usage. In contrast, "colorful-ray-398" achieved near-perfect metrics with more reasonable parameters, suggesting better generalization and robustness. Therefore, we select "colorful-ray-398" as our best model.