In [0]:
import mlflow
mlflow.__version__


In [0]:
# Databricks notebook source
# MAGIC %md-sandbox
# MAGIC <div><img src="https://files.training.databricks.com/images/eLearning/ML-Part-4/mlflow-tracking.png" style="height: 400px; margin: 20px"/></div>

# COMMAND ----------

# MAGIC %md
# MAGIC Import a dataset of Airbnb listings and featurize the data.  We'll use this to train a model.

# COMMAND ----------

import boto3
import pandas as pd

# COMMAND ----------
s3 = boto3.client("s3")
bucket = "columbia-gr5069-main"
key = "raw/results.csv"

obj = s3.get_object(Bucket=bucket, Key=key)
df = pd.read_csv(obj["Body"], na_values='\\N')

display(df)

In [0]:
from sklearn.model_selection import train_test_split

# Select relevant numeric features
selected_cols = ["grid", "laps", "number", "statusId", "positionOrder"]
df = df[selected_cols].dropna()

# Split into features and target
X = df.drop("positionOrder", axis=1)
y = df["positionOrder"]

# train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)


In [0]:
def log_rf(experimentID, run_name, params, X_train, X_test, y_train, y_test):
    import mlflow.sklearn
    import matplotlib.pyplot as plt
    import seaborn as sns
    from sklearn.ensemble import RandomForestRegressor
    from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
    import tempfile
    import os

    with mlflow.start_run(experiment_id=experimentID, run_name=run_name) as run:
        model = RandomForestRegressor(**params)
        model.fit(X_train, y_train)
        predictions = model.predict(X_test)

        # Log model
        mlflow.sklearn.log_model(model, "random-forest-model")

        # Log params
        for k, v in params.items():
            mlflow.log_param(k, v)

        # Metrics
        mae = mean_absolute_error(y_test, predictions)
        mse = mean_squared_error(y_test, predictions)
        r2 = r2_score(y_test, predictions)

        mlflow.log_metric("mae", mae)
        mlflow.log_metric("mse", mse)
        mlflow.log_metric("r2", r2)

        # Artifact 1: residuals
        residuals = y_test - predictions
        fig, ax = plt.subplots()
        sns.histplot(residuals, bins=30)
        plt.title("Residuals")
        temp_png = tempfile.NamedTemporaryFile(delete=False, suffix=".png")
        fig.savefig(temp_png.name)
        mlflow.log_artifact(temp_png.name, "residuals.png")
        temp_png.close()
        os.remove(temp_png.name)

        # Artifact 2: feature importance
        importance = pd.DataFrame({
            "feature": X.columns,
            "importance": model.feature_importances_
        }).sort_values(by="importance", ascending=False)

        temp_csv = tempfile.NamedTemporaryFile(delete=False, suffix=".csv")
        importance.to_csv(temp_csv.name, index=False)
        mlflow.log_artifact(temp_csv.name, "feature-importance.csv")
        temp_csv.close()
        os.remove(temp_csv.name)

        return run.info.run_uuid


In [0]:
import mlflow
experimentID = mlflow.active_run().info.experiment_id if mlflow.active_run() else mlflow.set_experiment("/Users/ht2668@columbia.edu/F1-Prediction").experiment_id

# 10 param experiments
param_list = [
    {"n_estimators": 100, "max_depth": 3, "random_state": 42},
    {"n_estimators": 200, "max_depth": 4, "random_state": 42},
    {"n_estimators": 300, "max_depth": 5, "random_state": 42},
    {"n_estimators": 400, "max_depth": 6, "random_state": 42},
    {"n_estimators": 500, "max_depth": 7, "random_state": 42},
    {"n_estimators": 600, "max_depth": 8, "random_state": 42},
    {"n_estimators": 700, "max_depth": 9, "random_state": 42},
    {"n_estimators": 800, "max_depth": 10, "random_state": 42},
    {"n_estimators": 900, "max_depth": 11, "random_state": 42},
    {"n_estimators": 1000, "max_depth": 12, "random_state": 42}
]

for i, params in enumerate(param_list):
    run_name = f"Run {i+1}"
    log_rf(experimentID, run_name, params, X_train, X_test, y_train, y_test)



Best Model Selection
Among the 10 experiments, **Run 10** achieved the best overall performance with:

- **R² = 0.827**, indicating the highest variance explained by the model
- **MSE = 10.24**, the lowest mean squared error
- **MAE = 2.45**, also the lowest mean absolute error

This suggests that the model trained with `n_estimators=1000` and `max_depth=12` provided the most accurate predictions with the best generalization performance. Therefore, Run 10 is selected as the best model.
