In [0]:
pip install mlflow==2.11.4

In [0]:
import pandas as pd
import numpy as np
import boto3
import io
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import mlflow
import mlflow.sklearn
import matplotlib.pyplot as plt
import seaborn as sns
import tempfile
from sklearn.model_selection import train_test_split


s3 = boto3.client('s3')
bucket = "columbia-gr5069-main"

# Define a helper to read S3 CSV
def read_s3_csv(key):
    obj = s3.get_object(Bucket=bucket, Key=key)
    return pd.read_csv(io.BytesIO(obj['Body'].read()))

# Load datasets from S3
results = read_s3_csv("raw/results.csv") #info on rank, fastest lap time and speed
races = read_s3_csv("raw/races.csv") #dates and names of all the races but has a lot of empty data
drivers = read_s3_csv("raw/drivers.csv") #drivers names and nationalities
lap_times = read_s3_csv("raw/lap_times.csv") #for each race, for each driver, the lap time, the number of laps and the time of the fastest lap
pit_stops = read_s3_csv("raw/pit_stops.csv") #time spent at the pitstop in each lap
qualifying = read_s3_csv("raw/qualifying.csv") #time taken in each round-the ones who dont finish or are disqualified are /N

###want to see what each dataset means

In [0]:
display (results)

In [0]:
display (races)

In [0]:
display(drivers)

In [0]:
display(lap_times)


In [0]:
display(pit_stops)

In [0]:
display(qualifying)

## Preparing dataset by combining results with pitstops

In [0]:
%python
# Reset the index of the results DataFrame
results_reset = results.reset_index()

# Join results df with pitstops df with suffixes for overlapping columns
pitstop_results_df = pit_stops.join(
    results_reset.set_index(['raceId', 'driverId']),
    on=['raceId', 'driverId'],
    how='inner',
    lsuffix='_pitstop',
    rsuffix='_result'
)

display(pitstop_results_df)

In [0]:
#get a list of all columns in the dataset
df = pitstop_results_df.columns
print(df)

positionOrder: Final race classification (e.g., 1 = winner).

rank: Ranking of the driver's fastest lap in the race (e.g., 1 = fastest lap overall).

fastestLap: Lap number where the driver set their fastest lap.

##Objective: I want to predict position order using pitstop_results_df

# Feature Selection
## Using relevant columns from the dataset:

### Pre-Race Features:
grid (starting position), constructorId (team), driverId (driver skill).

### In-Race Features:
laps (completed laps), statusId (DNF flag), fastestLapSpeed, fastestLapTime.

### Pit Stop Features:
stop (number of pit stops), milliseconds_pitstop (total pit time).

In [0]:
# Select features for modeling
model_data = pitstop_results_df[['grid','constructorId','raceId','driverId','laps','statusId','fastestLapTime', 'fastestLapSpeed','stop','milliseconds_pitstop','rank','positionOrder']]
                            

In [0]:
display(model_data)

I'm only interested in those who finsihed the race, so statusId=1 is the only valid metric for me

In [0]:
model_data['is_DNF'] = (model_data['statusId'] != 1).astype(int)  # 1=Finished, 0=DNF

In [0]:
display(model_data)

In [0]:
#want to check the datatypes and if i have any nans
print(model_data.dtypes)
print(model_data.isna().sum())

# Random Forest

In [0]:
%python
df=model_data
# Step 1: Replace '\N' with NaN in problematic columns
cols_with_N = [
    'grid','constructorId','raceId','driverId','laps','statusId','fastestLapTime', 'fastestLapSpeed','stop','milliseconds_pitstop','rank','positionOrder'
]
cols_existing = [col for col in cols_with_N if col in model_data.columns]
model_data[cols_existing] = model_data[cols_existing].replace('\\N', np.nan)

# Step 3: Drop non-numeric columns OR encode them if needed
df = model_data.select_dtypes(include=[np.number])  

# Step 4: Drop remaining rows with missing data 
df = df.dropna()

In [0]:

#Model Prep

# Step 5: Define features and target
X = df.drop(columns=["positionOrder"])
y = df["positionOrder"]

# Step 6: Train-test split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

# Step 7: Train model and log with MLflow
import mlflow.sklearn
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

with mlflow.start_run(run_name="Basic RF Experiment") as run:
    rf = RandomForestRegressor()
    rf.fit(X_train, y_train)
    predictions = rf.predict(X_test)

    # Log model
    mlflow.sklearn.log_model(rf, "random-forest-model")

    # Metrics
    mse = mean_squared_error(y_test, predictions)
    mlflow.log_metric("mse", mse)
    print(f"  mse: {mse}")
  
    runID = run.info.run_uuid
    experimentID = run.info.experiment_id
  
    print("Inside MLflow Run with run_id {} and experiment_id {}".format(runID, experimentID))

In [0]:
import mlflow.sklearn
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

with mlflow.start_run(run_name="Basic RF Experiment") as run:
  # Create model, train it, and create predictions
  rf = RandomForestRegressor()
  rf.fit(X_train, y_train)
  predictions = rf.predict(X_test)
  
  # Log model
  mlflow.sklearn.log_model(rf, "random-forest-model")
  
  # Create metrics
  mse = mean_squared_error(y_test, predictions)
  print("  mse: {}".format(mse))
  
  # Log metrics
  mlflow.log_metric("mse", mse)
  
  runID = run.info.run_uuid
  experimentID = run.info.experiment_id
  
  print("Inside MLflow Run with run_id {} and experiment_id {}".format(runID, experimentID))

In [0]:
def log_rf(experimentID, run_name, params, X_train, X_test, y_train, y_test):
  import os
  import matplotlib.pyplot as plt
  import mlflow.sklearn
  import seaborn as sns
  from sklearn.ensemble import RandomForestRegressor
  from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
  import tempfile

  with mlflow.start_run(experiment_id=experimentID, run_name=run_name) as run:
    # Create model, train it, and create predictions
    rf = RandomForestRegressor(**params)
    rf.fit(X_train, y_train)
    predictions = rf.predict(X_test)

    # Log model
    mlflow.sklearn.log_model(rf, "random-forest-model")

    # Log params
    [mlflow.log_param(param, value) for param, value in params.items()]

    # Create metrics
    mse = mean_squared_error(y_test, predictions)
    mae = mean_absolute_error(y_test, predictions)
    r2 = r2_score(y_test, predictions)
    print("  mse: {}".format(mse))
    print("  mae: {}".format(mae))
    print("  R2: {}".format(r2))

    # Log metrics
    mlflow.log_metric("mse", mse)
    mlflow.log_metric("mae", mae)  
    mlflow.log_metric("r2", r2)  
    # Create feature importance
    importance = pd.DataFrame(list(zip(df.columns, rf.feature_importances_)), 
                                columns=["Feature", "Importance"]
                              ).sort_values("Importance", ascending=False)
    
    # Log importances using a temporary file
    temp = tempfile.NamedTemporaryFile(prefix="feature-importance-", suffix=".csv")
    temp_name = temp.name
    try:
      importance.to_csv(temp_name, index=False)
      mlflow.log_artifact(temp_name, "feature-importance.csv")
    finally:
      temp.close() # Delete the temp file
    
    # Create plot
    fig, ax = plt.subplots()

    sns.residplot(x=predictions, y=y_test, lowess=True, ax=ax)
    plt.xlabel("Predicted values for PositionOrder")
    plt.ylabel("Residual")
    plt.title("Residual Plot")

    # Log residuals using a temporary file
    temp = tempfile.NamedTemporaryFile(prefix="residuals-", suffix=".png")
    temp_name = temp.name
    try:
      fig.savefig(temp_name)
      mlflow.log_artifact(temp_name, "residuals.png")
    finally:
      temp.close() # Delete the temp file
      
    display(fig)
    return run.info.run_uuid

In [0]:
# COMMAND ----------

# MAGIC %md
# MAGIC Run with new parameters.

# COMMAND ----------

params1 = {
  "n_estimators": 100,
  "max_depth": 5,
  "random_state": 42
}

log_rf(experimentID, "Run 1: 100 Estimators, Depth 5", params1, X_train, X_test, y_train, y_test)

# COMMAND ----------

params2 = {
  "n_estimators": 200,
  "max_depth": 5,
  "random_state": 42
}

log_rf(experimentID, "Run 2: 200 Estimators, Depth 5", params2, X_train, X_test, y_train, y_test)

# COMMAND ----------

params3 = {
  "n_estimators": 300,
  "max_depth": 5,
  "random_state": 42
}

log_rf(experimentID, "Run 3: 300 Estimators, Depth 5", params3, X_train, X_test, y_train, y_test)

# COMMAND ----------

params4 = {
  "n_estimators": 100,
  "max_depth": 10,
  "random_state": 42
}

log_rf(experimentID, "Run 4: 100 Estimators, Depth 10", params4, X_train, X_test, y_train, y_test)

# COMMAND ----------

params5 = {
  "n_estimators": 200,
  "max_depth": 10,
  "random_state": 42
}

log_rf(experimentID, "Run 5: 200 Estimators, Depth 10", params5, X_train, X_test, y_train, y_test)

# COMMAND ----------

params6 = {
  "n_estimators": 300,
  "max_depth": 10,
  "random_state": 42
}

log_rf(experimentID, "Run 6: 300 Estimators, Depth 10", params6, X_train, X_test, y_train, y_test)

# COMMAND ----------

params7 = {
  "n_estimators": 100,
  "max_depth": 15,
  "random_state": 42
}

log_rf(experimentID, "Run 7: 100 Estimators, Depth 15", params7, X_train, X_test, y_train, y_test)

# COMMAND ----------

params8 = {
  "n_estimators": 200,
  "max_depth": 15,
  "random_state": 42
}

log_rf(experimentID, "Run 8: 200 Estimators, Depth 15", params8, X_train, X_test, y_train, y_test)

# COMMAND ----------

params9 = {
  "n_estimators": 300,
  "max_depth": 15,
  "random_state": 42
}

log_rf(experimentID, "Run 9: 300 Estimators, Depth 15", params9, X_train, X_test, y_train, y_test)

# COMMAND ----------

params10 = {
  "n_estimators": 500,
  "max_depth": 20,
  "random_state": 42
}

log_rf(experimentID, "Run 10: 500 Estimators, Depth 20", params10, X_train, X_test, y_train, y_test)

# COMMAND ----------


# Interpretation:

The best model seems to be the last one (10th) which although has the same R2 as models 8,9 and 10 at 0.9 or 90%, but has the least MSE and MAE

