In [0]:
pip install mlflow

In [0]:
# import relevant functions
import pandas as pd
from sklearn.model_selection import train_test_split
import boto3
import mlflow.sklearn
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from mlflow import search_runs

In [0]:
# import data
s3 = boto3.client('s3')
bucket = "columbia-gr5069-main"
drivers_data = "raw/drivers.csv"
races_data = "raw/races.csv"
results_data = "raw/results.csv"

drivers = s3.get_object(Bucket= bucket, Key= drivers_data) 
races = s3.get_object(Bucket= bucket, Key= races_data) 
results = s3.get_object(Bucket= bucket, Key= results_data) 

df_drivers = pd.read_csv(drivers["Body"])
df_races = pd.read_csv(races["Body"])
df_results = pd.read_csv(results["Body"])

In [0]:
display(df_drivers)

In [0]:
display(df_races)

In [0]:
display(df_results)

In [0]:
# merge dataframes 
drivers_results = pd.merge(df_results, df_drivers, on="driverId", how="left")
drivers_results = pd.merge(drivers_results, df_races, on="raceId", how="left")


In [0]:
# calculate age of drivers at time of race
drivers_results['dob'] = pd.to_datetime(drivers_results['dob'])
drivers_results['date'] = pd.to_datetime(drivers_results['date'])
drivers_results['age'] = (drivers_results['date'].dt.year - drivers_results['dob'].dt.year) - ((
    drivers_results['date'].dt.year - drivers_results['dob'].dt.month) < 0)

In [0]:
# one-hot encode nationality
drivers_results = pd.get_dummies(drivers_results, columns=["nationality"])
display(drivers_results)

In [0]:
# display final dataframe
df = drivers_results[["raceId", "driverId", "positionOrder", "age", 
                         "nationality_American", 
                         "nationality_American-Italian", 
                         "nationality_Argentine", 
                         "nationality_Argentine-Italian", 
                         "nationality_Australian", 
                         "nationality_Austrian", 
                         "nationality_Belgian", 
                         "nationality_Brazilian", 
                         "nationality_British", 
                         "nationality_Canadian", 
                         "nationality_Chilean", 
                         "nationality_Chinese", 
                         "nationality_Colombian", 
                         "nationality_Czech", 
                         "nationality_Danish", 
                         "nationality_Dutch", 
                         "nationality_East German", 
                         "nationality_Finnish", 
                         "nationality_French", 
                         "nationality_German", 
                         "nationality_Hungarian", 
                         "nationality_Indian", 
                         "nationality_Indonesian", 
                         "nationality_Irish", 
                         "nationality_Italian", 
                         "nationality_Japanese", 
                         "nationality_Liechtensteiner", 
                         "nationality_Malaysian", 
                         "nationality_Mexican", 
                         "nationality_Monegasque", 
                         "nationality_New Zealander", 
                         "nationality_Polish", 
                         "nationality_Portuguese", 
                         "nationality_Rhodesian", 
                         "nationality_Russian", 
                         "nationality_South African", 
                         "nationality_Spanish", 
                         "nationality_Swedish", 
                         "nationality_Swiss", 
                         "nationality_Thai", 
                         "nationality_Uruguayan", 
                         "nationality_Venezuelan"]]
display(df)

In [0]:
# split data into training and test subsets
y = df["positionOrder"]
X = df.loc[:,["age",
                "nationality_American", 
                "nationality_American-Italian", 
                "nationality_Argentine", 
                "nationality_Argentine-Italian", 
                "nationality_Australian", 
                "nationality_Austrian", 
                "nationality_Belgian", 
                "nationality_Brazilian", 
                "nationality_British", 
                "nationality_Canadian", 
                "nationality_Chilean", 
                "nationality_Chinese", 
                "nationality_Colombian", 
                "nationality_Czech", 
                "nationality_Danish", 
                "nationality_Dutch", 
                "nationality_East German", 
                "nationality_Finnish", 
                "nationality_French", 
                "nationality_German", 
                "nationality_Hungarian", 
                "nationality_Indian", 
                "nationality_Indonesian", 
                "nationality_Irish", 
                "nationality_Italian", 
                "nationality_Japanese", 
                "nationality_Liechtensteiner", 
                "nationality_Malaysian", 
                "nationality_Mexican", 
                "nationality_Monegasque", 
                "nationality_New Zealander", 
                "nationality_Polish", 
                "nationality_Portuguese", 
                "nationality_Rhodesian", 
                "nationality_Russian", 
                "nationality_South African", 
                "nationality_Spanish", 
                "nationality_Swedish", 
                "nationality_Swiss", 
                "nationality_Thai", 
                "nationality_Uruguayan", 
                "nationality_Venezuelan"]]

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=15)

In [0]:
with mlflow.start_run(run_name="Basic RF Experiment") as run:
  # Create model, train it, and create predictions
  rf = RandomForestRegressor()
  rf.fit(X_train, y_train)
  predictions = rf.predict(X_test)
  
  # Log model
  mlflow.sklearn.log_model(rf, "random-forest-model")
  
  # Create metrics
  mse = mean_squared_error(y_test, predictions)
  print("  mse: {}".format(mse))
  
  # Log metrics
  mlflow.log_metric("mse", mse)
  
  runID = run.info.run_uuid
  experimentID = run.info.experiment_id
  
  print("Inside MLflow Run with run_id {} and experiment_id {}".format(runID, experimentID))

In [0]:
def log_rf(experimentID, run_name, params, X_train, X_test, y_train, y_test):
  import os
  import matplotlib.pyplot as plt
  import mlflow.sklearn
  import seaborn as sns
  from sklearn.ensemble import RandomForestRegressor
  from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score, explained_variance_score, mean_absolute_percentage_error, mean_squared_log_error
  import tempfile

  with mlflow.start_run(experiment_id=experimentID, run_name=run_name) as run:
    # Create model, train it, and create predictions
    rf = RandomForestRegressor(**params)
    rf.fit(X_train, y_train)
    predictions = rf.predict(X_test)

    # Log model
    mlflow.sklearn.log_model(rf, "random-forest-model")

    # Log params
    [mlflow.log_param(param, value) for param, value in params.items()]

    # Create metrics
    mse = mean_squared_error(y_test, predictions)
    mae = mean_absolute_error(y_test, predictions)
    rmse = mean_squared_error(y_test, predictions, squared=False)
    r2 = r2_score(y_test, predictions)
    mape = mean_absolute_percentage_error(y_test, predictions)
    evs = explained_variance_score(y_test, predictions)
    msle = mean_squared_log_error(y_test, predictions)
    print("  mse: {}".format(mse))
    print("  mae: {}".format(mae))
    print("  rmse: {}".format(rmse))
    print("  R2: {}".format(r2))
    print("  mape: {}".format(mape))
    print("  evs: {}".format(evs))
    print("  msle: {}".format(msle))

    # Log metrics
    mlflow.log_metric("mse", mse)
    mlflow.log_metric("mae", mae)  
    mlflow.log_metric("r2", r2) 
    mlflow.log_metric("rmse", rmse)
    mlflow.log_metric("mape", mape)  
    mlflow.log_metric("evs", evs)
    mlflow.log_metric("msle", msle)  
    
    # Create feature importance
    importance = pd.DataFrame(list(zip(df.columns, rf.feature_importances_)), 
                                columns=["Feature", "Importance"]
                              ).sort_values("Importance", ascending=False)
    
    # Log importances using a temporary file
    temp = tempfile.NamedTemporaryFile(prefix="feature-importance-", suffix=".csv")
    temp_name = temp.name
    try:
      importance.to_csv(temp_name, index=False)
      mlflow.log_artifact(temp_name, "feature-importance.csv")
    finally:
      temp.close() # Delete the temp file
    
    # Create plot
    fig, ax = plt.subplots()

    sns.residplot(x=predictions, y=y_test, lowess=True)
    plt.xlabel("Predicted values position order")
    plt.ylabel("Residual")
    plt.title("Residual Plot")

    # Log residuals using a temporary file
    temp = tempfile.NamedTemporaryFile(prefix="residuals-", suffix=".png")
    temp_name = temp.name
    try:
      fig.savefig(temp_name)
      mlflow.log_artifact(temp_name, "residuals.png")
    finally:
      temp.close() # Delete the temp file

    # Log predictions  
    pred = pd.DataFrame({"Predictions": predictions, "Actual": y_test})
    pred_path = "predictions.csv"
    pred.to_csv(pred_path, index=False)
    mlflow.log_artifact(pred_path)
    
    return run.info.run_uuid

In [0]:
params = {
  "n_estimators": 100,
  "max_depth": 5,
  "random_state": 42
}

log_rf(experimentID, "exp1", params, X_train, X_test, y_train, y_test)

In [0]:
params_10_depth = {
  "n_estimators": 100,
  "max_depth": 10,
  "random_state": 42
}

log_rf(experimentID, "exp2", params_10_depth, X_train, X_test, y_train, y_test)

In [0]:
params_15_depth = {
  "n_estimators": 100,
  "max_depth": 15,
  "random_state": 42
}

log_rf(experimentID, "exp3", params_15_depth, X_train, X_test, y_train, y_test)

In [0]:
params_1000_trees = {
  "n_estimators": 1000,
  "max_depth": 5,
  "random_state": 42
}

log_rf(experimentID, "exp4", params_1000_trees, X_train, X_test, y_train, y_test)

In [0]:
params_1000_trees_10_depth = {
  "n_estimators": 1000,
  "max_depth": 10,
  "random_state": 42
}

log_rf(experimentID, "exp5", params_1000_trees_10_depth, X_train, X_test, y_train, y_test)

In [0]:
params_1000_trees_15_depth = {
  "n_estimators": 1000,
  "max_depth": 15,
  "random_state": 42
}

log_rf(experimentID, "exp6", params_1000_trees_15_depth, X_train, X_test, y_train, y_test)

In [0]:
params_2000_trees = {
  "n_estimators": 2000,
  "max_depth": 5,
  "random_state": 42
}

log_rf(experimentID, "exp7", params_2000_trees, X_train, X_test, y_train, y_test)

In [0]:
params_2000_trees_10_depth = {
  "n_estimators": 2000,
  "max_depth": 10,
  "random_state": 42
}

log_rf(experimentID, "exp8", params_2000_trees_10_depth, X_train, X_test, y_train, y_test)

In [0]:
params_2000_trees_15_depth = {
  "n_estimators": 2000,
  "max_depth": 15,
  "random_state": 42
}

log_rf(experimentID, "exp9", params_2000_trees_15_depth, X_train, X_test, y_train, y_test)

In [0]:
params_3000_trees = {
  "n_estimators": 3000,
  "max_depth": 5,
  "random_state": 42
}

log_rf(experimentID, "exp10", params_3000_trees, X_train, X_test, y_train, y_test)

In [0]:
runs = search_runs()
best_run = runs.loc[runs['metrics.r2'].idxmax()]

print(f"Best run name: {best_run['tags.mlflow.runName']}")
print(f"Best run ID: {best_run['run_id']}")
print(f"Best R2: {best_run['metrics.r2']}")

In [0]:
best_run2 = runs.loc[runs['metrics.rmse'].idxmin()]

print(f"Best run name: {best_run2['tags.mlflow.runName']}")
print(f"Best run ID: {best_run2['run_id']}")
print(f"Best MSE: {best_run2['metrics.rmse']}")

Experiment 9 is the the best model run because it has the best R^2 value of 0.077 which means 7.7% of the variance in the position order is explained by the drivers' nationality and age. It also has the lowest value for root mean squared error with a value of around 7 positions which shows that on average predicted values and actual values for positions had a difference of around 7 positions. While experiment 9 is the best run out of the 10 experiments, the models themselves are generally not very good predictors of position, showing that drivers' age and nationality are not great predictors of position.