In [0]:
%pip install mlflow pandas scikit-learn matplotlib seaborn
import mlflow
import mlflow.sklearn
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import tempfile
import os

In [0]:
# Databricks notebook source
# MAGIC %md-sandbox
# MAGIC <div><img src="https://files.training.databricks.com/images/eLearning/ML-Part-4/mlflow-tracking.png" style="height: 400px; margin: 20px"/></div>


# COMMAND ----------

# MAGIC %md
# MAGIC Import a dataset of Airbnb listings and featurize the data.  We'll use this to train a model.

# COMMAND ----------

import boto3
import pandas as pd

# COMMAND ----------

s3 = boto3.client('s3')

# COMMAND ----------
df_results = spark.read.csv('s3://columbia-gr5069-main/raw/results.csv', header=True)

display(df_results)


In [0]:
df_clean = df_results[["grid", "laps", "number", "resultId", "statusId", "positionOrder"]]
df_clean = df_clean.dropna()
df_clean = df_clean.toPandas()

X = df_clean.drop(["positionOrder"], axis=1)
y = df_clean[["positionOrder"]]

# Train-test split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)


In [0]:
%python
import mlflow.sklearn
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
import pandas as pd
import numpy as np

# Clean the dataset
X_train = X_train.replace('\\N', np.nan).dropna()
y_train = y_train.replace('\\N', np.nan).dropna()
X_test = X_test.replace('\\N', np.nan).dropna()
y_test = y_test.replace('\\N', np.nan).dropna()

# Ensure consistent lengths
X_train, y_train = X_train.align(y_train, join='inner', axis=0)
X_test, y_test = X_test.align(y_test, join='inner', axis=0)

with mlflow.start_run(run_name="Basic RF Experiment") as run:
  # Create model, train it, and create predictions
  rf = RandomForestRegressor()
  rf.fit(X_train, y_train)
  predictions = rf.predict(X_test)
  
  # Log model
  mlflow.sklearn.log_model(rf, "random-forest-model")
  
  # Create metrics
  mse = mean_squared_error(y_test, predictions)
  print("  mse: {}".format(mse))
  
  # Log metrics
  mlflow.log_metric("mse", mse)
  
  runID = run.info.run_uuid
  experimentID = run.info.experiment_id
  
  print("Inside MLflow Run with run_id {} and experiment_id {}".format(runID, experimentID))

In [0]:
%python
def log_rf(experimentID, run_name, params, X_train, X_test, y_train, y_test):
    with mlflow.start_run(experiment_id=experimentID, run_name=run_name) as run:
        # Initialize the model with hyperparameters
        model = RandomForestRegressor(**params)
        model.fit(X_train, y_train)
        predictions = model.predict(X_test)

        # Log the model
        mlflow.sklearn.log_model(model, "random-forest-model")

        # Log hyperparameters
        for k, v in params.items():
            mlflow.log_param(k, v)

        # Create metrics
        mse = mean_squared_error(y_test, predictions)
        mae = mean_absolute_error(y_test, predictions)
        r2 = r2_score(y_test, predictions)
        
        print(f"  mse: {mse}")
        print(f"  mae: {mae}")
        print(f"  R2: {r2}")

        # Log metrics
        mlflow.log_metric("mse", mse)
        mlflow.log_metric("mae", mae)  
        mlflow.log_metric("r2", r2)

        # Create and log a plot of feature importances
        feature_importances = model.feature_importances_
        plt.figure(figsize=(6, 4))
        sns.barplot(x=X_train.columns, y=feature_importances)
        plt.title('Feature Importance')
        plt.tight_layout()
        
        # Save the plot to the current directory
        plot_path = "feature_importance_plot.png"
        plt.savefig(plot_path)
        mlflow.log_artifact(plot_path)
        
        # Log predictions as CSV
        predictions_df = pd.DataFrame({'True Values': y_test, 'Predicted Values': y_pred})
        csv_path = "predictions.csv"
        predictions_df.to_csv(csv_path, index=False)
        mlflow.log_artifact(csv_path)
        
        return mlflow.active_run().info.run_id, r2

In [0]:
%python
# Example hyperparameters (you can tweak these for better tuning)
param_grid = [
    {"n_estimators": 30, "max_depth": 3, "random_state": 42},
    {"n_estimators": 80, "max_depth": 8, "random_state": 42},
    {"n_estimators": 120, "max_depth": 5, "random_state": 42},
    {"n_estimators": 170, "max_depth": 12, "random_state": 42},
    {"n_estimators": 60, "max_depth": 10, "random_state": 42},
    {"n_estimators": 150, "max_depth": 6, "random_state": 42},
    {"n_estimators": 90, "max_depth": 15, "random_state": 42},
    {"n_estimators": 110, "max_depth": 7, "random_state": 42},
    {"n_estimators": 40, "max_depth": 4, "random_state": 42},
    {"n_estimators": 100, "max_depth": 20, "random_state": 42}
]

# Assuming X_train, X_test, y_train, y_test are already defined
experimentID = "f7f3c067e6c04249ab931221fd117ba1"  # Replace with actual experiment ID

best_run_uuid = None
best_r2 = -float('inf')  # Initialize the best R² value to negative infinity

# Define the log_rf function
def log_rf(experimentID, run_name, params, X_train, X_test, y_train, y_test):
    from sklearn.ensemble import RandomForestRegressor
    from sklearn.metrics import r2_score
    import mlflow
    import pandas as pd

    with mlflow.start_run(experiment_id=experimentID, run_name=run_name) as run:
        rf = RandomForestRegressor(**params)
        rf.fit(X_train, y_train)
        y_pred = rf.predict(X_test)
        
        r2 = r2_score(y_test, y_pred)
        mlflow.log_param("n_estimators", params["n_estimators"])
        mlflow.log_param("max_depth", params["max_depth"])
        mlflow.log_metric("r2", r2)
        
        # Log predictions as CSV
        predictions_df = pd.DataFrame({
            'True Values': y_test.flatten(), 
            'Predicted Values': y_pred.flatten()
        })
        csv_path = "predictions.csv"
        predictions_df.to_csv(csv_path, index=False)
        mlflow.log_artifact(csv_path)
        
        return run.info.run_uuid, r2

# Loop over the hyperparameter grid to run 10 experiments
for i, params in enumerate(param_grid):
    run_name = f"Run {i+1}"
    run_uuid, r2 = log_rf(experimentID, run_name, params, X_train, X_test, y_train, y_test)
    
    print(f"Run {i+1} finished with R²: {r2}. Run ID: {run_uuid}")
    
    # Track the best model
    if r2 > best_r2:
        best_r2 = r2
        best_run_uuid = run_uuid

print(f"\nBest model run UUID: {best_run_uuid} with R²: {best_r2}")