In [0]:
%pip install mlflow pandas scikit-learn matplotlib seaborn
import mlflow
import mlflow.sklearn
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import tempfile
import os

In [0]:
import boto3
s3 = boto3.client('s3')

df_results = spark.read.csv('s3://columbia-gr5069-main/raw/results.csv', header=True)

display(df_results)


In [0]:
df_clean = df_results[["grid", "laps", "points", "laps", "statusId", "positionOrder"]]
df_clean = df_clean.dropna()
df_clean = df_clean.toPandas()

X = df_clean.drop(["positionOrder"], axis=1)
y = df_clean[["positionOrder"]]

# Train-test split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)


In [0]:
%python
import mlflow
import mlflow.sklearn
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from sklearn.impute import SimpleImputer
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np

# Clean the dataset by replacing non-numeric values with NaN and then filling them
def clean_data(df):
    df.replace("\\N", np.nan, inplace=True)
    imputer = SimpleImputer(strategy='mean')
    df = pd.DataFrame(imputer.fit_transform(df), columns=df.columns)
    return df

# Assuming you have already defined X_train, X_test, y_train, y_test
X_train = clean_data(X_train)
X_test = clean_data(X_test)
y_train = clean_data(y_train)
y_test = clean_data(y_test)

# Define the param_grid with the modified hyperparameters
param_grid = [
    {"n_estimators": 30, "max_depth": 3, "random_state": 42},
    {"n_estimators": 80, "max_depth": 8, "random_state": 42},
    {"n_estimators": 120, "max_depth": 5, "random_state": 42},
    {"n_estimators": 170, "max_depth": 12, "random_state": 42},
    {"n_estimators": 60, "max_depth": 10, "random_state": 42},
    {"n_estimators": 150, "max_depth": 6, "random_state": 42},
    {"n_estimators": 90, "max_depth": 15, "random_state": 42},
    {"n_estimators": 110, "max_depth": 7, "random_state": 42},
    {"n_estimators": 40, "max_depth": 4, "random_state": 42},
    {"n_estimators": 100, "max_depth": 20, "random_state": 42}
]

# Function to log and train the model, and return the R² value
def log_rf(experimentID, run_name, params, X_train, X_test, y_train, y_test):
    with mlflow.start_run(experiment_id=experimentID, run_name=run_name):
        # Initialize the model with parameters
        model = RandomForestRegressor(
            n_estimators=params["n_estimators"],
            max_depth=params["max_depth"],
            random_state=params["random_state"]
        )
        
        # Train the model
        model.fit(X_train, y_train)
        
        # Make predictions
        y_pred = model.predict(X_test)
        
        # Calculate metrics
        mse = mean_squared_error(y_test, y_pred)
        r2 = r2_score(y_test, y_pred)
        mae = mean_absolute_error(y_test, y_pred)
        
        # Log hyperparameters
        mlflow.log_param("n_estimators", params["n_estimators"])
        mlflow.log_param("max_depth", params["max_depth"])
        mlflow.log_param("random_state", params["random_state"])
        
        # Log metrics
        mlflow.log_metric("mean_squared_error", mse)
        mlflow.log_metric("r2_score", r2)
        mlflow.log_metric("mean_absolute_error", mae)
        
        # Log the model
        mlflow.sklearn.log_model(model, "model")
        
        # Create and log a plot of feature importances
        feature_importances = model.feature_importances_
        plt.figure(figsize=(6, 4))
        sns.barplot(x=X_train.columns, y=feature_importances)
        plt.title('Feature Importance')
        plt.tight_layout()
        
        # Save the plot to the current directory
        plot_path = "feature_importance_plot.png"
        plt.savefig(plot_path)
        mlflow.log_artifact(plot_path)
        
        # Log predictions as CSV
        predictions_df = pd.DataFrame({
            'True Values': y_test.values.ravel(), 
            'Predicted Values': y_pred.ravel()
        })
        csv_path = "predictions.csv"
        predictions_df.to_csv(csv_path, index=False)
        mlflow.log_artifact(csv_path)
        
        return mlflow.active_run().info.run_id, r2

# Define experiment ID (replace with actual experiment ID)
experimentID = "f7f3c067e6c04249ab931221fd117ba1"  # Replace with actual experiment ID

best_run_uuid = None
best_r2 = -float('inf')  # Initialize the best R² value to negative infinity

# Loop over the hyperparameter grid to run 10 experiments
for i, params in enumerate(param_grid):
    run_name = f"Run {i+1}"
    print(f"Running {run_name} with params: {params}")
    
    run_uuid, r2 = log_rf(experimentID, run_name, params, X_train, X_test, y_train, y_test)
    
    print(f"Run {i+1} finished with R²: {r2}. Run ID: {run_uuid}")
    
    # Track the best model
    if r2 > best_r2:
        best_r2 = r2
        best_run_uuid = run_uuid

# Check if the best model was found
if best_run_uuid is None:
    print("No valid model found with R² greater than -inf")
else:
    print(f"\nBest model run UUID: {best_run_uuid} with R²: {best_r2}")