In [0]:
# Import necessary libraries
import mlflow
import mlflow.sklearn
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, count, avg, min, max, stddev
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.preprocessing import OneHotEncoder, StandardScaler
import matplotlib.pyplot as plt
import seaborn as sns
import os
from joblib import Parallel, delayed

# Set up Spark Session with optimized configuration
spark = SparkSession.builder \
    .appName("F1_MLflow_Homework") \
    .config("spark.sql.shuffle.partitions", 10) \
    .config("spark.executor.memory", "4g") \
    .config("spark.driver.memory", "4g") \
    .getOrCreate()

# Enable Arrow optimization for faster Spark to Pandas conversion
spark.conf.set("spark.sql.execution.arrow.pyspark.enabled", "true")

# Get Databricks username for experiment path
username = spark.sql("SELECT current_user()").collect()[0][0]
experiment_path = f"/Users/{username}/F1_Lap_Time_Prediction"

# Set MLflow experiment with proper path format for Databricks
mlflow.set_experiment(experiment_path)
print(f"MLflow experiment set: {experiment_path}")

# Load data - only select needed columns to reduce data transfer
print("Loading data from S3...")
df_laptimes = spark.read.csv('s3://columbia-gr5069-main/raw/lap_times.csv', header=True)
df_drivers = spark.read.csv('s3://columbia-gr5069-main/raw/drivers.csv', header=True)
df_pitstops = spark.read.csv('s3://columbia-gr5069-main/raw/pit_stops.csv', header=True)
df_results = spark.read.csv('s3://columbia-gr5069-main/raw/results.csv', header=True)
df_races = spark.read.csv('s3://columbia-gr5069-main/raw/races.csv', header=True)

# Convert to Pandas for ease of use with scikit-learn
laptimes_pd = df_laptimes.toPandas()
drivers_pd = df_drivers.toPandas()
pitstops_pd = df_pitstops.toPandas()
results_pd = df_results.toPandas()
races_pd = df_races.toPandas()

print("Data loaded successfully")

# Data preprocessing - only keep needed columns to improve performance
print("Starting data preprocessing...")

# Pre-filter dataframes to include only needed columns before merging
drivers_keep_cols = ['driverId', 'driverRef', 'code', 'forename', 'surname', 'nationality']
drivers_pd = drivers_pd[[col for col in drivers_keep_cols if col in drivers_pd.columns]]

races_keep_cols = ['raceId', 'name', 'year', 'round', 'location', 'country']
races_pd = races_pd[[col for col in races_keep_cols if col in races_pd.columns]]

pitstops_keep_cols = ['raceId', 'driverId', 'milliseconds']
pitstops_pd = pitstops_pd[[col for col in pitstops_keep_cols if col in pitstops_pd.columns]]

# Join laptimes with drivers information
merged_data = laptimes_pd.merge(drivers_pd, on='driverId', how='left')
# Join with race information
merged_data = merged_data.merge(races_pd, on='raceId', how='left')

# Add average pit stop time per driver per race
avg_pitstops = pitstops_pd.groupby(['raceId', 'driverId']).agg({'milliseconds': 'mean'}).reset_index()
avg_pitstops.rename(columns={'milliseconds': 'avg_pitstop_time'}, inplace=True)
merged_data = merged_data.merge(avg_pitstops, on=['raceId', 'driverId'], how='left')

print(f"Merged data shape: {merged_data.shape}")

# Identify categorical and numerical features
categorical_features = []
for col in ['driverRef', 'code', 'forename', 'surname', 'nationality', 'name', 'location', 'country']:
    if col in merged_data.columns:
        categorical_features.append(col)

numerical_features = []
for col in ['lap', 'position', 'year', 'round', 'grid', 'altitude']:
    if col in merged_data.columns:
        numerical_features.append(col)
        merged_data[col] = pd.to_numeric(merged_data[col], errors='coerce')

# Add pit stop time if available
if 'avg_pitstop_time' in merged_data.columns:
    merged_data['avg_pitstop_time'] = merged_data['avg_pitstop_time'].fillna(merged_data['avg_pitstop_time'].mean())
    numerical_features.append('avg_pitstop_time')

print(f"Selected {len(categorical_features)} categorical features: {categorical_features}")
print(f"Selected {len(numerical_features)} numerical features: {numerical_features}")

# Handle categorical features - one-hot encoding
print("Performing one-hot encoding...")
encoder = OneHotEncoder(sparse=False, handle_unknown='ignore')
encoded_features = encoder.fit_transform(merged_data[categorical_features].fillna('Unknown'))

# Create a new DataFrame with string column names for the encoded features
encoded_cols = [f'encoded_{i}' for i in range(encoded_features.shape[1])]
    
encoded_df = pd.DataFrame(
    encoded_features, 
    columns=encoded_cols,
    index=merged_data.index
)

# Prepare the final dataset
X_numerical = merged_data[numerical_features]
X = pd.concat([X_numerical.reset_index(drop=True), encoded_df.reset_index(drop=True)], axis=1)
X.columns = X.columns.astype(str)
y = pd.to_numeric(merged_data['milliseconds'], errors='coerce')

# Remove rows with NaN in target variable
mask = ~y.isna()
X = X[mask]
y = y[mask]

print(f"Final dataset - X shape: {X.shape}, y shape: {y.shape}")

# Speed up by reducing dataset size if it's very large (optional)
if X.shape[0] > 50000:
    print(f"Reducing dataset size from {X.shape[0]} to 50000 rows for faster training")
    random_indices = np.random.choice(X.shape[0], 50000, replace=False)
    X = X.iloc[random_indices]
    y = y.iloc[random_indices]

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print(f"Training set: {X_train.shape}, Test set: {X_test.shape}")

# Scale numerical features
scaler = StandardScaler()
X_train_numerical = scaler.fit_transform(X_train.iloc[:, :len(numerical_features)])
X_test_numerical = scaler.transform(X_test.iloc[:, :len(numerical_features)])
X_train.iloc[:, :len(numerical_features)] = X_train_numerical
X_test.iloc[:, :len(numerical_features)] = X_test_numerical

# Create feature names list for artifacts (all as strings)
feature_names = numerical_features.copy() + encoded_cols

# Function to create and save artifacts - simplified to improve speed
def create_artifacts(model, X_test, y_test, y_pred, feature_names, run_id):
    # Feature importance plot - only plot top 10 features
    feature_importance = pd.DataFrame(
        model.feature_importances_,
        index=feature_names,
        columns=['importance']
    ).sort_values('importance', ascending=False)
    
    plt.figure(figsize=(8, 5))
    feature_importance.head(10).plot(kind='bar')
    plt.title('Top 10 Feature Importance')
    plt.tight_layout()
    plt.savefig(f'feature_importance_{run_id}.png', dpi=100)
    plt.close()
    
    # Save feature importance as CSV
    feature_importance.to_csv(f'feature_importance_{run_id}.csv')
    
    # Residuals plot - sample fewer points for faster plotting
    if len(y_pred) > 1000:
        idx = np.random.choice(len(y_pred), 1000, replace=False)
        y_pred_sample = y_pred[idx]
        y_test_sample = y_test.iloc[idx]
    else:
        y_pred_sample = y_pred
        y_test_sample = y_test
    
    plt.figure(figsize=(8, 5))
    plt.scatter(y_pred_sample, y_test_sample - y_pred_sample, alpha=0.5)
    plt.axhline(y=0, color='r', linestyle='-')
    plt.xlabel('Predicted Values')
    plt.ylabel('Residuals')
    plt.title('Residual Plot (Sample)')
    plt.tight_layout()
    plt.savefig(f'residuals_{run_id}.png', dpi=100)
    plt.close()
    
    # Save predictions in a smaller sample for faster saving
    if len(y_pred) > 5000:
        idx = np.random.choice(len(y_pred), 5000, replace=False)
        pd.DataFrame({
            'actual': y_test.iloc[idx], 
            'predicted': y_pred[idx]
        }).to_csv(f'predictions_{run_id}.csv')
    else:
        pd.DataFrame({
            'actual': y_test, 
            'predicted': y_pred
        }).to_csv(f'predictions_{run_id}.csv')
    
    return [
        f'feature_importance_{run_id}.png',
        f'feature_importance_{run_id}.csv',
        f'residuals_{run_id}.png',
        f'predictions_{run_id}.csv'
    ]

# Function to evaluate model and log metrics
def evaluate_model(model, X_test, y_test):
    # Make predictions
    y_pred = model.predict(X_test)
    
    # Calculate metrics
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    mae = mean_absolute_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    
    return rmse, mae, r2, y_pred

# Function to run a single experiment
def run_experiment(run_id, max_depth, n_estimators, min_samples_split):
    run_name = f"Run {run_id + 1}"
    print(f"\nStarting {run_name} with parameters:")
    print(f"  max_depth: {max_depth}")
    print(f"  n_estimators: {n_estimators}")
    print(f"  min_samples_split: {min_samples_split}")
    
    # Start MLflow run
    with mlflow.start_run(run_name=run_name):
        # Log parameters
        mlflow.log_param("max_depth", max_depth)
        mlflow.log_param("n_estimators", n_estimators)
        mlflow.log_param("min_samples_split", min_samples_split)
        mlflow.log_param("random_state", 42)
        
        # Train model with warm_start=True to speed up sequential fits
        print("  Training model...")
        model = RandomForestRegressor(
            max_depth=max_depth,
            n_estimators=n_estimators,
            min_samples_split=min_samples_split,
            random_state=42,
            n_jobs=-1,  # Use all cores for faster training
            verbose=0
        )
        model.fit(X_train, y_train)
        
        # Evaluate model
        print("  Evaluating model...")
        rmse, mae, r2, y_pred = evaluate_model(model, X_test, y_test)
        
        # Log metrics
        mlflow.log_metric("rmse", rmse)
        mlflow.log_metric("mae", mae)
        mlflow.log_metric("r2", r2)
        
        # Create and log artifacts
        print("  Creating artifacts...")
        artifact_paths = create_artifacts(model, X_test, y_test, y_pred, feature_names, run_id)
        
        # Log artifacts
        for artifact_path in artifact_paths:
            mlflow.log_artifact(artifact_path)
        
        # Log the model itself
        mlflow.sklearn.log_model(model, "random-forest-model")
        
        print(f"  Completed {run_name}")
        print(f"  Metrics - RMSE: {rmse:.2f}, MAE: {mae:.2f}, R2: {r2:.4f}")
        print("-" * 50)
        
        # Clean up artifact files to save space
        for artifact_path in artifact_paths:
            if os.path.exists(artifact_path):
                os.remove(artifact_path)
                
        return {
            "run_id": run_id,
            "max_depth": max_depth,
            "n_estimators": n_estimators,
            "min_samples_split": min_samples_split,
            "rmse": rmse,
            "mae": mae,
            "r2": r2
        }

# Run experiments with different parameters
print("\n=== Starting MLflow experiment runs ===\n")

# Parameters to try
parameter_combinations = [
    # max_depth, n_estimators, min_samples_split
    (5, 100, 2),     # Run 1
    (10, 300, 2),    # Run 2  
    (15, 500, 2),    # Run 3
    (20, 800, 2),    # Run 4
    (None, 1000, 2), # Run 5
    (10, 100, 5),    # Run 6
    (10, 300, 10),   # Run 7
    (15, 300, 5),    # Run 8
    (20, 500, 5),    # Run 9
    (15, 800, 10)    # Run 10
]

# Run experiments in sequence (more reliable in Databricks)
results = []
for run_id, (max_depth, n_estimators, min_samples_split) in enumerate(parameter_combinations):
    result = run_experiment(run_id, max_depth, n_estimators, min_samples_split)
    results.append(result)

# Print summary of all runs
print("\n=== Summary of All Runs ===\n")
results_df = pd.DataFrame(results)
print(results_df.sort_values(by='r2', ascending=False))

print("\n=== All experiment runs completed successfully ===")
print("You can now view the results in the MLflow UI")
print("Remember to take screenshots of:")
print("1. The MLflow homepage showing all your runs")
print("2. The detailed page of your best run")

MLflow experiment set: /Users/lq2242@columbia.edu/F1_Lap_Time_Prediction
Loading data from S3...
Data loaded successfully
Starting data preprocessing...
Merged data shape: (551742, 15)
Selected 6 categorical features: ['driverRef', 'code', 'forename', 'surname', 'nationality', 'name']
Selected 5 numerical features: ['lap', 'position', 'year', 'round', 'avg_pitstop_time']
Performing one-hot encoding...
Final dataset - X shape: (551742, 575), y shape: (551742,)
Reducing dataset size from 551742 to 50000 rows for faster training
Training set: (40000, 575), Test set: (10000, 575)

=== Starting MLflow experiment runs ===


Starting Run 1 with parameters:
  max_depth: 5
  n_estimators: 100
  min_samples_split: 2
  Training model...
  Evaluating model...
  Creating artifacts...




  Completed Run 1
  Metrics - RMSE: 62915.14, MAE: 12849.33, R2: -0.0246
--------------------------------------------------

Starting Run 2 with parameters:
  max_depth: 10
  n_estimators: 300
  min_samples_split: 2
  Training model...
  Evaluating model...
  Creating artifacts...




  Completed Run 2
  Metrics - RMSE: 63722.23, MAE: 11564.90, R2: -0.0511
--------------------------------------------------

Starting Run 3 with parameters:
  max_depth: 15
  n_estimators: 500
  min_samples_split: 2
  Training model...
  Evaluating model...
  Creating artifacts...




  Completed Run 3
  Metrics - RMSE: 70595.26, MAE: 10199.92, R2: -0.2901
--------------------------------------------------

Starting Run 4 with parameters:
  max_depth: 20
  n_estimators: 800
  min_samples_split: 2
  Training model...
  Evaluating model...
  Creating artifacts...




  Completed Run 4
  Metrics - RMSE: 72878.58, MAE: 8882.09, R2: -0.3749
--------------------------------------------------

Starting Run 5 with parameters:
  max_depth: None
  n_estimators: 1000
  min_samples_split: 2
  Training model...
  Evaluating model...
  Creating artifacts...




  Completed Run 5
  Metrics - RMSE: 72881.38, MAE: 7774.76, R2: -0.3750
--------------------------------------------------

Starting Run 6 with parameters:
  max_depth: 10
  n_estimators: 100
  min_samples_split: 5
  Training model...
  Evaluating model...
  Creating artifacts...




  Completed Run 6
  Metrics - RMSE: 62929.33, MAE: 11700.02, R2: -0.0251
--------------------------------------------------

Starting Run 7 with parameters:
  max_depth: 10
  n_estimators: 300
  min_samples_split: 10
  Training model...
  Evaluating model...
  Creating artifacts...




  Completed Run 7
  Metrics - RMSE: 61867.21, MAE: 11593.31, R2: 0.0092
--------------------------------------------------

Starting Run 8 with parameters:
  max_depth: 15
  n_estimators: 300
  min_samples_split: 5
  Training model...
  Evaluating model...
  Creating artifacts...




  Completed Run 8
  Metrics - RMSE: 65789.47, MAE: 10166.97, R2: -0.1204
--------------------------------------------------

Starting Run 9 with parameters:
  max_depth: 20
  n_estimators: 500
  min_samples_split: 5
  Training model...
  Evaluating model...
  Creating artifacts...




  Completed Run 9
  Metrics - RMSE: 67664.82, MAE: 8932.54, R2: -0.1852
--------------------------------------------------

Starting Run 10 with parameters:
  max_depth: 15
  n_estimators: 800
  min_samples_split: 10
  Training model...
  Evaluating model...
  Creating artifacts...




  Completed Run 10
  Metrics - RMSE: 63459.56, MAE: 10142.09, R2: -0.0425
--------------------------------------------------

=== Summary of All Runs ===

   run_id  max_depth  n_estimators  ...          rmse           mae        r2
6       6       10.0           300  ...  61867.206947  11593.305768  0.009206
0       0        5.0           100  ...  62915.144122  12849.331374 -0.024643
5       5       10.0           100  ...  62929.332190  11700.022640 -0.025105
9       9       15.0           800  ...  63459.556434  10142.085339 -0.042452
1       1       10.0           300  ...  63722.227824  11564.901245 -0.051100
7       7       15.0           300  ...  65789.466408  10166.966489 -0.120405
8       8       20.0           500  ...  67664.821977   8932.536956 -0.185190
2       2       15.0           500  ...  70595.262224  10199.924922 -0.290070
3       3       20.0           800  ...  72878.578314   8882.089589 -0.374871
4       4        NaN          1000  ...  72881.380441   7774.7570

<Figure size 800x500 with 0 Axes>

<Figure size 800x500 with 0 Axes>

<Figure size 800x500 with 0 Axes>

<Figure size 800x500 with 0 Axes>

<Figure size 800x500 with 0 Axes>

<Figure size 800x500 with 0 Axes>

<Figure size 800x500 with 0 Axes>

<Figure size 800x500 with 0 Axes>

<Figure size 800x500 with 0 Axes>

<Figure size 800x500 with 0 Axes>