In [67]:
# Import required libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.linear_model import Ridge
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.metrics import mean_squared_error, classification_report, r2_score
from sklearn.utils.class_weight import compute_class_weight
import matplotlib.pyplot as plt
import seaborn as sns
import os


In [68]:
# Initial setup
np.random.seed(42)

# Data loading and preprocessing
df = pd.read_csv("f1_data/laps.csv")

df.head()

Unnamed: 0,meeting_key,session_key,driver_number,lap_number,date_start,duration_sector_1,duration_sector_2,duration_sector_3,i1_speed,i2_speed,is_pit_out_lap,lap_duration,segments_sector_1,segments_sector_2,segments_sector_3,st_speed
0,1259,10023,81,1,2025-05-02T16:30:04.816000+00:00,,43.201,35.596,164.0,107.0,True,,"[2064, 2064, 2064, 2049, 2049, 2049, 2049, 204...","[2049, 2049, 2049, 2049, 2049, 2049, 2049, 204...","[2049, 2049, 2049, 2049, 2049, 2049, 2049, 2051]",219.0
1,1259,10023,10,1,2025-05-02T16:30:11.241000+00:00,,40.568,27.843,173.0,161.0,True,,"[2064, 2064, 2064, 2051, 2049, 2049, 2049, 204...","[2049, 2049, 2049, 2049, 2049, 2049, 2049, 204...","[2049, 2051, 2051, 2051, 2051, 2051, 2051, 2051]",302.0
2,1259,10023,5,1,2025-05-02T16:30:14.320000+00:00,,40.195,34.242,197.0,173.0,True,,"[2064, 2064, 2064, 2049, 2049, 2049, 2049, 204...","[2049, 2049, 2049, 2049, 2049, 2049, 2049, 204...","[2051, 2049, 2049, 2049, 2049, 2049, 2049, 2049]",157.0
3,1259,10023,16,1,2025-05-02T16:30:20.489000+00:00,,40.275,38.481,190.0,153.0,True,,"[2064, 2064, 2064, 2049, 2049, 2049, 2049, 204...","[2049, 2049, 2049, 2049, 2049, 2049, 2049, 204...","[2049, 2049, 2049, 2049, 2049, 2049, 2049, 2049]",109.0
4,1259,10023,55,1,2025-05-02T16:30:21.956000+00:00,,41.446,36.064,186.0,98.0,True,,"[2064, 2064, 2064, 2049, 2049, 2049, 2049, 204...","[2049, 2051, 2051, 2051, 2049, 2049, 2049, 204...","[2049, 2049, 2049, 2049, 2049, 2049, 2049, 2051]",71.0


In [69]:
def train_position_models(X_train, X_test, y_train, y_test):
    models = {
        'Decision Tree': DecisionTreeClassifier(random_state=42, max_depth=8),
        'Random Forest': RandomForestClassifier(n_estimators=300, max_depth=12, random_state=42)
    }
    
    results = {}
    for name, model in models.items():
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        accuracy = np.mean(y_pred == y_test)
        results[name] = {
            'accuracy': accuracy,
            'predictions': y_pred,
            'model': model
        }
        print(f"\n{name} Performance:")
        print(classification_report(y_test, y_pred, zero_division=0))
    
    return results

def train_laptime_models(X_train, X_test, y_train, y_test):
    models = {
        'Ridge': Ridge(alpha=1.0, random_state=42),
        'Decision Tree': DecisionTreeRegressor(random_state=42, max_depth=12),
        'Random Forest': RandomForestRegressor(n_estimators=300, max_depth=15, random_state=42)
    }
    
    results = {}
    for name, model in models.items():
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        mse = mean_squared_error(y_test, y_pred)
        rmse = np.sqrt(mse)
        r2 = r2_score(y_test, y_pred)
        results[name] = {
            'MSE': mse,
            'RMSE': rmse,
            'R2': r2,
            'predictions': y_pred,
            'model': model
        }
        print(f"\n{name} Performance:")
        print(f"Mean Squared Error: {mse:.4f}")
        print(f"Root Mean Squared Error: {rmse:.4f}")
        print(f"R² Score: {r2:.4f}")
    
    return results



In [70]:
df['date_start'] = pd.to_datetime(df['date_start'], format='ISO8601')

# Calculate total lap time
df['total_lap_time'] = df['lap_duration'].fillna(
    df['duration_sector_1'].fillna(0) + 
    df['duration_sector_2'].fillna(0) + 
    df['duration_sector_3'].fillna(0)
)

# Handle missing and infinite values
df.replace([np.inf, -np.inf], np.nan, inplace=True)
numeric_columns = df.select_dtypes(include=[np.number]).columns
df[numeric_columns] = df[numeric_columns].fillna(df[numeric_columns].mean())

# Basic features
df['avg_speed'] = df[['i1_speed', 'i2_speed', 'st_speed']].mean(axis=1)
df['speed_variance'] = df[['i1_speed', 'i2_speed', 'st_speed']].var(axis=1)
df['sector_consistency'] = df[['duration_sector_1', 'duration_sector_2', 'duration_sector_3']].std(axis=1)

# Time-based features
df['hour'] = df['date_start'].dt.hour
df['minute'] = df['date_start'].dt.minute

# Performance features
df['rolling_avg_lap_time'] = df.groupby('driver_number')['total_lap_time'].transform(
    lambda x: x.rolling(window=3, min_periods=1).mean()
)
df['lap_time_improvement'] = df.groupby('driver_number')['total_lap_time'].transform(
    lambda x: x.shift(1) - x
)
df['is_personal_best'] = df.groupby('driver_number')['total_lap_time'].transform(
    lambda x: x == x.min()
).astype(int)

# Position features
df['position'] = df.groupby('lap_number')['total_lap_time'].rank(method='min')
df['position_group'] = pd.qcut(df['position'], q=5, labels=['Top', 'Upper Mid', 'Mid', 'Lower Mid', 'Back'])
df['relative_position'] = df.groupby('lap_number')['total_lap_time'].transform(lambda x: x.rank(pct=True))
df['position_moving_avg'] = df.groupby('driver_number')['position'].transform(
    lambda x: x.rolling(window=3, min_periods=1).mean()
)

# Tire degradation
df['estimated_tire_deg'] = df.groupby('driver_number')['total_lap_time'].transform(
    lambda x: ((x - x.min()) / x.min() * 100).clip(0, 100)
)

# Feature selection
feature_columns = [
    'avg_speed', 'speed_variance', 'sector_consistency',
    'rolling_avg_lap_time', 'lap_time_improvement',
    'is_personal_best', 'estimated_tire_deg',
    'relative_position', 'position_moving_avg'
]

# Clean data
df = df.dropna(subset=feature_columns + ['position_group', 'total_lap_time'])
df = df[df['total_lap_time'] > 0]  # Remove invalid lap times

# Prepare for modeling
X = df[feature_columns]
y_position = df['position_group']
y_lap_time = df['total_lap_time']

# Split data
X_train_pos, X_test_pos, y_train_pos, y_test_pos = train_test_split(
    X, y_position, test_size=0.2, random_state=42
)
X_train_time, X_test_time, y_train_time, y_test_time = train_test_split(
    X, y_lap_time, test_size=0.2, random_state=42
)

# Scale features
scaler = StandardScaler()
X_train_pos_scaled = scaler.fit_transform(X_train_pos)
X_test_pos_scaled = scaler.transform(X_test_pos)
X_train_time_scaled = scaler.fit_transform(X_train_time)
X_test_time_scaled = scaler.transform(X_test_time)

# Train models
print("\nTraining Position Prediction Models:")
print("-" * 50)
position_results = train_position_models(X_train_pos_scaled, X_test_pos_scaled, y_train_pos, y_test_pos)

print("\nTraining Lap Time Prediction Models:")
print("-" * 50)
laptime_results = train_laptime_models(X_train_time_scaled, X_test_time_scaled, y_train_time, y_test_time)

# Model comparison plot
plt.figure(figsize=(15, 6))

plt.subplot(1, 2, 1)
accuracies = [results['accuracy'] for results in position_results.values()]
plt.bar(position_results.keys(), accuracies, color=['skyblue', 'lightgreen'])
plt.title('Position Group Prediction - Accuracy')
plt.ylabel('Accuracy Score')
plt.ylim(0, 1)
for i, v in enumerate(accuracies):
    plt.text(i, v + 0.01, f'{v:.3f}', ha='center')

plt.subplot(1, 2, 2)
r2_scores = [results['R2'] for results in laptime_results.values()]
plt.bar(laptime_results.keys(), r2_scores, color=['lightcoral', 'lightblue', 'lightgreen'])
plt.title('Lap Time Prediction - R² Score')
plt.ylabel('R² Score')
plt.ylim(0, 1)
for i, v in enumerate(r2_scores):
    plt.text(i, v + 0.01, f'{v:.3f}', ha='center')

plt.tight_layout()
plt.savefig('visualizations/model_performance_comparison.png')
plt.close()

# Performance distribution plots
plt.figure(figsize=(12, 6))
sns.boxplot(data=df, x='position_group', y='total_lap_time')
plt.title('Lap Times by Position Group')
plt.xlabel('Position Group')
plt.ylabel('Lap Time (seconds)')
plt.xticks(rotation=45)
plt.tight_layout()
plt.savefig('visualizations/position_group_lap_times.png')
plt.close()

plt.figure(figsize=(10, 6))
sns.scatterplot(data=df, x='avg_speed', y='total_lap_time', hue='position_group')
plt.title('Average Speed vs Lap Time by Position Group')
plt.xlabel('Average Speed')
plt.ylabel('Lap Time (seconds)')
plt.tight_layout()
plt.savefig('visualizations/speed_vs_laptime.png')
plt.close()

# Print best models
best_position_model = max(position_results.items(), key=lambda x: x[1]['accuracy'])[0]
best_laptime_model = max(laptime_results.items(), key=lambda x: x[1]['R2'])[0]
print(f"\nBest position group prediction model: {best_position_model}")
print(f"Best lap time prediction model: {best_laptime_model}")


Training Position Prediction Models:
--------------------------------------------------

Decision Tree Performance:
              precision    recall  f1-score   support

        Back       0.80      1.00      0.89         8
   Lower Mid       0.85      0.92      0.88        12
         Mid       0.94      0.80      0.86        20
         Top       1.00      0.91      0.95        23
   Upper Mid       0.90      1.00      0.95        19

    accuracy                           0.91        82
   macro avg       0.90      0.93      0.91        82
weighted avg       0.92      0.91      0.91        82


Random Forest Performance:
              precision    recall  f1-score   support

        Back       0.80      1.00      0.89         8
   Lower Mid       0.80      1.00      0.89        12
         Mid       0.94      0.80      0.86        20
         Top       1.00      0.91      0.95        23
   Upper Mid       1.00      1.00      1.00        19

    accuracy                           0