In [19]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
import joblib

In [20]:
# Load or simulate data
import pandas as pd

# Load all the key F1 datasets
lap_times = pd.read_csv('lap_times.csv')
pit_stops = pd.read_csv('pit_stops.csv')
races = pd.read_csv('races.csv')
circuits = pd.read_csv('circuits.csv')
drivers = pd.read_csv('drivers.csv')
results = pd.read_csv('results.csv')


In [21]:
merged = lap_times.merge(results, on=['raceId', 'driverId'], how='left')

# Merge with pit stop data – detect if pitted on this lap
merged = merged.merge(
    pit_stops[['raceId', 'driverId', 'lap']],
    on=['raceId', 'driverId', 'lap'],
    how='left',
    indicator='pit_status'
)
merged['pitted'] = (merged['pit_status'] == 'both').astype(int)
merged.drop(columns='pit_status', inplace=True)


In [22]:
# Merge race and circuit context
merged = merged.merge(races, on='raceId', how='left')
merged = merged.merge(circuits, on='circuitId', how='left')

In [27]:
merged.columns


Index(['raceId', 'driverId', 'lap', 'position_x', 'time_x', 'milliseconds_x',
       'resultId', 'constructorId', 'number', 'grid', 'position_y',
       'positionText', 'positionOrder', 'points', 'laps', 'time_y',
       'milliseconds_y', 'fastestLap', 'rank', 'fastestLapTime',
       'fastestLapSpeed', 'statusId', 'pitted', 'year', 'round', 'circuitId',
       'name_x', 'date', 'time', 'url_x', 'fp1_date', 'fp1_time', 'fp2_date',
       'fp2_time', 'fp3_date', 'fp3_time', 'quali_date', 'quali_time',
       'sprint_date', 'sprint_time', 'circuitRef', 'name_y', 'location',
       'country', 'lat', 'lng', 'alt', 'url_y', 'strategy'],
      dtype='object')

In [32]:
merged.groupby(['raceId', 'driverId'])['milliseconds_x'].diff().fillna(0)

Unnamed: 0,milliseconds_x
342881,0.0
342882,-15348.0
342883,-2140.0
342884,-533.0
342885,1062.0
...,...
588935,135.0
588936,-178.0
588937,108.0
588938,-526.0


In [34]:
# Feature Engineering
merged = merged.sort_values(by=['driverId', 'raceId', 'lap'])
merged['lap_delta'] = merged.groupby(['raceId', 'driverId'])['milliseconds_x'].diff().fillna(0)
merged['lap_number'] = merged['lap']
merged['day_of_week'] = pd.to_datetime(merged['date']).dt.dayofweek  # 0 = Monday

In [35]:
print(lap_times.columns)


Index(['raceId', 'driverId', 'lap', 'position', 'time', 'milliseconds'], dtype='object')


In [36]:
# Strategy Label (simplified)
merged['strategy'] = merged['pitted'].apply(lambda x: 'Pit Now' if x == 1 else 'Hold Position')


In [37]:
# Model Training
features = merged[['lap_delta', 'lap_number', 'positionOrder', 'day_of_week']].fillna(0)
target = merged['strategy']

In [38]:
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=42)
model = RandomForestClassifier(n_estimators=100)
model.fit(X_train, y_train)

In [41]:
import joblib
# Evaluation
y_pred = model.predict(X_test)
print("Strategy Prediction Accuracy:", accuracy_score(y_test, y_pred))

# Save model
#joblib.dump(model, 'models/strategy_model.pkl')

Strategy Prediction Accuracy: 0.9789419183988728


In [43]:
#Saving model
import os

os.makedirs('models', exist_ok=True)
joblib.dump(model, 'models/strategy_model.pkl')



['models/strategy_model.pkl']