In [None]:
%pip install fastf1 xgboost pandas numpy scikit-learn matplotlib

# Installing libraries

In [3]:
import fastf1
import pandas as pd
import numpy as np
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_absolute_error
import matplotlib.pyplot as plt
import os

# Extrating data from the Fastf1 api

In [4]:
# Step 1: Set up FastF1 cache
cache_dir = '/content/f1_cache'
if not os.path.exists(cache_dir):
    os.makedirs(cache_dir)
fastf1.Cache.enable_cache(cache_dir)

In [None]:
# Step 2: Collect data for Saudi Arabian GPs (2021–2024)
data = []
for year in range(2021, 2025):
    try:
        # Load race session
        race_session = fastf1.get_session(year, 'Saudi Arabia', 'R')
        race_session.load()
        results = race_session.results[['DriverNumber', 'TeamName', 'Position', 'Points']]
        laps = race_session.laps[['DriverNumber', 'LapTime', 'Compound', 'Stint', 'PitOutTime', 'PitInTime']]
        results['Year'] = year

        # Load qualifying session
        quali_session = fastf1.get_session(year, 'Saudi Arabia', 'Q')
        quali_session.load(telemetry=False)
        if quali_session.results is not None:
            quali = quali_session.results[['DriverNumber', 'Position']].rename(columns={'Position': 'QualifyingPosition'})
        else:
            print(f"Warning: Qualifying data for {year} not available. Using race starting grid as proxy.")
            quali = results[['DriverNumber']].copy()
            quali['QualifyingPosition'] = range(1, len(quali) + 1)

        # Merge and append data
        merged = results.merge(laps, on='DriverNumber').merge(quali, on='DriverNumber')
        data.append(merged)
        print(f"Successfully loaded data for {year}")
    except Exception as e:
        print(f"Error loading data for {year}: {str(e)}")
        continue

# Combine data
if data:
    df = pd.concat(data, ignore_index=True)
    print("Data collection complete. Shape:", df.shape)
else:
    raise ValueError("No data collected. Check API or session availability.")

In [6]:
df.head()

Unnamed: 0,DriverNumber,TeamName,Position,Points,Year,LapTime,Compound,Stint,PitOutTime,PitInTime,QualifyingPosition
0,44,Mercedes,1.0,26.0,2021,0 days 00:01:35.234000,MEDIUM,1.0,NaT,NaT,1.0
1,44,Mercedes,1.0,26.0,2021,0 days 00:01:34.020000,MEDIUM,1.0,NaT,NaT,1.0
2,44,Mercedes,1.0,26.0,2021,0 days 00:01:33.782000,MEDIUM,1.0,NaT,NaT,1.0
3,44,Mercedes,1.0,26.0,2021,0 days 00:01:33.792000,MEDIUM,1.0,NaT,NaT,1.0
4,44,Mercedes,1.0,26.0,2021,0 days 00:01:33.397000,MEDIUM,1.0,NaT,NaT,1.0


In [None]:
df.info()

# Feature engineering

In [8]:
# Convert LapTime to seconds
df['LapTime'] = df['LapTime'].dt.total_seconds()

# Calculate pit stops (count non-NaT PitOutTime per driver)
df['PitStops'] = df.groupby(['Year', 'DriverNumber'])['PitOutTime'].transform(lambda x: x.notna().sum())

# Aggregate to driver-level features
features = df.groupby(['Year', 'DriverNumber']).agg({
    'LapTime': lambda x: x.mean(skipna=True),  # Mean lap time, ignoring NaN
    'QualifyingPosition': 'mean',  # Should be constant per driver
    'Points': 'sum',  # Total points
    'TeamName': 'last',  # Most recent team
    'PitStops': 'mean',  # Average pit stops (should be constant)
    'Position': 'mean'  # Target: final position
}).reset_index()


In [None]:
# Handle missing values
features['LapTime'].fillna(features['LapTime'].mean(), inplace=True)
features['QualifyingPosition'].fillna(features['QualifyingPosition'].mean(), inplace=True)
features['PitStops'].fillna(0, inplace=True)  # Assume 0 pit stops if missing

In [10]:
# One-hot encode team names
features = pd.get_dummies(features, columns=['TeamName'], prefix='Team')

In [None]:
features.head()

In [12]:
# Normalize numerical features
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
numerical_cols = ['LapTime', 'QualifyingPosition', 'Points', 'PitStops']
features[numerical_cols] = scaler.fit_transform(features[numerical_cols])

In [13]:
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import pandas as pd
import numpy as np

# Prepare features and target
X = features.drop(columns=['Year', 'DriverNumber', 'Position'] + [col for col in features.columns if col.startswith('Team')])
y = features['Position']

# Split data
X_train = X[features['Year'] < 2024]
y_train = y[features['Year'] < 2024]
X_test = X[features['Year'] == 2024]
y_test = y[features['Year'] == 2024]

# Verify test set
print("Test set shape:", X_test.shape)
print("Test set indices:", X_test.index.tolist())
print("Test set drivers:", features.loc[X_test.index, 'DriverNumber'].unique())

# Train model
model = GradientBoostingRegressor(max_depth=3, learning_rate=0.1, n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Evaluate
y_pred = model.predict(X_test)

# Calculate evaluation metrics
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)

# Adjusted R-squared
n = X_test.shape[0]  # Number of samples
p = X_test.shape[1]  # Number of features
adjusted_r2 = 1 - (1 - r2) * (n - 1) / (n - p - 1)

# Mean Absolute Percentage Error (MAPE)
mape = np.mean(np.abs((y_test - y_pred) / y_test)) * 100

# Print evaluation metrics
print("\nEvaluation Metrics for 2024 Test Data:")
print(f"Mean Absolute Error (MAE): {mae:.2f}")
print(f"Mean Squared Error (MSE): {mse:.2f}")
print(f"Root Mean Squared Error (RMSE): {rmse:.2f}")
print(f"R-squared (R²): {r2:.4f}")
print(f"Adjusted R-squared: {adjusted_r2:.4f}")
print(f"Mean Absolute Percentage Error (MAPE): {mape:.2f}%")

# Convert to ranks, preserving test set indices
predicted_ranks = pd.Series(y_pred, index=y_test.index).rank().astype(int)
actual_ranks = pd.Series(y_test, index=y_test.index).rank().astype(int)

# Create DataFrame with aligned indices
rank_df = pd.DataFrame({
    'DriverNumber': features.loc[y_test.index, 'DriverNumber'],
    'Predicted': predicted_ranks,
    'Actual': actual_ranks
}).reset_index(drop=True)

print("\nPredicted vs Actual Ranks (2024):")
print(rank_df)

# Additional check: Verify rank counts
print("\nRank counts:")
print("Predicted ranks:", rank_df['Predicted'].value_counts().sort_index())
print("Actual ranks:", rank_df['Actual'].value_counts().sort_index())

Test set shape: (20, 4)
Test set indices: [58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77]
Test set drivers: ['1' '10' '11' '14' '16' '18' '2' '20' '22' '23' '24' '27' '3' '31' '38'
 '4' '44' '63' '77' '81']

Evaluation Metrics for 2024 Test Data:
Mean Absolute Error (MAE): 1.35
Mean Squared Error (MSE): 4.66
Root Mean Squared Error (RMSE): 2.16
R-squared (R²): 0.8598
Adjusted R-squared: 0.8224
Mean Absolute Percentage Error (MAPE): 9.96%

Predicted vs Actual Ranks (2024):
   DriverNumber  Predicted  Actual
0             1          1       1
1            10         18      20
2            11          2       2
3            14          5       5
4            16          3       3
5            18         17      19
6             2         20      14
7            20         12      12
8            22         14      15
9            23         13      11
10           24         19      18
11           27         10      10
12            3         15      16


# Please note these predictions are apart from the rookies

In [18]:
import pandas as pd

# Define driver mapping for 2024 F1 grid (based on DriverNumber from test set)
driver_mapping = {
    '1': 'Max Verstappen',
    '10': 'Pierre Gasly',
    '11': 'Sergio Perez',
    '14': 'Fernando Alonso',
    '16': 'Charles Leclerc',
    '18': 'Lance Stroll',
    '2': 'Logan Sargeant',
    '20': 'Kevin Magnussen',
    '22': 'Yuki Tsunoda',
    '23': 'Alexander Albon',
    '24': 'Zhou Guanyu',
    '27': 'Nico Hulkenberg',
    '3': 'Daniel Ricciardo',
    '31': 'Esteban Ocon',
    '38': 'Oliver Bearman',  # Substitute in 2024
    '4': 'Lando Norris',
    '44': 'Lewis Hamilton',
    '63': 'George Russell',
    '77': 'Valtteri Bottas',
    '81': 'Oscar Piastri'
}

# Define current 2025 drivers (excluding likely dropped drivers and substitutes)
current_2025_drivers = [
    '1', '10', '14', '16', '18', '20', '22', '23', '27', '31', '4', '44', '63', '81'
]

# Predict for 2025 using 2024 data as proxy, filtered for current drivers
# Get indices of 2024 test set drivers that are in current_2025_drivers
mask = features.loc[X_test.index, 'DriverNumber'].isin(current_2025_drivers)
X_2025 = X_test[mask].copy()  # Filter test set for 2025 drivers
drivers_2025 = features.loc[X_test.index, 'DriverNumber'][mask].values.tolist()

# Verify 2025 data
print("\n2025 Prediction Set Shape:", X_2025.shape)
print("2025 Driver Numbers:", drivers_2025)

# Predict positions
predictions_2025 = model.predict(X_2025)

# Convert to ranks
ranks_2025 = pd.Series(predictions_2025, index=range(len(drivers_2025))).rank(method='dense').astype(int)

# Create output DataFrame with driver names
output_2025 = pd.DataFrame({
    'DriverName': [driver_mapping.get(d, f"Unknown Driver {d}") for d in drivers_2025],
    'PredictedPosition': ranks_2025
})

# Sort by predicted position
output_2025 = output_2025.sort_values('PredictedPosition').reset_index(drop=True)

print("\nPredicted 2025 Saudi Arabian GP Rankings (Current Drivers Only):")
print(output_2025)



2025 Prediction Set Shape: (14, 4)
2025 Driver Numbers: ['1', '10', '14', '16', '18', '20', '22', '23', '27', '31', '4', '44', '63', '81']

Predicted 2025 Saudi Arabian GP Rankings (Current Drivers Only):
         DriverName  PredictedPosition
0    Max Verstappen                  1
1   Charles Leclerc                  2
2     Oscar Piastri                  3
3   Fernando Alonso                  4
4    George Russell                  5
5      Lando Norris                  6
6    Lewis Hamilton                  7
7   Nico Hulkenberg                  8
8   Kevin Magnussen                  9
9   Alexander Albon                 10
10     Yuki Tsunoda                 11
11     Esteban Ocon                 12
12     Lance Stroll                 13
13     Pierre Gasly                 14
