In [59]:
import requests as requests
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, log_loss

In [60]:
# Get race results for 2023 season
url = "https://ergast.com/api/f1/2023/results.json?limit=1000"
response = requests.get(url)
data = response.json()

In [61]:
def fetch_f1_race_results(season=2023):
    url = f"https://ergast.com/api/f1/{season}/results.json?limit=1000"
    response = requests.get(url)
    data = response.json()
    
    races = data['MRData']['RaceTable']['Races']
    race_list = []

    for race in races:
        for result in race['Results']:
            driver = result['Driver']
            constructor = result['Constructor']
            
            race_list.append({
                'Race': race['raceName'],
                'Date': pd.to_datetime(race['date']),
                'Circuit': race['Circuit']['circuitName'],
                'Driver': f"{driver['givenName']} {driver['familyName']}",
                'Driver_Nationality': driver['nationality'],
                'Constructor': constructor['name'],
                'Grid': int(result['grid']),
                'Finish_Position': int(result['position']),
                'Status': result['status'],
                'DNF': 0 if result['status'] == "Finished" else 1
            })
    
    df = pd.DataFrame(race_list)
    return df

# Fetch and show improved dataframe
f1_df = fetch_f1_race_results(2023)
print(f1_df.head(10))

                 Race       Date                        Circuit  \
0  Bahrain Grand Prix 2023-03-05  Bahrain International Circuit   
1  Bahrain Grand Prix 2023-03-05  Bahrain International Circuit   
2  Bahrain Grand Prix 2023-03-05  Bahrain International Circuit   
3  Bahrain Grand Prix 2023-03-05  Bahrain International Circuit   
4  Bahrain Grand Prix 2023-03-05  Bahrain International Circuit   
5  Bahrain Grand Prix 2023-03-05  Bahrain International Circuit   
6  Bahrain Grand Prix 2023-03-05  Bahrain International Circuit   
7  Bahrain Grand Prix 2023-03-05  Bahrain International Circuit   
8  Bahrain Grand Prix 2023-03-05  Bahrain International Circuit   
9  Bahrain Grand Prix 2023-03-05  Bahrain International Circuit   

            Driver Driver_Nationality     Constructor  Grid  Finish_Position  \
0   Max Verstappen              Dutch        Red Bull     1                1   
1     Sergio Pérez            Mexican        Red Bull     2                2   
2  Fernando Alonso    

In [62]:
#df = f1_df.drop(columns=['Circuit'], axis=1)

In [63]:
df

Unnamed: 0,Race,Date,Driver,Driver_Nationality,Constructor,Grid,Finish_Position,Status,DNF,Driver_Encoded,Constructor_Encoded,Race_Encoded,Finished,is_winner
0,Bahrain Grand Prix,2023-03-05,Max Verstappen,Dutch,Red Bull,1,1,Finished,0,12,8,2,1,1
1,Bahrain Grand Prix,2023-03-05,Sergio Pérez,Mexican,Red Bull,2,2,Finished,0,17,8,2,1,0
2,Bahrain Grand Prix,2023-03-05,Fernando Alonso,Spanish,Aston Martin,5,3,Finished,0,4,3,2,1,0
3,Bahrain Grand Prix,2023-03-05,Carlos Sainz,Spanish,Ferrari,4,4,Finished,0,1,4,2,1,0
4,Bahrain Grand Prix,2023-03-05,Lewis Hamilton,British,Mercedes,7,5,Finished,0,10,7,2,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,Miami Grand Prix,2023-05-07,Guanyu Zhou,Chinese,Alfa Romeo,14,16,Finished,0,6,0,3,1,0
96,Miami Grand Prix,2023-05-07,Lando Norris,British,McLaren,16,17,Finished,0,9,6,3,1,0
97,Miami Grand Prix,2023-05-07,Nyck de Vries,Dutch,AlphaTauri,15,18,Finished,0,14,1,3,1,0
98,Miami Grand Prix,2023-05-07,Oscar Piastri,Australian,McLaren,19,19,+1 Lap,1,15,6,3,0,0


In [64]:
df['Date'] = pd.to_datetime(df['Date'])

# Encode categorical variables: Driver, Constructor, and Race
le_driver = LabelEncoder()
df['Driver_Encoded'] = le_driver.fit_transform(df['Driver'])

le_constructor = LabelEncoder()
df['Constructor_Encoded'] = le_constructor.fit_transform(df['Constructor'])

le_race = LabelEncoder()
df['Race_Encoded'] = le_race.fit_transform(df['Race'])

# Convert 'Status' to a binary (Finished = 1, DNF = 0)
df['Finished'] = df['Status'].apply(lambda x: 1 if x == 'Finished' else 0)

# Create the target: winner is defined as Finish_Position == 1
df['is_winner'] = df['Finish_Position'].apply(lambda x: 1 if x == 1 else 0)

# Feature engineering: For simplicity, let's use just these features, but feel free to add more.
features = ['Grid', 'Driver_Encoded', 'Constructor_Encoded', 'Race_Encoded', 'Finished']
X = df[features]
y = df['is_winner']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [65]:
# Train the model
model = XGBClassifier(objective='binary:logistic', eval_metric='logloss', random_state=42)
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
logloss = log_loss(y_test, model.predict_proba(X_test))

print(f"Accuracy: {accuracy:.4f}")
print(f"Log Loss: {logloss:.4f}")

Accuracy: 0.8500
Log Loss: 0.3442


In [66]:
# Example with actual names

# Assume 'Lewis Hamilton', 'Mercedes', and 'Monza' are present in your label encoders
driver_name = 'Lando Norris'
constructor_name = 'McLaren'
race_name = 'Bahrain Grand Prix'

# Now transform these using the fitted encoders
driver_encoded = le_driver.transform([driver_name])[0]
constructor_encoded = le_constructor.transform([constructor_name])[0]
race_encoded = le_race.transform([race_name])[0]
# Add the new label to the encoder if it's not already present
#

# Create new race data
new_race_data = {
    'Grid': [1],  # Starting from pole position
    'Driver_Encoded': [driver_encoded],
    'Constructor_Encoded': [constructor_encoded],
    'Race_Encoded': [race_encoded],
    'Finished': [1]  # Assuming he finishes
}

# Create DataFrame
new_race_df = pd.DataFrame(new_race_data)

# Make the prediction
winner_prob = model.predict_proba(new_race_df[features])

# Output the probability of winning
print(f"Predicted winning probability for {driver_name}: {winner_prob[0][1]:.4f}")


Predicted winning probability for Lando Norris: 0.0121


In [67]:
# Full list of 20 drivers and their constructors
all_drivers = [
    'Max Verstappen', 'Sergio Pérez',           # Red Bull
    'Lewis Hamilton', 'George Russell',          # Mercedes
    'Charles Leclerc', 'Carlos Sainz',            # Ferrari
    'Lando Norris', 'Oscar Piastri',              # McLaren
    'Fernando Alonso', 'Lance Stroll',            # Aston Martin
    'Esteban Ocon', 'Pierre Gasly',               # Alpine F1 Team
    'Yuki Tsunoda', 'Nyck de Vries',           # RB (formerly AlphaTauri)
    'Valtteri Bottas', 'Guanyu Zhou',             # Sauber (formerly Alfa Romeo)
    'Kevin Magnussen', 'Nico Hülkenberg',         # Haas
    'Alexander Albon', 'Logan Sargeant'           # Williams
]

# Matching constructors
constructor_map = {
    'Max Verstappen': 'Red Bull',
    'Sergio Pérez': 'Red Bull',
    'Lewis Hamilton': 'Mercedes',
    'George Russell': 'Mercedes',
    'Charles Leclerc': 'Ferrari',
    'Carlos Sainz': 'Ferrari',
    'Lando Norris': 'McLaren',
    'Oscar Piastri': 'McLaren',
    'Fernando Alonso': 'Aston Martin',
    'Lance Stroll': 'Aston Martin',
    'Esteban Ocon': 'Alpine F1 Team',
    'Pierre Gasly': 'Alpine F1 Team',
    'Yuki Tsunoda': 'AlphaTauri',  # Red Bull's sister team (formerly AlphaTauri)
    'Nyck de Vries': 'AlphaTauri',
    'Valtteri Bottas': 'Alfa Romeo',
    'Guanyu Zhou': 'Alfa Romeo',
    'Kevin Magnussen': 'Haas F1 Team',
    'Nico Hülkenberg': 'Haas F1 Team',
    'Alexander Albon': 'Williams',
    'Logan Sargeant': 'Williams'
}


In [68]:
# List of all drivers you want to predict for
#
# Race name
race_name = 'Bahrain Grand Prix'

# Empty list to collect data
new_race_data = []

# Fill the data for each driver
for driver_name in all_drivers:
    driver_encoded = le_driver.transform([driver_name])[0]
    constructor_encoded = le_constructor.transform([constructor_map[driver_name]])[0]
    race_encoded = le_race.transform([race_name])[0]
    
    new_race_data.append({
        'Driver': driver_name,
        'Grid': 1,  # assuming starting from pole; you can customize for real grid positions
        'Driver_Encoded': driver_encoded,
        'Constructor_Encoded': constructor_encoded,
        'Race_Encoded': race_encoded,
        'Finished': 1  # assuming all finish
    })

# Create DataFrame
new_race_df = pd.DataFrame(new_race_data)

# Predict probabilities
winner_prob = model.predict_proba(new_race_df[features])

# Add prediction column
new_race_df['Win_Probability'] = winner_prob[:, 1]  # Probability of finishing 1st

# Find the driver with the highest probability
predicted_winner = new_race_df.loc[new_race_df['Win_Probability'].idxmax()]

# Output
print(f"Predicted winner: {predicted_winner['Driver']} with probability {predicted_winner['Win_Probability']:.4f}")

# Optional: See all driver probabilities
print("\nAll drivers' winning probabilities:")
print(new_race_df[['Driver', 'Win_Probability']].sort_values(by='Win_Probability', ascending=False))


Predicted winner: Sergio Pérez with probability 0.3498

All drivers' winning probabilities:
             Driver  Win_Probability
1      Sergio Pérez         0.349778
0    Max Verstappen         0.123318
14  Valtteri Bottas         0.093009
12     Yuki Tsunoda         0.093009
18  Alexander Albon         0.060418
19   Logan Sargeant         0.060418
11     Pierre Gasly         0.036433
7     Oscar Piastri         0.026115
13    Nyck de Vries         0.026115
17  Nico Hülkenberg         0.026115
3    George Russell         0.019189
2    Lewis Hamilton         0.019189
6      Lando Norris         0.012110
8   Fernando Alonso         0.012110
9      Lance Stroll         0.012110
5      Carlos Sainz         0.012110
4   Charles Leclerc         0.012110
15      Guanyu Zhou         0.012110
16  Kevin Magnussen         0.012110
10     Esteban Ocon         0.012110


In [None]:
##
#driver_name = le_driver.inverse_transform([driver_encoded])[0]
#Driver_name = driver_name
# Output the probability of each driver finishing 1st
#print(f"Predicted winner probabilities: {winner_prob}")
#print(f"Driver name: {Driver_name}")


Predicted winner probabilities: [[0.87668204 0.12331796]
 [0.65022206 0.34977794]
 [0.98081136 0.01918866]
 [0.98081136 0.01918866]
 [0.98789024 0.01210978]
 [0.98789024 0.01210978]
 [0.98789024 0.01210978]
 [0.9738853  0.02611468]
 [0.98789024 0.01210978]
 [0.98789024 0.01210978]
 [0.98789024 0.01210978]
 [0.96356654 0.03643343]
 [0.90699077 0.09300926]
 [0.9738853  0.02611468]
 [0.90699077 0.09300926]
 [0.98789024 0.01210978]
 [0.98789024 0.01210978]
 [0.9738853  0.02611468]
 [0.93958163 0.06041835]
 [0.93958163 0.06041835]]
Driver name: Logan Sargeant


In [70]:
import pickle

# Save the LabelEncoders
with open('le_driver.pkl', 'wb') as f:
    pickle.dump(le_driver, f)

with open('le_constructor.pkl', 'wb') as f:
    pickle.dump(le_constructor, f)

with open('le_race.pkl', 'wb') as f:
    pickle.dump(le_race, f)

# Save the trained model
with open('xgb_model.pkl', 'wb') as f:
    pickle.dump(model, f)

# (Optional) Save the features list if you want to keep track
with open('features.pkl', 'wb') as f:
    pickle.dump(features, f)
