# Gradient Boosting Model inital attempt

In [35]:
%pip install xgboost

Note: you may need to restart the kernel to use updated packages.


In [36]:
import xgboost as xgb
import pandas as pd
from matplotlib import pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, recall_score, classification_report, roc_auc_score

In [37]:
df= pd.read_parquet("final_data.parquet")
df

Unnamed: 0,constructorId,constructor_name,constructorStandingsId,raceId_x,constructor_pos,constructor_wins,constructorResultsId,constructor_points,round,circuitId,...,Points_Per_Entry,Years_Active,Champion,driverRef,code,driverStandingsId,raceId_y,points,position,driver_wins
0,1,McLaren,26936.0,989.0,4.0,0.0,15643.0,12.0,1,1,...,5.78273,19,True,alonso,ALO,4,18,5.0,4,0
1,1,McLaren,26936.0,989.0,4.0,0.0,15643.0,12.0,1,1,...,5.78273,19,True,alonso,ALO,12,19,6.0,7,0
2,1,McLaren,26936.0,989.0,4.0,0.0,15643.0,12.0,1,1,...,5.78273,19,True,alonso,ALO,30,20,6.0,9,0
3,1,McLaren,26936.0,989.0,4.0,0.0,15643.0,12.0,1,1,...,5.78273,19,True,alonso,ALO,51,21,6.0,10,0
4,1,McLaren,26936.0,989.0,4.0,0.0,15643.0,12.0,1,1,...,5.78273,19,True,alonso,ALO,72,22,9.0,8,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
355645,214,Alpine F1 Team,28708.0,1120.0,6.0,0.0,16886.0,0.0,22,24,...,3.06422,6,False,gasly,GAS,73126,1140,9.0,16,0
355646,214,Alpine F1 Team,28708.0,1120.0,6.0,0.0,16886.0,0.0,22,24,...,3.06422,6,False,gasly,GAS,73172,1141,26.0,12,0
355647,214,Alpine F1 Team,28708.0,1120.0,6.0,0.0,16886.0,0.0,22,24,...,3.06422,6,False,gasly,GAS,73218,1142,26.0,12,0
355648,214,Alpine F1 Team,28708.0,1120.0,6.0,0.0,16886.0,0.0,22,24,...,3.06422,6,False,gasly,GAS,73241,1143,36.0,11,0


In [38]:
#create binary variable for classifier
df['win']= (df['position']== 1).astype(int)

#features to keep for driver win predictions
features= [
    'grid', 'laps', 'results_points', 'fastestLapSpeed',  #race and performance info
    'AirTemp', 'Humidity', 'Pressure', 'TrackTemp', 'WindDirection', 'WindSpeed',  #weather/track conditions
    'Race_Entries', 'Race_Starts', 'Pole_Positions', 'Race_Wins', 'Podiums', #rest is for driver performance
    'Fastest_Laps', 'Points', 'Pole_Rate', 'Start_Rate', 'Win_Rate', 'Podium_Rate',
    'FastLap_Rate', 'Points_Per_Entry', 'Years_Active'
]

#convert to numerical
for col in features:
    df[col] = pd.to_numeric(df[col], errors='coerce')

#only use rows will full data
df= df.dropna(subset=features+ ['win']+ ['race_name'])

X= df[features]
y= df['win']

In [39]:
X_train, X_test, y_train, y_test= train_test_split(X, y, test_size=0.2, random_state= 33)

In [40]:
model= xgb.XGBClassifier(eval_metric='logloss', random_state=22, base_score= 0.5,
                         learning_rate= 0.1, n_estimators= 100, max_depth= 3,
                         subsample= 0.8, colsample_bytree= 0.8)

In [41]:
model.fit(X_train, y_train)

In [42]:
xgbYPred= model.predict(X_test)
xgbYPred_probability= model.predict_proba(X_test)[:, 1]

In [70]:
print("Accuracy: ", accuracy_score(y_test, xgbYPred))
print("AUC Score: ", roc_auc_score(y_test, xgbYPred_probability))
print("\nClassification Report \n", classification_report(y_test, xgbYPred))

Accuracy:  0.9129886506935687
AUC Score:  0.8992882687265593

Classification Report 
               precision    recall  f1-score   support

           0       0.91      1.00      0.95     60092
           1       0.00      0.00      0.00      5727

    accuracy                           0.91     65819
   macro avg       0.46      0.50      0.48     65819
weighted avg       0.83      0.91      0.87     65819



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [49]:
#Putting predictions in the dataframe
race_winners= df.loc[X_test.index].copy()
race_winners['predicted_winner']= xgbYPred
race_winners['predicted_winner_probability']= xgbYPred_probability

In [69]:
#get the names of all the races
unique_race_names = df['race_name'].unique()
print(unique_race_names)

['Australian Grand Prix' 'Bahrain Grand Prix' 'Chinese Grand Prix'
 'Azerbaijan Grand Prix' 'Spanish Grand Prix' 'Monaco Grand Prix'
 'Canadian Grand Prix' 'French Grand Prix' 'Austrian Grand Prix'
 'British Grand Prix' 'German Grand Prix' 'Hungarian Grand Prix'
 'Italian Grand Prix' 'Singapore Grand Prix' 'Russian Grand Prix'
 'Japanese Grand Prix' 'United States Grand Prix' 'Mexican Grand Prix'
 'Brazilian Grand Prix' 'Abu Dhabi Grand Prix' 'Belgian Grand Prix'
 'Tuscan Grand Prix' 'Eifel Grand Prix' 'Portuguese Grand Prix'
 'Emilia Romagna Grand Prix' 'Turkish Grand Prix' 'Qatar Grand Prix'
 'Styrian Grand Prix' 'Dutch Grand Prix' 'Mexico City Grand Prix'
 'São Paulo Grand Prix' 'Saudi Arabian Grand Prix' 'Miami Grand Prix'
 'Las Vegas Grand Prix']


In [59]:
def get_prediction(race_name):
        filtered= race_winners[race_winners['race_name'] == race_name]
        
        if len(filtered) == 0:
            return f"No data for race: {race_name}"

        filtered= filtered.drop_duplicates(subset=['driverId'])
        
        return filtered.sort_values('predicted_winner_probability', ascending=False)

In [68]:
country= "Bahrain"
race= f"{country} Grand Prix"
race_pred= get_prediction(race)

# print(f"\nPredictions for {race}:")
# print(race_pred[['forename', 'surname', 'predicted_winner_probability']].head(10))

#looked online for how to make it look nicer
print(f"\n--- Predictions for {race} ---")
print(f"{'Position':<8} {'Driver':<25} {'Win Probability':<15}")
print("-" * 50)

for i, (_, row) in enumerate(race_pred.head(5).iterrows(), 1):
    driver_name = f"{row['forename']} {row['surname']}"
    probability = f"{row['predicted_winner_probability']*100:.2f}%"
    print(f"{i:<8} {driver_name:<25} {probability:<15}")

#no lie i stole this code just so it looks nice. Please dont judge me


--- Predictions for Bahrain Grand Prix ---
Position Driver                    Win Probability
--------------------------------------------------
1        Max Verstappen            38.49%         
2        Lewis Hamilton            35.86%         
3        Fernando Alonso           13.29%         
4        Charles Leclerc           3.37%          
5        Valtteri Bottas           2.15%          
