<a href="https://www.kaggle.com/code/vesilehan/f1-winner-predict-model?scriptVersionId=243115855" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

# Formula-1 2025 Winner Prediction 

## Libraries

In [1]:
import pandas as pd
import numpy as np
import plotly.express as px 
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import (
    accuracy_score,
    precision_score,
    recall_score,
    roc_auc_score
)
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC

In [2]:
drivers_df=pd.read_csv('/kaggle/input/f1-races-results-dataset-1950-to-2024/drivers_updated.csv')
drivers_df.head()

Unnamed: 0,Pos,Driver,Nationality,Car,PTS,year,Code
0,1,Nino Farina,ITA,Alfa Romeo,30.0,1950,FAR
1,2,Juan Manuel Fangio,ARG,Alfa Romeo,27.0,1950,FAN
2,3,Luigi Fagioli,ITA,Alfa Romeo,24.0,1950,FAG
3,4,Louis Rosier,FRA,Talbot-Lago,13.0,1950,ROS
4,5,Alberto Ascari,ITA,Ferrari,11.0,1950,ASC


In [3]:
drivers_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1661 entries, 0 to 1660
Data columns (total 7 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   Pos          1661 non-null   object 
 1   Driver       1661 non-null   object 
 2   Nationality  1661 non-null   object 
 3   Car          1650 non-null   object 
 4   PTS          1661 non-null   float64
 5   year         1661 non-null   int64  
 6   Code         1661 non-null   object 
dtypes: float64(1), int64(1), object(5)
memory usage: 91.0+ KB


In [4]:
fastest_laps_df=pd.read_csv('/kaggle/input/f1-races-results-dataset-1950-to-2024/fastest_laps_updated.csv')
fastest_laps_df.head()

Unnamed: 0,Grand Prix,Driver,Car,Time,year,Code
0,Great Britain,Nino Farina,Alfa Romeo,1:50.600,1950,FAR
1,Monaco,Juan Manuel Fangio,Alfa Romeo,1:51.000,1950,FAN
2,Indianapolis 500,Johnnie Parsons,Kurtis Kraft Offenhauser,,1950,PAR
3,Switzerland,Nino Farina,Alfa Romeo,2:41.600,1950,FAR
4,Belgium,Nino Farina,Alfa Romeo,4:34.100,1950,FAR


In [5]:
fastest_laps_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1108 entries, 0 to 1107
Data columns (total 6 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Grand Prix  1108 non-null   object
 1   Driver      1108 non-null   object
 2   Car         1108 non-null   object
 3   Time        1107 non-null   object
 4   year        1108 non-null   int64 
 5   Code        1108 non-null   object
dtypes: int64(1), object(5)
memory usage: 52.1+ KB


In [6]:
teams_df=pd.read_csv('/kaggle/input/f1-races-results-dataset-1950-to-2024/teams_updated.csv')
teams_df.head()

Unnamed: 0,Pos,Team,PTS,year
0,1,Vanwall,48.0,1958
1,2,Ferrari,40.0,1958
2,3,Cooper Climax,31.0,1958
3,4,BRM,18.0,1958
4,5,Maserati,6.0,1958


In [7]:
teams_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 695 entries, 0 to 694
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   Pos     695 non-null    object 
 1   Team    695 non-null    object 
 2   PTS     695 non-null    float64
 3   year    695 non-null    int64  
dtypes: float64(1), int64(1), object(2)
memory usage: 21.8+ KB


In [8]:
winners_df=pd.read_csv('/kaggle/input/f1-races-results-dataset-1950-to-2024/winners.csv')
winners_df.head()

Unnamed: 0,Grand Prix,Date,Winner,Car,Laps,Time,Name Code
0,Great Britain,1950-05-13,Nino Farina,Alfa Romeo,70.0,2:13:23.600,FAR
1,Monaco,1950-05-21,Juan Manuel Fangio,Alfa Romeo,100.0,3:13:18.700,FAN
2,Indianapolis 500,1950-05-30,Johnnie Parsons,Kurtis Kraft Offenhauser,138.0,2:46:55.970,PAR
3,Switzerland,1950-06-04,Nino Farina,Alfa Romeo,42.0,2:02:53.700,FAR
4,Belgium,1950-06-18,Juan Manuel Fangio,Alfa Romeo,35.0,2:47:26.000,FAN


In [9]:
winners_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1110 entries, 0 to 1109
Data columns (total 7 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Grand Prix  1110 non-null   object 
 1   Date        1110 non-null   object 
 2   Winner      1110 non-null   object 
 3   Car         1110 non-null   object 
 4   Laps        1107 non-null   float64
 5   Time        1107 non-null   object 
 6   Name Code   1110 non-null   object 
dtypes: float64(1), object(6)
memory usage: 60.8+ KB


## Data Preprocessing and Feature Engineering

In [10]:
winners_df['Date']=pd.to_datetime(winners_df['Date'])
winners_df.head()

Unnamed: 0,Grand Prix,Date,Winner,Car,Laps,Time,Name Code
0,Great Britain,1950-05-13,Nino Farina,Alfa Romeo,70.0,2:13:23.600,FAR
1,Monaco,1950-05-21,Juan Manuel Fangio,Alfa Romeo,100.0,3:13:18.700,FAN
2,Indianapolis 500,1950-05-30,Johnnie Parsons,Kurtis Kraft Offenhauser,138.0,2:46:55.970,PAR
3,Switzerland,1950-06-04,Nino Farina,Alfa Romeo,42.0,2:02:53.700,FAR
4,Belgium,1950-06-18,Juan Manuel Fangio,Alfa Romeo,35.0,2:47:26.000,FAN


In [11]:
winners_df['year']=winners_df['Date'].dt.year
winners_df.head()

Unnamed: 0,Grand Prix,Date,Winner,Car,Laps,Time,Name Code,year
0,Great Britain,1950-05-13,Nino Farina,Alfa Romeo,70.0,2:13:23.600,FAR,1950
1,Monaco,1950-05-21,Juan Manuel Fangio,Alfa Romeo,100.0,3:13:18.700,FAN,1950
2,Indianapolis 500,1950-05-30,Johnnie Parsons,Kurtis Kraft Offenhauser,138.0,2:46:55.970,PAR,1950
3,Switzerland,1950-06-04,Nino Farina,Alfa Romeo,42.0,2:02:53.700,FAR,1950
4,Belgium,1950-06-18,Juan Manuel Fangio,Alfa Romeo,35.0,2:47:26.000,FAN,1950


In [12]:
winners_df['year']=winners_df['year'].astype('int64')
winners_df.head()

Unnamed: 0,Grand Prix,Date,Winner,Car,Laps,Time,Name Code,year
0,Great Britain,1950-05-13,Nino Farina,Alfa Romeo,70.0,2:13:23.600,FAR,1950
1,Monaco,1950-05-21,Juan Manuel Fangio,Alfa Romeo,100.0,3:13:18.700,FAN,1950
2,Indianapolis 500,1950-05-30,Johnnie Parsons,Kurtis Kraft Offenhauser,138.0,2:46:55.970,PAR,1950
3,Switzerland,1950-06-04,Nino Farina,Alfa Romeo,42.0,2:02:53.700,FAR,1950
4,Belgium,1950-06-18,Juan Manuel Fangio,Alfa Romeo,35.0,2:47:26.000,FAN,1950


In [13]:
winners_df.rename(columns={'Name Code':'driver_code','Car':'Team'},inplace=True)
winners_df.columns

Index(['Grand Prix', 'Date', 'Winner', 'Team', 'Laps', 'Time', 'driver_code',
       'year'],
      dtype='object')

In [14]:
winners_df.isna().sum()

Grand Prix     0
Date           0
Winner         0
Team           0
Laps           3
Time           3
driver_code    0
year           0
dtype: int64

In [15]:
winners_df.dropna(axis=0,inplace=True)
winners_df.isna().sum()

Grand Prix     0
Date           0
Winner         0
Team           0
Laps           0
Time           0
driver_code    0
year           0
dtype: int64

In [16]:
teams_df.rename(columns={'Team':'Team_name'},inplace=True)
teams_df.columns

Index(['Pos', 'Team_name', 'PTS', 'year'], dtype='object')

In [17]:
teams_df.isna().sum()

Pos          0
Team_name    0
PTS          0
year         0
dtype: int64

In [18]:
teams_df.dropna(axis=0,inplace=True)
teams_df.isna().sum()

Pos          0
Team_name    0
PTS          0
year         0
dtype: int64

In [19]:
fastest_laps_df.rename(columns={'Car':'Team'},inplace=True)
fastest_laps_df.columns

Index(['Grand Prix', 'Driver', 'Team', 'Time', 'year', 'Code'], dtype='object')

In [20]:
fastest_laps_df.isna().sum()

Grand Prix    0
Driver        0
Team          0
Time          1
year          0
Code          0
dtype: int64

In [21]:
fastest_laps_df.dropna(axis=0,inplace=True)
fastest_laps_df.isna().sum()

Grand Prix    0
Driver        0
Team          0
Time          0
year          0
Code          0
dtype: int64

In [22]:
drivers_df.rename(columns={'Car':'Team'},inplace=True)
drivers_df.columns

Index(['Pos', 'Driver', 'Nationality', 'Team', 'PTS', 'year', 'Code'], dtype='object')

In [23]:
drivers_df.isna().sum()

Pos             0
Driver          0
Nationality     0
Team           11
PTS             0
year            0
Code            0
dtype: int64

In [24]:
drivers_df.dropna(axis=0,inplace=True)
drivers_df.isna().sum()

Pos            0
Driver         0
Nationality    0
Team           0
PTS            0
year           0
Code           0
dtype: int64

In [25]:
#Process 'Time'columns to seconds
def time_to_seconds(time_str):
    if pd.isna(time_str):
        return np.nan
    parts = str(time_str).split(':')
    if len(parts) == 3: # h:m:s
        return float(parts[0]) * 3600 + float(parts[1]) * 60 + float(parts[2])
    elif len(parts) == 2: # m:s
        return float(parts[0]) * 60 + float(parts[1])
    else: # just seconds, or invalid format
        return float(time_str)

In [26]:
winners_df['Time_seconds']=winners_df['Time'].apply(time_to_seconds)
winners_df.drop(columns='Time',inplace=True)
winners_df.head()

Unnamed: 0,Grand Prix,Date,Winner,Team,Laps,driver_code,year,Time_seconds
0,Great Britain,1950-05-13,Nino Farina,Alfa Romeo,70.0,FAR,1950,8003.6
1,Monaco,1950-05-21,Juan Manuel Fangio,Alfa Romeo,100.0,FAN,1950,11598.7
2,Indianapolis 500,1950-05-30,Johnnie Parsons,Kurtis Kraft Offenhauser,138.0,PAR,1950,10015.97
3,Switzerland,1950-06-04,Nino Farina,Alfa Romeo,42.0,FAR,1950,7373.7
4,Belgium,1950-06-18,Juan Manuel Fangio,Alfa Romeo,35.0,FAN,1950,10046.0


In [27]:
fastest_laps_df['Time_seconds']=fastest_laps_df['Time'].apply(time_to_seconds)
fastest_laps_df.drop(columns='Time',inplace=True)
fastest_laps_df.head()

Unnamed: 0,Grand Prix,Driver,Team,year,Code,Time_seconds
0,Great Britain,Nino Farina,Alfa Romeo,1950,FAR,110.6
1,Monaco,Juan Manuel Fangio,Alfa Romeo,1950,FAN,111.0
3,Switzerland,Nino Farina,Alfa Romeo,1950,FAR,161.6
4,Belgium,Nino Farina,Alfa Romeo,1950,FAR,274.1
5,France,Juan Manuel Fangio,Alfa Romeo,1950,FAN,155.6


In [28]:
races=winners_df[['Grand Prix','Date','year']].drop_duplicates().sort_values(['year','Date']).reset_index(drop=True)
races.head()

Unnamed: 0,Grand Prix,Date,year
0,Great Britain,1950-05-13,1950
1,Monaco,1950-05-21,1950
2,Indianapolis 500,1950-05-30,1950
3,Switzerland,1950-06-04,1950
4,Belgium,1950-06-18,1950


In [29]:
# Merge all drivers and their teams for each year.This creates (race, driver, team) combinations for ALL active drivers in that year
base_df = races.merge(drivers_df[['Driver', 'Team', 'Nationality', 'year']], on='year', how='left')
base_df.head()

Unnamed: 0,Grand Prix,Date,year,Driver,Team,Nationality
0,Great Britain,1950-05-13,1950,Nino Farina,Alfa Romeo,ITA
1,Great Britain,1950-05-13,1950,Juan Manuel Fangio,Alfa Romeo,ARG
2,Great Britain,1950-05-13,1950,Luigi Fagioli,Alfa Romeo,ITA
3,Great Britain,1950-05-13,1950,Louis Rosier,Talbot-Lago,FRA
4,Great Britain,1950-05-13,1950,Alberto Ascari,Ferrari,ITA


In [30]:
#add teams_stats
base_df=base_df.merge(teams_df[['Team_name','PTS','Pos','year']],
                     left_on=['Team','year'],
                     right_on=['Team_name','year'],
                     how='left',
                     suffixes=('_driver_car','_team_stats'))
base_df.head()

  has_large_values = (abs_vals > 1e6).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()


Unnamed: 0,Grand Prix,Date,year,Driver,Team,Nationality,Team_name,PTS,Pos
0,Great Britain,1950-05-13,1950,Nino Farina,Alfa Romeo,ITA,,,
1,Great Britain,1950-05-13,1950,Juan Manuel Fangio,Alfa Romeo,ARG,,,
2,Great Britain,1950-05-13,1950,Luigi Fagioli,Alfa Romeo,ITA,,,
3,Great Britain,1950-05-13,1950,Louis Rosier,Talbot-Lago,FRA,,,
4,Great Britain,1950-05-13,1950,Alberto Ascari,Ferrari,ITA,,,


In [31]:
# Add driver points (Pos, PTS for that year)
base_df = base_df.merge( drivers_df[['Driver', 'PTS', 'Pos', 'year']],
                   left_on=['Driver', 'year'],
                   right_on=['Driver', 'year'],
                   how='left',
                   suffixes=('_overall', '_driver_stats'))
base_df.head()

  has_large_values = (abs_vals > 1e6).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()


Unnamed: 0,Grand Prix,Date,year,Driver,Team,Nationality,Team_name,PTS_overall,Pos_overall,PTS_driver_stats,Pos_driver_stats
0,Great Britain,1950-05-13,1950,Nino Farina,Alfa Romeo,ITA,,,,30.0,1
1,Great Britain,1950-05-13,1950,Juan Manuel Fangio,Alfa Romeo,ARG,,,,27.0,2
2,Great Britain,1950-05-13,1950,Luigi Fagioli,Alfa Romeo,ITA,,,,24.0,3
3,Great Britain,1950-05-13,1950,Louis Rosier,Talbot-Lago,FRA,,,,13.0,4
4,Great Britain,1950-05-13,1950,Alberto Ascari,Ferrari,ITA,,,,11.0,5


In [32]:
# Merge the two DataFrames on 'Grand Prix' and 'Date'
merged_df = base_df.merge(winners_df[['Grand Prix', 'Date', 'Winner']], 
                           on=['Grand Prix', 'Date'], 
                           how='left')

# Create the 'Did_Win' column by checking if Driver matches Winner
merged_df['Did_win'] = merged_df['Driver'] == merged_df['Winner']

In [33]:
# Mark the actual winner for each race-driver combination
base_df['Did_win'] = merged_df['Did_win']
base_df.head()

  has_large_values = (abs_vals > 1e6).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()


Unnamed: 0,Grand Prix,Date,year,Driver,Team,Nationality,Team_name,PTS_overall,Pos_overall,PTS_driver_stats,Pos_driver_stats,Did_win
0,Great Britain,1950-05-13,1950,Nino Farina,Alfa Romeo,ITA,,,,30.0,1,True
1,Great Britain,1950-05-13,1950,Juan Manuel Fangio,Alfa Romeo,ARG,,,,27.0,2,False
2,Great Britain,1950-05-13,1950,Luigi Fagioli,Alfa Romeo,ITA,,,,24.0,3,False
3,Great Britain,1950-05-13,1950,Louis Rosier,Talbot-Lago,FRA,,,,13.0,4,False
4,Great Britain,1950-05-13,1950,Alberto Ascari,Ferrari,ITA,,,,11.0,5,False


In [34]:
base_df['Did_win']=base_df['Did_win'].astype('int64')
base_df.head()

  has_large_values = (abs_vals > 1e6).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()


Unnamed: 0,Grand Prix,Date,year,Driver,Team,Nationality,Team_name,PTS_overall,Pos_overall,PTS_driver_stats,Pos_driver_stats,Did_win
0,Great Britain,1950-05-13,1950,Nino Farina,Alfa Romeo,ITA,,,,30.0,1,1
1,Great Britain,1950-05-13,1950,Juan Manuel Fangio,Alfa Romeo,ARG,,,,27.0,2,0
2,Great Britain,1950-05-13,1950,Luigi Fagioli,Alfa Romeo,ITA,,,,24.0,3,0
3,Great Britain,1950-05-13,1950,Louis Rosier,Talbot-Lago,FRA,,,,13.0,4,0
4,Great Britain,1950-05-13,1950,Alberto Ascari,Ferrari,ITA,,,,11.0,5,0


### Handling Missing Values

In [35]:
base_df.isna().sum()

Grand Prix             0
Date                   0
year                   0
Driver                 0
Team                   0
Nationality            0
Team_name           1949
PTS_overall         1949
Pos_overall         1949
PTS_driver_stats       0
Pos_driver_stats       0
Did_win                0
dtype: int64

In [36]:
base_df.dropna(axis=0,inplace=True)
base_df.isna().sum()

Grand Prix          0
Date                0
year                0
Driver              0
Team                0
Nationality         0
Team_name           0
PTS_overall         0
Pos_overall         0
PTS_driver_stats    0
Pos_driver_stats    0
Did_win             0
dtype: int64

In [37]:
base_df.head()

Unnamed: 0,Grand Prix,Date,year,Driver,Team,Nationality,Team_name,PTS_overall,Pos_overall,PTS_driver_stats,Pos_driver_stats,Did_win
1417,Argentina,1958-01-19,1958,Mike Hawthorn,Ferrari,GBR,Ferrari,40.0,2,42.0,1,0
1418,Argentina,1958-01-19,1958,Stirling Moss,Vanwall,GBR,Vanwall,48.0,1,41.0,2,1
1419,Argentina,1958-01-19,1958,Tony Brooks,Vanwall,GBR,Vanwall,48.0,1,24.0,3,0
1420,Argentina,1958-01-19,1958,Roy Salvadori,Cooper Climax,GBR,Cooper Climax,31.0,3,15.0,4,0
1421,Argentina,1958-01-19,1958,Peter Collins,Ferrari,GBR,Ferrari,40.0,2,14.0,5,0


In [38]:
base_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 22484 entries, 1417 to 24432
Data columns (total 12 columns):
 #   Column            Non-Null Count  Dtype         
---  ------            --------------  -----         
 0   Grand Prix        22484 non-null  object        
 1   Date              22484 non-null  datetime64[ns]
 2   year              22484 non-null  int64         
 3   Driver            22484 non-null  object        
 4   Team              22484 non-null  object        
 5   Nationality       22484 non-null  object        
 6   Team_name         22484 non-null  object        
 7   PTS_overall       22484 non-null  float64       
 8   Pos_overall       22484 non-null  object        
 9   PTS_driver_stats  22484 non-null  float64       
 10  Pos_driver_stats  22484 non-null  object        
 11  Did_win           22484 non-null  int64         
dtypes: datetime64[ns](1), float64(2), int64(2), object(7)
memory usage: 2.2+ MB


### Feature Engineering

In [39]:
categorical_col = ['Grand Prix', 'Driver', 'Team', 'Team_name', 'Nationality']
numerical_col = ['Pos_overall', 'Pos_driver_stats']

for col in base_df.columns:
    if col in categorical_col:
        base_df[col] = base_df[col].astype('category')
    elif col in numerical_col:
        # Convert and handle non-numeric values
        base_df[col] = pd.to_numeric(base_df[col], errors='coerce').astype('Int64') 

# Check the DataFrame info
base_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 22484 entries, 1417 to 24432
Data columns (total 12 columns):
 #   Column            Non-Null Count  Dtype         
---  ------            --------------  -----         
 0   Grand Prix        22484 non-null  category      
 1   Date              22484 non-null  datetime64[ns]
 2   year              22484 non-null  int64         
 3   Driver            22484 non-null  category      
 4   Team              22484 non-null  category      
 5   Nationality       22484 non-null  category      
 6   Team_name         22484 non-null  category      
 7   PTS_overall       22484 non-null  float64       
 8   Pos_overall       22450 non-null  Int64         
 9   PTS_driver_stats  22484 non-null  float64       
 10  Pos_driver_stats  22467 non-null  Int64         
 11  Did_win           22484 non-null  int64         
dtypes: Int64(2), category(5), datetime64[ns](1), float64(2), int64(2)
memory usage: 1.6 MB


In [40]:
base_df.dropna(inplace=True)
base_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 22433 entries, 1417 to 24432
Data columns (total 12 columns):
 #   Column            Non-Null Count  Dtype         
---  ------            --------------  -----         
 0   Grand Prix        22433 non-null  category      
 1   Date              22433 non-null  datetime64[ns]
 2   year              22433 non-null  int64         
 3   Driver            22433 non-null  category      
 4   Team              22433 non-null  category      
 5   Nationality       22433 non-null  category      
 6   Team_name         22433 non-null  category      
 7   PTS_overall       22433 non-null  float64       
 8   Pos_overall       22433 non-null  Int64         
 9   PTS_driver_stats  22433 non-null  float64       
 10  Pos_driver_stats  22433 non-null  Int64         
 11  Did_win           22433 non-null  int64         
dtypes: Int64(2), category(5), datetime64[ns](1), float64(2), int64(2)
memory usage: 1.6 MB


In [41]:
base_df['race_month']=base_df['Date'].dt.month
base_df['race_weekofyear']=base_df['Date'].dt.isocalendar().week
base_df['race_dayofweek']=base_df['Date'].dt.weekday         # 0=Mon, …, 6=Sun
base_df['race_is_weekend']=(base_df['race_dayofweek'] >= 5)
base_df.head()

Unnamed: 0,Grand Prix,Date,year,Driver,Team,Nationality,Team_name,PTS_overall,Pos_overall,PTS_driver_stats,Pos_driver_stats,Did_win,race_month,race_weekofyear,race_dayofweek,race_is_weekend
1417,Argentina,1958-01-19,1958,Mike Hawthorn,Ferrari,GBR,Ferrari,40.0,2,42.0,1,0,1,3,6,True
1418,Argentina,1958-01-19,1958,Stirling Moss,Vanwall,GBR,Vanwall,48.0,1,41.0,2,1,1,3,6,True
1419,Argentina,1958-01-19,1958,Tony Brooks,Vanwall,GBR,Vanwall,48.0,1,24.0,3,0,1,3,6,True
1420,Argentina,1958-01-19,1958,Roy Salvadori,Cooper Climax,GBR,Cooper Climax,31.0,3,15.0,4,0,1,3,6,True
1421,Argentina,1958-01-19,1958,Peter Collins,Ferrari,GBR,Ferrari,40.0,2,14.0,5,0,1,3,6,True


In [42]:
#Compute race_number within each season
base_df = base_df.sort_values(['year', 'Date']).reset_index(drop=True)
base_df['race_number'] = base_df.groupby('year').cumcount() + 1
base_df.head()

Unnamed: 0,Grand Prix,Date,year,Driver,Team,Nationality,Team_name,PTS_overall,Pos_overall,PTS_driver_stats,Pos_driver_stats,Did_win,race_month,race_weekofyear,race_dayofweek,race_is_weekend,race_number
0,Argentina,1958-01-19,1958,Mike Hawthorn,Ferrari,GBR,Ferrari,40.0,2,42.0,1,0,1,3,6,True,1
1,Argentina,1958-01-19,1958,Stirling Moss,Vanwall,GBR,Vanwall,48.0,1,41.0,2,1,1,3,6,True,2
2,Argentina,1958-01-19,1958,Tony Brooks,Vanwall,GBR,Vanwall,48.0,1,24.0,3,0,1,3,6,True,3
3,Argentina,1958-01-19,1958,Roy Salvadori,Cooper Climax,GBR,Cooper Climax,31.0,3,15.0,4,0,1,3,6,True,4
4,Argentina,1958-01-19,1958,Peter Collins,Ferrari,GBR,Ferrari,40.0,2,14.0,5,0,1,3,6,True,5


In [43]:
# Compute days_since_last_race for each driver
base_df = base_df.sort_values(['Driver', 'Date']).reset_index(drop=True)
base_df['prev_race_date']= base_df.groupby('Driver')['Date'].shift(1)
base_df['days_since_last_race'] = (base_df['Date'] - base_df['prev_race_date']).dt.days.fillna(0).astype(int)
base_df.head()

  base_df['prev_race_date']= base_df.groupby('Driver')['Date'].shift(1)


Unnamed: 0,Grand Prix,Date,year,Driver,Team,Nationality,Team_name,PTS_overall,Pos_overall,PTS_driver_stats,Pos_driver_stats,Did_win,race_month,race_weekofyear,race_dayofweek,race_is_weekend,race_number,prev_race_date,days_since_last_race
0,Australia,2007-03-18,2007,Adrian Sutil,Spyker Ferrari,GER,Spyker Ferrari,1.0,10,1.0,19,0,3,11,6,True,17,NaT,0
1,Malaysia,2007-04-08,2007,Adrian Sutil,Spyker Ferrari,GER,Spyker Ferrari,1.0,10,1.0,19,0,4,14,6,True,40,2007-03-18,21
2,Bahrain,2007-04-15,2007,Adrian Sutil,Spyker Ferrari,GER,Spyker Ferrari,1.0,10,1.0,19,0,4,15,6,True,63,2007-04-08,7
3,Spain,2007-05-13,2007,Adrian Sutil,Spyker Ferrari,GER,Spyker Ferrari,1.0,10,1.0,19,0,5,19,6,True,86,2007-04-15,28
4,Monaco,2007-05-27,2007,Adrian Sutil,Spyker Ferrari,GER,Spyker Ferrari,1.0,10,1.0,19,0,5,21,6,True,109,2007-05-13,14


In [44]:
gp_dummies = pd.get_dummies(base_df['Grand Prix'], prefix='GP')
base_df = pd.concat([base_df, gp_dummies], axis=1)

In [45]:
base_df=base_df.sort_values(['Driver', 'Date']).reset_index(drop=True)

In [46]:
# Label-encode driver names
label_encoder= LabelEncoder()
base_df['driver_id'] = label_encoder.fit_transform(base_df['Driver'])
base_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 22433 entries, 0 to 22432
Data columns (total 72 columns):
 #   Column                Non-Null Count  Dtype         
---  ------                --------------  -----         
 0   Grand Prix            22433 non-null  category      
 1   Date                  22433 non-null  datetime64[ns]
 2   year                  22433 non-null  int64         
 3   Driver                22433 non-null  category      
 4   Team                  22433 non-null  category      
 5   Nationality           22433 non-null  category      
 6   Team_name             22433 non-null  category      
 7   PTS_overall           22433 non-null  float64       
 8   Pos_overall           22433 non-null  Int64         
 9   PTS_driver_stats      22433 non-null  float64       
 10  Pos_driver_stats      22433 non-null  Int64         
 11  Did_win               22433 non-null  int64         
 12  race_month            22433 non-null  int32         
 13  race_weekofyear 

### Driver-Specific Historical Performance

In [47]:
# driver_total_races_before
base_df['driver_races_before'] = base_df.groupby('Driver').cumcount()

# driver_wins_cum (cumulative sum of Did_win, shifted by 1 to avoid leakage)
base_df['driver_wins_cum'] = base_df.groupby('Driver')['Did_win'].cumsum().shift(1).fillna(0)
base_df.head()

  base_df['driver_races_before'] = base_df.groupby('Driver').cumcount()
  base_df['driver_wins_cum'] = base_df.groupby('Driver')['Did_win'].cumsum().shift(1).fillna(0)


Unnamed: 0,Grand Prix,Date,year,Driver,Team,Nationality,Team_name,PTS_overall,Pos_overall,PTS_driver_stats,...,GP_Sweden,GP_Switzerland,GP_Turkey,GP_Tuscany,GP_USA East,GP_USA West,GP_United States,driver_id,driver_races_before,driver_wins_cum
0,Australia,2007-03-18,2007,Adrian Sutil,Spyker Ferrari,GER,Spyker Ferrari,1.0,10,1.0,...,False,False,False,False,False,False,False,0,0,0.0
1,Malaysia,2007-04-08,2007,Adrian Sutil,Spyker Ferrari,GER,Spyker Ferrari,1.0,10,1.0,...,False,False,False,False,False,False,False,0,1,0.0
2,Bahrain,2007-04-15,2007,Adrian Sutil,Spyker Ferrari,GER,Spyker Ferrari,1.0,10,1.0,...,False,False,False,False,False,False,False,0,2,0.0
3,Spain,2007-05-13,2007,Adrian Sutil,Spyker Ferrari,GER,Spyker Ferrari,1.0,10,1.0,...,False,False,False,False,False,False,False,0,3,0.0
4,Monaco,2007-05-27,2007,Adrian Sutil,Spyker Ferrari,GER,Spyker Ferrari,1.0,10,1.0,...,False,False,False,False,False,False,False,0,4,0.0


In [48]:
# driver_win_rate_before
base_df['driver_win_rate_before'] = (
    base_df['driver_wins_cum'] / 
    base_df['driver_races_before'].replace({0: np.nan})
).fillna(0)
base_df.head()

Unnamed: 0,Grand Prix,Date,year,Driver,Team,Nationality,Team_name,PTS_overall,Pos_overall,PTS_driver_stats,...,GP_Switzerland,GP_Turkey,GP_Tuscany,GP_USA East,GP_USA West,GP_United States,driver_id,driver_races_before,driver_wins_cum,driver_win_rate_before
0,Australia,2007-03-18,2007,Adrian Sutil,Spyker Ferrari,GER,Spyker Ferrari,1.0,10,1.0,...,False,False,False,False,False,False,0,0,0.0,0.0
1,Malaysia,2007-04-08,2007,Adrian Sutil,Spyker Ferrari,GER,Spyker Ferrari,1.0,10,1.0,...,False,False,False,False,False,False,0,1,0.0,0.0
2,Bahrain,2007-04-15,2007,Adrian Sutil,Spyker Ferrari,GER,Spyker Ferrari,1.0,10,1.0,...,False,False,False,False,False,False,0,2,0.0,0.0
3,Spain,2007-05-13,2007,Adrian Sutil,Spyker Ferrari,GER,Spyker Ferrari,1.0,10,1.0,...,False,False,False,False,False,False,0,3,0.0,0.0
4,Monaco,2007-05-27,2007,Adrian Sutil,Spyker Ferrari,GER,Spyker Ferrari,1.0,10,1.0,...,False,False,False,False,False,False,0,4,0.0,0.0


In [49]:
# is_podium flag (assuming Pos_driver_stats ≤ 3 means podium)
base_df['is_podium'] = (base_df['Pos_driver_stats'].astype(int) <= 3).astype(int)

# driver_podiums_cum (cumulative sum of is_podium, shifted by 1)
base_df['driver_podiums_cum'] = (
    base_df.groupby('Driver')['is_podium']
    .cumsum().shift(1).fillna(0)
)

  base_df.groupby('Driver')['is_podium']


In [50]:
# driver_podium_rate_before
base_df['driver_podium_rate_before'] = (
    base_df['driver_podiums_cum'] / 
    base_df['driver_races_before'].replace({0: np.nan})
).fillna(0)

# driver_wins_last5 (rolling window of size 5, shifted by 1)
base_df['driver_wins_last5'] = (
    base_df.groupby('Driver')['Did_win']
           .apply(lambda x: x.shift(1).rolling(window=5, min_periods=1).sum())
           .reset_index(level=0, drop=True)
)

# driver_avg_finish_last5 (rolling mean of Pos_driver_stats, shifted by 1)
base_df['driver_avg_finish_last5'] = (
    base_df.groupby('Driver')['Pos_driver_stats']
           .apply(lambda x: x.astype(int).shift(1).rolling(window=5, min_periods=1).mean())
           .reset_index(level=0, drop=True)
)

  base_df.groupby('Driver')['Did_win']
  base_df.groupby('Driver')['Pos_driver_stats']


### Team-Specific Historical Performance

In [51]:
base_df = base_df.sort_values(['Team', 'Date']).reset_index(drop=True)

# Label-encode team names
team_le        = LabelEncoder()
base_df['team_id'] = team_le.fit_transform(base_df['Team'])

# team_total_races_before
base_df['team_races_before'] = base_df.groupby('Team').cumcount()


  base_df['team_races_before'] = base_df.groupby('Team').cumcount()


In [52]:
# team_wins_cum (shifted by 1)
base_df['team_wins_cum'] = (
    base_df.groupby('Team')['Did_win']
    .cumsum().shift(1).fillna(0)
)

# team_win_rate_before
base_df['team_win_rate_before'] = (
    base_df['team_wins_cum'] / 
    base_df['team_races_before'].replace({0: np.nan})
).fillna(0)

  base_df.groupby('Team')['Did_win']


In [53]:
# team_podiums_last5 (rolling sum of is_podium, shifted by 1)
base_df['team_podiums_last5'] = (
    base_df.groupby('Team')['is_podium']
           .apply(lambda x: x.shift(1).rolling(window=5, min_periods=1).sum())
           .reset_index(level=0, drop=True)
)

  base_df.groupby('Team')['is_podium']


### Championship Points & Positions

In [54]:
scaler = MinMaxScaler()
base_df[['PTS_overall_scaled', 'PTS_driver_stats_scaled']] = scaler.fit_transform(
    base_df[['PTS_overall', 'PTS_driver_stats']]
)

In [55]:
# Convert Pos_overall and Pos_driver_stats to ordinal integers
base_df['pos_overall_ordinal']      = base_df['Pos_overall'].astype(int)
base_df['pos_driver_stats_ordinal'] = base_df['Pos_driver_stats'].astype(int)
base_df.head()

  has_large_values = (abs_vals > 1e6).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()
  has_large_values = (abs_vals > 1e6).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()


Unnamed: 0,Grand Prix,Date,year,Driver,Team,Nationality,Team_name,PTS_overall,Pos_overall,PTS_driver_stats,...,driver_avg_finish_last5,team_id,team_races_before,team_wins_cum,team_win_rate_before,team_podiums_last5,PTS_overall_scaled,PTS_driver_stats_scaled,pos_overall_ordinal,pos_driver_stats_ordinal
0,Brazil,1987-04-12,1987,Roberto Moreno,AGS Ford,BRA,AGS Ford,1.0,13,1.0,...,,0,0,0.0,0.0,,0.001163,0.001739,13,21
1,San Marino,1987-05-03,1987,Roberto Moreno,AGS Ford,BRA,AGS Ford,1.0,13,1.0,...,21.0,0,1,0.0,0.0,0.0,0.001163,0.001739,13,21
2,Belgium,1987-05-17,1987,Roberto Moreno,AGS Ford,BRA,AGS Ford,1.0,13,1.0,...,21.0,0,2,0.0,0.0,0.0,0.001163,0.001739,13,21
3,Monaco,1987-05-31,1987,Roberto Moreno,AGS Ford,BRA,AGS Ford,1.0,13,1.0,...,21.0,0,3,0.0,0.0,0.0,0.001163,0.001739,13,21
4,Detroit,1987-06-21,1987,Roberto Moreno,AGS Ford,BRA,AGS Ford,1.0,13,1.0,...,21.0,0,4,0.0,0.0,0.0,0.001163,0.001739,13,21


In [56]:
#Nationality Encoding
nat_dummies = pd.get_dummies(base_df['Nationality'], prefix='Nat')
base_df = pd.concat([base_df, nat_dummies], axis=1)
base_df.head()

Unnamed: 0,Grand Prix,Date,year,Driver,Team,Nationality,Team_name,PTS_overall,Pos_overall,PTS_driver_stats,...,Nat_POR,Nat_RAF,Nat_RHO,Nat_RSA,Nat_RUS,Nat_SUI,Nat_SWE,Nat_THA,Nat_USA,Nat_VEN
0,Brazil,1987-04-12,1987,Roberto Moreno,AGS Ford,BRA,AGS Ford,1.0,13,1.0,...,False,False,False,False,False,False,False,False,False,False
1,San Marino,1987-05-03,1987,Roberto Moreno,AGS Ford,BRA,AGS Ford,1.0,13,1.0,...,False,False,False,False,False,False,False,False,False,False
2,Belgium,1987-05-17,1987,Roberto Moreno,AGS Ford,BRA,AGS Ford,1.0,13,1.0,...,False,False,False,False,False,False,False,False,False,False
3,Monaco,1987-05-31,1987,Roberto Moreno,AGS Ford,BRA,AGS Ford,1.0,13,1.0,...,False,False,False,False,False,False,False,False,False,False
4,Detroit,1987-06-21,1987,Roberto Moreno,AGS Ford,BRA,AGS Ford,1.0,13,1.0,...,False,False,False,False,False,False,False,False,False,False


In [57]:
# Driver-GP win rate before
base_df = base_df.sort_values(['Driver', 'Grand Prix', 'Date']).reset_index(drop=True)
base_df['driver_gp_races_before'] = base_df.groupby(['Driver', 'Grand Prix']).cumcount()
base_df['driver_gp_wins_cum']     = (
    base_df.groupby(['Driver', 'Grand Prix'])['Did_win']
    .cumsum().shift(1).fillna(0)
)
base_df['driver_gp_win_rate_before'] = (
    base_df['driver_gp_wins_cum'] / 
    base_df['driver_gp_races_before'].replace({0: np.nan})
).fillna(0)
base_df.head()

  base_df['driver_gp_races_before'] = base_df.groupby(['Driver', 'Grand Prix']).cumcount()
  base_df.groupby(['Driver', 'Grand Prix'])['Did_win']


Unnamed: 0,Grand Prix,Date,year,Driver,Team,Nationality,Team_name,PTS_overall,Pos_overall,PTS_driver_stats,...,Nat_RSA,Nat_RUS,Nat_SUI,Nat_SWE,Nat_THA,Nat_USA,Nat_VEN,driver_gp_races_before,driver_gp_wins_cum,driver_gp_win_rate_before
0,Abu Dhabi,2009-11-01,2009,Adrian Sutil,Force India Mercedes,GER,Force India Mercedes,13.0,9,5.0,...,False,False,False,False,False,False,False,0,0.0,0.0
1,Abu Dhabi,2010-11-14,2010,Adrian Sutil,Force India Mercedes,GER,Force India Mercedes,68.0,7,47.0,...,False,False,False,False,False,False,False,1,0.0,0.0
2,Abu Dhabi,2011-11-13,2011,Adrian Sutil,Force India Mercedes,GER,Force India Mercedes,69.0,6,42.0,...,False,False,False,False,False,False,False,2,0.0,0.0
3,Abu Dhabi,2013-11-03,2013,Adrian Sutil,Force India Mercedes,GER,Force India Mercedes,77.0,6,29.0,...,False,False,False,False,False,False,False,3,0.0,0.0
4,Abu Dhabi,2014-11-23,2014,Adrian Sutil,Sauber Ferrari,GER,Sauber Ferrari,0.0,10,0.0,...,False,False,False,False,False,False,False,4,0.0,0.0


In [58]:
# Team-GP win rate before
base_df = base_df.sort_values(['Team', 'Grand Prix', 'Date']).reset_index(drop=True)
base_df['team_gp_races_before'] = base_df.groupby(['Team', 'Grand Prix']).cumcount()
base_df['team_gp_wins_cum']     = (
    base_df.groupby(['Team', 'Grand Prix'])['Did_win']
    .cumsum().shift(1).fillna(0)
)
base_df['team_gp_win_rate_before'] = (
    base_df['team_gp_wins_cum'] / 
    base_df['team_gp_races_before'].replace({0: np.nan})
).fillna(0)
base_df.head()

  base_df['team_gp_races_before'] = base_df.groupby(['Team', 'Grand Prix']).cumcount()
  base_df.groupby(['Team', 'Grand Prix'])['Did_win']


Unnamed: 0,Grand Prix,Date,year,Driver,Team,Nationality,Team_name,PTS_overall,Pos_overall,PTS_driver_stats,...,Nat_SWE,Nat_THA,Nat_USA,Nat_VEN,driver_gp_races_before,driver_gp_wins_cum,driver_gp_win_rate_before,team_gp_races_before,team_gp_wins_cum,team_gp_win_rate_before
0,Australia,1987-11-15,1987,Roberto Moreno,AGS Ford,BRA,AGS Ford,1.0,13,1.0,...,False,False,False,False,0,0.0,0.0,0,0.0,0.0
1,Australia,1989-11-05,1989,Gabriele Tarquini,AGS Ford,ITA,AGS Ford,1.0,15,1.0,...,False,False,False,False,0,1.0,0.0,1,0.0,0.0
2,Austria,1987-08-16,1987,Roberto Moreno,AGS Ford,BRA,AGS Ford,1.0,13,1.0,...,False,False,False,False,0,0.0,0.0,0,0.0,0.0
3,Belgium,1987-05-17,1987,Roberto Moreno,AGS Ford,BRA,AGS Ford,1.0,13,1.0,...,False,False,False,False,0,0.0,0.0,0,0.0,0.0
4,Belgium,1989-08-27,1989,Gabriele Tarquini,AGS Ford,ITA,AGS Ford,1.0,15,1.0,...,False,False,False,False,0,0.0,0.0,1,0.0,0.0


In [59]:
#Driver-Team win rate before
base_df = base_df.sort_values(['Driver', 'Team', 'Date']).reset_index(drop=True)
base_df['driver_team_races_before'] = base_df.groupby(['Driver', 'Team']).cumcount()
base_df['driver_team_wins_cum']     = (
    base_df.groupby(['Driver', 'Team'])['Did_win']
    .cumsum().shift(1).fillna(0)
)
base_df['driver_team_win_rate_before'] = (
    base_df['driver_team_wins_cum'] / 
    base_df['driver_team_races_before'].replace({0: np.nan})
).fillna(0)
base_df.head()

  base_df['driver_team_races_before'] = base_df.groupby(['Driver', 'Team']).cumcount()
  base_df.groupby(['Driver', 'Team'])['Did_win']


Unnamed: 0,Grand Prix,Date,year,Driver,Team,Nationality,Team_name,PTS_overall,Pos_overall,PTS_driver_stats,...,Nat_VEN,driver_gp_races_before,driver_gp_wins_cum,driver_gp_win_rate_before,team_gp_races_before,team_gp_wins_cum,team_gp_win_rate_before,driver_team_races_before,driver_team_wins_cum,driver_team_win_rate_before
0,Australia,2008-03-16,2008,Adrian Sutil,Force India Ferrari,GER,Force India Ferrari,0.0,10,0.0,...,False,1,0.0,0.0,0,0.0,0.0,0,0.0,0.0
1,Malaysia,2008-03-23,2008,Adrian Sutil,Force India Ferrari,GER,Force India Ferrari,0.0,10,0.0,...,False,1,0.0,0.0,0,0.0,0.0,1,0.0,0.0
2,Bahrain,2008-04-06,2008,Adrian Sutil,Force India Ferrari,GER,Force India Ferrari,0.0,10,0.0,...,False,1,0.0,0.0,0,0.0,0.0,2,0.0,0.0
3,Spain,2008-04-27,2008,Adrian Sutil,Force India Ferrari,GER,Force India Ferrari,0.0,10,0.0,...,False,1,0.0,0.0,0,0.0,0.0,3,0.0,0.0
4,Turkey,2008-05-11,2008,Adrian Sutil,Force India Ferrari,GER,Force India Ferrari,0.0,10,0.0,...,False,1,0.0,0.0,0,0.0,0.0,4,0.0,0.0


In [60]:
drop = [
    'prev_race_date', 'is_podium',
    'driver_wins_cum', 'driver_podiums_cum',
    'team_wins_cum',
    'driver_gp_wins_cum', 'team_gp_wins_cum', 'driver_team_wins_cum'
]
df=base_df.drop(columns=drop)
df.head()

Unnamed: 0,Grand Prix,Date,year,Driver,Team,Nationality,Team_name,PTS_overall,Pos_overall,PTS_driver_stats,...,Nat_SWE,Nat_THA,Nat_USA,Nat_VEN,driver_gp_races_before,driver_gp_win_rate_before,team_gp_races_before,team_gp_win_rate_before,driver_team_races_before,driver_team_win_rate_before
0,Australia,2008-03-16,2008,Adrian Sutil,Force India Ferrari,GER,Force India Ferrari,0.0,10,0.0,...,False,False,False,False,1,0.0,0,0.0,0,0.0
1,Malaysia,2008-03-23,2008,Adrian Sutil,Force India Ferrari,GER,Force India Ferrari,0.0,10,0.0,...,False,False,False,False,1,0.0,0,0.0,1,0.0
2,Bahrain,2008-04-06,2008,Adrian Sutil,Force India Ferrari,GER,Force India Ferrari,0.0,10,0.0,...,False,False,False,False,1,0.0,0,0.0,2,0.0
3,Spain,2008-04-27,2008,Adrian Sutil,Force India Ferrari,GER,Force India Ferrari,0.0,10,0.0,...,False,False,False,False,1,0.0,0,0.0,3,0.0
4,Turkey,2008-05-11,2008,Adrian Sutil,Force India Ferrari,GER,Force India Ferrari,0.0,10,0.0,...,False,False,False,False,1,0.0,0,0.0,4,0.0


In [61]:
df.isna().sum().sort_values(ascending=False)

driver_wins_last5              303
driver_avg_finish_last5        303
team_podiums_last5             176
PTS_driver_stats_scaled          0
Nat_DEN                          0
                              ... 
GP_Great Britain                 0
GP_Germany                       0
GP_France                        0
GP_Europe                        0
driver_team_win_rate_before      0
Length: 127, dtype: int64

In [62]:
df.dropna(inplace=True)
df.isna().sum().sort_values(ascending=False)

Grand Prix                     0
PTS_overall_scaled             0
Nat_DEN                        0
Nat_COL                        0
Nat_CHN                        0
                              ..
GP_Great Britain               0
GP_Germany                     0
GP_France                      0
GP_Europe                      0
driver_team_win_rate_before    0
Length: 127, dtype: int64

## Visualization

In [63]:
# Count how many rows end in Did_win = 0 vs 1
win_counts = base_df['Did_win'].value_counts().reset_index()
win_counts.columns = ['Did_win', 'count']
win_counts['Did_win'] = win_counts['Did_win'].astype(str)  # for a cleaner legend

fig = px.bar(
    win_counts,
    x='Did_win',
    y='count',
    text='count',
    labels={'Did_win': 'Did Win (0 = No, 1 = Yes)', 'count': 'Number of Rows'},
    title='Target Distribution: Did_win'
)
fig.update_traces(textposition='outside')


In [64]:
# Group by driver to count total wins (sum of Did_win)
driver_wins = (
    base_df
    .groupby('Driver', as_index=False)['Did_win']
    .sum()
    .sort_values('Did_win', ascending=False)
)

#the top 10 winners for legibility
top10_drivers = driver_wins.head(10)

fig = px.bar(
    top10_drivers,
    x='Driver',
    y='Did_win',
    text='Did_win',
    labels={'Did_win': 'Total Wins', 'Driver': 'Driver'},
    title='Top 10 Drivers by Total Wins'
)
fig.update_traces(textposition='outside')
fig.update_layout(xaxis_tickangle=-45)
fig.show()






In [65]:
team_wins = (
    base_df
    .groupby('Team', as_index=False)['Did_win']
    .sum()
    .sort_values('Did_win', ascending=False)
)

top10_teams = team_wins.head(10)

fig = px.bar(
    top10_teams,
    x='Team',
    y='Did_win',
    text='Did_win',
    labels={'Did_win': 'Total Wins', 'Team': 'Constructor'},
    title='Top 10 Teams by Total Wins'
)
fig.update_traces(textposition='outside')
fig.update_layout(xaxis_tickangle=-30)
fig.show()






In [66]:
# For each race_number, compute: total rows and total wins
agg_by_round = (
    base_df
    .groupby('race_number', as_index=False)
    .agg(total_rows=('Did_win', 'count'),
         total_wins=('Did_win', 'sum'))
)
# fraction of driver–race rows that won
agg_by_round['win_rate'] = agg_by_round['total_wins'] / agg_by_round['total_rows']

fig = px.line(
    agg_by_round,
    x='race_number',
    y='win_rate',
    markers=True,
    labels={'race_number': 'Round Number in Season', 'win_rate': 'Win Rate (per row)'},
    title='Win‐Rate vs. Race Number'
)
fig.update_yaxes(tickformat='.0%')
fig.show()


In [67]:
fig = px.histogram(
    base_df,
    x='PTS_overall_scaled',
    color='Did_win',
    nbins=30,
    barmode='overlay',
    histnorm='probability density',
    labels={
        'PTS_overall_scaled': 'Scaled Championship Points',
        'Did_win': 'Did Win'
    },
    title='Distribution of Scaled Points (Winners vs. Non-Winners)'
)
fig.update_layout(legend=dict(title='Did Win'), bargap=0.1)
fig.show()

In [68]:
# Select a subset of numeric columns (drop raw categories, keep engineered floats/ints)
numeric_cols = [
    'Did_win',
    'race_month', 'race_weekofyear', 'race_dayofweek', 'race_number', 'days_since_last_race',
    'driver_win_rate_before', 'driver_podium_rate_before', 'driver_wins_last5', 'driver_avg_finish_last5',
    'team_win_rate_before', 'team_podiums_last5',
    'PTS_overall_scaled', 'pos_overall_ordinal', 'PTS_driver_stats_scaled', 'pos_driver_stats_ordinal',
    'driver_gp_win_rate_before', 'team_gp_win_rate_before', 'driver_team_win_rate_before'
]
corr_df = base_df[numeric_cols].corr().round(2)

# Plotly heatmap
fig = px.imshow(
    corr_df,
    text_auto=True,
    aspect='auto',
    labels={'x': 'Feature', 'y': 'Feature'},
    title='Correlation Matrix of Numeric Features'
)
fig.update_layout(xaxis={'side': 'bottom'})
fig.show()

## Models

In [69]:
numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
numeric_cols.remove('Did_win')
X=df[numeric_cols]
y=df['Did_win']

In [70]:
# 5-fold stratified split
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

In [71]:
#Logistic Regression
logreg_pipe = Pipeline([
    ('scaler', StandardScaler()),
    ('clf', LogisticRegression(max_iter=1000, random_state=42))
])

In [72]:
# Random Forest
rf_clf = RandomForestClassifier(
    n_estimators=200,
    min_samples_leaf=5,
    random_state=42,
    n_jobs=-1
)

In [73]:
# Gradient Boosting
gbm_clf = GradientBoostingClassifier(
    n_estimators=200,
    learning_rate=0.05,
    max_depth=3,
    subsample=0.8,
    random_state=42
)

In [74]:
# SVM
svm_pipe = Pipeline([
    ('scaler', StandardScaler()),
    ('svc', SVC(kernel='rbf', C=1.0, gamma='scale', probability=True, random_state=42))
])

In [75]:
models = {
    'LogisticRegression': logreg_pipe,
    'RandomForest':       rf_clf,
    'GradientBoosting':   gbm_clf,
    'SVM':                svm_pipe
}

In [76]:
# Prepare a results DataFrame to collect (fold, model, metrics)
results = []

# 5-fold Stratified CV
for fold, (train_idx, test_idx) in enumerate(skf.split(X, y), start=1):
    X_tr, X_val = X.iloc[train_idx], X.iloc[test_idx]
    y_tr, y_val = y.iloc[train_idx], y.iloc[test_idx]
    
    for model_name, model in models.items():
        model.fit(X_tr, y_tr)
        
        # Predict class (0/1) and probabilities on validation fold
        y_pred_proba = model.predict_proba(X_val)[:, 1]
        y_pred       = (y_pred_proba >= 0.5).astype(int)
        
        acc   = accuracy_score(y_val,       y_pred)
        prec1 = precision_score(y_val,      y_pred, pos_label=1, zero_division=0)
        rec1  = recall_score(y_val,         y_pred, pos_label=1, zero_division=0)
        auc   = roc_auc_score(y_val,        y_pred_proba)
        
        # 4) Store
        results.append({
            'fold':        fold,
            'model':       model_name,
            'accuracy':    acc,
            'precision_1': prec1,
            'recall_1':    rec1,
            'roc_auc':     auc
        })

In [77]:
cm_results = pd.DataFrame(results)

# Group by model, compute mean and std of each metric
summary = cm_results.groupby('model').agg({
    'accuracy':    ['mean', 'std'],
    'precision_1': ['mean', 'std'],
    'recall_1':    ['mean', 'std'],
    'roc_auc':     ['mean', 'std']
})

# Clean up column names
summary.columns = [
    f"{metric}_{stat}"
    for metric, stat in summary.columns
]
summary = summary.reset_index()

print(summary)

                model  accuracy_mean  accuracy_std  precision_1_mean  \
0    GradientBoosting       0.954039      0.001927          0.521077   
1  LogisticRegression       0.955130      0.002019          0.574947   
2        RandomForest       0.953630      0.001594          0.501621   
3                 SVM       0.954858      0.001818          0.572398   

   precision_1_std  recall_1_mean  recall_1_std  roc_auc_mean  roc_auc_std  
0         0.080983       0.153255      0.008372      0.925976     0.008262  
1         0.099679       0.126765      0.027934      0.928521     0.008055  
2         0.053536       0.155226      0.012130      0.918024     0.007417  
3         0.110723       0.100217      0.018697      0.751966     0.005341  


In [78]:
# Pull only the AUC rows
auc_df = cm_results[['model', 'roc_auc']]

fig = px.box(
    auc_df,
    x='model',
    y='roc_auc',
    points='all',  # show all individual fold values
    title='Cross-Validated ROC AUC by Model'
)
fig.show()

In [79]:
#Pie Chart of Mean Accuracy Distribution Across Models
fig1 = px.pie(
    summary,
    names='model',
    values='accuracy_mean',
    title='Distribution of Mean Accuracy by Model'
)
fig1.show()

In [80]:
# Pie Chart of Mean Recall (class 1) by Model
fig2 = px.pie(
    summary,
    names='model',
    values='recall_1_mean',
    title='Distribution of Mean Recall (Winners) by Model'
)
fig2.show()

In [81]:
#Bar Chart of Mean ROC AUC with Error Bars
fig3 = px.bar(
    summary,
    x='model',
    y='roc_auc_mean',
    error_y='roc_auc_std',
    labels={'roc_auc_mean': 'Mean ROC AUC', 'model': 'Model'},
    title='Mean ROC AUC with Standard Deviation (5-Fold CV)'
)
fig3.update_layout(xaxis_tickangle=-45)
fig3.show()

In [82]:
#Pie Chart of Combined (Accuracy + Recall) Distribution by Model
summary['combined'] = summary['accuracy_mean'] + summary['recall_1_mean']
fig4 = px.pie(
    summary,
    names='model',
    values='combined',
    title='Combined (Accuracy + Recall) Distribution by Model'
)
fig4.show()