In [1]:
import pandas as pd
from pathlib import Path

In [2]:
# Read results.csv and races.csv 
results_df = pd.read_csv(Path('../Resources/Dataset/results.csv'))
races_df = pd.read_csv(Path('../Resources/Dataset/races.csv'))
laptimes_df = pd.read_csv(Path('../Resources/Dataset/lap_times.csv'))

In [3]:
# Filter for race position after first lap 
firstlap_df = laptimes_df.loc[laptimes_df["lap"]==1]

In [4]:
# Rename "position" column to "firstlap_position"
firstlap_df = firstlap_df.rename(columns={"position": "firstlap_position"}) 

In [5]:
# Rename columns in results_df
results_df = results_df.rename(columns={"position": "ending_position", 
                                        "grid": "starting_position",
                                        "statusId": "finishing_status" })  
results_df.head()

Unnamed: 0,resultId,raceId,driverId,constructorId,number,starting_position,ending_position,positionText,positionOrder,points,laps,time,milliseconds,fastestLap,rank,fastestLapTime,fastestLapSpeed,finishing_status
0,1,18,1,1,22,1,1,1,1,10.0,58,1:34:50.616,5690616,39,2,1:27.452,218.3,1
1,2,18,2,2,3,5,2,2,2,8.0,58,+5.478,5696094,41,3,1:27.739,217.586,1
2,3,18,3,3,7,7,3,3,3,6.0,58,+8.163,5698779,41,5,1:28.090,216.719,1
3,4,18,4,4,5,11,4,4,4,5.0,58,+17.181,5707797,58,7,1:28.603,215.464,1
4,5,18,5,1,23,3,5,5,5,4.0,58,+18.014,5708630,43,1,1:27.418,218.385,1


In [6]:
# Merge results_df with races_df on raceId. Only bring in columns from races_df that we need
results_df = (results_df.merge(races_df, left_on='raceId', right_on='raceId')
          .reindex(columns=['resultId', 'raceId', 'driverId', 'constructorId',
                                  'starting_position', 'ending_position', 'finishing_status', 
                                 'year', 'circuitId', 'name']))
print(results_df.shape)
results_df.head()

(25040, 10)


Unnamed: 0,resultId,raceId,driverId,constructorId,starting_position,ending_position,finishing_status,year,circuitId,name
0,1,18,1,1,1,1,1,2008,1,Australian Grand Prix
1,2,18,2,2,5,2,1,2008,1,Australian Grand Prix
2,3,18,3,3,7,3,1,2008,1,Australian Grand Prix
3,4,18,4,4,11,4,1,2008,1,Australian Grand Prix
4,5,18,5,1,3,5,1,2008,1,Australian Grand Prix


In [7]:
# Merge results_df with races_df on raceId. Only bring in columns from races_df that we need
results_df = (results_df.merge(firstlap_df, how='left', left_on=['raceId','driverId'], right_on=['raceId','driverId'])
          .reindex(columns=['resultId', 'raceId', 'driverId', 'constructorId',
                                  'starting_position', 'ending_position', 'finishing_status', 
                                 'year', 'circuitId', 'name', 'lap', 'firstlap_position']))
print(results_df.shape)
results_df.head()

(25040, 12)


Unnamed: 0,resultId,raceId,driverId,constructorId,starting_position,ending_position,finishing_status,year,circuitId,name,lap,firstlap_position
0,1,18,1,1,1,1,1,2008,1,Australian Grand Prix,1.0,1.0
1,2,18,2,2,5,2,1,2008,1,Australian Grand Prix,1.0,5.0
2,3,18,3,3,7,3,1,2008,1,Australian Grand Prix,1.0,4.0
3,4,18,4,4,11,4,1,2008,1,Australian Grand Prix,1.0,9.0
4,5,18,5,1,3,5,1,2008,1,Australian Grand Prix,1.0,3.0


In [8]:
results_df['lap'] = results_df['lap'].fillna(1.0)
results_df['firstlap_position'] = results_df['firstlap_position'].fillna(0)

These represent finishing statuses. Drivers can finish multiple laps down from the lead driver. For our purpose, we only care that the driver finished the race so we will convert all these statuses to 1.
1: Finished, 11: +1 Lap, 12: +2 Laps, 13: +3 Laps, 14: +4 Laps,15: +5 Laps, 16: +6 Laps,17: +7 Laps, 18: +8 Laps, 19: +9 Laps, 45: +11 Laps, 50: +17 Laps,12: +42 Laps, 53: +13 Laps,55: +12 Laps, 58: +26 Laps, 88: +10 Laps, 111: +14 Laps, 112: +15 Laps, 113: +25 Laps, 114: +18 Laps, 115: +22 Laps, 116: +16 Laps, 117: +24 Laps, 118: +29 Laps, 119: +23 Laps, 120: +21 Laps, 122: +44 Laps, 123: +30 Laps, 124: +19 Laps, 125: +46 Laps,127: +20 Laps,133: +49 Laps,134: +38 Laps

In [9]:
# We will finish all the statuses where drivers finish the race to 1
replace = {
    1:1, 11:1, 12:1, 13:1, 14:1, 15:1, 16:1, 17:1, 18:1, 19:1, 45:1, 50:1, 128:1, 53:1, 55:1, 58:1, 
    88:1, 111:1, 112:1, 113:1, 114:1, 115:1, 116:1, 117:1, 118:1, 119:1, 120:1, 122:1, 123:1, 124:1, 
    125:1, 127:1, 133:1, 134:1  
}
results_df["finishing_status"] = results_df["finishing_status"].replace(replace)

In [10]:
# Transform finish_status column. Anything which is not 1 (driver did not finish race) is converted to 2
def change_status(x):
    if x == 1:
        return 1
    else:
        return 2
    
results_df["finishing_status"] = results_df["finishing_status"].apply(change_status)
print(results_df.shape)
results_df.head()

(25040, 12)


Unnamed: 0,resultId,raceId,driverId,constructorId,starting_position,ending_position,finishing_status,year,circuitId,name,lap,firstlap_position
0,1,18,1,1,1,1,1,2008,1,Australian Grand Prix,1.0,1.0
1,2,18,2,2,5,2,1,2008,1,Australian Grand Prix,1.0,5.0
2,3,18,3,3,7,3,1,2008,1,Australian Grand Prix,1.0,4.0
3,4,18,4,4,11,4,1,2008,1,Australian Grand Prix,1.0,9.0
4,5,18,5,1,3,5,1,2008,1,Australian Grand Prix,1.0,3.0


In [11]:
# Replace all "\\N" ending_positions to 0 
replace1 = {"\\N":0}
results_df["ending_position"] = results_df["ending_position"].replace(replace1)

In [12]:
results_df['ending_position'] = pd.to_numeric(results_df['ending_position'])

In [13]:
# Identify unique ending_positions to identify if any rows need to be dropped
results_df["ending_position"].unique()

array([ 1,  2,  3,  4,  5,  6,  7,  8,  0,  9, 10, 11, 12, 13, 14, 15, 16,
       17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33])

In [14]:
# Create bins and assign letter grades
bins = [0, 3, 6, 10, 15, 33]
labels = [1,2,3,4,5]
results_df['ending_bin'] = pd.cut(results_df['ending_position'], bins=bins, labels=labels)
print(results_df.shape)
results_df.head()

(25040, 13)


Unnamed: 0,resultId,raceId,driverId,constructorId,starting_position,ending_position,finishing_status,year,circuitId,name,lap,firstlap_position,ending_bin
0,1,18,1,1,1,1,1,2008,1,Australian Grand Prix,1.0,1.0,1
1,2,18,2,2,5,2,1,2008,1,Australian Grand Prix,1.0,5.0,1
2,3,18,3,3,7,3,1,2008,1,Australian Grand Prix,1.0,4.0,1
3,4,18,4,4,11,4,1,2008,1,Australian Grand Prix,1.0,9.0,2
4,5,18,5,1,3,5,1,2008,1,Australian Grand Prix,1.0,3.0,2


In [15]:
results_df.dtypes

resultId                int64
raceId                  int64
driverId                int64
constructorId           int64
starting_position       int64
ending_position         int64
finishing_status        int64
year                    int64
circuitId               int64
name                   object
lap                   float64
firstlap_position     float64
ending_bin           category
dtype: object

In [16]:
# After binning, the ending_position "0" creates a NaN value. Also the column is a category but we want number 
# So we will convert the bin column to numeric and replace the NaNs with 0. 
results_df['ending_bin'] = pd.to_numeric(results_df['ending_bin'])
results_df['ending_bin'] = results_df['ending_bin'].fillna(0)

In [17]:
results_df.dtypes

resultId               int64
raceId                 int64
driverId               int64
constructorId          int64
starting_position      int64
ending_position        int64
finishing_status       int64
year                   int64
circuitId              int64
name                  object
lap                  float64
firstlap_position    float64
ending_bin           float64
dtype: object

In [18]:
# the binning column but we want integer so we will convert it to integer
results_df["ending_bin"] = results_df["ending_bin"].astype(int)
results_df.dtypes

resultId               int64
raceId                 int64
driverId               int64
constructorId          int64
starting_position      int64
ending_position        int64
finishing_status       int64
year                   int64
circuitId              int64
name                  object
lap                  float64
firstlap_position    float64
ending_bin             int64
dtype: object

In [19]:
# Create a new dataframe which a train_test column. 2019 wil be test and 2010 to 2018 will be train dataset

results_df['train_test'] = results_df['year']

def change_year(x):
    if x < 2010:
        return 3
    elif x == 2019:
        return 2 
    elif x > 2019:
        return 4
    else:
        return 1
results_df["train_test"] = results_df["train_test"].apply(change_year)

print(results_df.shape)
results_df.head()

(25040, 14)


Unnamed: 0,resultId,raceId,driverId,constructorId,starting_position,ending_position,finishing_status,year,circuitId,name,lap,firstlap_position,ending_bin,train_test
0,1,18,1,1,1,1,1,2008,1,Australian Grand Prix,1.0,1.0,1,3
1,2,18,2,2,5,2,1,2008,1,Australian Grand Prix,1.0,5.0,1,3
2,3,18,3,3,7,3,1,2008,1,Australian Grand Prix,1.0,4.0,1,3
3,4,18,4,4,11,4,1,2008,1,Australian Grand Prix,1.0,9.0,2,3
4,5,18,5,1,3,5,1,2008,1,Australian Grand Prix,1.0,3.0,2,3


In [20]:
# Rearrainging columns for results2010
results_df = results_df[['resultId',
                         'raceId',
                         'name',
                         'year',
                         'circuitId',
                         'driverId',
                         'constructorId',
                         'starting_position',
                         'finishing_status',
                         'firstlap_position',
                         'lap',
                         'train_test',
                         'ending_position', 
                            'ending_bin']]

In [21]:
# Creating the final dataframe with data between 2010 to 2019
fivebin_final = results_df.loc[results_df["train_test"] <= 2]

In [22]:
print(fivebin_final.shape)
fivebin_final.head()

(4297, 14)


Unnamed: 0,resultId,raceId,name,year,circuitId,driverId,constructorId,starting_position,finishing_status,firstlap_position,lap,train_test,ending_position,ending_bin
20323,20323,337,Bahrain Grand Prix,2010,3,4,6,3,1,2.0,1.0,1,1,1
20324,20324,337,Bahrain Grand Prix,2010,3,13,6,2,1,3.0,1.0,1,2,1
20325,20325,337,Bahrain Grand Prix,2010,3,1,1,4,1,5.0,1.0,1,3,1
20326,20326,337,Bahrain Grand Prix,2010,3,20,9,1,1,1.0,1.0,1,4,2
20327,20327,337,Bahrain Grand Prix,2010,3,3,131,5,1,4.0,1.0,1,5,2


In [23]:
# Extract CSV file
fivebin_final.to_csv('../Resources/PythonExport/fivebin_final.csv')