In [1]:
import pandas as pd
from pathlib import Path

In [2]:
# Read results.csv and races.csv 
results_df = pd.read_csv(Path('../Resources/Dataset/results.csv'))
races_df = pd.read_csv(Path('../Resources/Dataset/races.csv'))
laptimes_df = pd.read_csv(Path('../Resources/Dataset/lap_times.csv'))

In [3]:
# Filter for race position after first lap 
firstlap_df = laptimes_df.loc[laptimes_df["lap"]==1]

In [4]:
# Rename "position" column to "firstlap_position"
firstlap_df = firstlap_df.rename(columns={"position": "firstlap_position"}) 

In [5]:
firstlap_df.head()

Unnamed: 0,raceId,driverId,lap,firstlap_position,time,milliseconds
0,841,20,1,1,1:38.109,98109
58,841,1,1,2,1:40.573,100573
116,841,17,1,3,1:41.467,101467
174,841,808,1,4,1:42.835,102835
232,841,13,1,5,1:44.196,104196


In [6]:
# Rename columns in results_df
results_df = results_df.rename(columns={"position": "ending_position", 
                                        "grid": "starting_position",
                                        "statusId": "finishing_status" })  

In [7]:
# Merge results_df with races_df on raceId. Only bring in columns from races_df that we need
results_df = (results_df.merge(races_df, left_on='raceId', right_on='raceId')
          .reindex(columns=['resultId', 'raceId', 'driverId', 'constructorId',
                                  'starting_position', 'ending_position', 'finishing_status', 
                                 'year', 'circuitId', 'name']))
print(results_df.shape)
results_df.head()

(25040, 10)


Unnamed: 0,resultId,raceId,driverId,constructorId,starting_position,ending_position,finishing_status,year,circuitId,name
0,1,18,1,1,1,1,1,2008,1,Australian Grand Prix
1,2,18,2,2,5,2,1,2008,1,Australian Grand Prix
2,3,18,3,3,7,3,1,2008,1,Australian Grand Prix
3,4,18,4,4,11,4,1,2008,1,Australian Grand Prix
4,5,18,5,1,3,5,1,2008,1,Australian Grand Prix


In [8]:
# Merge results_df with firstlap_df on raceId. Only bring in columns from races_df that we need
results_df = (results_df.merge(firstlap_df, how='left', left_on=['raceId','driverId'], right_on=['raceId','driverId'])
          .reindex(columns=['resultId', 'raceId', 'driverId', 'constructorId',
                                  'starting_position', 'ending_position', 'finishing_status', 
                                 'year', 'circuitId', 'name', 'lap', 'firstlap_position']))
print(results_df.shape)
results_df.head()

(25040, 12)


Unnamed: 0,resultId,raceId,driverId,constructorId,starting_position,ending_position,finishing_status,year,circuitId,name,lap,firstlap_position
0,1,18,1,1,1,1,1,2008,1,Australian Grand Prix,1.0,1.0
1,2,18,2,2,5,2,1,2008,1,Australian Grand Prix,1.0,5.0
2,3,18,3,3,7,3,1,2008,1,Australian Grand Prix,1.0,4.0
3,4,18,4,4,11,4,1,2008,1,Australian Grand Prix,1.0,9.0
4,5,18,5,1,3,5,1,2008,1,Australian Grand Prix,1.0,3.0


In [9]:
results_df['lap'] = results_df['lap'].fillna(1.0)
results_df['firstlap_position'] = results_df['firstlap_position'].fillna(0)

These represent finishing statuses. Drivers can finish multiple laps down from the lead driver. For our purpose, we only care that the driver finished the race so we will convert all these statuses to 1. 1: Finished, 11: +1 Lap, 12: +2 Laps, 13: +3 Laps, 14: +4 Laps,15: +5 Laps, 16: +6 Laps,17: +7 Laps, 18: +8 Laps, 19: +9 Laps, 45: +11 Laps, 50: +17 Laps,12: +42 Laps, 53: +13 Laps,55: +12 Laps, 58: +26 Laps, 88: +10 Laps, 111: +14 Laps, 112: +15 Laps, 113: +25 Laps, 114: +18 Laps, 115: +22 Laps, 116: +16 Laps, 117: +24 Laps, 118: +29 Laps, 119: +23 Laps, 120: +21 Laps, 122: +44 Laps, 123: +30 Laps, 124: +19 Laps, 125: +46 Laps,127: +20 Laps,133: +49 Laps,134: +38 Laps

In [10]:
# We will finish all the statuses where drivers finish the race to 1
replace = {
    1:1, 11:1, 12:1, 13:1, 14:1, 15:1, 16:1, 17:1, 18:1, 19:1, 45:1, 50:1, 128:1, 53:1, 55:1, 58:1, 
    88:1, 111:1, 112:1, 113:1, 114:1, 115:1, 116:1, 117:1, 118:1, 119:1, 120:1, 122:1, 123:1, 124:1, 
    125:1, 127:1, 133:1, 134:1  
}
results_df["finishing_status"] = results_df["finishing_status"].replace(replace)

In [11]:
# Transform finish_status column. Anything which is not 1 (driver did not finish race) is converted to 2
def change_status(x):
    if x == 1:
        return 1
    else:
        return 2
    
results_df["finishing_status"] = results_df["finishing_status"].apply(change_status)
print(results_df.shape)
results_df.head()

(25040, 12)


Unnamed: 0,resultId,raceId,driverId,constructorId,starting_position,ending_position,finishing_status,year,circuitId,name,lap,firstlap_position
0,1,18,1,1,1,1,1,2008,1,Australian Grand Prix,1.0,1.0
1,2,18,2,2,5,2,1,2008,1,Australian Grand Prix,1.0,5.0
2,3,18,3,3,7,3,1,2008,1,Australian Grand Prix,1.0,4.0
3,4,18,4,4,11,4,1,2008,1,Australian Grand Prix,1.0,9.0
4,5,18,5,1,3,5,1,2008,1,Australian Grand Prix,1.0,3.0


In [12]:
# Replace all "\\N" ending_positions to 0 
replace1 = {"\\N":0}
results_df["ending_position"] = results_df["ending_position"].replace(replace1)

In [13]:
results_df['ending_position'] = pd.to_numeric(results_df['ending_position'])

In [14]:
# Convert "lap" and "firstlap_position" from Float to Integer and confirm using df.dtypes
results_df["lap"] = results_df["lap"].astype(int)
results_df["firstlap_position"] = results_df["firstlap_position"].astype(int)
results_df.dtypes

resultId              int64
raceId                int64
driverId              int64
constructorId         int64
starting_position     int64
ending_position       int64
finishing_status      int64
year                  int64
circuitId             int64
name                 object
lap                   int64
firstlap_position     int64
dtype: object

In [15]:
# Create a new dataframe which a train_test column. 2019 wil be test and 2010 to 2018 will be train dataset

results_df['train_test'] = results_df['year']

def change_year(x):
    if x < 2010:
        return 3
    elif x == 2019:
        return 2 
    elif x > 2019:
        return 4
    else:
        return 1
results_df["train_test"] = results_df["train_test"].apply(change_year)

print(results_df.shape)
results_df.head()

(25040, 13)


Unnamed: 0,resultId,raceId,driverId,constructorId,starting_position,ending_position,finishing_status,year,circuitId,name,lap,firstlap_position,train_test
0,1,18,1,1,1,1,1,2008,1,Australian Grand Prix,1,1,3
1,2,18,2,2,5,2,1,2008,1,Australian Grand Prix,1,5,3
2,3,18,3,3,7,3,1,2008,1,Australian Grand Prix,1,4,3
3,4,18,4,4,11,4,1,2008,1,Australian Grand Prix,1,9,3
4,5,18,5,1,3,5,1,2008,1,Australian Grand Prix,1,3,3


In [16]:
# Rearrainging columns for results_df
results_df = results_df[['resultId',
                         'raceId',
                         'name',
                         'year',
                         'circuitId',
                         'driverId',
                         'constructorId',
                         'starting_position',
                         'finishing_status',
                         'firstlap_position',
                         'lap',
                         'train_test',
                         'ending_position']]

In [17]:
# Creating the final dataframe with data between 2010 to 2019
race_final = results_df.loc[results_df["train_test"] <= 2]
print(race_final.shape)
race_final.head()

(4297, 13)


Unnamed: 0,resultId,raceId,name,year,circuitId,driverId,constructorId,starting_position,finishing_status,firstlap_position,lap,train_test,ending_position
20323,20323,337,Bahrain Grand Prix,2010,3,4,6,3,1,2,1,1,1
20324,20324,337,Bahrain Grand Prix,2010,3,13,6,2,1,3,1,1,2
20325,20325,337,Bahrain Grand Prix,2010,3,1,1,4,1,5,1,1,3
20326,20326,337,Bahrain Grand Prix,2010,3,20,9,1,1,1,1,1,4
20327,20327,337,Bahrain Grand Prix,2010,3,3,131,5,1,4,1,1,5


In [18]:
# Extract CSV file
race_final.to_csv('../Resources/PythonExport/race_final.csv')