In [1]:
import pandas as pd
from pathlib import Path

In [2]:
# Read in the required tables
results = pd.read_csv(Path('../Resources/Dataset/results.csv'))
races = pd.read_csv(Path('../Resources/Dataset/races.csv'))
drivers = pd.read_csv(Path('../Resources/Dataset/drivers.csv'))
constructors = pd.read_csv(Path('../Resources/Dataset/constructors.csv'))
circuit = pd.read_csv(Path('../Resources/Dataset/circuits.csv'))
laptimes = pd.read_csv(Path('../Resources/Dataset/lap_times.csv'))
weather = pd.read_csv(Path('../Resources/Dataset/lap_times.csv'))

## Join Tables

In [3]:
# Merge results with races on raceId. Only bring in columns from races that we need
data = (results.merge(races, left_on='raceId', right_on='raceId').reindex(
            columns=['year', 'round', 'resultId', 'raceId', 'circuitId', 'driverId', 'constructorId','grid', 
                     'position', 'statusId']))
data

Unnamed: 0,year,round,resultId,raceId,circuitId,driverId,constructorId,grid,position,statusId
0,2008,1,1,18,1,1,1,1,1,1
1,2008,1,2,18,1,2,2,5,2,1
2,2008,1,3,18,1,3,3,7,3,1
3,2008,1,4,18,1,4,4,11,4,1
4,2008,1,5,18,1,5,1,3,5,1
...,...,...,...,...,...,...,...,...,...,...
25035,2021,4,25041,1055,4,849,3,19,16,11
25036,2021,4,25042,1055,4,4,214,10,17,11
25037,2021,4,25043,1055,4,854,210,18,18,12
25038,2021,4,25044,1055,4,853,210,20,19,12


In [4]:
# Merge data with circuit on circuitId. Only bring in circuitRef
data = (data.merge(circuit, left_on='circuitId', right_on='circuitId').reindex(
            columns=['year', 'round', 'resultId', 'raceId', 'circuitId', 'circuitRef', 'driverId', 'constructorId','grid', 
                     'position', 'statusId']))

In [5]:
# Merge data with drivers on driverId. Only bring in driverRef
data = (data.merge(drivers, left_on='driverId', right_on='driverId').reindex(
            columns=['year', 'round', 'resultId', 'raceId', 'circuitId', 'circuitRef', 'driverId','driverRef', 'constructorId','grid', 
                     'position', 'statusId']))

In [6]:
print(data.shape)
data.head()

(25040, 12)


Unnamed: 0,year,round,resultId,raceId,circuitId,circuitRef,driverId,driverRef,constructorId,grid,position,statusId
0,2008,1,1,18,1,albert_park,1,hamilton,1,1,1,1
1,2007,1,371,36,1,albert_park,1,hamilton,1,4,3,1
2,2009,1,7573,1,1,albert_park,1,hamilton,1,18,\N,2
3,2010,2,20352,338,1,albert_park,1,hamilton,1,11,6,1
4,2011,1,20780,841,1,albert_park,1,hamilton,1,2,2,1


In [7]:
# Replace all "\\N" in position column to 0 
replace1 = {"\\N":0}
data["position"] = data["position"].replace(replace1)

# Convert 'position' from object to numeric
data['position'] = pd.to_numeric(data['position'])

In [8]:
data.dtypes

year              int64
round             int64
resultId          int64
raceId            int64
circuitId         int64
circuitRef       object
driverId          int64
driverRef        object
constructorId     int64
grid              int64
position          int64
statusId          int64
dtype: object

## Other Transformations

In [9]:
# Add a new column win where all wins are 1 and everything else is 0
data['Win'] = data["position"]
data.Win = data.position.map(lambda x: 1 if x == 1 else 0)
print(data.shape)
data.head()

(25040, 13)


Unnamed: 0,year,round,resultId,raceId,circuitId,circuitRef,driverId,driverRef,constructorId,grid,position,statusId,Win
0,2008,1,1,18,1,albert_park,1,hamilton,1,1,1,1,1
1,2007,1,371,36,1,albert_park,1,hamilton,1,4,3,1,0
2,2009,1,7573,1,1,albert_park,1,hamilton,1,18,0,2,0
3,2010,2,20352,338,1,albert_park,1,hamilton,1,11,6,1,0
4,2011,1,20780,841,1,albert_park,1,hamilton,1,2,2,1,0


In [10]:
df = data[data.year>2000]

In [11]:
# Sort resultId column in ascending order and reset_index for data
df = df.sort_values(by="resultId", ascending=True)
df = df.reset_index()
df = df.drop(["index"], axis=1)
df

Unnamed: 0,year,round,resultId,raceId,circuitId,circuitRef,driverId,driverRef,constructorId,grid,position,statusId,Win
0,2008,1,1,18,1,albert_park,1,hamilton,1,1,1,1,1
1,2008,1,2,18,1,albert_park,2,heidfeld,2,5,2,1,0
2,2008,1,3,18,1,albert_park,3,rosberg,3,7,3,1,0
3,2008,1,4,18,1,albert_park,4,alonso,4,11,4,1,0
4,2008,1,5,18,1,albert_park,5,kovalainen,1,3,5,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
7982,2021,4,25041,1055,4,catalunya,849,latifi,3,19,16,11,0
7983,2021,4,25042,1055,4,catalunya,4,alonso,214,10,17,11,0
7984,2021,4,25043,1055,4,catalunya,854,mick_schumacher,210,18,18,12,0
7985,2021,4,25044,1055,4,catalunya,853,mazepin,210,20,19,12,0


In [12]:
# Extract CSV file
df.to_csv('../Resources/PythonExport/data_final.csv')