In [1]:
import pandas as pd
from pathlib import Path

In [2]:
# Read in the required tables
results = pd.read_csv(Path('../Resources/Dataset/results.csv'))
races = pd.read_csv(Path('../Resources/Dataset/races.csv'))
drivers = pd.read_csv(Path('../Resources/Dataset/drivers.csv'))
constructors = pd.read_csv(Path('../Resources/Dataset/constructors.csv'))
circuit = pd.read_csv(Path('../Resources/Dataset/circuits.csv'))
laptimes = pd.read_csv(Path('../Resources/Dataset/lap_times.csv'))

col_list = ["raceId", 'weather', 'weather_warm', 'weather_cold', 'weather_dry', 'weather_wet','weather_cloudy']
weather = pd.read_csv(Path('../Resources/PythonExport/weather.csv'), usecols=col_list)

## Join Tables

In [3]:
# Merge results with races on raceId. Only bring in columns from races that we need
data = (results.merge(races, left_on='raceId', right_on='raceId').reindex(
            columns=['year', 'round', 'resultId', 'raceId', 'circuitId', 'driverId', 'constructorId','grid', 
                     'position', 'statusId']))
data.tail()

Unnamed: 0,year,round,resultId,raceId,circuitId,driverId,constructorId,grid,position,statusId
25035,2021,4,25041,1055,4,849,3,19,16,11
25036,2021,4,25042,1055,4,4,214,10,17,11
25037,2021,4,25043,1055,4,854,210,18,18,12
25038,2021,4,25044,1055,4,853,210,20,19,12
25039,2021,4,25045,1055,4,852,213,16,\N,10


In [4]:
# Merge data with circuit on circuitId. Only bring in circuitRef
data = (data.merge(circuit, left_on='circuitId', right_on='circuitId').reindex(
            columns=['year', 'round', 'resultId', 'raceId', 'circuitId', 'circuitRef', 'driverId', 'constructorId','grid', 
                     'position', 'statusId']))
data.tail()

Unnamed: 0,year,round,resultId,raceId,circuitId,circuitRef,driverId,constructorId,grid,position,statusId
25035,2021,3,25021,1054,75,portimao,847,3,11,16,11
25036,2021,3,25022,1054,75,portimao,854,210,19,17,12
25037,2021,3,25023,1054,75,portimao,849,3,18,18,12
25038,2021,3,25024,1054,75,portimao,853,210,20,19,12
25039,2021,3,25025,1054,75,portimao,8,51,15,\N,130


In [5]:
# Merge data with drivers on driverId. Only bring in driverRef
data = (data.merge(drivers, left_on='driverId', right_on='driverId').reindex(
            columns=['year', 'round', 'resultId', 'raceId', 'circuitId', 'circuitRef', 'driverId','driverRef', 'constructorId','grid', 
                     'position', 'statusId']))
data.tail()

Unnamed: 0,year,round,resultId,raceId,circuitId,circuitRef,driverId,driverRef,constructorId,grid,position,statusId
25035,1952,1,19656,817,66,bremgarten,725,abecassis,133,10,\N,86
25036,1951,1,19867,825,66,bremgarten,725,abecassis,133,20,\N,121
25037,1950,4,20106,836,66,bremgarten,800,pagani,105,15,7,13
25038,1951,8,20020,832,67,pedralbes,783,grignard,154,16,\N,5
25039,1951,8,20024,832,67,pedralbes,782,jover,105,18,\N,5


In [6]:
# Merge data with weather on raceId. Bring in weather information
data = (data.merge(weather, left_on='raceId', right_on='raceId').reindex(
            columns=['year', 'round', 'resultId', 'raceId', 'circuitId', 'circuitRef', 'driverId','driverRef', 
                     'constructorId','grid', 'position', 'statusId', 'weather_warm', 'weather_cold', 
                     'weather_dry', 'weather_wet', 'weather_cloudy' ]))
data.tail()

Unnamed: 0,year,round,resultId,raceId,circuitId,circuitRef,driverId,driverRef,constructorId,grid,position,statusId,weather_warm,weather_cold,weather_dry,weather_wet,weather_cloudy
25035,1951,2,19885,826,19,indianapolis,768,rose,150,5,\N,3,1,0,0,0,0
25036,1951,2,19890,826,19,indianapolis,769,mackey,155,33,\N,8,1,0,0,0,0
25037,1951,2,19893,826,19,indianapolis,770,green,113,10,\N,5,1,0,0,0,0
25038,1951,2,19897,826,19,indianapolis,771,walt_brown,113,13,\N,121,1,0,0,0,0
25039,1951,2,19902,826,19,indianapolis,772,hellings,150,23,\N,5,1,0,0,0,0


In [7]:
print(data.shape)
data.head()

(25040, 17)


Unnamed: 0,year,round,resultId,raceId,circuitId,circuitRef,driverId,driverRef,constructorId,grid,position,statusId,weather_warm,weather_cold,weather_dry,weather_wet,weather_cloudy
0,2008,1,1,18,1,albert_park,1,hamilton,1,1,1,1,1,0,0,0,0
1,2008,1,2,18,1,albert_park,2,heidfeld,2,5,2,1,1,0,0,0,0
2,2008,1,3,18,1,albert_park,3,rosberg,3,7,3,1,1,0,0,0,0
3,2008,1,4,18,1,albert_park,4,alonso,4,11,4,1,1,0,0,0,0
4,2008,1,5,18,1,albert_park,5,kovalainen,1,3,5,1,1,0,0,0,0


In [8]:
# Replace all "\\N" in position column to 0 
replace1 = {"\\N":0}
data["position"] = data["position"].replace(replace1)

# Convert 'position' from object to numeric
data['position'] = pd.to_numeric(data['position'])

## Other Transformations

In [9]:
# Add a new column win where all wins are 1 and everything else is 0
data['Win'] = data["position"]
data.Win = data.position.map(lambda x: 1 if x == 1 else 0)
print(data.shape)
data.head()

(25040, 18)


Unnamed: 0,year,round,resultId,raceId,circuitId,circuitRef,driverId,driverRef,constructorId,grid,position,statusId,weather_warm,weather_cold,weather_dry,weather_wet,weather_cloudy,Win
0,2008,1,1,18,1,albert_park,1,hamilton,1,1,1,1,1,0,0,0,0,1
1,2008,1,2,18,1,albert_park,2,heidfeld,2,5,2,1,1,0,0,0,0,0
2,2008,1,3,18,1,albert_park,3,rosberg,3,7,3,1,1,0,0,0,0,0
3,2008,1,4,18,1,albert_park,4,alonso,4,11,4,1,1,0,0,0,0,0
4,2008,1,5,18,1,albert_park,5,kovalainen,1,3,5,1,1,0,0,0,0,0


In [10]:
data = data.sort_values(by="resultId", ascending=True)
data = data. reset_index()
data = data.drop(["index"], axis=1)
print(data.shape)
data.tail()

(25040, 18)


Unnamed: 0,year,round,resultId,raceId,circuitId,circuitRef,driverId,driverRef,constructorId,grid,position,statusId,weather_warm,weather_cold,weather_dry,weather_wet,weather_cloudy,Win
25035,2021,4,25041,1055,4,catalunya,849,latifi,3,19,16,11,0,0,0,0,1,0
25036,2021,4,25042,1055,4,catalunya,4,alonso,214,10,17,11,0,0,0,0,1,0
25037,2021,4,25043,1055,4,catalunya,854,mick_schumacher,210,18,18,12,0,0,0,0,1,0
25038,2021,4,25044,1055,4,catalunya,853,mazepin,210,20,19,12,0,0,0,0,1,0
25039,2021,4,25045,1055,4,catalunya,852,tsunoda,213,16,0,10,0,0,0,0,1,0


In [11]:
# Extract CSV file
data.to_csv('../Resources/PythonExport/data_final.csv')