In [1]:
import pandas as pd
from pathlib import Path

In [2]:
# Read in the required tables
results = pd.read_csv(Path('../Resources/Dataset/results.csv'))
races = pd.read_csv(Path('../Resources/Dataset/races.csv'))
drivers = pd.read_csv(Path('../Resources/Dataset/drivers.csv'))
constructors = pd.read_csv(Path('../Resources/Dataset/constructors.csv'))
circuit = pd.read_csv(Path('../Resources/Dataset/circuits.csv'))
laptimes = pd.read_csv(Path('../Resources/Dataset/lap_times.csv'))

col_list = ["raceId", 'weather', 'weather_warm', 'weather_cold', 'weather_dry', 'weather_wet','weather_cloudy']
weather = pd.read_csv(Path('../Resources/PythonExport/weather.csv'), usecols=col_list)

## Join Tables

In [3]:
# Merge results with races on raceId. Only bring in columns from races that we need
data = (results.merge(races, left_on='raceId', right_on='raceId').reindex(
            columns=['year', 'round', 'resultId', 'raceId', 'circuitId', 'driverId', 'constructorId','grid', 
                     'position']))
data.tail()

Unnamed: 0,year,round,resultId,raceId,circuitId,driverId,constructorId,grid,position
25035,2021,4,25041,1055,4,849,3,19,16
25036,2021,4,25042,1055,4,4,214,10,17
25037,2021,4,25043,1055,4,854,210,18,18
25038,2021,4,25044,1055,4,853,210,20,19
25039,2021,4,25045,1055,4,852,213,16,\N


In [4]:
# Merge data with weather on raceId. Bring in weather information
data = (data.merge(weather, left_on='raceId', right_on='raceId').reindex(
            columns=['year', 'round', 'resultId', 'raceId', 'circuitId', 'driverId', 
                     'constructorId','grid', 'position', 'weather_warm', 'weather_cold', 
                     'weather_dry', 'weather_wet', 'weather_cloudy' ]))
data.tail()

Unnamed: 0,year,round,resultId,raceId,circuitId,driverId,constructorId,grid,position,weather_warm,weather_cold,weather_dry,weather_wet,weather_cloudy
25035,2021,4,25041,1055,4,849,3,19,16,0,0,0,0,1
25036,2021,4,25042,1055,4,4,214,10,17,0,0,0,0,1
25037,2021,4,25043,1055,4,854,210,18,18,0,0,0,0,1
25038,2021,4,25044,1055,4,853,210,20,19,0,0,0,0,1
25039,2021,4,25045,1055,4,852,213,16,\N,0,0,0,0,1


In [5]:
print(data.shape)
data.head()

(25040, 14)


Unnamed: 0,year,round,resultId,raceId,circuitId,driverId,constructorId,grid,position,weather_warm,weather_cold,weather_dry,weather_wet,weather_cloudy
0,2008,1,1,18,1,1,1,1,1,1,0,0,0,0
1,2008,1,2,18,1,2,2,5,2,1,0,0,0,0
2,2008,1,3,18,1,3,3,7,3,1,0,0,0,0
3,2008,1,4,18,1,4,4,11,4,1,0,0,0,0
4,2008,1,5,18,1,5,1,3,5,1,0,0,0,0


In [6]:
# Replace all "\\N" in position column to 0 
replace1 = {"\\N":0}
data["position"] = data["position"].replace(replace1)

# Convert 'position' from object to numeric
data['position'] = pd.to_numeric(data['position'])

## Other Transformations

In [7]:
# Add a new column win where all wins are 1 and everything else is 0
data['Win'] = data["position"]
data.Win = data.position.map(lambda x: 1 if x == 1 else 0)
print(data.shape)
data.head()

(25040, 15)


Unnamed: 0,year,round,resultId,raceId,circuitId,driverId,constructorId,grid,position,weather_warm,weather_cold,weather_dry,weather_wet,weather_cloudy,Win
0,2008,1,1,18,1,1,1,1,1,1,0,0,0,0,1
1,2008,1,2,18,1,2,2,5,2,1,0,0,0,0,0
2,2008,1,3,18,1,3,3,7,3,1,0,0,0,0,0
3,2008,1,4,18,1,4,4,11,4,1,0,0,0,0,0
4,2008,1,5,18,1,5,1,3,5,1,0,0,0,0,0


In [8]:
# Add another new column where everything except podiums are marked 0

podium = [1,2,3]

def podium_order(x):
    if x in podium:
        return x
    else:
        return 0
data["podium"] = data["position"].apply(podium_order)    

In [9]:
data = data.sort_values(by="resultId", ascending=True)
data = data. reset_index()
data = data.drop(["index"], axis=1)
data = data[(data['year']<=2019) & (data['year']>=2000)]
print(data.shape)
data.head()

(7940, 16)


Unnamed: 0,year,round,resultId,raceId,circuitId,driverId,constructorId,grid,position,weather_warm,weather_cold,weather_dry,weather_wet,weather_cloudy,Win,podium
0,2008,1,1,18,1,1,1,1,1,1,0,0,0,0,1,1
1,2008,1,2,18,1,2,2,5,2,1,0,0,0,0,0,2
2,2008,1,3,18,1,3,3,7,3,1,0,0,0,0,0,3
3,2008,1,4,18,1,4,4,11,4,1,0,0,0,0,0,0
4,2008,1,5,18,1,5,1,3,5,1,0,0,0,0,0,0


In [10]:
# Extract CSV file
data.to_csv('../Resources/PythonExport/data_final.csv')