# Data Pre-Processing Notebook
This notebook contains the data processing for the project's F1 Kaggle datasets.

In [1]:
#Import general pytohn packages
import pandas as pd
import numpy as np

#Modelling packages
from sklearn.model_selection import train_test_split

#Import Python packages prepared for this project
from workflowpackages.data import get_kaggle_f1_data
#-------------------------
#For complete code regarding the package see 'workflowpackages' .py files
#You can also type ? and ?? after the imported function to see documentation and full source code in-line
#-------------------------

In [2]:
get_kaggle_f1_data(force_download=True)

Dataset URL: https://www.kaggle.com/datasets/jtrotman/formula-1-race-data
Force download was deemed True, therefore all the datasets have been downloaded
No missing datasets. The following datasets are available:
- circuits.csv
- constructors.csv
- lap_times.csv
- races.csv
- sprint_results.csv
- constructor_results.csv
- driver_standings.csv
- pit_stops.csv
- results.csv
- status.csv
- constructor_standings.csv
- drivers.csv
- qualifying.csv
- seasons.csv


In [3]:
#READ DATA
#Read the dataframes from the source and label them
df_results = pd.read_csv('datasets/f1/results.csv')
df_races = pd.read_csv('datasets/f1/races.csv')
df_drivers = pd.read_csv('datasets/f1/drivers.csv')
#--------------------------------------------------

#SELECT COLUMNS
#Select columns to keep for modelling in the dataframes
#Results data
results_cols = [
    'resultId', #Unique id for a race result per driver
    'raceId',
    'driverId',
    'grid',
    'position', #Numeric, null if position text is R, D, or W - meaning the car was retired, disqualified, or withdrawn
    'positionText' #String, same as position, instead of null, indicates R, D, or W
]
#Races data
races_cols = [
    'raceId',
    'year'
]
#Drviers data
drivers_cols = [
    'driverId',
    'code' #Driver code based on driver last name
]
#---------------------------------------------------

#FILTER DATA
#Filter the dataframes by selected columns
df_results = df_results[results_cols]
df_races = df_races[races_cols]
df_drivers = df_drivers[drivers_cols]
#---------------------------------------------------

#JOIN THE DATA
#Join the races data to results dataframe
df_model = df_results.merge(
    df_races,
    how = 'inner',
    on = 'raceId',
)
#Join the drivers data to results dataframe
df_model = df_model.merge(
    df_drivers,
    how = 'inner',
    on = 'driverId',
)
#---------------------------------------------------

In [4]:
#Filter races to analyze (limit historical period)
mask = (df_model['year'] == 2023) | (df_model['year'] == 2022)
df_model = df_model[mask]

In [5]:
#CREATE DUMMY VARIABLES

#Get dummies for Finished Race
dummies_finished = pd.get_dummies(
    #Modify positionText column to indicate "finished_race" if the car finished the race, otherwise "out"
    df_model['positionText'].apply(lambda x: 'finished_race' if str.isnumeric(x) else 'Out'),
    prefix = 'Car'
).drop(
    ['Car_finished_race'],
    axis = 1
)
#Join dummies to model dataset
df_model = df_model.merge(
    dummies_finished,
    how = 'inner',
    left_index = True,
    right_index = True
).drop(
    ['positionText'],
    axis = 1
)

#Get dummies for Driver id
dummies_drivers = pd.get_dummies(
    df_model['driverId'],
    prefix = 'Driver'
)
#Join dummies to model dataset
df_model = df_model.merge(
    dummies_drivers,
    how = 'inner',
    left_index = True,
    right_index = True
)
#---------------------------------------------------

#CLEAN DATA COLUMNS
#Clean nulls and change datatypes
df_model['position'] = df_model['position'].apply(lambda x : 0 if x == '\\N' else x).astype('float')
df_model['grid'] = df_model['grid'].astype('float')

In [6]:
#MODELLING

#Feature Selection
#Define model dataset
model_cols = [
    'grid',
    'Car_Out'
]
#Append Driver Dummies
for driver in dummies_drivers.columns:
    model_cols.append(driver)
#---------------------------------------------------

#Train and Test Split
X_train, X_test, y_train, y_test = train_test_split(
    df_model[model_cols], 
    df_model['position'],
    test_size = 0.3,
    random_state = 42 #Because 37 is a fantabulous number
)
#---------------------------------------------------