In [2]:
# Suppress warnings
import warnings

warnings.filterwarnings("ignore")

# Standard library imports
import sys
import os
import logging
import time
import json

# Third-party imports
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from joblib import dump, load

from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.preprocessing import MinMaxScaler, StandardScaler

# from tensorflow.keras.layers import Dense
# from tensorflow.keras.models import Sequential
# from tensorflow.keras.optimizers import Adam
# from tensorflow.keras.wrappers.scikit_learn import KerasClassifier


In [55]:
races = pd.read_csv("../data/raw/races.csv", na_values=["\\N"])
circuits = pd.read_csv("../data/raw/circuits.csv", na_values=["\\N"])
results = pd.read_csv("../data/raw/results.csv", na_values=["\\N"])
drivers = pd.read_csv("../data/raw/drivers.csv", na_values=["\\N"])
qualifying = pd.read_csv("../data/raw/qualifying.csv", na_values=["\\N"])
sprints = pd.read_csv("../data/raw/sprint_results.csv", na_values=["\\N"])
pit_stops = pd.read_csv("../data/raw/pit_stops.csv", na_values=["\\N"])
lap_times = pd.read_csv("../data/raw/lap_times.csv", na_values=["\\N"])
constructors = pd.read_csv("../data/raw/constructors.csv", na_values=["\\N"])

In [68]:
results[results["raceId"] == 18]

Unnamed: 0,resultId,raceId,driverId,constructorId,number,grid,position,positionText,positionOrder,points,laps,time,milliseconds,fastestLap,rank,fastestLapTime,fastestLapSpeed,statusId
0,1,18,1,1,22.0,1,1.0,1,1,10.0,58,1:34:50.616,5690616.0,39.0,2.0,1:27.452,218.3,1
1,2,18,2,2,3.0,5,2.0,2,2,8.0,58,+5.478,5696094.0,41.0,3.0,1:27.739,217.586,1
2,3,18,3,3,7.0,7,3.0,3,3,6.0,58,+8.163,5698779.0,41.0,5.0,1:28.090,216.719,1
3,4,18,4,4,5.0,11,4.0,4,4,5.0,58,+17.181,5707797.0,58.0,7.0,1:28.603,215.464,1
4,5,18,5,1,23.0,3,5.0,5,5,4.0,58,+18.014,5708630.0,43.0,1.0,1:27.418,218.385,1
5,6,18,6,3,8.0,13,6.0,6,6,3.0,57,,,50.0,14.0,1:29.639,212.974,11
6,7,18,7,5,14.0,17,7.0,7,7,2.0,55,,,54.0,8.0,1:29.534,213.224,5
7,8,18,8,6,1.0,15,8.0,8,8,1.0,53,,,20.0,4.0,1:27.903,217.18,5
8,9,18,9,2,4.0,2,,R,9,0.0,47,,,15.0,9.0,1:28.753,215.1,4
9,10,18,10,7,12.0,18,,R,10,0.0,43,,,23.0,13.0,1:29.558,213.166,3


What proportion of races result in not finishing the race (i.e., any status except 1)?

In [46]:
(np.sum(results['statusId']!=1)/len(results))*100

71.6128059127418

How does all time points by driver differ? What about constructor points?

In [59]:
# All time driver points
results.groupby("driverId")["points"].sum().reset_index().sort_values(
    by="points", ascending=False
).join(drivers.set_index("driverId"), on="driverId")[
    ["driverId", "points", "driverRef"]
]

Unnamed: 0,driverId,points,driverRef
0,1,4713.5,hamilton
19,20,3098.0,vettel
828,830,2744.5,max_verstappen
3,4,2304.0,alonso
7,8,1873.0,raikkonen
...,...,...,...
458,459,0.0,marsh
457,458,0.0,slotemaker
456,457,0.0,pon
163,164,0.0,joachim_winkelhock


In [62]:
# All time constructor points
results.groupby("constructorId")["points"].sum().reset_index().sort_values(
    by="points", ascending=False
).join(constructors.set_index("constructorId"), on="constructorId")[
    ["constructorId", "points", "name"]
]

Unnamed: 0,constructorId,points,name
5,6,10772.27,Ferrari
128,131,7502.64,Mercedes
8,9,7472.00,Red Bull
0,1,6687.50,McLaren
2,3,3628.00,Williams
...,...,...,...
134,137,0.00,Arzani-Volpini
133,136,0.00,Pankratz
86,89,0.00,LDS
87,90,0.00,Protos


In [66]:
races[races['raceId']==18]

Unnamed: 0,raceId,year,round,circuitId,name,date,time,url,fp1_date,fp1_time,fp2_date,fp2_time,fp3_date,fp3_time,quali_date,quali_time,sprint_date,sprint_time
17,18,2008,1,1,Australian Grand Prix,2008-03-16,04:30:00,http://en.wikipedia.org/wiki/2008_Australian_G...,,,,,,,,,,
