In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px

In [2]:
# These files contain data that needs to be put together
drivers = pd.read_csv('datasets/original/drivers.csv')
constructors = pd.read_csv('datasets/original/constructors.csv')
circuits = pd.read_csv('datasets/original/circuits.csv')
status = pd.read_csv('datasets/original/status.csv')
races = pd.read_csv('datasets/original/races.csv')

# The results file contains all the codes from above files and needs to be worked on
results = pd.read_csv('datasets/original/results.csv')

In [3]:
# Viewing the results dataframe before modifications
results.head(10)

Unnamed: 0,resultId,raceId,driverId,constructorId,number,grid,position,positionText,positionOrder,points,laps,time,milliseconds,fastestLap,rank,fastestLapTime,fastestLapSpeed,statusId
0,1,18,1,1,22,1,1,1,1,10.0,58,1:34:50.616,5690616,39,2,1:27.452,218.3,1
1,2,18,2,2,3,5,2,2,2,8.0,58,+5.478,5696094,41,3,1:27.739,217.586,1
2,3,18,3,3,7,7,3,3,3,6.0,58,+8.163,5698779,41,5,1:28.090,216.719,1
3,4,18,4,4,5,11,4,4,4,5.0,58,+17.181,5707797,58,7,1:28.603,215.464,1
4,5,18,5,1,23,3,5,5,5,4.0,58,+18.014,5708630,43,1,1:27.418,218.385,1
5,6,18,6,3,8,13,6,6,6,3.0,57,\N,\N,50,14,1:29.639,212.974,11
6,7,18,7,5,14,17,7,7,7,2.0,55,\N,\N,22,12,1:29.534,213.224,5
7,8,18,8,6,1,15,8,8,8,1.0,53,\N,\N,20,4,1:27.903,217.18,5
8,9,18,9,2,4,2,\N,R,9,0.0,47,\N,\N,15,9,1:28.753,215.1,4
9,10,18,10,7,12,18,\N,R,10,0.0,43,\N,\N,23,13,1:29.558,213.166,3


In [4]:
results = results.drop(['time', 'milliseconds', 'points', 'fastestLapTime', 'fastestLapSpeed'], axis = 1)
results.head()

Unnamed: 0,resultId,raceId,driverId,constructorId,number,grid,position,positionText,positionOrder,laps,fastestLap,rank,statusId
0,1,18,1,1,22,1,1,1,1,58,39,2,1
1,2,18,2,2,3,5,2,2,2,58,41,3,1
2,3,18,3,3,7,7,3,3,3,58,41,5,1
3,4,18,4,4,5,11,4,4,4,58,58,7,1
4,5,18,5,1,23,3,5,5,5,58,43,1,1


In [5]:
# Get full driver names in one column
drivers['driverName'] = drivers['forename'] + ' ' + drivers['surname']
drivers.head()

Unnamed: 0,driverId,driverRef,number,code,forename,surname,dob,nationality,url,driverName
0,1,hamilton,44,HAM,Lewis,Hamilton,1985-01-07,British,http://en.wikipedia.org/wiki/Lewis_Hamilton,Lewis Hamilton
1,2,heidfeld,\N,HEI,Nick,Heidfeld,1977-05-10,German,http://en.wikipedia.org/wiki/Nick_Heidfeld,Nick Heidfeld
2,3,rosberg,6,ROS,Nico,Rosberg,1985-06-27,German,http://en.wikipedia.org/wiki/Nico_Rosberg,Nico Rosberg
3,4,alonso,14,ALO,Fernando,Alonso,1981-07-29,Spanish,http://en.wikipedia.org/wiki/Fernando_Alonso,Fernando Alonso
4,5,kovalainen,\N,KOV,Heikki,Kovalainen,1981-10-19,Finnish,http://en.wikipedia.org/wiki/Heikki_Kovalainen,Heikki Kovalainen


In [6]:
races = races.iloc[:, : 8]

races.head()

Unnamed: 0,raceId,year,round,circuitId,name,date,time,url
0,1,2009,1,1,Australian Grand Prix,2009-03-29,06:00:00,http://en.wikipedia.org/wiki/2009_Australian_G...
1,2,2009,2,2,Malaysian Grand Prix,2009-04-05,09:00:00,http://en.wikipedia.org/wiki/2009_Malaysian_Gr...
2,3,2009,3,17,Chinese Grand Prix,2009-04-19,07:00:00,http://en.wikipedia.org/wiki/2009_Chinese_Gran...
3,4,2009,4,3,Bahrain Grand Prix,2009-04-26,12:00:00,http://en.wikipedia.org/wiki/2009_Bahrain_Gran...
4,5,2009,5,4,Spanish Grand Prix,2009-05-10,12:00:00,http://en.wikipedia.org/wiki/2009_Spanish_Gran...


In [7]:
# Merge dataframes with the results to get all info in one place

all_results = pd.merge(results, races, on ='raceId', how ='left')
all_results = pd.merge(all_results, circuits, on ='circuitId', how ='left')
all_results = pd.merge(all_results, drivers, on ='driverId', how ='left')
all_results = pd.merge(all_results, status, on ='statusId', how ='left')
all_results = pd.merge(all_results, constructors, on ='constructorId', how ='left')
all_results['driverName'] = all_results['forename'] + ' ' + all_results['surname']
all_results.head()

  all_results = pd.merge(all_results, constructors, on ='constructorId', how ='left')


Unnamed: 0,resultId,raceId,driverId,constructorId,number_x,grid,position,positionText,positionOrder,laps,...,surname,dob,nationality_x,url_x,driverName,status,constructorRef,name,nationality_y,url_y
0,1,18,1,1,22,1,1,1,1,58,...,Hamilton,1985-01-07,British,http://en.wikipedia.org/wiki/Lewis_Hamilton,Lewis Hamilton,Finished,mclaren,McLaren,British,http://en.wikipedia.org/wiki/McLaren
1,2,18,2,2,3,5,2,2,2,58,...,Heidfeld,1977-05-10,German,http://en.wikipedia.org/wiki/Nick_Heidfeld,Nick Heidfeld,Finished,bmw_sauber,BMW Sauber,German,http://en.wikipedia.org/wiki/BMW_Sauber
2,3,18,3,3,7,7,3,3,3,58,...,Rosberg,1985-06-27,German,http://en.wikipedia.org/wiki/Nico_Rosberg,Nico Rosberg,Finished,williams,Williams,British,http://en.wikipedia.org/wiki/Williams_Grand_Pr...
3,4,18,4,4,5,11,4,4,4,58,...,Alonso,1981-07-29,Spanish,http://en.wikipedia.org/wiki/Fernando_Alonso,Fernando Alonso,Finished,renault,Renault,French,http://en.wikipedia.org/wiki/Renault_in_Formul...
4,5,18,5,1,23,3,5,5,5,58,...,Kovalainen,1981-10-19,Finnish,http://en.wikipedia.org/wiki/Heikki_Kovalainen,Heikki Kovalainen,Finished,mclaren,McLaren,British,http://en.wikipedia.org/wiki/McLaren


In [8]:
all_results['name']

0             McLaren
1          BMW Sauber
2            Williams
3             Renault
4             McLaren
             ...     
25655    Haas F1 Team
25656        Williams
25657        Williams
25658      AlphaTauri
25659      Alfa Romeo
Name: name, Length: 25660, dtype: object

In [9]:
# Drop unnecessary columns and rename other columns for convenience

all_results = all_results.drop(['positionText', 'grid', 'fastestLap', 'rank', 'date', 'time', 'lat', 'lng', 'alt', 'url_x', 'number_y', 'forename', 'surname', 'dob', 'url_y', 'round', 'number_x', 'driverRef', 'positionOrder','nationality_x', 'nationality_y'], axis = 1)


all_results.rename(columns = {'name_y': 'circuit_name', 'name': 'constructor_name'}, inplace=True)
all_results.head()

Unnamed: 0,resultId,raceId,driverId,constructorId,position,laps,statusId,year,circuitId,name_x,circuitRef,circuit_name,location,country,code,driverName,status,constructorRef,constructor_name
0,1,18,1,1,1,58,1,2008,1,Australian Grand Prix,albert_park,Albert Park Grand Prix Circuit,Melbourne,Australia,HAM,Lewis Hamilton,Finished,mclaren,McLaren
1,2,18,2,2,2,58,1,2008,1,Australian Grand Prix,albert_park,Albert Park Grand Prix Circuit,Melbourne,Australia,HEI,Nick Heidfeld,Finished,bmw_sauber,BMW Sauber
2,3,18,3,3,3,58,1,2008,1,Australian Grand Prix,albert_park,Albert Park Grand Prix Circuit,Melbourne,Australia,ROS,Nico Rosberg,Finished,williams,Williams
3,4,18,4,4,4,58,1,2008,1,Australian Grand Prix,albert_park,Albert Park Grand Prix Circuit,Melbourne,Australia,ALO,Fernando Alonso,Finished,renault,Renault
4,5,18,5,1,5,58,1,2008,1,Australian Grand Prix,albert_park,Albert Park Grand Prix Circuit,Melbourne,Australia,KOV,Heikki Kovalainen,Finished,mclaren,McLaren


In [10]:
all_results.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 25660 entries, 0 to 25659
Data columns (total 19 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   resultId          25660 non-null  int64 
 1   raceId            25660 non-null  int64 
 2   driverId          25660 non-null  int64 
 3   constructorId     25660 non-null  int64 
 4   position          25660 non-null  object
 5   laps              25660 non-null  int64 
 6   statusId          25660 non-null  int64 
 7   year              25660 non-null  int64 
 8   circuitId         25660 non-null  int64 
 9   name_x            25660 non-null  object
 10  circuitRef        25660 non-null  object
 11  circuit_name      25660 non-null  object
 12  location          25660 non-null  object
 13  country           25660 non-null  object
 14  code              25660 non-null  object
 15  driverName        25660 non-null  object
 16  status            25660 non-null  object
 17  constructorR

In [11]:
# Create a dataframe which will be used to answer the last question

circuit_races = pd.merge(races, circuits, on ='circuitId', how ='left')
circuit_races = circuit_races[['raceId', 'year', 'circuitId', 'name_x', 'circuitRef', 'name_y', 'location', 'country']]

circuit_races

Unnamed: 0,raceId,year,circuitId,name_x,circuitRef,name_y,location,country
0,1,2009,1,Australian Grand Prix,albert_park,Albert Park Grand Prix Circuit,Melbourne,Australia
1,2,2009,2,Malaysian Grand Prix,sepang,Sepang International Circuit,Kuala Lumpur,Malaysia
2,3,2009,17,Chinese Grand Prix,shanghai,Shanghai International Circuit,Shanghai,China
3,4,2009,3,Bahrain Grand Prix,bahrain,Bahrain International Circuit,Sakhir,Bahrain
4,5,2009,4,Spanish Grand Prix,catalunya,Circuit de Barcelona-Catalunya,Montmeló,Spain
...,...,...,...,...,...,...,...,...
1074,1092,2022,22,Japanese Grand Prix,suzuka,Suzuka Circuit,Suzuka,Japan
1075,1093,2022,69,United States Grand Prix,americas,Circuit of the Americas,Austin,USA
1076,1094,2022,32,Mexico City Grand Prix,rodriguez,Autódromo Hermanos Rodríguez,Mexico City,Mexico
1077,1095,2022,18,Brazilian Grand Prix,interlagos,Autódromo José Carlos Pace,São Paulo,Brazil


Analysis
First - how many drivers finished a race that they started? To figure this out, we can look at the number of drivers across all races with the race status as 'Finished' or '+1/2/3 laps'. This is done because drivers can finish on the lead lap (i.e. they haven't been lapped by the race winner) or they can finish after being lapped. In some cases, drivers may retire in the last couple of laps of the race.

In modern F1, they are still classified in the results as having finished provided they have run 95% of the race distance - usually anything over +3 laps would count as a Did Not Finish (DNF). Historically, all drivers who completed the race (regardless of the number of laps they were behind the winning driver) have been classified. Due to this, all finishers have been considered.

In [12]:
# Dataframe of all drivers who finished a Grand Prix
finished_df = all_results.loc[all_results['statusId'].isin([1, 11, 12, 13, 14, 15, 16, 17, 18, 19])]

# Percentage of drivers finishing a race in all of F1 history
finish_pc = round((finished_df.shape[0] / all_results.shape[0]) * 100, 3)

print(f"In all of F1 history, only {finish_pc}% drivers have finished a Grand Prix that they started")

In all of F1 history, only 54.84% drivers have finished a Grand Prix that they started


In [13]:
# List of mechanical DNF status codes
m_dnf_lst = [5, 6, 7, 8, 9, 10, 21, 22, 23, 24, 25, 26, 30, 31, 32, 33, 34, 51, 56, 129, 121, 126, 131, 132, 135]
m_dnf_lst.extend(range(36, 45))
m_dnf_lst.extend(range(46, 50))
m_dnf_lst.extend(range(61, 111))


m_dnf_lst = [i for i in m_dnf_lst if i not in [62, 68, 73, 77, 78, 81, 82, 88, 89, 90, 92, 93, 96, 97, 100, 107]]
# m_dnf_lst

In [14]:
# Dataframe of all mechanical DNFs
mdnf_df = all_results.loc[all_results['statusId'].isin(m_dnf_lst)]

mdnf_df.head()

Unnamed: 0,resultId,raceId,driverId,constructorId,position,laps,statusId,year,circuitId,name_x,circuitRef,circuit_name,location,country,code,driverName,status,constructorRef,constructor_name
6,7,18,7,5,7,55,5,2008,1,Australian Grand Prix,albert_park,Albert Park Grand Prix Circuit,Melbourne,Australia,BOU,Sébastien Bourdais,Engine,toro_rosso,Toro Rosso
7,8,18,8,6,8,53,5,2008,1,Australian Grand Prix,albert_park,Albert Park Grand Prix Circuit,Melbourne,Australia,RAI,Kimi Räikkönen,Engine,ferrari,Ferrari
10,11,18,11,8,\N,32,7,2008,1,Australian Grand Prix,albert_park,Albert Park Grand Prix Circuit,Melbourne,Australia,SAT,Takuma Sato,Transmission,super_aguri,Super Aguri
11,12,18,12,4,\N,30,8,2008,1,Australian Grand Prix,albert_park,Albert Park Grand Prix Circuit,Melbourne,Australia,PIQ,Nelson Piquet Jr.,Clutch,renault,Renault
12,13,18,13,6,\N,29,5,2008,1,Australian Grand Prix,albert_park,Albert Park Grand Prix Circuit,Melbourne,Australia,MAS,Felipe Massa,Engine,ferrari,Ferrari


In [15]:
# Counts of different types of mechanical DNFs over the years
mdnf_df['status'].value_counts()

Engine            2005
Gearbox            804
Suspension         431
Transmission       321
Electrical         315
                  ... 
Crankshaft           1
Engine fire          1
CV joint             1
Launch control       1
Brake duct           1
Name: status, Length: 72, dtype: int64

In [16]:
dnfs_per_year = mdnf_df['year'].value_counts().reset_index().sort_values(by='index').rename(columns={'index': 'year', 'year': 'count'})

# Plotting the number of mechanical DNFs over the years
fig = px.bar(dnfs_per_year, x="year", y="count", title = 'Number of mechanical DNFs per year', color = 'count', 
             color_continuous_scale=px.colors.sequential.Viridis[::-1], width=1050, height=650)
# Change figure background
fig.update_layout(
#     margin=dict(l=20, r=20, t=20, b=20),
#     paper_bgcolor="lightsteelblue",
    plot_bgcolor='gainsboro')

# [::-1] to reverse colour scale

fig.show()

The above plot clearly shows that the 1980s and 1990s saw the most number of mechanical retirements from races. However, this may not just be because of poor reliability during that period - F1 had advanced a lot since its beginnings in 1950 and so technology was bound to be better. It is important to note that in the early days of F1, a season had 7 races. In the mid to late-1980s, there were 16 races per season. Additionally, more experimentation and increasing regulations meant the teams had to try out new approaches - often resulting in a DNF result.

Next, we need to consider how many drivers took part in each F1 season.

The current F1 driver grid consists of 10 teams with 2 main drivers each. Teams sometimes have to use reserve drivers due to factors such as driver injury, illness, etc. A good example of this is Nico Hulkenberg - who has filled in for Sergio Perez, Lance Stroll and Sebastian Vettel with Racing Point/Aston Martin Racing due to Covid 19 over 2020-2022.

However, F1 has had many more drivers in a single season in the past. 108 different drivers representing 41 teams took part in the 1953 F1 season - the most drivers in a single season ever. Due to thse factors, the following plot looks at the average number of mechanical DNFs in a season per driver.

In [17]:
drivers_per_year = all_results.groupby(['year'])['driverName'].nunique().reset_index().rename(columns={'driverName': 'numDrivers'})
dnfs_per_year['numDrivers'] = drivers_per_year['numDrivers']

dnfs_per_year['dnfs_per_driver'] = (dnfs_per_year['count']/dnfs_per_year['numDrivers']).round(3)
dnfs_per_year

Unnamed: 0,year,count,numDrivers,dnfs_per_driver
53,1950,62,24,2.583
43,1951,76,35,2.171
42,1952,77,37,2.081
25,1953,106,52,2.038
29,1954,93,36,2.583
...,...,...,...,...
62,2018,47,25,1.880
69,2019,30,20,1.500
70,2020,29,23,1.261
71,2021,26,21,1.238


In [18]:
fig = px.bar(dnfs_per_year, x = 'year', y = 'dnfs_per_driver', title = 'Number of mechanical DNFs per driver per year', 
             color = 'dnfs_per_driver', color_continuous_scale=px.colors.sequential.Viridis[::-1], 
             width=1150, height=650, labels = dict(year = 'Year', dnfs_per_driver = 'Mechanical DNFs per driver'))

fig.update_layout(plot_bgcolor='gainsboro')

fig.show()

This graph shows that 1978 was the season with the worst car reliability - 111 mechanical DNFs shared between 31 drivers. This is closely followed by 1994 - 115 mechanical DNFs shared between 33 drivers

Mechanical DNFs and accidents by driver
Now we can look at which drivers have had the most number of mechanical DNFs (i.e. the worst luck) and accidents/collisions. These figures will also be considered in relation to the total number of Grand Prix races they have started so as to get a good understanding.

This analysis makes use of the mdnf_df dataframe created earlier containing all mechanical DNF results in F1 history as well as additional data from the all_results dataframe to account for accidents

In [19]:
# Number of mechanical DNFs per driver

mdnf_drivers = mdnf_df['driverName'].value_counts().reset_index().sort_values(by='driverName', ascending = False).rename(columns={'index': 'driverName', 'driverName': 'mdnf_count'})

# Races entered per driver
races_per_driver = all_results['driverName'].value_counts().reset_index().rename(columns={'index': 'driverName', 'driverName': 'raceCount'})

# Accidents per driver - considering status IDs 3 (accident), 4 (collision) and 20 (spun off)
accidents = all_results.loc[all_results['statusId'].isin([3, 4, 20])]
accidents = accidents['driverName'].value_counts().reset_index().rename(columns={'index': 'driverName', 'driverName': 'accident_count'})
accidents['accident_count'] = accidents['accident_count'].astype('int')

# Merging dataframes
mdnf_drivers =  pd.merge(mdnf_drivers, races_per_driver, on ='driverName', how ='left')
mdnf_drivers = pd.merge(mdnf_drivers, accidents, on = 'driverName', how = 'left')

# Not every driver will have had mechanical DNFs as well as accidents. This creates NaNs in the accident_count column
# Fill NaN values and convert accident_count to int
mdnf_drivers['accident_count'] = mdnf_drivers['accident_count'].fillna(0)
mdnf_drivers['accident_count'] = mdnf_drivers['accident_count'].astype('int')

# Create columns
mdnf_drivers['races_per_mdnf'] = mdnf_drivers['raceCount']/mdnf_drivers['mdnf_count']
mdnf_drivers['races_per_accident'] = mdnf_drivers['raceCount']/mdnf_drivers['accident_count']

# Determine the total number of DNF results for each driver and arrange the dataframe in descending order based on this number
mdnf_drivers['total_DNFs'] = mdnf_drivers['mdnf_count'] + mdnf_drivers['accident_count']
mdnf_drivers = mdnf_drivers.sort_values(by='mdnf_count', ascending = False).reset_index()

mdnf_drivers['total_finishes'] = mdnf_drivers['raceCount'] - mdnf_drivers['total_DNFs']

# Determine total DNFs per race start and get average value
# mdnf_drivers['dnfs_per_start']

mdnf_drivers = mdnf_drivers[['driverName', 'raceCount', 'mdnf_count', 'accident_count', 'total_DNFs', 'total_finishes', 'races_per_mdnf', 'races_per_accident']].reset_index()

mdnf_drivers.head(10)

Unnamed: 0,index,driverName,raceCount,mdnf_count,accident_count,total_DNFs,total_finishes,races_per_mdnf,races_per_accident
0,0,Riccardo Patrese,257,103,36,139,118,2.495146,7.138889
1,1,Andrea de Cesaris,214,98,37,135,79,2.183673,5.783784
2,2,Michele Alboreto,215,82,20,102,113,2.621951,10.75
3,3,Gerhard Berger,210,68,26,94,116,3.088235,8.076923
4,4,Eddie Cheever,143,67,10,77,66,2.134328,14.3
5,5,Rubens Barrichello,326,66,32,98,228,4.939394,10.1875
6,6,Jacques Laffite,180,66,13,79,101,2.727273,13.846154
7,7,Jarno Trulli,256,63,29,92,164,4.063492,8.827586
8,8,Graham Hill,179,62,15,77,102,2.887097,11.933333
9,9,Nigel Mansell,192,61,28,89,103,3.147541,6.857143


The top 5 drivers in this list were all active in F1 in the mid to late-1980s - the period of F1 with the highest number of mechanical DNFs (lowest car reliability) as per the 'Number of mechanical DNFs per year' chart above.

In [20]:
fig = px.bar(mdnf_drivers.head(20), x="driverName", y=["mdnf_count", "accident_count", 'total_finishes'], 
             labels=dict(driverName="Driver Name", value="Count", variable='Result type'), 
             color_discrete_sequence=px.colors.qualitative.Pastel, 
             title="Results for top 20 drivers with most DNFs", width=1050, height=650)

fig.update_layout(plot_bgcolor='gainsboro')
fig.update_xaxes(tickangle = 45)
fig.update_layout(legend_traceorder="reversed")

# Update legend entries
newnames = {'mdnf_count':'Mechanical DNF', 'accident_count': 'Accident', 'total_finishes': 'Finish'}
fig.for_each_trace(lambda t: t.update(name = newnames[t.name],
                                      legendgroup = newnames[t.name],
                                      hovertemplate = t.hovertemplate.replace(t.name, newnames[t.name])
                                     )
                  )

fig.show()

Next, we can determine the average percentage of finishing drivers per race event and then average it for the whole year to view how it has changed over the years - a higher number indicates better car reliability

In [21]:
prct_1 = all_results[['year', 'raceId', 'statusId']]

# Percentage of all status IDs at each race event through the years
prct_finish = prct_1.groupby(['year', 'raceId'])['statusId'].value_counts(normalize=True).reset_index(name="percentage")

# Extract percentages of drivers who FINISHED races (status IDs: 1, 11-19) and add up their percentages
prct_finish = prct_finish.loc[prct_finish['statusId'].isin([1, 11, 12, 13, 14, 15, 16, 17, 18, 19])]
prct_finish_res = prct_finish.groupby(['year', 'raceId'])['percentage'].sum().reset_index(name='percentage_finishers')

# Group by year and get the average to get the average percentage of finishers per race per year
prct_finish_res = prct_finish_res.groupby(['year'])['percentage_finishers'].mean().reset_index(name='avg_finish_prct')
prct_finish_res['avg_finish_prct'] = prct_finish_res['avg_finish_prct'] * 100
prct_finish_res['avg_finish_prct'] = prct_finish_res['avg_finish_prct'].round(3)
prct_finish_res

Unnamed: 0,year,avg_finish_prct
0,1950,46.927
1,1951,48.407
2,1952,49.202
3,1953,42.342
4,1954,44.764
...,...,...
68,2018,79.762
69,2019,85.714
70,2020,83.235
71,2021,86.591


In [22]:
fig = px.bar(prct_finish_res, x = 'year', y = 'avg_finish_prct', color = 'avg_finish_prct', 
              color_continuous_scale=px.colors.sequential.Viridis[::-1], width=1050, height=650,
              labels=dict(year="Year", avg_finish_prct="Percentage"), 
              title = 'Average percentage of drivers finishing each race per year')

fig.update_layout(plot_bgcolor='gainsboro')

fig.update_yaxes(showgrid=True, gridwidth=1,  range = [0,100])
# gridcolor='LightPink',
fig.show()

Mechanical DNFs by constructor
This section looks at the number of mechanical DNFs for each constructor. These numbers has been compared with the number of starts made by each constructor. In modern F1, each constructor has 2 starters per race. However, in the early days of F1, a team could enter more than 2 drivers in a race event.

Accidents per constructor have not been considered as accidents and collisions are mostly caused by driver error, weather conditions and other external factors. These numbers have been considered in the drivers section above.

Only constructors with over 100 starts in F1 have been considered so as to have a meaningful indication of reliability over the period of a few seasons.

The number of starts have been divided by number of mechanical DNFs. A higher number indicates higher reliability.

In [23]:
all_results['circuit_name']

0        Albert Park Grand Prix Circuit
1        Albert Park Grand Prix Circuit
2        Albert Park Grand Prix Circuit
3        Albert Park Grand Prix Circuit
4        Albert Park Grand Prix Circuit
                      ...              
25655                       Hungaroring
25656                       Hungaroring
25657                       Hungaroring
25658                       Hungaroring
25659                       Hungaroring
Name: circuit_name, Length: 25660, dtype: object

In [24]:
# Number of mechanical DNFs per constructor
mdnf_constructors = mdnf_df['constructor_name'].value_counts().reset_index().sort_values(by='constructor_name', ascending = False).rename(columns={'index': 'constructorName', 'constructor_name': 'mdnf_count'})

# Number of starts per constructor
constructor_starts = all_results['constructor_name'].value_counts().reset_index().sort_values(by='constructor_name', ascending = False).rename(columns={'index': 'constructorName', 'constructor_name': 'starts'})

# Merge dataframes
mdnf_constructors =  pd.merge(mdnf_constructors, constructor_starts, on ='constructorName', how ='left')

# Reorder columns
mdnf_constructors = mdnf_constructors[['constructorName', 'starts', 'mdnf_count']]

# Create column for number of starts per mdnf and sort values
mdnf_constructors['starts_per_mdnf'] = (mdnf_constructors['starts']/mdnf_constructors['mdnf_count']).round(3)
mdnf_constructors = mdnf_constructors.sort_values(by='starts_per_mdnf', ascending = False)

# Only include constructors with more than 100 starts
mdnf_constructors = mdnf_constructors[mdnf_constructors['starts'] >= 100].reset_index().drop('index', axis = 1)

mdnf_constructors.head()

Unnamed: 0,constructorName,starts,mdnf_count,starts_per_mdnf
0,Mercedes,542,37,14.649
1,Force India,424,34,12.471
2,BMW Sauber,140,12,11.667
3,Marussia,109,11,9.909
4,Red Bull,678,77,8.805


In [25]:
fig = px.bar(mdnf_constructors, x = 'constructorName', y = 'starts_per_mdnf', color = 'starts_per_mdnf', 
              color_continuous_scale=px.colors.sequential.Viridis[::-1], width=1050, height=650,
              labels=dict(constructorName="Constructor", starts_per_mdnf="Number of starts per mechanical DNF"), 
              title = 'Reliability by constructor')

fig.layout.coloraxis.colorbar.title = 'Number'
fig.update_layout(plot_bgcolor='gainsboro')
fig.update_xaxes(tickangle = 50)


fig.show()

Circuits with the most crashes
Finally, we can consider the circuits which have had the most accidents and collisions.

Accidents and collisions are more common when weather conditions change during a race. Accidents are also common at narrow street circuits such as Monaco, Baku, Jeddah and Singapore with many drivers touching the wall and compromsing their laps/races.

The number of accidents at each circuit will also be compared against the number of Grands Prix held at the circuit to get a better picture of the dangers a certain circuit poses. An example of why this is necessary is Jeddah - a street circuit which (as of 2022) is very new to F1 (only 2 races held so far) but has had multiple crashes.

In [26]:
circuit_races['name_y']

0       Albert Park Grand Prix Circuit
1         Sepang International Circuit
2       Shanghai International Circuit
3        Bahrain International Circuit
4       Circuit de Barcelona-Catalunya
                     ...              
1074                    Suzuka Circuit
1075           Circuit of the Americas
1076      Autódromo Hermanos Rodríguez
1077        Autódromo José Carlos Pace
1078                Yas Marina Circuit
Name: name_y, Length: 1079, dtype: object

In [27]:
# Determine the total number of crashes (accidents, collisions and spins) which have happened at each circuit
circuit_accidents = all_results.loc[all_results['statusId'].isin([3, 4, 20])]
circuit_accidents = circuit_accidents[['circuit_name', 'status']]
circuit_accidents = pd.crosstab(index=circuit_accidents['circuit_name'], columns=circuit_accidents['status'])
circuit_accidents = circuit_accidents.reset_index()

# Determine the number of races held at each circuit
num_races = circuit_races['name_y'].value_counts().reset_index().rename(columns={'index': 'circuit_name', 'name_y': 'num_of_races'})
circuit_accidents = pd.merge(circuit_accidents, num_races, on = 'circuit_name', how = 'left')

# Calculate the total number of accidents at each circuit and arrange the dataframe according to this
circuit_accidents['total_accidents'] = circuit_accidents['Accident'] + circuit_accidents['Collision'] + circuit_accidents['Spun off']
circuit_accidents = circuit_accidents.sort_values(by = 'total_accidents', ascending = False)

# Calculate the number of accidents per race
circuit_accidents['accidents_per_race'] = (circuit_accidents['total_accidents']/circuit_accidents['num_of_races']).round(3)

# Clean the dataframe by resetting the index
circuit_accidents = circuit_accidents.reset_index()
circuit_accidents = circuit_accidents.drop(['index'], axis = 1)

circuit_accidents.head(10)

Unnamed: 0,circuit_name,Accident,Collision,Spun off,num_of_races,total_accidents,accidents_per_race
0,Circuit de Monaco,118,72,66,68,256,3.765
1,Autodromo Nazionale di Monza,52,40,52,72,144,2.0
2,Circuit de Spa-Francorchamps,52,47,37,55,136,2.473
3,Circuit Gilles Villeneuve,44,43,46,41,133,3.244
4,Silverstone Circuit,39,42,46,57,127,2.228
5,Autódromo José Carlos Pace,38,40,31,39,109,2.795
6,Nürburgring,62,24,21,41,107,2.61
7,Autodromo Enzo e Dino Ferrari,12,21,55,30,88,2.933
8,Indianapolis Motor Speedway,60,9,18,19,87,4.579
9,Hockenheimring,25,34,26,37,85,2.297


In [28]:
fig = px.bar(circuit_accidents.head(20), x="circuit_name", y=["Accident", "Collision", 'Spun off'], 
             labels=dict(circuit_name="Circuit", value="Count", variable='Crash type'),
             color_discrete_sequence=px.colors.qualitative.Pastel, 
             title="Top 20 circuits with the most crashes", width=1050, height=650)

fig.update_layout(plot_bgcolor='gainsboro')
fig.update_xaxes(tickangle = 45)
fig.update_layout(legend_traceorder="reversed")


fig.update_layout(hovermode="x")


fig.show()


Finally, we can look at the number of crashes per race at different circuits. A higher number indicates that a circuit is more dangerous.

In [29]:
crashes_per_race = circuit_accidents.sort_values(by = 'accidents_per_race', ascending = False)
crashes_per_race

Unnamed: 0,circuit_name,Accident,Collision,Spun off,num_of_races,total_accidents,accidents_per_race
38,Fair Park,1,0,12,1,13,13.000
17,Long Beach,27,14,11,8,52,6.500
53,Autodromo Internazionale del Mugello,0,6,0,1,6,6.000
54,Donington Park,0,3,3,1,6,6.000
16,Adelaide Street Circuit,13,18,27,11,58,5.273
...,...,...,...,...,...,...,...
62,Charade Circuit,3,0,0,4,3,0.750
52,Circuit of the Americas,3,4,0,10,7,0.700
37,Bahrain International Circuit,3,10,0,19,13,0.684
68,Prince George Circuit,2,0,0,3,2,0.667


In [30]:
fig = px.bar(crashes_per_race.head(20), x = 'circuit_name', y = 'accidents_per_race', 
             title="Top 20 circuits with the most crashes per race", width=1050, height=650,
            labels=dict(circuit_name="Circuit", accidents_per_race='Accidents per race'), color = 'accidents_per_race', 
              color_continuous_scale=px.colors.sequential.Viridis[::-1])

fig.update_layout(plot_bgcolor='gainsboro')
fig.update_xaxes(tickangle = 45)

fig.show()
