In [111]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt


READING ALL DATASETS

In [112]:
circuits = pd.read_csv('./data/circuits.csv')
constructor_results = pd.read_csv('./data/constructor_results.csv')
constructor_standings = pd.read_csv('./data/constructor_standings.csv')
constructors = pd.read_csv('./data/constructors.csv')
driver_standings = pd.read_csv('./data/driver_standings.csv')
drivers = pd.read_csv('./data/drivers.csv')
lap_times = pd.read_csv('./data/lap_times.csv')
pit_stops = pd.read_csv('./data/pit_stops.csv')
qualifying = pd.read_csv('./data/qualifying.csv')
races = pd.read_csv('./data/races.csv')
results = pd.read_csv('./data/results.csv')
seasons = pd.read_csv('./data/seasons.csv')
sprint_results = pd.read_csv('./data/sprint_results.csv')
status = pd.read_csv('./data/status.csv')
weather = pd.read_csv('./data/weather.csv')

DROPPING REDUNDANT COLUMNS

In [113]:
del circuits["lat"]
del circuits["lng"]
del circuits["country"]
del circuits["alt"]
del circuits["url"]
del circuits["name"]
del constructor_results['status']
del constructor_standings["positionText"]
del constructors["name"]
del constructors["url"]
driver_standings.drop(['positionText'], axis=1, inplace=True)
drivers.drop(['surname', 'url'], axis=1, inplace=True)
lap_times.drop(['time'], axis=1, inplace=True)
pit_stops.drop(['time', 'duration'], axis=1, inplace=True)
races = races.drop(['url','fp1_date', 'fp1_time', 'fp2_date', 'fp2_time', 'fp3_date', 'fp3_time', 'quali_date', 'quali_time', 'sprint_date', 'sprint_time'], axis = 1)
results.drop(['time', 'milliseconds', 'fastestLap', 'rank', 'fastestLapTime', 'fastestLapSpeed', 'positionText', 'position'], axis = 1, inplace = True)
seasons.drop(['url'], axis = 1, inplace=True)
weather.drop(['weather'], axis = 1, inplace=True)



RENAMING

In [114]:
weather.rename(columns={'season': 'year'}, inplace = True)

MERGING DATA :(

In [115]:
df_1 = pd.merge(races, weather, on = ['year', 'round'], how = 'outer')
df_1.head()
df_1.shape

(1079, 13)

In [116]:
results.head()
results.shape

(25660, 10)

In [117]:
df_2 = pd.merge(df_1, results, on = 'raceId')
df_2.shape
df_2.head()

Unnamed: 0,raceId,year,round,circuitId,name,date,time,circuit_id,weather_warm,weather_cold,...,weather_cloudy,resultId,driverId,constructorId,number,grid,positionOrder,points,laps,statusId
0,1,2009,1,1,Australian Grand Prix,2009-03-29,06:00:00,albert_park,1.0,0.0,...,0.0,7554,18,23,22,1,1,10.0,58,1
1,1,2009,1,1,Australian Grand Prix,2009-03-29,06:00:00,albert_park,1.0,0.0,...,0.0,7555,22,23,23,2,2,8.0,58,1
2,1,2009,1,1,Australian Grand Prix,2009-03-29,06:00:00,albert_park,1.0,0.0,...,0.0,7556,15,7,9,20,3,6.0,58,1
3,1,2009,1,1,Australian Grand Prix,2009-03-29,06:00:00,albert_park,1.0,0.0,...,0.0,7557,10,7,10,19,4,5.0,58,1
4,1,2009,1,1,Australian Grand Prix,2009-03-29,06:00:00,albert_park,1.0,0.0,...,0.0,7558,4,4,7,10,5,4.0,58,1


In [118]:
status.shape

(139, 2)

In [119]:
df_3 = pd.merge(df_2, status, on = 'statusId')
df_3.loc[df_3['circuitId'] == 2]

Unnamed: 0,raceId,year,round,circuitId,name,date,time,circuit_id,weather_warm,weather_cold,...,resultId,driverId,constructorId,number,grid,positionOrder,points,laps,statusId,status
11,2,2009,2,2,Malaysian Grand Prix,2009-04-05,09:00:00,sepang,0.0,0.0,...,7574,18,23,22,1,1,5.0,31,1,Finished
12,2,2009,2,2,Malaysian Grand Prix,2009-04-05,09:00:00,sepang,0.0,0.0,...,7575,2,2,6,10,2,4.0,31,1,Finished
13,2,2009,2,2,Malaysian Grand Prix,2009-04-05,09:00:00,sepang,0.0,0.0,...,7576,10,7,10,3,3,3.0,31,1,Finished
14,2,2009,2,2,Malaysian Grand Prix,2009-04-05,09:00:00,sepang,0.0,0.0,...,7577,15,7,9,2,4,2.5,31,1,Finished
15,2,2009,2,2,Malaysian Grand Prix,2009-04-05,09:00:00,sepang,0.0,0.0,...,7578,22,23,23,8,5,2.0,31,1,Finished
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
25406,927,2015,2,2,Malaysian Grand Prix,2015-03-29,07:00:00,sepang,0.0,0.0,...,22572,18,1,22,17,17,0.0,41,101,Turbo
25563,842,2011,2,2,Malaysian Grand Prix,2011-04-10,08:00:00,sepang,0.0,0.0,...,20823,39,164,22,24,23,0.0,14,128,+42 Laps
25564,842,2011,2,2,Malaysian Grand Prix,2011-04-10,08:00:00,sepang,0.0,0.0,...,20824,813,3,12,18,24,0.0,8,129,Engine misfire
25608,927,2015,2,2,Malaysian Grand Prix,2015-03-29,07:00:00,sepang,0.0,0.0,...,22573,4,1,14,18,18,0.0,21,132,ERS


In [120]:
circuits.head()
circuits.shape

(76, 3)

In [121]:
df_4 = pd.merge(df_3, circuits, on = 'circuitId')
df_4.drop(['circuitRef'], axis = 1, inplace=True)

df_4.shape

(25660, 24)

In [122]:
df_4

Unnamed: 0,raceId,year,round,circuitId,name,date,time,circuit_id,weather_warm,weather_cold,...,driverId,constructorId,number,grid,positionOrder,points,laps,statusId,status,location
0,1,2009,1,1,Australian Grand Prix,2009-03-29,06:00:00,albert_park,1.0,0.0,...,18,23,22,1,1,10.0,58,1,Finished,Melbourne
1,1,2009,1,1,Australian Grand Prix,2009-03-29,06:00:00,albert_park,1.0,0.0,...,22,23,23,2,2,8.0,58,1,Finished,Melbourne
2,1,2009,1,1,Australian Grand Prix,2009-03-29,06:00:00,albert_park,1.0,0.0,...,15,7,9,20,3,6.0,58,1,Finished,Melbourne
3,1,2009,1,1,Australian Grand Prix,2009-03-29,06:00:00,albert_park,1.0,0.0,...,10,7,10,19,4,5.0,58,1,Finished,Melbourne
4,1,2009,1,1,Australian Grand Prix,2009-03-29,06:00:00,albert_park,1.0,0.0,...,4,4,7,10,5,4.0,58,1,Finished,Melbourne
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
25655,1078,2022,5,79,Miami Grand Prix,2022-05-08,19:30:00,,,,...,20,117,5,0,17,0.0,54,4,Collision,Miami
25656,1078,2022,5,79,Miami Grand Prix,2022-05-08,19:30:00,,,,...,846,1,4,8,19,0.0,39,4,Collision,Miami
25657,1078,2022,5,79,Miami Grand Prix,2022-05-08,19:30:00,,,,...,842,213,10,7,18,0.0,45,22,Suspension,Miami
25658,1078,2022,5,79,Miami Grand Prix,2022-05-08,19:30:00,,,,...,825,210,20,16,16,0.0,56,33,Front wing,Miami


In [123]:
df_4.info()


<class 'pandas.core.frame.DataFrame'>
Int64Index: 25660 entries, 0 to 25659
Data columns (total 24 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   raceId          25660 non-null  int64  
 1   year            25660 non-null  int64  
 2   round           25660 non-null  int64  
 3   circuitId       25660 non-null  int64  
 4   name            25660 non-null  object 
 5   date            25660 non-null  object 
 6   time            25660 non-null  object 
 7   circuit_id      24620 non-null  object 
 8   weather_warm    24620 non-null  float64
 9   weather_cold    24620 non-null  float64
 10  weather_dry     24620 non-null  float64
 11  weather_wet     24620 non-null  float64
 12  weather_cloudy  24620 non-null  float64
 13  resultId        25660 non-null  int64  
 14  driverId        25660 non-null  int64  
 15  constructorId   25660 non-null  int64  
 16  number          25660 non-null  object 
 17  grid            25660 non-null 

In [124]:
constructor_results.info()
constructor_results.shape

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12080 entries, 0 to 12079
Data columns (total 4 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   constructorResultsId  12080 non-null  int64  
 1   raceId                12080 non-null  int64  
 2   constructorId         12080 non-null  int64  
 3   points                12080 non-null  float64
dtypes: float64(1), int64(3)
memory usage: 377.6 KB


(12080, 4)

In [125]:
df_5=pd.merge(df_4,constructor_standings, on = ['raceId', 'constructorId'],how='left')
df_5                                                                                    

Unnamed: 0,raceId,year,round,circuitId,name,date,time,circuit_id,weather_warm,weather_cold,...,positionOrder,points_x,laps,statusId,status,location,constructorStandingsId,points_y,position,wins
0,1,2009,1,1,Australian Grand Prix,2009-03-29,06:00:00,albert_park,1.0,0.0,...,1,10.0,58,1,Finished,Melbourne,4038.0,18.0,1.0,1.0
1,1,2009,1,1,Australian Grand Prix,2009-03-29,06:00:00,albert_park,1.0,0.0,...,2,8.0,58,1,Finished,Melbourne,4038.0,18.0,1.0,1.0
2,1,2009,1,1,Australian Grand Prix,2009-03-29,06:00:00,albert_park,1.0,0.0,...,3,6.0,58,1,Finished,Melbourne,4039.0,11.0,2.0,0.0
3,1,2009,1,1,Australian Grand Prix,2009-03-29,06:00:00,albert_park,1.0,0.0,...,4,5.0,58,1,Finished,Melbourne,4039.0,11.0,2.0,0.0
4,1,2009,1,1,Australian Grand Prix,2009-03-29,06:00:00,albert_park,1.0,0.0,...,5,4.0,58,1,Finished,Melbourne,4040.0,4.0,3.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
25655,1078,2022,5,79,Miami Grand Prix,2022-05-08,19:30:00,,,,...,17,0.0,54,4,Collision,Miami,28179.0,6.0,9.0,0.0
25656,1078,2022,5,79,Miami Grand Prix,2022-05-08,19:30:00,,,,...,19,0.0,39,4,Collision,Miami,28181.0,46.0,4.0,0.0
25657,1078,2022,5,79,Miami Grand Prix,2022-05-08,19:30:00,,,,...,18,0.0,45,22,Suspension,Miami,28178.0,16.0,7.0,0.0
25658,1078,2022,5,79,Miami Grand Prix,2022-05-08,19:30:00,,,,...,16,0.0,56,33,Front wing,Miami,28175.0,15.0,8.0,0.0


In [126]:
df_5.shape

(25660, 28)

In [127]:
constructor_standings.info()
constructor_standings.shape
constructor_results.shape

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12841 entries, 0 to 12840
Data columns (total 6 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   constructorStandingsId  12841 non-null  int64  
 1   raceId                  12841 non-null  int64  
 2   constructorId           12841 non-null  int64  
 3   points                  12841 non-null  float64
 4   position                12841 non-null  int64  
 5   wins                    12841 non-null  int64  
dtypes: float64(1), int64(5)
memory usage: 602.0 KB


(12080, 4)

In [128]:
constructor_results.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12080 entries, 0 to 12079
Data columns (total 4 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   constructorResultsId  12080 non-null  int64  
 1   raceId                12080 non-null  int64  
 2   constructorId         12080 non-null  int64  
 3   points                12080 non-null  float64
dtypes: float64(1), int64(3)
memory usage: 377.6 KB


In [129]:
constructor_results.head()

Unnamed: 0,constructorResultsId,raceId,constructorId,points
0,1,18,1,14.0
1,2,18,2,8.0
2,3,18,3,9.0
3,4,18,4,5.0
4,5,18,5,2.0


In [104]:
constructor_standings.head()

Unnamed: 0,constructorStandingsId,raceId,constructorId,points,position,wins
0,1,18,1,14.0,1,1
1,2,18,2,8.0,3,0
2,3,18,3,9.0,2,0
3,4,18,4,5.0,4,0
4,5,18,5,2.0,5,0


In [105]:
constructors.head()

Unnamed: 0,constructorId,constructorRef,nationality
0,1,mclaren,British
1,2,bmw_sauber,German
2,3,williams,British
3,4,renault,French
4,5,toro_rosso,Italian


In [106]:
constructors.shape

(211, 3)

In [132]:
df_6=pd.merge(df_5,constructors,on=['constructorId'],how='left')

In [108]:
df_6.shape

(25660, 30)

In [109]:
driver_standings.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 33686 entries, 0 to 33685
Data columns (total 6 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   driverStandingsId  33686 non-null  int64  
 1   raceId             33686 non-null  int64  
 2   driverId           33686 non-null  int64  
 3   points             33686 non-null  float64
 4   position           33686 non-null  int64  
 5   wins               33686 non-null  int64  
dtypes: float64(1), int64(5)
memory usage: 1.5 MB


In [110]:
drivers.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 854 entries, 0 to 853
Data columns (total 7 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   driverId     854 non-null    int64 
 1   driverRef    854 non-null    object
 2   number       854 non-null    object
 3   code         854 non-null    object
 4   forename     854 non-null    object
 5   dob          854 non-null    object
 6   nationality  854 non-null    object
dtypes: int64(1), object(6)
memory usage: 46.8+ KB


Unnamed: 0,driverId,driverRef,number,code,forename,dob,nationality
0,1,hamilton,44,HAM,Lewis,1985-01-07,British
1,2,heidfeld,\N,HEI,Nick,1977-05-10,German
2,3,rosberg,6,ROS,Nico,1985-06-27,German
3,4,alonso,14,ALO,Fernando,1981-07-29,Spanish
4,5,kovalainen,\N,KOV,Heikki,1981-10-19,Finnish


In [81]:
df_7=pd.merge(df_6,drivers,on=['driverId'])

In [82]:
df_7.shape

(25660, 36)

In [78]:
lap_times.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 528785 entries, 0 to 528784
Data columns (total 5 columns):
 #   Column        Non-Null Count   Dtype
---  ------        --------------   -----
 0   raceId        528785 non-null  int64
 1   driverId      528785 non-null  int64
 2   lap           528785 non-null  int64
 3   position      528785 non-null  int64
 4   milliseconds  528785 non-null  int64
dtypes: int64(5)
memory usage: 20.2 MB


In [133]:
df_7.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 25660 entries, 0 to 25659
Data columns (total 36 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   raceId                  25660 non-null  int64  
 1   year                    25660 non-null  int64  
 2   round                   25660 non-null  int64  
 3   circuitId               25660 non-null  int64  
 4   name                    25660 non-null  object 
 5   date                    25660 non-null  object 
 6   time                    25660 non-null  object 
 7   circuit_id              24620 non-null  object 
 8   weather_warm            24620 non-null  float64
 9   weather_cold            24620 non-null  float64
 10  weather_dry             24620 non-null  float64
 11  weather_wet             24620 non-null  float64
 12  weather_cloudy          24620 non-null  float64
 13  resultId                25660 non-null  int64  
 14  driverId                25660 non-null