In [39]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.graph_objects as go
import plotly.express as px

#### Reading dataset.csv

In [40]:
df = pd.read_csv('./data/dataset.csv', low_memory=False)
df.rename(columns={'Unnamed: 0': 'index'}, inplace=True)
df.drop(columns=['index'], inplace=True)
df.columns

Index(['raceId', 'year', 'round', 'circuitId', 'name', 'date', 'time',
       'circuit_id', 'weather_warm', 'weather_cold', 'weather_dry',
       'weather_wet', 'weather_cloudy', 'resultId', 'driverId',
       'constructorId', 'car_number', 'grid', 'results_positionOrder',
       'results_points', 'laps', 'milliseconds', 'fastestLap',
       'fastestLapTime', 'fastestLapSpeed', 'statusId', 'status',
       'circuit_location', 'country', 'constructorStandingsId',
       'constructor_points', 'constructor_position', 'constructor_wins',
       'constructorRef', 'constructor_nationality', 'driverRef',
       'driver_number', 'driver_code', 'forename', 'dob', 'driver_nationality',
       'driverStandingsId', 'driver_points', 'driver_position', 'driver_wins',
       'qualifyId', 'number', 'qualifying_position', 'q1_time', 'q2_time',
       'q3_time', 'no_of_stops', 'ms_avg', 'first_5_avg_time',
       'last_10_avg_time', 'lap_position_after_5',
       'lap_position_before_last_10'],
      dt

#### Dropping the columns

In [41]:
cols_to_be_dropped = ['raceId', 'year', 'round', 'name', 'time', 'circuit_id', 'car_number', 'resultId', 'driverRef', 'results_points', 'laps', 'fastestLap', 'fastestLapTime', 'fastestLapSpeed', 'first_5_avg_time', 'last_10_avg_time', 'lap_position_after_5', 'lap_position_before_last_10', 'circuit_location', 'constructorStandingsId', 'constructorRef', 'constructor_points', 'driverRef', 'driver_number', 'driver_code', 'forename', 'driverStandingsId', 'driver_points', 'number', 'status', 'qualifying_position', 'q1_time', 'q2_time', 'q3_time', 'ms_avg', 'no_of_stops', 'driver_position', 'qualifyId']
df_after_dropping = df.drop(columns = cols_to_be_dropped)

#### Getting age from date & dob

In [42]:
from datetime import datetime
from dateutil import relativedelta

# get two dates
race_date = df_after_dropping['date']
driver_dob = df_after_dropping['dob']

ages = []

for i in range(len(race_date)):
    start_date = datetime.strptime(race_date[i],"%Y-%m-%d")
    end_date = datetime.strptime(driver_dob[i], "%Y-%m-%d")
    delta = relativedelta.relativedelta(start_date, end_date)
    ages.append(delta.years)

df_after_dropping['driver_age'] = ages
df_after_dropping.drop(columns = ['date', 'dob'], inplace = True)
df_after_dropping

Unnamed: 0,circuitId,weather_warm,weather_cold,weather_dry,weather_wet,weather_cloudy,driverId,constructorId,grid,results_positionOrder,milliseconds,statusId,country,constructor_position,constructor_wins,constructor_nationality,driver_nationality,driver_wins,driver_age
0,1,1.0,0.0,0.0,0.0,0.0,18,23,1,1,5655784,1,Australia,1.0,1.0,British,British,1.0,29
1,1,0.0,0.0,1.0,0.0,0.0,18,16,4,6,5126355,1,Australia,4.0,0.0,British,British,0.0,24
2,1,1.0,0.0,1.0,1.0,0.0,18,16,8,10,5748074,1,Australia,6.0,0.0,British,British,0.0,23
3,1,0.0,0.0,0.0,1.0,1.0,18,1,4,1,5616531,1,Australia,2.0,1.0,British,British,1.0,30
4,1,0.0,0.0,0.0,0.0,0.0,18,1,4,6,5424563,1,Australia,2.0,0.0,British,British,0.0,31
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
25655,66,0.0,0.0,1.0,0.0,0.0,720,6,19,8,\N,111,Switzerland,,,Italian,Swiss,0.0,34
25656,66,1.0,0.0,0.0,0.0,0.0,720,141,21,21,\N,121,Switzerland,,,French,Swiss,0.0,33
25657,66,0.0,0.0,1.0,0.0,0.0,719,133,18,9,\N,116,Switzerland,,,British,Swiss,0.0,45
25658,67,0.0,0.0,1.0,0.0,0.0,783,154,16,16,\N,5,Spain,,,French,French,0.0,46


#### Changing status IDs to 0s and 1s

In [43]:
df_after_dropping['statusId'][df_after_dropping['statusId'] > 1] = 0



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



#### Replace Nan in Constructor_position by max Value = 21 & Replace Nan in Constructor_wins by 0

In [44]:
df_after_dropping['constructor_position'] = df_after_dropping['constructor_position'].fillna(21)
df_after_dropping['constructor_wins'] = df_after_dropping['driver_wins'].fillna(0)

## Analysis Graphs

In [45]:
# Drivers by nationality
nationality_driver = df_after_dropping.groupby(['driver_nationality'])['driver_nationality'].count().sort_values(ascending = False).reset_index(name = 'number_of_drivers')
nationality_driver['driver_nationality'][nationality_driver['number_of_drivers'] < 200] = 'Other'
nationality_driver = nationality_driver.groupby(['driver_nationality']).sum().reset_index()

fig = go.Figure(data=go.Pie(labels = nationality_driver['driver_nationality'], values=nationality_driver.number_of_drivers))
fig.update_layout(title = 'Drivers by Nationality')
fig.show()




A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [46]:
# Champions by nationality
champions = df_after_dropping[df_after_dropping['results_positionOrder'] == 1]
champions = champions.groupby(['driver_nationality'])['driver_nationality'].count().sort_values(ascending = False).reset_index(name = 'number_of_drivers')

fig = go.Figure(data=go.Pie(labels = champions['driver_nationality'], values=champions.number_of_drivers))
fig.update_layout(title = 'Champions by Driver Nationality')
fig.show()

In [47]:
# Completion rate by circuit
status_on_circuits = df_after_dropping[['circuitId', 'statusId']].groupby(['circuitId']).mean()
fig = px.bar(status_on_circuits, y = 'statusId')
fig.update_layout(title = 'Completion rate by circuit')
fig.show()

In [48]:
len(df_after_dropping.columns)

19

In [49]:
winners = df_after_dropping[['driverId', 'results_positionOrder', 'driver_age']][df_after_dropping['results_positionOrder'] == 1]
winners.drop(columns = ['driverId'], inplace=True)
winners_by_age = winners.groupby(['driver_age']).sum()
fig = px.bar(winners_by_age)
fig.update_layout(title = 'No. of races won by age')
fig.show()

In [50]:
wins = df_after_dropping[['grid', 'results_positionOrder']][df_after_dropping['results_positionOrder'] == 1]
wins = wins.groupby(['grid']).count()

fig = px.bar(wins)
fig.update_layout(title = 'No. of races won by grid position')
fig.show()