In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.graph_objects as go
import plotly.express as px
from sklearn.preprocessing import StandardScaler, MinMaxScaler, LabelEncoder
from category_encoders import TargetEncoder
from sklearn.model_selection import train_test_split
from category_encoders.wrapper import PolynomialWrapper

#### Reading dataset.csv

In [None]:
df = pd.read_csv('./data/dataset.csv', low_memory=False)
df.rename(columns={'Unnamed: 0': 'index'}, inplace=True)
df.rename(columns = {'country': 'circuit_country'}, inplace = True)
df.drop(columns=['index'], inplace=True)
df.columns

#### Dropping the columns

In [None]:
cols_to_be_dropped = ['round', 'name', 'time', 'circuit_id', 'car_number', 'resultId', 'driverRef', 'results_points', 'laps', 'fastestLap', 'fastestLapTime', 'fastestLapSpeed', 'first_5_avg_time', 'last_10_avg_time', 'lap_position_after_5', 'lap_position_before_last_10', 'circuit_location', 'constructorStandingsId', 'constructorRef', 'constructor_points', 'driverRef', 'driver_number', 'driver_code', 'forename', 'driverStandingsId', 'driver_points', 'number', 'status', 'qualifying_position', 'q1_time', 'q2_time', 'q3_time', 'ms_avg', 'no_of_stops', 'driver_position', 'qualifyId']
df_after_dropping = df.drop(columns = cols_to_be_dropped)

#### Getting age from date & dob

In [None]:
from datetime import datetime
from dateutil import relativedelta

# get two dates
race_date = df_after_dropping['date']
driver_dob = df_after_dropping['dob']

ages = []

for i in range(len(race_date)):
    start_date = datetime.strptime(race_date[i],"%Y-%m-%d")
    end_date = datetime.strptime(driver_dob[i], "%Y-%m-%d")
    delta = relativedelta.relativedelta(start_date, end_date)
    ages.append(delta.years)

df_after_dropping['driver_age'] = ages
df_after_dropping.drop(columns = ['date', 'dob'], inplace = True)
df_after_dropping

#### Changing status IDs to 0s and 1s

In [None]:
df_after_dropping['statusId'][df_after_dropping['statusId'] > 1] = 0

#### Replace Nan in Constructor_position by max Value = 21 & Replace Nan in Constructor_wins by 0

In [None]:
df_after_dropping['constructor_position'] = df_after_dropping['constructor_position'].fillna(21)
df_after_dropping['constructor_wins'] = df_after_dropping['driver_wins'].fillna(0)

## Analysis Graphs

In [None]:
# Drivers by nationality
nationality_driver = df_after_dropping.groupby(['driver_nationality'])['driver_nationality'].count().sort_values(ascending = False).reset_index(name = 'number_of_drivers')
nationality_driver['driver_nationality'][nationality_driver['number_of_drivers'] < 200] = 'Other'
nationality_driver = nationality_driver.groupby(['driver_nationality']).sum().reset_index()

fig = go.Figure(data=go.Pie(labels = nationality_driver['driver_nationality'], values=nationality_driver.number_of_drivers))
fig.update_layout(title = 'Drivers by Nationality')
fig.show()


In [None]:
# Champions by nationality
champions = df_after_dropping[df_after_dropping['results_positionOrder'] == 1]
champions = champions.groupby(['driver_nationality'])['driver_nationality'].count().sort_values(ascending = False).reset_index(name = 'number_of_drivers')

fig = go.Figure(data=go.Pie(labels = champions['driver_nationality'], values=champions.number_of_drivers))
fig.update_layout(title = 'Champions by Driver Nationality')
fig.show()

In [None]:
# Completion rate by circuit
status_on_circuits = df_after_dropping[['circuitId', 'statusId']].groupby(['circuitId']).mean()
fig = px.bar(status_on_circuits, y = 'statusId')
fig.update_layout(title = 'Completion rate by circuit')
fig.show()

In [None]:
len(df_after_dropping.columns)

In [None]:
winners = df_after_dropping[['driverId', 'results_positionOrder', 'driver_age']][df_after_dropping['results_positionOrder'] == 1]
winners.drop(columns = ['driverId'], inplace=True)
winners_by_age = winners.groupby(['driver_age']).sum()
fig = px.bar(winners_by_age)
fig.update_layout(title = 'No. of races won by age')
fig.show()

In [None]:
wins = df_after_dropping[['grid', 'results_positionOrder']][df_after_dropping['results_positionOrder'] == 1]
wins = wins.groupby(['grid']).count()

fig = px.bar(wins)
fig.update_layout(title = 'No. of races won by grid position')
fig.show()

In [None]:
df_after_dropping.columns

In [None]:
temp = df_after_dropping['milliseconds']

In [None]:
temp.isnull().sum()

In [None]:
temp.value_counts()

In [None]:
df_after_dropping[df_after_dropping['milliseconds'] == '\\N'][df_after_dropping['statusId'] == 1].index

In [None]:
df_after_dropping.drop(df_after_dropping[df_after_dropping['milliseconds'] == '\\N'][df_after_dropping['statusId'] == 1].index, inplace=True)

In [None]:
df_after_dropping['weather_warm'] = df_after_dropping['weather_warm'].fillna(0)
df_after_dropping['weather_cold'] = df_after_dropping['weather_cold'].fillna(0)
df_after_dropping['weather_dry'] = df_after_dropping['weather_dry'].fillna(0)
df_after_dropping['weather_wet'] = df_after_dropping['weather_wet'].fillna(0)
df_after_dropping['weather_cloudy'] = df_after_dropping['weather_cloudy'].fillna(0)
df_after_dropping['driver_wins'] = df_after_dropping['driver_wins'].fillna(0)
df_after_dropping.isnull().sum()

In [None]:
df_after_dropping['milliseconds'] = df_after_dropping['milliseconds'].replace('\\N', 0)

In [None]:
df_after_dropping['milliseconds'] = df_after_dropping['milliseconds'].astype('int64')

In [None]:
df_after_dropping['milliseconds'] = df_after_dropping['milliseconds'].replace(0, df_after_dropping['milliseconds'].max() * 2)

In [None]:
df_after_dropping['milliseconds'].describe()

In [None]:
df_after_dropping.info()

In [None]:
for i in df_after_dropping.columns: 
    sns.histplot(df_after_dropping[i])
    plt.show()

In [None]:
df_num = df_after_dropping.select_dtypes(include=[np.number])
for i in df_num.columns:
    sns.boxplot(df_num[i])
    plt.title(i)
    plt.show()

In [None]:
upper_limit = df_after_dropping["driver_age"].quantile(0.97)
df_after_dropping['driver_age'] = np.where(df_after_dropping['driver_age'] > upper_limit, upper_limit, df_after_dropping['driver_age'])

In [None]:
dictDriverCountry={'Argentine':1,'Australian':2,'Austrian':3,'Belgian':6,'Brazilian':7,'Canadian':8,'Chinese':9,
'French':10,'German':11,'Hungarian':12,'Indian':13,'Italian':14,'Japanese':15,'Malaysian':17,
'Mexican':18,'Monegasque':19,'Portuguese':22,'Russian':24,'South African':27,'Spanish':28,
'Swedish':29,'Swiss':30,'British':33,'American':34,'American-Italian':14,'Argentine-Italian':1,
'Chilean':35,'Colombian':36,'Czech':37,'Danish':38,'Dutch':39,'East German':11,'Finnish':40,'Indonesian':41,'Irish':42,
'Liechtensteiner':43,'New Zealander':44,'Polish':45,'Rhodesian':46,'Thai':47,'Uruguayan':48,'Venezuelan':49
}

In [None]:
dictCircuitCountry = {'Argentina': 1, 'Australia': 2, 'Austria': 3, 'Azerbaijan': 4, 'Bahrain': 5, 'Belgium': 6, 
'Brazil': 7, 'Canada': 8, 'China': 9, 'France': 10, 'Germany': 11, 'Hungary': 12, 'India': 13, 'Italy': 14, 'Japan': 15, 
'Korea': 16, 'Malaysia': 17, 'Mexico': 18, 'Monaco': 19, 'Morocco': 20, 'Netherlands': 21, 'Portugal': 22, 'Qatar': 23, 
'Russia': 24, 'Saudi Arabia': 25, 'Singapore': 26, 'South Africa': 27, 'Spain': 28, 'Sweden': 29, 'Switzerland': 30, 
'Turkey': 31, 'UAE': 32, 'UK': 33, 'USA': 34}

In [None]:
df_after_dropping['circuit_country'] = df_after_dropping['circuit_country'].map(dictCircuitCountry)

In [None]:
df_after_dropping['driver_nationality'] = df_after_dropping['driver_nationality'].map(dictDriverCountry)

In [None]:
plt.figure(figsize=(20,10))
sns.heatmap(df_after_dropping.corr(), annot=True, cmap='coolwarm')

In [None]:
import random
# x_train, x_test, y_train, y_test = train_test_split(df_after_dropping.drop(columns = ['results_positionOrder']), df_after_dropping['results_positionOrder'], test_size = 0.15, random_state = 42, stratify=df_after_dropping['results_positionOrder'])
testing_years = []
for i in range(3):
    testing_years.append(random.randrange(2010,2021))
    testing_years.append(random.randrange(1950, 1990))

df_test = pd.DataFrame()
for i in testing_years:
    df_temp = df_after_dropping[df_after_dropping['year'] == i]
    df_test = pd.concat([df_test, df_temp])

df_train = pd.concat([df_after_dropping, df_test, df_test]).drop_duplicates(keep=False)

x_train, x_test, y_train, y_test = df_train.drop(columns = ['results_positionOrder']), df_test.drop(columns = ['results_positionOrder']), df_train['results_positionOrder'], df_test['results_positionOrder']

In [None]:
standard_scaler = StandardScaler()
min_max_scaler = MinMaxScaler()

In [None]:
x_train['milliseconds'] = min_max_scaler.fit_transform(x_train[['milliseconds']])
x_test['milliseconds'] = min_max_scaler.transform(x_test[['milliseconds']])

In [None]:
x_train['driver_age'] = min_max_scaler.fit_transform(x_train[['driver_age']])
x_test['driver_age'] = min_max_scaler.transform(x_test[['driver_age']])

In [None]:
x_train['driver_wins'] = standard_scaler.fit_transform(x_train[['driver_wins']])
x_test['driver_wins'] = standard_scaler.transform(x_test[['driver_wins']])

In [None]:
x_train['constructor_wins'] = standard_scaler.fit_transform(x_train[['constructor_wins']])
x_test['constructor_wins'] = standard_scaler.transform(x_test[['constructor_wins']])

In [None]:
encoder = TargetEncoder(cols = ['constructor_nationality', 'driverId', 'constructorId', 'circuitId'], smoothing = 10, min_samples_leaf = 20)
# encoder = PolynomialWrapper(encoder_target)
x_train = encoder.fit_transform(x_train, y_train)
x_test = encoder.transform(x_test)

In [None]:
x_train

In [None]:
x_train['constructor_nationality'] = standard_scaler.fit_transform(x_train[['constructor_nationality']])
x_test['constructor_nationality'] = standard_scaler.transform(x_test[['constructor_nationality']])

In [None]:
x_train['driverId'] = standard_scaler.fit_transform(x_train[['driverId']])
x_test['driverId'] = standard_scaler.transform(x_test[['driverId']])

In [None]:
x_train['constructorId'] = standard_scaler.fit_transform(x_train[['constructorId']])
x_test['constructorId'] = standard_scaler.transform(x_test[['constructorId']])

In [None]:
x_train['circuitId'] = standard_scaler.fit_transform(x_train[['circuitId']])
x_test['circuitId'] = standard_scaler.transform(x_test[['circuitId']])

In [None]:
pd.concat([x_train, y_train], axis = 1).to_csv('./data/train.csv', index = False)
pd.concat([x_test, y_test], axis = 1).to_csv('./data/test.csv', index = False)

#### Final EDA

In [None]:
data = pd.read_csv('./data/train.csv')

In [None]:
for i in data.columns: 
    sns.histplot(data[i])
    plt.show()

In [None]:
df_num = data.select_dtypes(include=[np.number])
for i in df_num.columns:
    sns.boxplot(df_num[i])
    plt.title(i)
    plt.show()

In [None]:
plt.figure(figsize=(20,20))
sns.heatmap(data.corr(), annot = True, cmap = 'coolwarm')

In [None]:
sns.countplot(x = 'results_positionOrder', data = data)