In [1]:
# importing required libraries

import pandas as pd
from pandas import read_csv
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
from sklearn.model_selection import train_test_split
import warnings
warnings.simplefilter("ignore")


In [2]:
# importing the dataset
result_df = pd.read_csv("./data/results.csv")
stats_df = pd.read_csv("./data/status.csv")
drivers_df = pd.read_csv("./data/drivers.csv")
races_df = pd.read_csv("./data/races.csv")
constructor_df = pd.read_csv("./data/constructors.csv")
driver_standings_df = pd.read_csv("./data/driver_standings.csv")
pd.get_option("display.max_columns", None)


20

In [5]:
# Viewing the data
result_df.head() 

Unnamed: 0,resultId,raceId,driverId,constructorId,number,grid,position,positionText,positionOrder,points,laps,time,milliseconds,fastestLap,rank,fastestLapTime,fastestLapSpeed,statusId
0,1,18,1,1,22,1,1,1,1,10.0,58,1:34:50.616,5690616,39,2,1:27.452,218.3,1
1,2,18,2,2,3,5,2,2,2,8.0,58,+5.478,5696094,41,3,1:27.739,217.586,1
2,3,18,3,3,7,7,3,3,3,6.0,58,+8.163,5698779,41,5,1:28.090,216.719,1
3,4,18,4,4,5,11,4,4,4,5.0,58,+17.181,5707797,58,7,1:28.603,215.464,1
4,5,18,5,1,23,3,5,5,5,4.0,58,+18.014,5708630,43,1,1:27.418,218.385,1


In [3]:
# merging all seperate dataframe into single dataframe as df
con1 = pd.merge(result_df, races_df, on='raceId')
con2 = pd.merge(con1, drivers_df, on='driverId')
con3 = pd.merge(con2, driver_standings_df, on='driverId')
con4 = pd.merge(con3, constructor_df, on='constructorId')
df = pd.merge(con4, stats_df, on='statusId')
pd.get_option("display.max_columns", None)

# dropping the columns that don't have any value
df.dropna

# checking the data types
df.head()


Unnamed: 0,resultId,raceId_x,driverId,constructorId,number_x,grid,position_x,positionText_x,positionOrder,points_x,...,raceId_y,points_y,position_y,positionText_y,wins,constructorRef,name_y,nationality_y,url,status
0,1,18,1,1,22,1,1,1,1,10.0,...,18,10.0,1,1,1,mclaren,McLaren,British,http://en.wikipedia.org/wiki/McLaren,Finished
1,1,18,1,1,22,1,1,1,1,10.0,...,19,14.0,1,1,1,mclaren,McLaren,British,http://en.wikipedia.org/wiki/McLaren,Finished
2,1,18,1,1,22,1,1,1,1,10.0,...,20,14.0,3,3,1,mclaren,McLaren,British,http://en.wikipedia.org/wiki/McLaren,Finished
3,1,18,1,1,22,1,1,1,1,10.0,...,21,20.0,2,2,1,mclaren,McLaren,British,http://en.wikipedia.org/wiki/McLaren,Finished
4,1,18,1,1,22,1,1,1,1,10.0,...,22,28.0,3,3,1,mclaren,McLaren,British,http://en.wikipedia.org/wiki/McLaren,Finished


In [4]:
# zero tables were dropped, there are no null values in this dataframe

# checking what types of data are in the dataframe and how much ram they take up
df.info()


<class 'pandas.core.frame.DataFrame'>
Int64Index: 3348650 entries, 0 to 3348649
Data columns (total 44 columns):
 #   Column             Dtype  
---  ------             -----  
 0   resultId           int64  
 1   raceId_x           int64  
 2   driverId           int64  
 3   constructorId      int64  
 4   number_x           object 
 5   grid               int64  
 6   position_x         object 
 7   positionText_x     object 
 8   positionOrder      int64  
 9   points_x           float64
 10  laps               int64  
 11  time_x             object 
 12  milliseconds       object 
 13  fastestLap         object 
 14  rank               object 
 15  fastestLapTime     object 
 16  fastestLapSpeed    object 
 17  statusId           int64  
 18  year               int64  
 19  round              int64  
 20  circuitId          int64  
 21  name_x             object 
 22  date               object 
 23  time_y             object 
 24  url_x              object 
 25  driverRef         

In [None]:
# viewing the columns in the dataframe
df.columns


In [None]:
# dropping unnecessary columns
df = df.drop(['url', 'url_x', 'position_x', 'fastestLapTime', 'positionText_x', 'time_x', 'time_y', 'driverRef',
              'constructorRef', 'nationality_y', 'url_y', 'positionText_y', 'raceId_y', 'points_y'], 1)


In [None]:
# renaming the column names to less confusing names

col_name = {'number_x': 'number', 'milliseconds': 'timetaken_in_millisec', 'fastestLapSpeed': 'max_speed',
            'name_x': 'grand_prix', 'number_y': 'driver_num', 'code': 'driver_code', 'nationality_x': 'nationality', 'name_y': 'constructor',
            'raceId_x': 'racerId', 'points_x': 'points', 'position_y': 'position', 'forename': 'firstname'}

df.rename(columns=col_name, inplace=True)
df.head()


In [None]:
# combining the two columns into one column for better readability

df['driver_name'] = df['firstname']+' '+df['surname']

# dropping the now unnecessary columns
df = df.drop(['firstname', 'surname'], 1)


In [None]:
# converting to date format from string

pd.to_datetime(df.date)


In [None]:
# putting columns into seperate dataframes

df['dob'] = pd.to_datetime(df['dob'])
df['date'] = pd.to_datetime(df['date'])


In [None]:
# importing a required library to work with dates

from datetime import datetime


In [None]:
# calculating driver's age and creating it as a new column

dates = datetime.today()-df['dob']
age = dates.dt.days/365


# rounding the age to the nearest year (for better readability)

df['age'] = round(age)
pd.set_option('display.max_columns', None)
df.head()


In [None]:
# changing datatype to match the dataframe

l = ['number', 'timetaken_in_millisec',
     'fastestLap', 'rank', 'max_speed', 'driver_num']
for i in l:
    df[i] = pd.to_numeric(df[i], errors='coerce')


In [None]:
# driver_num is a value that is not consistent, so it is dropped

df.drop('driver_num', 1, inplace=True)


In [None]:
# seperating categorical (cat) and numerical (num) columns for understading

cat = []
num = []
for i in df.columns:
    if df[i].dtypes == 'O':
        cat.append(i)
    else:
        num.append(i)


In [None]:
# showing that the data processing lessens the ram usage (from 43 columns to 26 columns)

df.info()


In [None]:
# checking for null values

df.isnull().sum() / len(df) * 100


In [None]:
# filling the missing values within columns with the mean value of that column or a 0

df[['rank', 'fastestLap']] = df[['rank', 'fastestLap']].fillna(0)
df['timetaken_in_millisec'] = df['timetaken_in_millisec'].fillna(
    df['timetaken_in_millisec'].mean())
df['max_speed'] = df['max_speed'].fillna(df['max_speed'].mean())
df['number'] = df['number'].fillna(0)


In [None]:
# checking if null values are still present, if not, then the data is ready to be used (no null values present)

df.isnull().sum() / len(df) * 100


In [None]:
# removing rows where the driver does not finish the grand prix

df_fin = df[df['status'] == 'Finished']

# showing the end of the dataframe to see if the data is correctly processed

df_fin.tail(n=10)


In [None]:
# gathering the mean values of the numerical columns to variables

meanMS = df.max_speed.mean()
meanFL = df.fastestLap.mean()

# using values above meanMS in the dataframe to rule out outliers

df = df_fin[df_fin['max_speed'] > meanMS]
df.head()


In [None]:
# using values above meanFL in the dataframe to rule out outliers

df[df['fastestLap'] > meanFL]


In [None]:
# Returns unique values based on a hash table.

df.year.unique()


In [None]:
# filtering the data by mean of driver's age and events after year 2012

df = df[(df['age'] < df['age'].mean()) & (df['year'] > 2012)]
df


In [None]:
# droping unwanted columns these columns are not needed for the analysis

df.drop('date', 1, inplace=True)
df.drop('dob', 1, inplace=True)
df.drop('statusId', 1, inplace=True)


In [None]:
# Return unbiased skew of the dataframe

df.skew()

# skew is used to determine if the data is normally distributed or not


In [None]:
# as can be seen from the above output, the data is not normally distributed, the data will be transformed to normal distribution in the next step

# Q1 and Q3 are the first and third quartiles of the data

Q1 = df.quantile(0.25)
Q3 = df.quantile(0.75)

# IQR is the interquartile range so it is the difference between the Q3 and Q1

IQR = Q3 - Q1

# outliers are the data points that are more than 1.5 times IQR away from the Q1 and Q3 so we will remove them

df = df[~((df < (Q1-1.5*IQR)) | (df > (Q3+1.5*IQR))).any(axis=1)]
df.head()


In [None]:
# the data is now normally distributed and can be used for analysis

# removing junk data from the dataframe to make the figures more meaningful

num.remove('date')
num.remove('dob')
num.remove('statusId')

# creating figures to show the distribution of the data from the columns of the dataframe

plt.figure(figsize=(15, 50))
for i, j in zip(num, range(1, len(num)+1)):
    plt.subplot(11, 2, j)
    sns.kdeplot(df[i], shade=True, color='darkred')
plt.show()


In [None]:
# importing a library to do preprocessing with

from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()


In [None]:
# encoding categorical columns for faster processing in knn algorithm

for i in cat:
    df[i] = le.fit_transform(df[i])
df.head()


In [None]:
# creating two seperate dataframes for the training and testing data

x = df.drop('driver_name', 1)
y = df.driver_name


In [None]:
# importing a library to split the data into training and testing data

from sklearn.model_selection import train_test_split
xtrain, xtest, ytrain, ytest = train_test_split(
    x, y, test_size=0.3, random_state=40)


In [None]:
df.info()
