In [None]:
# importing required libraries

import pandas as pd
from pandas import read_csv
from datetime import datetime
from dmba import regressionSummary
import matplotlib.pyplot as plt
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn import datasets
from sklearn import tree
import warnings
warnings.simplefilter("ignore")


In [None]:
# importing the dataset
result_df = pd.read_csv("./data/results.csv")
stats_df = pd.read_csv("./data/status.csv")
drivers_df = pd.read_csv("./data/drivers.csv")
races_df = pd.read_csv("./data/races.csv")
constructor_df = pd.read_csv("./data/constructors.csv")
driver_standings_df = pd.read_csv("./data/driver_standings.csv")
qualifying_df = pd.read_csv("./data/qualifying.csv")
# pd.get_option("display.max_columns", None)


In [None]:
# Viewing the data
result_df.head() 

In [None]:
# merging all seperate dataframe into single dataframe as df
con1 = pd.merge(result_df, races_df, on='raceId')
con2 = pd.merge(con1, drivers_df, on='driverId')
con3 = pd.merge(con2, driver_standings_df, on=['driverId', 'raceId'])
con4 = pd.merge(con3, constructor_df, on='constructorId')
con5 = pd.merge(con4, qualifying_df, on=['raceId', 'driverId'])
df = pd.merge(con5, stats_df, on=['statusId'])
pd.get_option("display.max_columns", None)

# checking the data types
df.head()


In [None]:
# zero tables were dropped, there are no null values in this dataframe

# checking what types of data are in the dataframe and how much ram they take up
df.info()


In [None]:
# viewing the columns in the dataframe
df.columns


In [None]:
# dropping unnecessary columns
df = df.drop(['url', 'url_x', 'fastestLapTime', 'positionText_x', 'time_x', 'time_y', 'driverRef', 'constructorRef', 'nationality_y', 'url_y', 'positionText_y', 'points_y', 'rank', 'number_y', 'milliseconds', 'fastestLapSpeed',
             'number_x', 'code', 'fastestLap', 'driverStandingsId', 'q1', 'q2', 'q3', 'status', 'constructorId_x', 'constructorId_y', 'points_x', 'qualifyId', 'wins', 'resultId', 'positionOrder', 'position_y', 'grid', 'statusId'], axis='columns')


In [None]:
# renaming the column names to less confusing names

col_name = {'name_x': 'grand_prix', 'nationality_x': 'nationality', 'name_y': 'constructor',
            'raceId_x': 'racerId', 'points_x': 'points', 'forename': 'firstname', 'position_x': 'finnishPosition'}

df.rename(columns=col_name, inplace=True)
df.head()

# dropping the columns that don't have any value
df.dropna(inplace=True)


In [None]:
# combining the two columns into one column for better readability

df['driver_name'] = df.pop('firstname')+' '+df.pop('surname')

# # dropping the now unnecessary columns
# df = df.drop(['firstname', 'surname'], 1)


In [None]:
# converting to date format from string

pd.to_datetime(df.date)


In [None]:
# putting columns into seperate dataframes

df['dob'] = pd.to_datetime(df['dob'])
df['date'] = pd.to_datetime(df['date'])


In [None]:
# calculating driver's age and creating it as a new column

dates = df['date']-df['dob']
age = dates.dt.days/365


# rounding the age to the nearest year (for better readability)

df['age'] = round(age)
pd.set_option('display.max_columns', None)
df.head()


In [None]:
# showing that the data processing lessens the ram usage (from 43 columns to 26 columns)

df.info()


In [None]:
# checking for null values

df.isnull().sum() / len(df) * 100


# since unused there are no null values in this dataframe there has to be no handling of null values but here is the code we used in an earlier build

# filling the missing values within columns with the mean value of that column or a 0 if it is a category column

# df[['rank', 'fastestLap']] = df[['rank', 'fastestLap']].fillna(0)
# df['timetaken_in_millisec'] = df['timetaken_in_millisec'].fillna(
#     df['timetaken_in_millisec'].mean())
# df['max_speed'] = df['max_speed'].fillna(df['max_speed'].mean())
# df['number'] = df['number'].fillna(0)

# # checking if null values are still present, if not, then the data is ready to be used (no null values present)

# df.isnull().sum() / len(df) * 100


In [None]:
# doing necessary imports to let the machine actually understand the data

le = LabelEncoder()




In [None]:
# # encoding the categorical columns

# le_grand_prix = LabelEncoder()
# le_nationality = LabelEncoder()
# le_constructor = LabelEncoder()
# le_status = LabelEncoder()
# le_driver_name = LabelEncoder()


In [None]:
# # labeling the categorical values

# df['grand_prix_n'] = le_grand_prix.fit_transform(df['grand_prix'])
# df['nationality_n'] = le_nationality.fit_transform(df['nationality'])
# df['constructor_n'] = le_constructor.fit_transform(df['constructor'])
# df['status_n'] = le_status.fit_transform(df['status'])
# df['driver_name_n'] = le_driver_name.fit_transform(df['driver_name'])


In [None]:
# dropping the categorical columns wich contain strings (and other useless columns)

# df = df.drop(['grand_prix', 'nationality', 'constructor',
#              'status', 'driver_name', 'dob', 'date', 'points'], 1)


In [None]:
# dropping years before 2010 because there was a new points system introduced

df.drop(df[df.year < 2010].index, inplace=True)  # consider 2016 as well

In [None]:
# seperating categorical and numerical columns for understading

cat = []
num = []
for i in df.columns:
    if df[i].dtypes == 'O':
        cat.append(i)
    else:
        num.append(i)


In [None]:
# encoding categorical columns

for i in cat:
    df[i] = le.fit_transform(df[i])
df.head()

# dropping date and dob columns because they are not needed and they are not numerical

df.drop(['date', 'dob'], 1, inplace=True)  # this needs to be placed elsewhere
df.dropna(inplace=True)


In [None]:
df.to_csv(r'./newdata/collectionfile.csv')

In [None]:
df = df.reset_index()

In [None]:
# defining the tree structure

model = tree.DecisionTreeRegressor(max_depth=5, random_state=1234)


In [None]:
# creating two seperate dataframes for the training and testing data

X = pd.DataFrame(df.drop(['finnishPosition'], axis=1))

y = pd.DataFrame(df, columns=['finnishPosition'])


In [None]:
# performing a train test split to see if the prediction will be accurate

Xtrain, Xtest, ytrain, ytest = train_test_split(
    X, y, test_size=0.33, random_state=42)


In [None]:
model.fit(Xtrain, ytrain)

In [None]:
model.score(Xtrain, ytrain)


In [None]:
model.fit(Xtest, ytest)

In [None]:
model.score(Xtest, ytest)


In [None]:
y_pred_tree = model.predict(Xtest)


In [None]:
# first 50 rows of the data vs the predicted values

plt.plot([item for item in range(len(ytest[0:50]))], ytest.values[0:50], label="Actual Data", linestyle=':')
plt.plot([item for item in range(len(ytest[0:50]))], y_pred_tree[0:50], label="Predicted Data")
plt.ylabel("FinnishPosition")
plt.xlabel("Races")
plt.legend(loc='best')
plt.show()


In [None]:
fn = list(df.columns)
fn.remove('finnishPosition')

In [None]:
fig = plt.figure(figsize=(150,150))
_ = tree.plot_tree(model,
               feature_names=fn,
               filled=True)

In [None]:
fig.savefig("decision_tree.png")

In [None]:
# # removing rows where the driver does not finish the grand prix

# df_fin = df[df['status'] == 'Finished']

# # showing the end of the dataframe to see if the data is correctly processed

# df_fin.tail(n=10)


In [None]:
# # gathering the mean values of the numerical columns to variables

# meanMS = df.max_speed.mean()
# meanFL = df.fastestLap.mean()

# # using values above meanMS in the dataframe to rule out outliers

# df = df_fin[df_fin['max_speed'] > meanMS]
# df.head()


In [None]:
# # using values above meanFL in the dataframe to rule out outliers

# df[df['fastestLap'] > meanFL]


In [None]:
# # Returns unique values based on a hash table.

# df.year.unique()


In [None]:
# # filtering the data by mean of driver's age and events after year 2012

# df = df[(df['age'] < df['age'].mean()) & (df['year'] > 2012)]
# df


In [None]:
# # droping unwanted columns these columns are not needed for the analysis

# df.drop('date', 1, inplace=True)
# df.drop('dob', 1, inplace=True)
# df.drop('statusId', 1, inplace=True)


In [None]:
# # Return unbiased skew of the dataframe

# df.skew()

# # skew is used to determine if the data is normally distributed or not


In [None]:
# # as can be seen from the above output, the data is not normally distributed, the data will be transformed to normal distribution in the next step

# # Q1 and Q3 are the first and third quartiles of the data

# Q1 = df.quantile(0.25)
# Q3 = df.quantile(0.75)

# # IQR is the interquartile range so it is the difference between the Q3 and Q1

# IQR = Q3 - Q1

# # outliers are the data points that are more than 1.5 times IQR away from the Q1 and Q3 so we will remove them

# df = df[~((df < (Q1-1.5*IQR)) | (df > (Q3+1.5*IQR))).any(axis=1)]
# df.head()


In [None]:
# # the data is now normally distributed and can be used for analysis

# # removing junk data from the dataframe to make the figures more meaningful

# num.remove('date')
# num.remove('dob')
# num.remove('statusId')

# # creating figures to show the distribution of the data from the columns of the dataframe

# plt.figure(figsize=(15, 50))
# for i, j in zip(num, range(1, len(num)+1)):
#     plt.subplot(11, 2, j)
#     sns.kdeplot(df[i], shade=True, color='darkred')
# plt.show()


In [None]:
# # importing a library to do preprocessing with

# from sklearn.preprocessing import LabelEncoder
# le = LabelEncoder()


In [None]:
# # encoding categorical columns for faster processing in knn algorithm

# for i in cat:
#     df[i] = le.fit_transform(df[i])
# df.head()


In [None]:
# # creating two seperate dataframes for the training and testing data

# x = df.drop('driver_name', 1)
# y = df.driver_name


In [None]:
# # importing a library to split the data into training and testing data

# from sklearn.model_selection import train_test_split
# xtrain, xtest, ytrain, ytest = train_test_split(
#     x, y, test_size=0.3, random_state=40)


In [None]:
# df.info()
