In [None]:
# importing required libraries

import pandas as pd
from pandas import read_csv
from datetime import datetime
import matplotlib.pyplot as plt
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn import datasets
from sklearn import tree
import warnings
warnings.simplefilter("ignore")


In [None]:
# importing the dataset
result_df = pd.read_csv("./data/results.csv")
stats_df = pd.read_csv("./data/status.csv")
drivers_df = pd.read_csv("./data/drivers.csv")
races_df = pd.read_csv("./data/races.csv")
constructor_df = pd.read_csv("./data/constructors.csv")
driver_standings_df = pd.read_csv("./data/driver_standings.csv")
qualifying_df = pd.read_csv("./data/qualifying.csv")
# pd.get_option("display.max_columns", None)


In [None]:
# Viewing the data
result_df.head()


In [None]:
# merging all seperate dataframe into single dataframe as df
con1 = pd.merge(result_df, races_df, on='raceId')
con2 = pd.merge(con1, drivers_df, on='driverId')
con3 = pd.merge(con2, driver_standings_df, on=['driverId', 'raceId'])
con4 = pd.merge(con3, constructor_df, on='constructorId')
con5 = pd.merge(con4, qualifying_df, on=['raceId', 'driverId'])
df = pd.merge(con5, stats_df, on=['statusId'])
pd.get_option("display.max_columns", None)

# checking the data types
df.head()


In [None]:
# zero tables were dropped, there are no null values in this dataframe

# checking what types of data are in the dataframe and how much ram they take up
df.info()


In [None]:
# viewing the columns in the dataframe
df.columns


In [None]:
# dropping unnecessary columns
df = df.drop(['url', 'url_x', 'fastestLapTime', 'positionText_x', 'time_x', 'time_y', 'driverRef', 'constructorRef', 'nationality_y', 'url_y', 'positionText_y', 'points_y', 'rank', 'number_y', 'milliseconds', 'fastestLapSpeed',
             'number_x', 'code', 'fastestLap', 'driverStandingsId', 'q1', 'q2', 'q3', 'status', 'constructorId_x', 'constructorId_y', 'points_x', 'qualifyId', 'wins', 'resultId', 'positionOrder', 'position_y', 'grid', 'statusId'], axis='columns')


In [None]:
# renaming the column names to less confusing names

col_name = {'name_x': 'grand_prix', 'nationality_x': 'nationality', 'name_y': 'constructor',
            'raceId_x': 'racerId', 'points_x': 'points', 'forename': 'firstname', 'position_x': 'finnishPosition'}

df.rename(columns=col_name, inplace=True)
df.head()

# dropping the columns that don't have any value
df.dropna(inplace=True)


In [None]:
# combining the two columns into one column for better readability

df['driver_name'] = df.pop('firstname')+' '+df.pop('surname')


In [None]:
df.head()


In [None]:
# converting to date format from string

pd.to_datetime(df.date)


In [None]:
# putting columns into seperate dataframes

df['dob'] = pd.to_datetime(df['dob'])
df['date'] = pd.to_datetime(df['date'])


In [None]:
# calculating driver's age and creating it as a new column

dates = df.pop('date')-df.pop('dob')
age = dates.dt.days/365


# rounding the age to the nearest year (for better readability)

df['age'] = round(age)
pd.set_option('display.max_columns', None)

df.head()


In [None]:
# dropping date and dob columns because they are not needed and they are not numerical

df.dropna(inplace=True)


In [None]:
# showing that the data processing lessens the ram usage (from 43 columns to 26 columns)

df.info()


In [None]:
# checking for null values

df.isnull().sum() / len(df) * 100


# since unused there are no null values in this dataframe there has to be no handling of null values


In [None]:
# doing necessary imports to let the machine actually understand the data

le = LabelEncoder()


In [None]:
# dropping years before 2010 because there was a new points system introduced so previous data is not accurate anymore

df.drop(df[df.year < 2010].index, inplace=True)

In [None]:
# seperating categorical and numerical columns for understading

cat = []
num = []
for i in df.columns:
    if df[i].dtypes == 'O':
        cat.append(i)
    else:
        num.append(i)


In [None]:
# encoding categorical columns

for i in cat:
    df[i] = le.fit_transform(df[i])
df.head()


In [None]:
df.to_csv(r'./newdata/collectionfile.csv')

In [None]:
df = df.reset_index()

In [None]:
# defining the tree structure

model = tree.DecisionTreeRegressor(max_depth=5, random_state=1234)


In [None]:
# creating two seperate dataframes for the training and testing data

X = pd.DataFrame(df.drop(['finnishPosition'], axis=1))

y = pd.DataFrame(df, columns=['finnishPosition'])


In [None]:
# performing a train test split to see if the prediction will be accurate

Xtrain, Xtest, ytrain, ytest = train_test_split(
    X, y, test_size=0.33, random_state=42)


In [None]:
model.fit(Xtrain, ytrain)

In [None]:
model.score(Xtrain, ytrain)


In [None]:
model.fit(Xtest, ytest)

In [None]:
model.score(Xtest, ytest)


In [None]:
y_pred_tree = model.predict(Xtest)


In [None]:
# first 50 rows of the data vs the predicted values

plt.plot([item for item in range(len(ytest[0:50]))], ytest.values[0:50], label="Actual Data", linestyle=':')
plt.plot([item for item in range(len(ytest[0:50]))], y_pred_tree[0:50], label="Predicted Data")
plt.ylabel("FinnishPosition")
plt.xlabel("Drivers")
plt.legend(loc='best')
plt.show()


In [None]:
fn = list(df.columns)
fn.remove('finnishPosition')

In [None]:
fig = plt.figure(figsize=(150,150))
_ = tree.plot_tree(model,
               feature_names=fn,
               filled=True)

In [None]:
fig.savefig("decision_tree.png")