In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler, MinMaxScaler, LabelEncoder
from category_encoders import TargetEncoder
from sklearn.model_selection import train_test_split
from category_encoders.wrapper import PolynomialWrapper
from sklearn.decomposition import PCA

In [2]:
df = pd.read_csv('../data/dataset.csv', low_memory=False)
df.rename(columns={'Unnamed: 0': 'index'}, inplace=True)
df.rename(columns = {'country': 'circuit_country'}, inplace = True)
df.drop(columns=['index'], inplace=True)
df.columns

Index(['raceId', 'year', 'round', 'circuitId', 'name', 'date', 'time',
       'circuit_id', 'weather_warm', 'weather_cold', 'weather_dry',
       'weather_wet', 'weather_cloudy', 'resultId', 'driverId',
       'constructorId', 'car_number', 'grid', 'results_positionOrder',
       'results_points', 'laps', 'milliseconds', 'fastestLap',
       'fastestLapTime', 'fastestLapSpeed', 'statusId', 'status',
       'circuit_location', 'circuit_country', 'constructorStandingsId',
       'constructor_points', 'constructor_position', 'constructor_wins',
       'constructorRef', 'constructor_nationality', 'driverRef',
       'driver_number', 'driver_code', 'forename', 'dob', 'driver_nationality',
       'driverStandingsId', 'driver_points', 'driver_position', 'driver_wins',
       'qualifyId', 'number', 'qualifying_position', 'q1_time', 'q2_time',
       'q3_time', 'no_of_stops', 'ms_avg', 'first_5_avg_time',
       'last_10_avg_time', 'lap_position_after_5',
       'lap_position_before_last_10'],


In [3]:
cols_to_be_dropped = ['round', 'name', 'time', 'car_number', 'resultId', 'laps', 'fastestLap', 'fastestLapTime', 'fastestLapSpeed', 'first_5_avg_time', 'last_10_avg_time', 'lap_position_after_5', 'lap_position_before_last_10', 'circuit_location', 'constructorStandingsId', 'constructorRef', 'driver_number', 'driver_code', 'forename', 'driverStandingsId', 'number', 'status', 'qualifying_position', 'q1_time', 'q2_time', 'q3_time', 'ms_avg', 'no_of_stops', 'driver_position', 'qualifyId']
df_after_dropping = df.drop(columns = cols_to_be_dropped)

#### Getting age from date & dob

In [4]:
from datetime import datetime
from dateutil import relativedelta

# get two dates
race_date = df_after_dropping['date']
driver_dob = df_after_dropping['dob']

ages = []

for i in range(len(race_date)):
    start_date = datetime.strptime(race_date[i],"%Y-%m-%d")
    end_date = datetime.strptime(driver_dob[i], "%Y-%m-%d")
    delta = relativedelta.relativedelta(start_date, end_date)
    ages.append(delta.years)

df_after_dropping['driver_age'] = ages
df_after_dropping.drop(columns = ['date', 'dob'], inplace = True)
df_after_dropping

Unnamed: 0,raceId,year,circuitId,circuit_id,weather_warm,weather_cold,weather_dry,weather_wet,weather_cloudy,driverId,...,circuit_country,constructor_points,constructor_position,constructor_wins,constructor_nationality,driverRef,driver_nationality,driver_points,driver_wins,driver_age
0,1,2009,1,albert_park,1.0,0.0,0.0,0.0,0.0,18,...,Australia,18.0,1.0,1.0,British,button,British,10.0,1.0,29
1,90,2004,1,albert_park,0.0,0.0,1.0,0.0,0.0,18,...,Australia,3.0,4.0,0.0,British,button,British,3.0,0.0,24
2,108,2003,1,albert_park,1.0,0.0,1.0,1.0,0.0,18,...,Australia,0.0,6.0,0.0,British,button,British,0.0,0.0,23
3,338,2010,1,albert_park,0.0,0.0,0.0,1.0,1.0,18,...,Australia,54.0,2.0,1.0,British,button,British,31.0,1.0,30
4,841,2011,1,albert_park,0.0,0.0,0.0,0.0,0.0,18,...,Australia,26.0,2.0,0.0,British,button,British,8.0,0.0,31
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
25655,815,1953,66,bremgarten,0.0,0.0,1.0,0.0,0.0,720,...,Switzerland,,,,Italian,terra,Swiss,0.0,0.0,34
25656,817,1952,66,bremgarten,1.0,0.0,0.0,0.0,0.0,720,...,Switzerland,,,,French,terra,Swiss,0.0,0.0,33
25657,815,1953,66,bremgarten,0.0,0.0,1.0,0.0,0.0,719,...,Switzerland,,,,British,scherrer,Swiss,0.0,0.0,45
25658,832,1951,67,pedralbes,0.0,0.0,1.0,0.0,0.0,783,...,Spain,,,,French,grignard,French,0.0,0.0,46


#### Changing status IDs to 0s and 1s

In [5]:
df_after_dropping['statusId'][df_after_dropping['statusId'] > 1] = 0

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_after_dropping['statusId'][df_after_dropping['statusId'] > 1] = 0


#### Replace Nan in Constructor_position by max Value = 21 & Replace Nan in Constructor_wins by 0

In [6]:
df_after_dropping['constructor_position'] = df_after_dropping['constructor_position'].fillna(21)
df_after_dropping['constructor_wins'] = df_after_dropping['driver_wins'].fillna(0)

In [7]:
df_after_dropping[df_after_dropping['milliseconds'] == '\\N'][df_after_dropping['statusId'] == 1].index

  df_after_dropping[df_after_dropping['milliseconds'] == '\\N'][df_after_dropping['statusId'] == 1].index


Int64Index([19871, 21594], dtype='int64')

In [8]:
df_after_dropping.drop(df_after_dropping[df_after_dropping['milliseconds'] == '\\N'][df_after_dropping['statusId'] == 1].index, inplace=True)

  df_after_dropping.drop(df_after_dropping[df_after_dropping['milliseconds'] == '\\N'][df_after_dropping['statusId'] == 1].index, inplace=True)


In [9]:
index = df_after_dropping[df_after_dropping['results_points'] > 26].index
df_after_dropping.drop(index, inplace=True)

In [10]:
df_after_dropping['weather_warm'] = df_after_dropping['weather_warm'].fillna(0)
df_after_dropping['weather_cold'] = df_after_dropping['weather_cold'].fillna(0)
df_after_dropping['weather_dry'] = df_after_dropping['weather_dry'].fillna(0)
df_after_dropping['weather_wet'] = df_after_dropping['weather_wet'].fillna(0)
df_after_dropping['weather_cloudy'] = df_after_dropping['weather_cloudy'].fillna(0)
df_after_dropping['driver_wins'] = df_after_dropping['driver_wins'].fillna(0)
df_after_dropping.isnull().sum()

raceId                        0
year                          0
circuitId                     0
circuit_id                 1040
weather_warm                  0
weather_cold                  0
weather_dry                   0
weather_wet                   0
weather_cloudy                0
driverId                      0
constructorId                 0
grid                          0
results_positionOrder         0
results_points                0
milliseconds                  0
statusId                      0
circuit_country               0
constructor_points         1865
constructor_position          0
constructor_wins              0
constructor_nationality       0
driverRef                     0
driver_nationality            0
driver_points               469
driver_wins                   0
driver_age                    0
dtype: int64

In [11]:
df_after_dropping['milliseconds'] = df_after_dropping['milliseconds'].replace('\\N', 0)

In [12]:
df_after_dropping['milliseconds'] = df_after_dropping['milliseconds'].astype('int64')

In [13]:
df_after_dropping['milliseconds'] = df_after_dropping['milliseconds'].replace(0, df_after_dropping['milliseconds'].max() * 2)

In [14]:
df_after_dropping['milliseconds'].describe()

count    2.565500e+04
mean     2.368220e+07
std      1.068698e+07
min      2.070710e+05
25%      8.109250e+06
50%      3.018108e+07
75%      3.018108e+07
max      3.018108e+07
Name: milliseconds, dtype: float64

In [15]:
upper_limit = df_after_dropping["driver_age"].quantile(0.97)
df_after_dropping['driver_age'] = np.where(df_after_dropping['driver_age'] > upper_limit, upper_limit, df_after_dropping['driver_age'])

In [16]:
df_after_dropping['results_positionOrder'] = np.where(df_after_dropping['results_positionOrder'] > 20, 20, df_after_dropping['results_positionOrder'])

In [17]:
dictDriverCountry={'Argentine':1,'Australian':2,'Austrian':3,'Belgian':6,'Brazilian':7,'Canadian':8,'Chinese':9,
'French':10,'German':11,'Hungarian':12,'Indian':13,'Italian':14,'Japanese':15,'Malaysian':17,
'Mexican':18,'Monegasque':19,'Portuguese':22,'Russian':24,'South African':27,'Spanish':28,
'Swedish':29,'Swiss':30,'British':33,'American':34,'American-Italian':14,'Argentine-Italian':1,
'Chilean':35,'Colombian':36,'Czech':37,'Danish':38,'Dutch':39,'East German':11,'Finnish':40,'Indonesian':41,'Irish':42,
'Liechtensteiner':43,'New Zealander':44,'Polish':45,'Rhodesian':46,'Thai':47,'Uruguayan':48,'Venezuelan':49
}

In [18]:
dictCircuitCountry = {'Argentina': 1, 'Australia': 2, 'Austria': 3, 'Azerbaijan': 4, 'Bahrain': 5, 'Belgium': 6, 
'Brazil': 7, 'Canada': 8, 'China': 9, 'France': 10, 'Germany': 11, 'Hungary': 12, 'India': 13, 'Italy': 14, 'Japan': 15, 
'Korea': 16, 'Malaysia': 17, 'Mexico': 18, 'Monaco': 19, 'Morocco': 20, 'Netherlands': 21, 'Portugal': 22, 'Qatar': 23, 
'Russia': 24, 'Saudi Arabia': 25, 'Singapore': 26, 'South Africa': 27, 'Spain': 28, 'Sweden': 29, 'Switzerland': 30, 
'Turkey': 31, 'UAE': 32, 'UK': 33, 'USA': 34}

In [19]:
df_after_dropping['circuit_country'] = df_after_dropping['circuit_country'].map(dictCircuitCountry)

In [20]:
df_after_dropping['driver_nationality'] = df_after_dropping['driver_nationality'].map(dictDriverCountry)

In [21]:
# import random
# # x_train, x_test, y_train, y_test = train_test_split(df_after_dropping.drop(columns = ['results_positionOrder']), df_after_dropping['results_positionOrder'], test_size = 0.15, random_state = 42, stratify=df_after_dropping['results_positionOrder'])
# testing_years = [1965]
# # r1 = random.sample(range(2010, 2021), 3)    
# # r2 = random.sample(range(1950, 1990), 3)
# # testing_years.extend(r1)
# # testing_years.extend(r2)
# # # print(testing_years)


# df_test = pd.DataFrame()
# for i in testing_years:
#     df_temp = df_after_dropping[df_after_dropping['year'] == i]
#     df_test = pd.concat([df_test, df_temp])

# df_train = pd.concat([df_after_dropping, df_test, df_test]).drop_duplicates(keep=False)

# x_train, x_test, y_train, y_test = df_train.drop(columns = ['results_points']), df_test.drop(columns = ['results_points']), df_train['results_points'], df_test['results_points']
# print(x_train.shape, x_test.shape, y_train.shape, y_test.shape)

In [22]:
testing_year = 2019
training_years = range(1950, testing_year)

df_test = df_after_dropping[df_after_dropping['year'] == testing_year]

df_train = pd.DataFrame()
for i in training_years:
    df_temp = df_after_dropping[df_after_dropping['year'] == i]
    df_train = pd.concat([df_temp, df_train])

x_train, x_test, y_train, y_test = df_train.drop(columns = ['results_points']), df_test.drop(columns = ['results_points']), df_train['results_points'], df_test['results_points']



In [None]:
standard_scaler = StandardScaler()
min_max_scaler = MinMaxScaler()

In [None]:
x_train['milliseconds'] = min_max_scaler.fit_transform(x_train[['milliseconds']])
x_test['milliseconds'] = min_max_scaler.transform(x_test[['milliseconds']])

In [None]:
x_train['driver_age'] = min_max_scaler.fit_transform(x_train[['driver_age']])
x_test['driver_age'] = min_max_scaler.transform(x_test[['driver_age']])

In [None]:
x_train['driver_wins'] = standard_scaler.fit_transform(x_train[['driver_wins']])
x_test['driver_wins'] = standard_scaler.transform(x_test[['driver_wins']])

In [None]:
x_train['constructor_wins'] = standard_scaler.fit_transform(x_train[['constructor_wins']])
x_test['constructor_wins'] = standard_scaler.transform(x_test[['constructor_wins']])

In [None]:
encoder = TargetEncoder(cols = ['constructor_nationality', 'driverId', 'constructorId', 'circuitId'], smoothing = 10, min_samples_leaf = 20)
# encoder = PolynomialWrapper(encoder_target)
x_train = encoder.fit_transform(x_train, y_train)
x_test = encoder.transform(x_test)

In [None]:
x_train['constructor_nationality'] = standard_scaler.fit_transform(x_train[['constructor_nationality']])
x_test['constructor_nationality'] = standard_scaler.transform(x_test[['constructor_nationality']])

In [None]:
x_train['driverId'] = standard_scaler.fit_transform(x_train[['driverId']])
x_test['driverId'] = standard_scaler.transform(x_test[['driverId']])

In [None]:
x_train['constructorId'] = standard_scaler.fit_transform(x_train[['constructorId']])
x_test['constructorId'] = standard_scaler.transform(x_test[['constructorId']])

In [None]:
x_train['circuitId'] = standard_scaler.fit_transform(x_train[['circuitId']])
x_test['circuitId'] = standard_scaler.transform(x_test[['circuitId']])

In [None]:
x_train['constructor_points'].fillna(0, inplace = True)
x_test['constructor_points'].fillna(0, inplace = True)

In [None]:
x_train['constructor_points'] = standard_scaler.fit_transform(x_train[['constructor_points']])
x_test['constructor_points'] = standard_scaler.transform(x_test[['constructor_points']])

In [None]:
x_train['driver_points'].fillna(0, inplace = True)
x_test['driver_points'].fillna(0, inplace = True)

In [None]:
x_train['driver_points'] = standard_scaler.fit_transform(x_train[['driver_points']])
x_test['driver_points'] = standard_scaler.transform(x_test[['driver_points']])

In [None]:
train_driver_mapping = pd.DataFrame(x_train[['driverId', 'driverRef']])
train_driver_mapping.set_index('driverId')
train_driver_mapping.drop_duplicates()

test_driver_mapping = pd.DataFrame(x_test[['driverId', 'driverRef']])
test_driver_mapping.set_index('driverId')
test_driver_mapping.drop_duplicates()

driver_mapping = pd.concat([train_driver_mapping, test_driver_mapping]).drop_duplicates()
driver_mapping.to_csv('data/driver_mapping.csv')


x_train.drop(columns = ['driverRef'], inplace = True)
x_test.drop(columns = ['driverRef'], inplace = True)

In [None]:
train_circuit_mapping = pd.DataFrame(x_train[['circuitId', 'circuit_id']])
train_circuit_mapping.set_index('circuitId')
train_circuit_mapping.drop_duplicates()

test_circuit_mapping = pd.DataFrame(x_test[['circuitId', 'circuit_id']])
test_circuit_mapping.set_index('circuitId')
test_circuit_mapping.drop_duplicates()

driver_mapping = pd.concat([train_circuit_mapping, test_circuit_mapping]).drop_duplicates()
driver_mapping.to_csv('data/circuit_mapping.csv')

x_train.drop(columns = ['circuit_id'], inplace = True)
x_test.drop(columns = ['circuit_id'], inplace = True)


In [None]:
pd.concat([x_train, y_train], axis = 1).to_csv('./data/train.csv', index = False)
pd.concat([x_test, y_test], axis = 1).to_csv('./data/test.csv', index = False)

In [None]:
x_train_pre = x_train.drop(columns = ['driver_points', 'constructor_points', 'milliseconds', 'constructor_position', 'statusId'])
x_test_pre = x_test.drop(columns = ['driver_points', 'constructor_points', 'milliseconds', 'constructor_position', 'statusId'])

pd.concat([x_train_pre, y_train], axis = 1).to_csv('./data/train_pre.csv', index = False)
pd.concat([x_test_pre, y_test], axis = 1).to_csv('./data/test_pre.csv', index = False)