In [6]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
# import seaborn as sns
# import plotly.graph_objects as go
# import plotly.express as px
from sklearn.preprocessing import StandardScaler, MinMaxScaler, LabelEncoder
from category_encoders import TargetEncoder
from sklearn.model_selection import train_test_split
from category_encoders.wrapper import PolynomialWrapper
from sklearn.decomposition import PCA

In [7]:
df = pd.read_csv('./data/dataset.csv', low_memory=False)
df.rename(columns={'Unnamed: 0': 'index'}, inplace=True)
df.rename(columns = {'country': 'circuit_country'}, inplace = True)
df.drop(columns=['index'], inplace=True)
df.columns

Index(['raceId', 'year', 'round', 'circuitId', 'name', 'date', 'time',
       'circuit_id', 'weather_warm', 'weather_cold', 'weather_dry',
       'weather_wet', 'weather_cloudy', 'resultId', 'driverId',
       'constructorId', 'car_number', 'grid', 'results_positionOrder',
       'results_points', 'laps', 'milliseconds', 'fastestLap',
       'fastestLapTime', 'fastestLapSpeed', 'statusId', 'status',
       'circuit_location', 'circuit_country', 'constructorStandingsId',
       'constructor_points', 'constructor_position', 'constructor_wins',
       'constructorRef', 'constructor_nationality', 'driverRef',
       'driver_number', 'driver_code', 'forename', 'dob', 'driver_nationality',
       'driverStandingsId', 'driver_points', 'driver_position', 'driver_wins',
       'qualifyId', 'number', 'qualifying_position', 'q1_time', 'q2_time',
       'q3_time', 'no_of_stops', 'ms_avg', 'first_5_avg_time',
       'last_10_avg_time', 'lap_position_after_5',
       'lap_position_before_last_10'],


In [8]:
cols_to_be_dropped = ['round', 'name', 'time', 'circuit_id', 'car_number', 'resultId', 'driverRef', 'laps', 'fastestLap', 'fastestLapTime', 'fastestLapSpeed', 'first_5_avg_time', 'last_10_avg_time', 'lap_position_after_5', 'lap_position_before_last_10', 'circuit_location', 'constructorStandingsId', 'constructorRef', 'driverRef', 'driver_number', 'driver_code', 'forename', 'driverStandingsId', 'number', 'status', 'qualifying_position', 'q1_time', 'q2_time', 'q3_time', 'ms_avg', 'no_of_stops', 'driver_position', 'qualifyId']
df_after_dropping = df.drop(columns = cols_to_be_dropped)

#### Merging with qualifier

In [9]:
qualifiers = pd.read_csv('./qualifier_data/qualify.csv')

In [10]:
temp = pd.merge(df_after_dropping, qualifiers, how = 'inner')
temp.shape

(9385, 30)

#### Getting age from date & dob

In [11]:
from datetime import datetime
from dateutil import relativedelta

# get two dates
race_date = df_after_dropping['date']
driver_dob = df_after_dropping['dob']

ages = []

for i in range(len(race_date)):
    start_date = datetime.strptime(race_date[i],"%Y-%m-%d")
    end_date = datetime.strptime(driver_dob[i], "%Y-%m-%d")
    delta = relativedelta.relativedelta(start_date, end_date)
    ages.append(delta.years)

df_after_dropping['driver_age'] = ages
df_after_dropping.drop(columns = ['date', 'dob'], inplace = True)

### Merging qualifier data

In [12]:
qualify_df = pd.read_csv('qualifier_data/qualify.csv')

In [13]:
df_after_dropping = pd.merge(df_after_dropping, qualify_df, how = 'inner', on = ['raceId', 'year', 'circuitId', 'driverId', 'constructorId'])

#### Changing status IDs to 0s and 1s

In [14]:
df_after_dropping['statusId'][df_after_dropping['statusId'] > 1] = 0

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_after_dropping['statusId'][df_after_dropping['statusId'] > 1] = 0


#### Replace Nan in Constructor_position by max Value = 21 & Replace Nan in Constructor_wins by 0

In [15]:
df_after_dropping['constructor_position'] = df_after_dropping['constructor_position'].fillna(21)
df_after_dropping['constructor_wins'] = df_after_dropping['driver_wins'].fillna(0)

In [16]:
df_after_dropping[df_after_dropping['milliseconds'] == '\\N'][df_after_dropping['statusId'] == 1].index

  df_after_dropping[df_after_dropping['milliseconds'] == '\\N'][df_after_dropping['statusId'] == 1].index


Int64Index([], dtype='int64')

In [17]:
df_after_dropping.drop(df_after_dropping[df_after_dropping['milliseconds'] == '\\N'][df_after_dropping['statusId'] == 1].index, inplace=True)

  df_after_dropping.drop(df_after_dropping[df_after_dropping['milliseconds'] == '\\N'][df_after_dropping['statusId'] == 1].index, inplace=True)


In [18]:
index = df_after_dropping[df_after_dropping['results_points'] > 26].index
df_after_dropping.drop(index, inplace=True)

In [19]:
df_after_dropping['weather_warm'] = df_after_dropping['weather_warm'].fillna(0)
df_after_dropping['weather_cold'] = df_after_dropping['weather_cold'].fillna(0)
df_after_dropping['weather_dry'] = df_after_dropping['weather_dry'].fillna(0)
df_after_dropping['weather_wet'] = df_after_dropping['weather_wet'].fillna(0)
df_after_dropping['weather_cloudy'] = df_after_dropping['weather_cloudy'].fillna(0)
df_after_dropping['driver_wins'] = df_after_dropping['driver_wins'].fillna(0)
df_after_dropping.isnull().sum()

raceId                       0
year                         0
circuitId                    0
weather_warm                 0
weather_cold                 0
weather_dry                  0
weather_wet                  0
weather_cloudy               0
driverId                     0
constructorId                0
grid                         0
results_positionOrder        0
results_points               0
milliseconds                 0
statusId                     0
circuit_country              0
constructor_points          95
constructor_position         0
constructor_wins             0
constructor_nationality      0
driver_nationality           0
driver_points              241
driver_wins                  0
driver_age                   0
Unnamed: 0                   0
qualifying_position          0
q1_time_ms                   0
q2_time_ms                   0
q3_time_ms                   0
dtype: int64

In [20]:
df_after_dropping['milliseconds'] = df_after_dropping['milliseconds'].replace('\\N', 0)

In [21]:
df_after_dropping['milliseconds'] = df_after_dropping['milliseconds'].astype('int64')

In [22]:
df_after_dropping['milliseconds'] = df_after_dropping['milliseconds'].replace(0, df_after_dropping['milliseconds'].max() * 2)

In [23]:
df_after_dropping['milliseconds'].describe()

count    9.382000e+03
mean     1.941755e+07
std      1.171622e+07
min      2.070710e+05
25%      5.781706e+06
50%      2.948629e+07
75%      2.948629e+07
max      2.948629e+07
Name: milliseconds, dtype: float64

In [24]:
upper_limit = df_after_dropping["driver_age"].quantile(0.97)
df_after_dropping['driver_age'] = np.where(df_after_dropping['driver_age'] > upper_limit, upper_limit, df_after_dropping['driver_age'])

In [25]:
df_after_dropping['results_positionOrder'] = np.where(df_after_dropping['results_positionOrder'] > 20, 20, df_after_dropping['results_positionOrder'])

In [26]:
dictDriverCountry={'Argentine':1,'Australian':2,'Austrian':3,'Belgian':6,'Brazilian':7,'Canadian':8,'Chinese':9,
'French':10,'German':11,'Hungarian':12,'Indian':13,'Italian':14,'Japanese':15,'Malaysian':17,
'Mexican':18,'Monegasque':19,'Portuguese':22,'Russian':24,'South African':27,'Spanish':28,
'Swedish':29,'Swiss':30,'British':33,'American':34,'American-Italian':14,'Argentine-Italian':1,
'Chilean':35,'Colombian':36,'Czech':37,'Danish':38,'Dutch':39,'East German':11,'Finnish':40,'Indonesian':41,'Irish':42,
'Liechtensteiner':43,'New Zealander':44,'Polish':45,'Rhodesian':46,'Thai':47,'Uruguayan':48,'Venezuelan':49
}

In [27]:
dictCircuitCountry = {'Argentina': 1, 'Australia': 2, 'Austria': 3, 'Azerbaijan': 4, 'Bahrain': 5, 'Belgium': 6, 
'Brazil': 7, 'Canada': 8, 'China': 9, 'France': 10, 'Germany': 11, 'Hungary': 12, 'India': 13, 'Italy': 14, 'Japan': 15, 
'Korea': 16, 'Malaysia': 17, 'Mexico': 18, 'Monaco': 19, 'Morocco': 20, 'Netherlands': 21, 'Portugal': 22, 'Qatar': 23, 
'Russia': 24, 'Saudi Arabia': 25, 'Singapore': 26, 'South Africa': 27, 'Spain': 28, 'Sweden': 29, 'Switzerland': 30, 
'Turkey': 31, 'UAE': 32, 'UK': 33, 'USA': 34}

In [28]:
df_after_dropping['circuit_country'] = df_after_dropping['circuit_country'].map(dictCircuitCountry)

In [29]:
df_after_dropping['driver_nationality'] = df_after_dropping['driver_nationality'].map(dictDriverCountry)

In [30]:
import random
# x_train, x_test, y_train, y_test = train_test_split(df_after_dropping.drop(columns = ['results_positionOrder']), df_after_dropping['results_positionOrder'], test_size = 0.15, random_state = 42, stratify=df_after_dropping['results_positionOrder'])
testing_years = []
r1 = random.sample(range(2010, 2021), 3)    
r2 = random.sample(range(1950, 1990), 3)
testing_years.extend(r1)
testing_years.extend(r2)

df_test = pd.DataFrame()
for i in testing_years:
    df_temp = df_after_dropping[df_after_dropping['year'] == i]
    df_test = pd.concat([df_test, df_temp])

df_train = pd.concat([df_after_dropping, df_test, df_test]).drop_duplicates(keep=False)

x_train, x_test, y_train, y_test = df_train.drop(columns = ['results_points']), df_test.drop(columns = ['results_points']), df_train['results_points'], df_test['results_points']
print(x_train.shape, x_test.shape, y_train.shape, y_test.shape)

(8051, 28) (1331, 28) (8051,) (1331,)


In [31]:
standard_scaler = StandardScaler()
min_max_scaler = MinMaxScaler()

In [32]:
x_train['milliseconds'] = min_max_scaler.fit_transform(x_train[['milliseconds']])
x_test['milliseconds'] = min_max_scaler.transform(x_test[['milliseconds']])

In [33]:
x_train['driver_age'] = min_max_scaler.fit_transform(x_train[['driver_age']])
x_test['driver_age'] = min_max_scaler.transform(x_test[['driver_age']])

In [34]:
x_train['driver_wins'] = standard_scaler.fit_transform(x_train[['driver_wins']])
x_test['driver_wins'] = standard_scaler.transform(x_test[['driver_wins']])

In [35]:
x_train['constructor_wins'] = standard_scaler.fit_transform(x_train[['constructor_wins']])
x_test['constructor_wins'] = standard_scaler.transform(x_test[['constructor_wins']])

In [36]:
encoder = TargetEncoder(cols = ['constructor_nationality', 'driverId', 'constructorId', 'circuitId'], smoothing = 10, min_samples_leaf = 20)
# encoder = PolynomialWrapper(encoder_target)
x_train = encoder.fit_transform(x_train, y_train)
x_test = encoder.transform(x_test)

In [37]:
x_train['constructor_nationality'] = standard_scaler.fit_transform(x_train[['constructor_nationality']])
x_test['constructor_nationality'] = standard_scaler.transform(x_test[['constructor_nationality']])

In [38]:
x_train['driverId'] = standard_scaler.fit_transform(x_train[['driverId']])
x_test['driverId'] = standard_scaler.transform(x_test[['driverId']])

In [39]:
x_train['constructorId'] = standard_scaler.fit_transform(x_train[['constructorId']])
x_test['constructorId'] = standard_scaler.transform(x_test[['constructorId']])

In [40]:
x_train['circuitId'] = standard_scaler.fit_transform(x_train[['circuitId']])
x_test['circuitId'] = standard_scaler.transform(x_test[['circuitId']])

In [41]:
x_train['constructor_points'].fillna(0, inplace = True)
x_test['constructor_points'].fillna(0, inplace = True)

In [42]:
x_train['constructor_points'] = standard_scaler.fit_transform(x_train[['constructor_points']])
x_test['constructor_points'] = standard_scaler.transform(x_test[['constructor_points']])

In [43]:
x_train['driver_points'].fillna(0, inplace = True)
x_test['driver_points'].fillna(0, inplace = True)

In [44]:
x_train['driver_points'] = standard_scaler.fit_transform(x_train[['driver_points']])
x_test['driver_points'] = standard_scaler.transform(x_test[['driver_points']])

In [45]:
pd.concat([x_train, y_train], axis = 1).to_csv('./data_new/qualifying_train.csv', index = False)
pd.concat([x_test, y_test], axis = 1).to_csv('./data_new/qualifying_test.csv', index = False)

In [46]:
x_train_pre = x_train.drop(columns = ['driver_points', 'constructor_points', 'milliseconds', 'constructor_position', 'statusId'])
x_test_pre = x_test.drop(columns = ['driver_points', 'constructor_points', 'milliseconds', 'constructor_position', 'statusId'])

pd.concat([x_train_pre, y_train], axis = 1).to_csv('./data_new/qualifying_train_pre.csv', index = False)
pd.concat([x_test_pre, y_test], axis = 1).to_csv('./data_new/qualifying_test_pre.csv', index = False)