In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

### Loading data and prining out head & shape ###

In [2]:
#Importing data to dataframe
raw_data = pd.read_csv('data.csv')
print(raw_data.head().to_string())
print("Shape: ", raw_data.shape)

   resultId  raceId  year  round  grid  positionOrder  points  laps milliseconds fastestLap rank fastestLapTime fastestLapSpeed        driverRef     surname forename         dob nationality_x constructorRef                          name nationality_y     circuitRef  circuitId        name_y    location  country      lat        lng  alt        date  target_finish
0      2460     136  2002     13    11              4     3.0  77.0          NaN         \N   \N             \N              \N        raikkonen   Räikkönen     Kimi  1979-10-17       Finnish        mclaren                   Hungaroring       British    hungaroring         11       McLaren    Budapest  Hungary  47.5789   19.24860  264  2002-08-18              1
1     11565     483  1981      1    23             21     0.0  16.0           \N         \N   \N             \N              \N           watson      Watson     John  1946-05-04       British        mclaren                    Long Beach       British     long_beach       

### Print overview of columns containing NA values, and dropping using 90% threshold ###

In [3]:
# Make sure other types of missing data is also registered as missing data
missing_markers = ['\\N', 'NULL', 'null', ''] 
raw_data = raw_data.replace(missing_markers, np.nan)

nullValues = raw_data.isnull().sum()
print("Rows with NA and NA count: ")
print(nullValues[nullValues > 0])
print("\nShape with 90% threshold for dropping column:")
thresh = round(0.9*raw_data.shape[0])
trimmed_raw_data = raw_data.dropna(axis=1, thresh=thresh)
print(trimmed_raw_data.shape)
print(trimmed_raw_data.columns)

Rows with NA and NA count: 
points              971
laps                978
milliseconds       7393
fastestLap         6895
rank               6798
fastestLapTime     6895
fastestLapSpeed    7191
dtype: int64

Shape with 90% threshold for dropping column:
(10000, 26)
Index(['resultId', 'raceId', 'year', 'round', 'grid', 'positionOrder',
       'points', 'laps', 'driverRef', 'surname', 'forename', 'dob',
       'nationality_x', 'constructorRef', 'name', 'nationality_y',
       'circuitRef', 'circuitId', 'name_y', 'location', 'country', 'lat',
       'lng', 'alt', 'date', 'target_finish'],
      dtype='object')


### Further feature selection ###
Some columns have leakage features, meaning they have a 1-1 correlation with what we try to predict.These need to be removed as they will make the predictions too "easy". Some of these are also measurements made after after the target value is measured, and therefore don't have any predictive power. E.g. "positionOrder" and "points", which are based upon whether the driver finishes the race or not. <br>
Will keep features 'year', 'round', 'grid', 'constructorRef', 'circuitRef', 'alt', 'date' as well as 'target_finish' <br>
Reasons one by one: <br><br>

##### Id's #####
- resultId: DROP – not useful

- raceId: DROP – not useful

- circuitId: DROP – Redundant with circuitRef

##### Race metadata #####

- year: KEEP – DNF rates may vary over the years

- round: KEEP – Early/late season influences DNF

- date: KEEP – Weather/season patterns

- country: DROP - redundant with circuitRef

##### Driver & team #####

- driverRef: DROP - High cardinality, too sparse. Data of retired drivers unuseful for future drivers

- surname: DROP – same reason as driverRef

- forename: DROP – same reason as driverRef

- dob: DROP – weak predictor

- nationality_x: DROP – Weak predictor

- constructorRef: KEEP – Team strongly impacts DNF

- nationality_y: DROP – irrelevant / redundant to constructorRef

##### Performance stats #####

- grid: KEEP – Starting position affects crash risk

- positionOrder: DROP – Leakage (reveals final result)

##### Circuit info #####

- circuitRef: KEEP – Tracks differ in DNF probability

- name: DROP – Duplicate of circuitRef

- name_y: DROP – Another duplicate

- location: DROP – Text field, not useful

##### Geographical #####

- lat: DROP – Raw coordinate not meaningful

- lng: DROP – Same as above

- alt: KEEP - altitude of tracks may be predicitive  


In [4]:
keep_cols = [
    'year', 'round', 'grid', 'constructorRef',
    'circuitRef', 'alt', 'date',
    'target_finish'
]
clean_data = trimmed_raw_data[keep_cols]
print("Shape:", clean_data.shape)
print("Features:",clean_data.columns)

Shape: (10000, 8)
Features: Index(['year', 'round', 'grid', 'constructorRef', 'circuitRef', 'alt', 'date',
       'target_finish'],
      dtype='object')


### Dropping rows with NA values (redundant as columns containing NA are removed already) ###

In [5]:
clean_data = clean_data.dropna()
print("Shape after dropping rows with NA values and columns with >90% NA values:\n",clean_data.shape)

Shape after dropping rows with NA values and columns with >90% NA values:
 (10000, 8)


### Printing out describtion of dataframe, and ranked correlation between numerical features for analysis ###

In [6]:
print(clean_data.describe().to_string())
print("\nCorrelation between numerical features and target_finish ranked on abs value")
print(clean_data.corr(numeric_only=True)['target_finish'].sort_values(key=abs, ascending=False)[1:])

               year         round          grid           alt  target_finish
count  10000.000000  10000.000000  10000.000000  10000.000000   10000.000000
mean    1991.466600      8.540100     11.176200    281.465900       0.289500
std       20.060237      5.085487      7.241008    414.586363       0.453553
min     1950.000000      1.000000      0.000000     -7.000000       0.000000
25%     1977.000000      4.000000      5.000000     18.000000       0.000000
50%     1991.000000      8.000000     11.000000    153.000000       0.000000
75%     2009.000000     12.000000     17.000000    401.000000       1.000000
max     2024.000000     24.000000     34.000000   2227.000000       1.000000

Correlation between numerical features and target_finish ranked on abs value
grid    -0.344964
year     0.276936
round    0.123115
alt     -0.043222
Name: target_finish, dtype: float64


### Handling 'object' values ###

Will use a reduced One Hot Encoding. Let the top 10 most common values per feature be one hot encoded, place all other values under a group called 'other...'

Then convert date to month, and OHE it

In [7]:
clean_data["date"] = pd.to_datetime(clean_data["date"]).dt.month
clean_data = clean_data.rename(columns={"date":"month"})

In [8]:
top_10_constructorRefs = clean_data['constructorRef'].value_counts().nlargest(10).index

print("The Top 10 constructorRefs are:", list(top_10_constructorRefs))

# Replace country if not in top 10 with 'Other'
clean_data['constructorRef'] = clean_data['constructorRef'].apply(
    lambda x: x if x in top_10_constructorRefs else 'other_const'
)

The Top 10 constructorRefs are: ['ferrari', 'mclaren', 'williams', 'sauber', 'team_lotus', 'tyrrell', 'red_bull', 'renault', 'ligier', 'brabham']


In [9]:
top_10_circuitRef = clean_data['circuitRef'].value_counts().nlargest(10).index

print("The Top 10 circuitRef are:", list(top_10_circuitRef))

# Replace country if not in top 10 with 'Other'
clean_data['circuitRef'] = clean_data['circuitRef'].apply(
    lambda x: x if x in top_10_circuitRef else 'other_circ'
)

The Top 10 circuitRef are: ['monza', 'monaco', 'silverstone', 'spa', 'villeneuve', 'hungaroring', 'nurburgring', 'hockenheimring', 'interlagos', 'red_bull_ring']


### Dataset before OHE ###

In [10]:
print(clean_data.head().to_string())

   year  round  grid constructorRef     circuitRef  alt  month  target_finish
0  2002     13    11        mclaren    hungaroring  264      8              1
1  1981      1    23        mclaren     other_circ   12      3              0
2  1958      8     0    other_const    nurburgring  578      8              0
3  2021      8    19    other_const  red_bull_ring  678      6              0
4  1988     12     0    other_const          monza  162      9              0


In [11]:
from sklearn.preprocessing import OneHotEncoder

cat = ["constructorRef", "circuitRef", "month"]
enc = OneHotEncoder()
enc.fit(clean_data[cat])
cat_data = enc.transform(clean_data[cat]).toarray()

cat_df = pd.DataFrame(data = cat_data, columns = enc.get_feature_names_out(), index = clean_data.index)
encoded_data = clean_data.join(cat_df).drop(columns = cat)

# Dropped to avoid linear dependencies 
encoded_data = encoded_data.drop(columns=["circuitRef_other_circ", "constructorRef_other_const", "month_12"])

### Dataset after OHE ###

In [12]:
print(encoded_data.head().to_string())
print(encoded_data.shape)
print(encoded_data.columns)

   year  round  grid  alt  target_finish  constructorRef_brabham  constructorRef_ferrari  constructorRef_ligier  constructorRef_mclaren  constructorRef_red_bull  constructorRef_renault  constructorRef_sauber  constructorRef_team_lotus  constructorRef_tyrrell  constructorRef_williams  circuitRef_hockenheimring  circuitRef_hungaroring  circuitRef_interlagos  circuitRef_monaco  circuitRef_monza  circuitRef_nurburgring  circuitRef_red_bull_ring  circuitRef_silverstone  circuitRef_spa  circuitRef_villeneuve  month_1  month_2  month_3  month_4  month_5  month_6  month_7  month_8  month_9  month_10  month_11
0  2002     13    11  264              1                     0.0                     0.0                    0.0                     1.0                      0.0                     0.0                    0.0                        0.0                     0.0                      0.0                        0.0                     1.0                    0.0                0.0               

## Flip values in target_finish ## 
We want to detect DNF, make this our positive

In [13]:
encoded_data['target_dnf'] = 1 - encoded_data['target_finish']
encoded_data = encoded_data.drop(columns=["target_finish"])
print(encoded_data["target_dnf"].value_counts())

target_dnf
1    7105
0    2895
Name: count, dtype: int64


## Data split ##

cutoff_year set to 2015 arbitrarily for now. Can change this, but gives a reasonable split (test size ~15% of dataset)

In [14]:
cutoff_year = 2015

train_data = encoded_data[encoded_data['year'] <= cutoff_year].copy()
test_data  = encoded_data[encoded_data['year'] >  cutoff_year].copy()

X_train = train_data.drop(columns=['target_dnf'])
y_train = train_data['target_dnf']

X_test  = test_data.drop(columns=['target_dnf'])
y_test  = test_data['target_dnf']

print("Training size (X, y):", X_train.shape, ",", len(y_train))
print("Test size (X, y):", X_test.shape, ",", len(y_test))

Training size (X, y): (8550, 35) , 8550
Test size (X, y): (1450, 35) , 1450


Beginning of logistic regression
Starting with finding strong features using Lasso

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
random_seed = 2334

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)

feature_extractor = LogisticRegression(penalty='l1', solver='liblinear', random_state=random_seed, C=1)

feature_extractor.fit(X_train_scaled, y_train)

#Threshold 0.05, this gives 19 features with C=1
thresh = 0.05

significant = X_train.columns[abs(feature_extractor.coef_[0])>thresh]
coefficients = feature_extractor.coef_[0][abs(feature_extractor.coef_[0])>thresh]
z = list(zip(significant, coefficients))
z.sort(key=lambda x: abs(x[1]), reverse=True)
significantSorted = [p[0] for p in z]

19


After Lasso has had its way with our features, we are left with 19
We will then test out different subsets of these in conjunction with changing the levels of regularization
To do this we will create a pipeline and use a Grid search

What happens in the following cell
1. We define a list of lists of features. These are selected based on the output of the previous cell, where lasso assigns higher coefficients to features with more predictive power. I try all, then only the significant ones, the 7 best, the best half, best quarter, worst half and then in the end constructors and circuits for fun (might remove these)
2. I define a parametergrid, currently the only parameter is C, which is the amount of lasso-regularization. 
3. I iterate through the different subsets of features, and do gridsearch, with a cv of 5, for each one of them. 
4. We save the best result of each feature-set

In [33]:
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.compose import ColumnTransformer

print(significantSorted)

#We try all the features, the best half and quarter, as well as the worst half for fun
listsOfFeatures = [
    X_train.columns,
    significantSorted,
    significantSorted[0:7],
    significantSorted[0:len(significantSorted)//2],
    significantSorted[0:len(significantSorted)//4],
    significantSorted[len(significantSorted)//2:-1],
    [s for s in X_train.columns if 'constructorRef' in s],
    [s for s in X_train.columns if 'circuitRef' in s],
]

param_grid = {
    'model__C' : np.logspace(-2, 2, 7),
}


pipe = Pipeline([
    ("scaler", StandardScaler()),
    ("model", LogisticRegression(max_iter=1000, solver='liblinear', penalty='l1', random_state=random_seed))
])


results = {}

for i, features in enumerate(listsOfFeatures):
    X_train_iter = X_train[features]
    grid = GridSearchCV(pipe, param_grid=param_grid, cv=5)
    if i == 6:
        key = f"Feature_Set_constructors_({len(features)} columns)"
    elif i == 7:
        key = f"Feature_Set_circuits_({len(features)} columns)"
    else:
        key = f"Feature_Set_{i+1}_({len(features)} columns)"
    grid.fit(X_train_iter, y_train)


    #Storing results
    results[key] = {
        'best_score': grid.best_score_,
        'best_params': grid.best_params_,
        'features_used': features
    }

for key, res in results.items():
    print(f"**{key}**: Best Score = {res['best_score']:.4f}, Best C = {res['best_params']['model__C']:.4f}")

print(sum(y_train)/len(y_train))

['grid', 'year', 'constructorRef_ferrari', 'constructorRef_mclaren', 'constructorRef_williams', 'circuitRef_nurburgring', 'constructorRef_red_bull', 'circuitRef_monaco', 'constructorRef_ligier', 'month_10', 'constructorRef_renault', 'constructorRef_team_lotus', 'circuitRef_spa', 'circuitRef_interlagos', 'alt', 'constructorRef_tyrrell', 'month_3', 'circuitRef_silverstone', 'circuitRef_villeneuve']
**Feature_Set_1_(35 columns)**: Best Score = 0.8022, Best C = 0.0464
**Feature_Set_2_(19 columns)**: Best Score = 0.8016, Best C = 0.0464
**Feature_Set_3_(7 columns)**: Best Score = 0.8012, Best C = 0.0464
**Feature_Set_4_(9 columns)**: Best Score = 0.8006, Best C = 0.2154
**Feature_Set_5_(4 columns)**: Best Score = 0.7927, Best C = 0.0464
**Feature_Set_6_(9 columns)**: Best Score = 0.7523, Best C = 4.6416
**Feature_Set_constructors_(10 columns)**: Best Score = 0.7588, Best C = 0.0464
**Feature_Set_circuits_(10 columns)**: Best Score = 0.7520, Best C = 0.0100
0.752046783625731
