In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from keras.models import Sequential
from keras.layers import LSTM, Dense, Dropout
from keras.callbacks import EarlyStopping

In [2]:
df = pd.read_csv('df_all.csv')
df

Unnamed: 0.1,Unnamed: 0,Driver,DriverNumber,LapTime,FreshTyre,Position
0,0,HAM,44,95.270,1,6.0
1,1,HAM,44,95.120,1,6.0
2,2,HAM,44,93.731,1,4.0
3,3,HAM,44,93.955,1,4.0
4,4,HAM,44,93.679,1,4.0
...,...,...,...,...,...,...
566003,566003,TSU,22,95.133,0,13.0
566004,566004,ZHO,24,97.487,1,14.0
566005,566005,ZHO,24,96.727,1,14.0
566006,566006,ZHO,24,95.731,1,14.0


In [3]:
df.dtypes

Unnamed: 0        int64
Driver           object
DriverNumber      int64
LapTime         float64
FreshTyre         int64
Position        float64
dtype: object

In [4]:
df_copy = df.copy()

In [7]:
#convert position to categorical
#1=1 rest =0
df['Position'] = df['Position'].apply(lambda x: 1 if x == 1 else 0) 
df['Position'].unique()

array([0, 1])

In [17]:
#count number of 1s and 0s
df['Position'].value_counts()

Position
0    533756
1     32252
Name: count, dtype: int64

In [8]:
features = df[['DriverNumber', 'LapTime', 'FreshTyre']]
features

Unnamed: 0,DriverNumber,LapTime,FreshTyre
0,44,95.270,1
1,44,95.120,1
2,44,93.731,1
3,44,93.955,1
4,44,93.679,1
...,...,...,...
566003,22,95.133,0
566004,24,97.487,1
566005,24,96.727,1
566006,24,95.731,1


In [9]:
label = df['Position']

In [18]:
#increase number of 1s
%pip install imbalanced-learn
from imblearn.over_sampling import SMOTE
smote = SMOTE()
X_resampled, Y_resampled = smote.fit_resample(features, label)


Collecting imbalanced-learn
  Downloading imbalanced_learn-0.11.0-py3-none-any.whl.metadata (8.3 kB)
Downloading imbalanced_learn-0.11.0-py3-none-any.whl (235 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m235.6/235.6 kB[0m [31m1.1 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0mm
[?25hInstalling collected packages: imbalanced-learn
Successfully installed imbalanced-learn-0.11.0
Note: you may need to restart the kernel to use updated packages.


In [21]:
print(X_resampled.shape)
print(Y_resampled.shape)

(1067512, 3)
(1067512,)


In [22]:
# Splitting the dataset into training and a temporary set (80% training, 20% temp)
X_train, X_temp, Y_train, Y_temp = train_test_split(X_resampled, Y_resampled, test_size=0.2, shuffle=False)

# Splitting the temporary set into validation and test sets (50% validation, 50% test of the temp set)
# This results in 80% training, 10% validation, and 10% test sets of the original dataset
X_val, X_test, Y_val, Y_test = train_test_split(X_temp, Y_temp, test_size=0.5, shuffle=False)

In [23]:
scaler = StandardScaler()

In [24]:
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
X_val_scaled = scaler.transform(X_val)

In [25]:
def create_sequences(X, Y, sequence_length):
        X_seq, Y_seq = [], []
        for i in range(len(X) - sequence_length):
            X_seq.append(X[i: i + sequence_length])
            Y_seq.append(Y.iloc[i + sequence_length])
        return np.array(X_seq), np.array(Y_seq)

In [26]:
X_train_seq, Y_train_seq = create_sequences(X_train_scaled, Y_train, 30)
X_test_seq, Y_test_seq = create_sequences(X_test_scaled, Y_test, 30)
X_val_seq, Y_val_seq = create_sequences(X_val_scaled, Y_val, 30)

In [27]:
print("X_train:",X_train_seq.shape)
print("Y_train:",Y_train_seq.shape)
print("X_test:",X_test_seq.shape)
print("Y_test:",Y_test_seq.shape)
print("X_val:",X_val_seq.shape)
print("Y_val:",Y_val_seq.shape)

X_train: (853979, 30, 3)
Y_train: (853979,)
X_test: (106722, 30, 3)
Y_test: (106722,)
X_val: (106721, 30, 3)
Y_val: (106721,)


In [29]:
# Initialize the LSTM model
model = Sequential()

# First LSTM layer with dropout
model.add(LSTM(units=100, return_sequences=True, input_shape=(30, 3)))
model.add(Dropout(0.3))

# Second LSTM layer with dropout
model.add(LSTM(units=100, return_sequences=True))
model.add(Dropout(0.3))

# Third LSTM layer with dropout
model.add(LSTM(units=100, return_sequences=False))
model.add(Dropout(0.3))

# Additional Dense layers before the output
model.add(Dense(units=50, activation='relu'))
model.add(Dense(units=25, activation='relu'))

# Output layer for classification
model.add(Dense(units=1, activation='sigmoid'))

# Compile the model for binary classification
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

early_stopping = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)

# Train the model with Early Stopping
history = model.fit(
    X_train_seq, 
    Y_train_seq, 
    epochs=5, 
    batch_size=64, 
    validation_data=(X_val_seq, Y_val_seq),
    callbacks=[early_stopping]
)


Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [30]:
# Assuming model is your trained LSTM model
# X_test and Y_test are your test features and labels
loss, accuracy = model.evaluate(X_test_seq, Y_test_seq, verbose=0)

print(f"Test Accuracy: {accuracy*100:.2f}%")


Test Accuracy: 100.00%


In [31]:
from lime import lime_text
from lime.lime_text import LimeTextExplainer

In [32]:
explainer = LimeTextExplainer(class_names=['winner', 'loser'])  # Fill in class names as appropriate


In [33]:
def predict_fn(data):
    # Preprocess data as required by your model
    # For example, tokenizing, padding, etc.
    preprocessed_data = data
    x = model.predict(X_test_seq[0].reshape(1,30,3))
    # Make predictions
    return x


In [39]:
idx = 0  # Index of the sample in your dataset you want to explain
exp = explainer.explain_instance(X_test_seq[0].reshape(1,30,3), predict_fn, num_features=3)
exp.show_in_notebook(show_table=True)


TypeError: cannot use a string pattern on a bytes-like object

In [37]:
model.predict(X_test_seq[0].reshape(1,30,3))



array([[1.]], dtype=float32)

In [38]:
Y_test_seq[0]

1