## Dataset Link

https://www.kaggle.com/competitions/tabular-playground-series-apr-2022/data

In [30]:
import numpy as np
import pandas as pd 

# Import the data

In [31]:
train = pd.read_csv("C:/My TA Resources/ML SP22/LSTM/train.csv")
test = pd.read_csv("C:/My TA Resources/ML SP22/LSTM/test.csv")
labels = pd.read_csv("C:/My TA Resources/ML SP22/LSTM/train_labels.csv")

In [32]:
labels['state'].value_counts()

1    13014
0    12954
Name: state, dtype: int64

# import libraries

In [33]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.filterwarnings("ignore")

import tensorflow as tf
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
from tensorflow.keras.utils import plot_model
from tensorflow.keras.models import Model, load_model
from tensorflow.keras.layers import GlobalMaxPooling1D
from tensorflow.keras.layers import BatchNormalization
from tensorflow.keras.layers import Dense, Dropout, Input
from tensorflow.keras.layers import Concatenate, LSTM, GRU
from tensorflow.keras.layers import Bidirectional, Multiply
from tensorflow.keras.metrics import AUC
from tensorflow.keras.utils import plot_model

from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import GroupKFold


In [34]:
data=train.loc[1].reset_index()

In [35]:
features = train.columns.tolist()[3:]


## Data Processing, Feature Extraction

In [36]:
def preprocessing(df):
    for feature in features:
        df[feature + '_lag1'] = df.groupby('sequence')[feature].shift(1)
        df.fillna(0, inplace=True)
        df[feature + '_diff1'] = df[feature] - df[feature + '_lag1']
      

In [37]:
preprocessing(train)

In [38]:
groups = train['sequence']
labels = labels['state']

train = train.drop(['sequence', 'subject', 'step'], axis=1).values
train = train.reshape(-1, 60, train.shape[-1])

test =test.drop(['sequence', 'subject', 'step'], axis=1).values
test = test.reshape(-1, 60, test.shape[-1])

In [39]:
train.shape

(25968, 60, 39)

# Model

In [40]:
def lstm():
    #with tpu_strategy.scope():
    x_input = Input(shape=(train.shape[-2:])) # (60,39)

    x1 = Bidirectional(LSTM(units=512, return_sequences=True))(x_input)
    x2 = Bidirectional(LSTM(units=256, return_sequences=True))(x1)
    z1 = Bidirectional(GRU(units=256, return_sequences=True))(x1)

    c = Concatenate(axis=2)([x2, z1])

    x3 = Bidirectional(LSTM(units=128, return_sequences=True))(c)

    x4 = GlobalMaxPooling1D()(x3)
    x5 = Dense(units=128, activation='selu')(x4)
    x_output = Dense(1, activation='sigmoid')(x5)

    model = Model(inputs=x_input, outputs=x_output, name='lstm_model')
        
    return model

In [41]:
model = lstm()
model.compile(optimizer = 'adam', loss = 'binary_crossentropy', metrics=[AUC(name = 'auc')])

In [42]:
model.summary()

Model: "lstm_model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_2 (InputLayer)            [(None, 60, 39)]     0                                            
__________________________________________________________________________________________________
bidirectional_4 (Bidirectional) (None, 60, 1024)     2260992     input_2[0][0]                    
__________________________________________________________________________________________________
bidirectional_5 (Bidirectional) (None, 60, 512)      2623488     bidirectional_4[0][0]            
__________________________________________________________________________________________________
bidirectional_6 (Bidirectional) (None, 60, 512)      1969152     bidirectional_4[0][0]            
_________________________________________________________________________________________

In [44]:
scores = []
test_preds = []
kf = GroupKFold(n_splits=5)

## Model Training

In [45]:
for fold_idx, (train_idx, valid_idx) in enumerate(kf.split(train, labels, groups.unique())):
    
    print('\n')
    print('*'*15, f'↓ Fold {fold_idx+1} ↓', '*'*15)
    
    # Separate into train data and validation data
    X_train, X_valid = train[train_idx], train[valid_idx]
    y_train, y_valid = labels.iloc[train_idx].values, labels.iloc[valid_idx].values
    
    # Train the model
    model.fit(X_train, y_train, 
              validation_data=(X_valid, y_valid), 
              epochs=5, 
              batch_size=256, 
              callbacks=[EarlyStopping(monitor='val_auc', patience=7, mode='max', 
                                       restore_best_weights=True),
                         ReduceLROnPlateau(monitor='val_auc', factor=0.6, 
                                           patience=4, verbose=False)]
             )
    
    # Save score
    score = roc_auc_score(y_valid, model.predict(X_valid, batch_size=512).squeeze())
    scores.append(score)
    
    # Predict
    test_preds.append(model.predict(test, batch_size=512).squeeze())
    
    print(f'Fold {fold_idx+1} | Score: {score}')
    print('*'*15, f'↑ Fold {fold_idx+1} ↑', '*'*15)
    
print(f'Mean accuracy on {kf.n_splits} folds {np.mean(scores)}')



*************** ↓ Fold 1 ↓ ***************
Epoch 1/5
 5/82 [>.............................] - ETA: 13:46 - loss: 0.7774 - auc: 0.5565

KeyboardInterrupt: 