## Import libraries

In [1]:
import tensorflow as tf

print(tf.__version__)
tf.config.list_physical_devices('GPU')

2.8.0


[PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]

In [2]:
import gc
import numpy as np
import pandas as pd
from tqdm import tqdm
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import QuantileTransformer

import tensorflow as tf
from tensorflow.keras.utils import plot_model
from tensorflow.keras.models import Model, load_model
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.callbacks import ModelCheckpoint
from tensorflow.keras.callbacks import ReduceLROnPlateau
from tensorflow.keras.layers import GlobalAveragePooling1D
from tensorflow.keras.layers import GlobalMaxPooling1D, Add
from tensorflow.keras.layers import Bidirectional, LSTM, GRU
from tensorflow.keras.layers import Input, BatchNormalization
from tensorflow.keras.layers import Dense, Concatenate, Multiply

import warnings
warnings.filterwarnings("ignore")

np.random.seed(12)
tf.random.set_seed(12)

## Load source datasets

In [3]:
DIR_DATASET = "C:/Users/0stxx/Datasets/"
NAME_PROJECT = '2204-kaggle-tps2204'

df_train = pd.read_csv(DIR_DATASET + NAME_PROJECT + '/train.csv')
df_train_labels = pd.read_csv(DIR_DATASET + NAME_PROJECT + '/train_labels.csv')
df_test = pd.read_csv(DIR_DATASET + NAME_PROJECT + '/test.csv')
df_sub = pd.read_csv(DIR_DATASET + NAME_PROJECT + '/sample_submission.csv')

In [4]:
train = df_train
train.sort_values(by=['sequence','step'], inplace=True)
print(f"train: {train.shape}")
train.head()

train: (1558080, 16)


Unnamed: 0,sequence,subject,step,sensor_00,sensor_01,sensor_02,sensor_03,sensor_04,sensor_05,sensor_06,sensor_07,sensor_08,sensor_09,sensor_10,sensor_11,sensor_12
0,0,47,0,-0.196291,0.112395,1.0,0.329204,-1.00466,-0.131638,-0.127505,0.368702,-0.1,-0.963873,-0.985069,0.531893,4.751492
1,0,47,1,-0.44745,0.134454,1.0,-0.658407,0.162495,0.340314,-0.209472,-0.867176,0.2,-0.301301,0.082733,-0.231481,0.45439
2,0,47,2,0.326893,-0.694328,1.0,0.330088,0.473678,1.280479,-0.094718,0.535878,1.4,1.002168,0.449221,-0.58642,-4.736147
3,0,47,3,0.523184,0.75105,1.0,0.976991,-0.563287,-0.720269,0.79326,0.951145,-0.3,-0.995665,-0.43429,1.34465,0.429241
4,0,47,4,0.272025,1.07458,1.0,-0.136283,0.398579,0.044877,0.560109,-0.541985,-0.9,1.055636,0.812631,0.123457,-0.223359


In [5]:
train_labels = df_train_labels
train_labels.sort_values(by=['sequence'], inplace=True)
print(f"train_labels: {train_labels.shape}")
train_labels.head()

train_labels: (25968, 2)


Unnamed: 0,sequence,state
0,0,0
1,1,1
2,2,1
3,3,1
4,4,1


In [6]:
test = df_test
test.sort_values(by=['sequence','step'], inplace=True)
print(f"test: {test.shape}")
test.head()

test: (733080, 16)


Unnamed: 0,sequence,subject,step,sensor_00,sensor_01,sensor_02,sensor_03,sensor_04,sensor_05,sensor_06,sensor_07,sensor_08,sensor_09,sensor_10,sensor_11,sensor_12
0,25968,684,0,2.427357,19.639706,1.0,-1.466372,-1.289973,-4.207928,2.486339,-2.493893,8.0,-1.123555,-1.673048,10.980453,0.419011
1,25968,684,1,-4.950541,-21.747899,1.0,0.983186,-0.569053,1.845924,-3.887978,1.727481,-2.9,0.395231,-0.882233,-1.871399,-0.008525
2,25968,684,2,1.136012,-10.756303,1.0,1.016814,0.964157,2.454749,0.312386,1.154198,-5.6,1.114162,1.525273,-11.584362,0.139812
3,25968,684,3,0.806028,6.504202,1.0,-0.179646,0.969221,-1.035153,-0.457195,0.254962,-2.7,-0.588873,0.608761,-4.24177,-0.462916
4,25968,684,4,1.288253,5.552521,1.0,-0.493805,-1.036124,-1.126402,2.008197,-0.730534,0.0,0.899566,-1.259615,-0.472222,-0.121483


In [7]:
submission = df_sub
print(f"submission: {submission.shape}")
submission.head()

submission: (12218, 2)


Unnamed: 0,sequence,state
0,25968,0
1,25969,0
2,25970,0
3,25971,0
4,25972,0


## Feature Engineering

In [8]:
def sub_imp(x):
    if x < 25:
        return 0
    elif x > 95:
        return 2
    else:
        return 1

In [9]:
def add_features(df):
    for col in tqdm(sensor_cols):
        
        for window in [1,2,3,6]:
            df[f'{col}_lead_diff{window}'] = df[col] - df.groupby('sequence')[col].shift(window).fillna(0)
            df[f'{col}_lag_diff{window}'] = df[col] - df.groupby('sequence')[col].shift(-1*window).fillna(0)
        
        for window in [3,6,12,24]:
            df[col+'_roll_'+str(window)+'_mean'] = df.groupby('sequence')[col]\
                                                     .rolling(window=window, min_periods=1)\
                                                     .mean().reset_index(level=0,drop=True)
            
            df[col+'_roll_'+str(window)+'_std'] = df.groupby('sequence')[col]\
                                                    .rolling(window=window, min_periods=1)\
                                                    .std().reset_index(level=0,drop=True)
            
            df[col+'_roll_'+str(window)+'_sum'] = df.groupby('sequence')[col]\
                                                    .rolling(window=window, min_periods=1)\
                                                    .sum().reset_index(level=0,drop=True)
    
    df.fillna(0, inplace=True)
    
    sub_stat = df[['sequence', 'subject']]\
                .drop_duplicates()\
                .groupby('subject')\
                .agg({'sequence': 'count'})\
                .rename(columns={'sequence': 'count'}).reset_index()
    
    df = df.merge(sub_stat, on='subject', how='left')
    df['sub_imp'] = df['count'].apply(lambda x: sub_imp(x))
    df.drop('count', axis=1, inplace=True)
        
    return df

In [10]:
sensor_cols = [col for col in train.columns if 'sensor' in col]
train = add_features(train)
test = add_features(test)
print(f"train: {train.shape} \ntest: {test.shape}")

100%|██████████| 13/13 [02:29<00:00, 11.49s/it]
100%|██████████| 13/13 [01:10<00:00,  5.44s/it]


train: (1558080, 277) 
test: (733080, 277)


In [11]:
train.drop(["sequence","step","subject"], axis=1, inplace=True)
test.drop(["sequence","step","subject"], axis=1, inplace=True)

In [12]:
scaler = QuantileTransformer(n_quantiles=2000, 
                             output_distribution='normal', 
                             random_state=42).fit(train)
train = scaler.transform(train)
test = scaler.transform(test)

In [13]:
train = train.reshape(-1, 60, train.shape[-1]).copy()
test = test.reshape(-1, 60, train.shape[-1]).copy()
train_labels = train_labels['state'].values.reshape(-1,1)
print(f"train: {train.shape} \ntest: {test.shape} \ntrain_labels {train_labels.shape}")

train: (25968, 60, 274) 
test: (12218, 60, 274) 
train_labels (25968, 1)


In [14]:
del scaler
gc.collect()

0

## Hardware config

In [15]:
try:
    tpu = tf.distribute.cluster_resolver.TPUClusterResolver()
    tf.config.experimental_connect_to_cluster(tpu)
    tf.tpu.experimental.initialize_tpu_system(tpu)
    strategy = tf.distribute.experimental.TPUStrategy(tpu)
    BATCH_SIZE = strategy.num_replicas_in_sync * 32
    print("Running on TPU:", tpu.master())
    print(f"Batch Size: {BATCH_SIZE}")
    
except ValueError:
    strategy = tf.distribute.get_strategy()
    BATCH_SIZE = 8
    print(f"Running on {strategy.num_replicas_in_sync} replicas")
    print(f"Batch Size: {BATCH_SIZE}")

Running on 1 replicas
Batch Size: 8


## Keras Model

In [16]:
def dnn_model():
    
    x_input = Input(shape=(train.shape[-2:]))
    
    x1 = Bidirectional(LSTM(units=768, return_sequences=True))(x_input)
    x2 = Bidirectional(LSTM(units=512, return_sequences=True))(x1)
    x3 = Bidirectional(LSTM(units=384, return_sequences=True))(x2)
    x4 = Bidirectional(LSTM(units=256, return_sequences=True))(x3)
    x5 = Bidirectional(LSTM(units=128, return_sequences=True))(x4)
    
    z1 = Bidirectional(GRU(units=384, return_sequences=True))(x2)
    z2 = Multiply()([x3, z1])
    
    z3 = Bidirectional(GRU(units=256, return_sequences=True))(z2)
    z4 = Multiply()([x4, z3])
    
    z5 = Bidirectional(GRU(units=128, return_sequences=True))(z4)
    
    x = Concatenate(axis=2)([x1, x3, x5, z1, z3, z5])
    x = GlobalMaxPooling1D()(x)
    
    x = Dense(units=1024, activation='selu')(x)
    x = Dense(units=128, activation='selu')(x)
    
    x_output = Dense(units=1, activation='sigmoid')(x)

    model = Model(inputs=x_input, outputs=x_output, 
                  name='TPS_Apr22_TFv2_Model')
    return model

In [17]:
model = dnn_model()
model.summary()

Model: "TPS_Apr22_TFv2_Model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_1 (InputLayer)           [(None, 60, 274)]    0           []                               
                                                                                                  
 bidirectional (Bidirectional)  (None, 60, 1536)     6408192     ['input_1[0][0]']                
                                                                                                  
 bidirectional_1 (Bidirectional  (None, 60, 1024)    8392704     ['bidirectional[0][0]']          
 )                                                                                                
                                                                                                  
 bidirectional_2 (Bidirectional  (None, 60, 768)     4328448     ['bidirectiona

In [18]:
plot_model(
    model, 
    to_file='TPS_Apr22_TFv2_Model.png', 
    show_shapes=True,
    show_layer_names=True
)

You must install pydot (`pip install pydot`) and install graphviz (see instructions at https://graphviz.gitlab.io/download/) for plot_model/model_to_dot to work.


In [None]:
with strategy.scope():
    
    VERBOSE = 1
    test_preds = []
    kf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
    
    for fold, (train_idx, test_idx) in enumerate(kf.split(train, train_labels)):
        X_train, X_valid = train[train_idx], train[test_idx]
        y_train, y_valid = train_labels[train_idx], train_labels[test_idx]
        
        model = dnn_model()
        model.compile(optimizer='adam', loss="binary_crossentropy", metrics=['AUC'])

        lr = ReduceLROnPlateau(monitor="val_auc", factor=0.75, 
                               patience=4, verbose=VERBOSE)
        
        save_locally = tf.saved_model.SaveOptions(experimental_io_device='/job:localhost')
        chk_point = ModelCheckpoint(f'./TPS_Apr22_TFv2_Model_{fold+1}C.h5', options=save_locally, 
                                    monitor='val_auc', verbose=VERBOSE, 
                                    save_best_only=True, mode='max')

        es = EarlyStopping(monitor="val_auc", patience=10, 
                           verbose=VERBOSE, mode="max", 
                           restore_best_weights=True)
        
        model.fit(X_train, y_train, 
                  validation_data=(X_valid, y_valid), 
                  epochs=100,
                  verbose=VERBOSE,
                  batch_size=BATCH_SIZE, 
                  callbacks=[lr, chk_point, es])
        
        load_locally = tf.saved_model.LoadOptions(experimental_io_device='/job:localhost')
        model = load_model(f'./TPS_Apr22_TFv2_Model_{fold+1}C.h5', options=load_locally)
        
        y_pred = model.predict(X_valid, batch_size=BATCH_SIZE, verbose=VERBOSE).squeeze()
        score = roc_auc_score(y_valid, y_pred)
        print(f"Fold-{fold+1} | OOF Score: {score}")
        
        test_preds.append(model.predict(test, batch_size=BATCH_SIZE, verbose=VERBOSE).squeeze())
        
        del model, y_pred
        del X_train, X_valid
        del y_train, y_valid
        gc.collect()

Epoch 1/100
  17/2922 [..............................] - ETA: 4:02 - loss: 2.7696 - auc: 0.5376

## Create submission file

In [None]:
submission["state"] = np.mean(np.vstack(test_preds), axis=0)
submission.to_csv('mean_submission.csv', index=False)
submission.head()

In [None]:
submission["state"] = np.median(np.vstack(test_preds), axis=0)
submission.to_csv('median_submission.csv', index=False)
submission.head()

In [None]:
# Good Day!!