In [1]:
import os, gc, random, time
import pandas as pd
import datatable as dt
import numpy as np
import matplotlib.pyplot as plt
from tqdm.notebook import tqdm
from joblib import dump, load

from sklearn.metrics import roc_auc_score, roc_curve
from sklearn.model_selection import GroupKFold

import tensorflow as tf
import tensorflow_addons as tfa
import tensorflow.keras.backend as K
import tensorflow.keras.layers as layers
from tensorflow.keras.callbacks import Callback, ReduceLROnPlateau, ModelCheckpoint, EarlyStopping

# Processing

In [2]:
def reduce_memory_usage(df):
    start_memory = df.memory_usage().sum() / 1024 ** 2
    print(f"Memory usage of dataframe is {start_memory} MB")
    for col in df.columns:
        col_type = df[col].dtype
        
        if col_type != 'object':
            c_min = df[col].min()
            c_max = df[col].max()
            
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.innfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.innfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.innfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    pass
        else:
            df[col] = df[col].astype('category')
    end_memory = df.memory_usage().sum / 1024 ** 2
    print(f"Memory usage of dataframe after reduction {end_memory} MB")
    print(f"Reduced by {100 * (start_memory - end_memory) / start_memory}")
    return df

In [3]:
%%time
train = dt.fread('/kaggle/working/input/train.csv').to_pandas()
train = train.query('date > 85').reset_index(drop=True)
train = train.loc[train.weight > 0].reset_index(drop = True)

features = [c for c in train.columns if 'feature' in c]
f_mean = train[features[1:]].mean()
train[features[1:]] = train[features[1:]].fillna(f_mean)
f_mean = f_mean.values
np.save('f_mean.npy', f_mean)

train['action'] = (train['resp'] > 0).astype('int')
resp_cols = ['resp_1', 'resp_2', 'resp_3', 'resp_4', 'resp']

# train = reduce_memory_usage(train)

CPU times: user 2min 32s, sys: 16.5 s, total: 2min 49s
Wall time: 11.4 s


In [34]:
def create_lstm(lookback, num_columns, num_labels, head_hidden_units, lstm_units, tail_hidden_units, dropout_rates, label_smoothing, learning_rate):
    assert (len(dropout_rates) == 1 + len(head_hidden_units) + len(lstm_units) + len(tail_hidden_units)), "number of dropout_rates is not equal to number of layers!"
    
    inp = tf.keras.layers.Input(shape = (lookback,num_columns,))
    x = tf.keras.layers.BatchNormalization()(inp)
    x = tf.keras.layers.Dropout(dropout_rates[0])(x)
    
    for i in range(len(head_hidden_units)): 
        x = tf.keras.layers.Dense(head_hidden_units[i])(x)
        x = tf.keras.layers.BatchNormalization()(x)
        x = tf.keras.layers.Activation(tf.keras.activations.swish)(x)
        x = tf.keras.layers.Dropout(dropout_rates[i + 1])(x)
        
    for i in range(len(lstm_units)):
        x = tf.keras.layers.LSTM(lstm_units[i])(x)
        x = tf.keras.layers.BatchNormalization()(x)
        x = tf.keras.layers.Dropout(dropout_rates[i + 1 + len(head_hidden_units)])(x)
        
        
    for i in range(len(tail_hidden_units)): 
        x = tf.keras.layers.Dense(tail_hidden_units[i])(x)
        x = tf.keras.layers.BatchNormalization()(x)
        x = tf.keras.layers.Activation(tf.keras.activations.swish)(x)
        x = tf.keras.layers.Dropout(dropout_rates[i+1+len(head_hidden_units)+len(lstm_units)])(x)
        
    x = tf.keras.layers.Dense(num_labels)(x)
    out = tf.keras.layers.Activation('sigmoid')(x)
    
    model = tf.keras.models.Model(inputs=inp, outputs=out)
    model.compile(
        optimizer = tfa.optimizers.RectifiedAdam(learning_rate=learning_rate),
        loss = tf.keras.losses.BinaryCrossentropy(label_smoothing=label_smoothing),
        metrics = tf.keras.metrics.AUC(name='AUC'),
    )
    return model

In [37]:
def prepare_dataset(X, y, window_length, batch_size, mode='train'):
    x_ds = tf.data.Dataset.from_tensor_slices(X)
    x_ds = x_ds.window(window_length, shift=1, drop_remainder=True)
    x_ds = x_ds.flat_map(lambda window: window.batch(window_length))
    
    if mode == 'train':
        y_ds = tf.data.Dataset.from_tensor_slices(y[window_length-1:])
        ds = tf.data.Dataset.zip((x_ds, y_ds))
        ds = ds.shuffle(10000).batch(batch_size)
    elif model == 'predict':
        ds = x_ds
        ds = ds.batch(batch_size)
        
    ds = ds.prefetch(30)
    return ds

In [38]:
oof = np.zeros(len(train['action']))
# gkf = GroupTimeSeriesSplit(n_splits = 5)
gkf = GroupKFold(n_splits = 5)

val_idx = []
oof_scores = []

for fold, (tr, te) in enumerate(gkf.split(train['action'].values, train['action'].values, train['date'].values)):
    
    print("#"*75)
    print(f"Fold{fold}")
#     print(train.loc[tr, 'ts_id'])
#     print(train.loc[te, 'ts_id'])
    
    X_tr, X_val = train.loc[tr, features], train.loc[te, features]
    y_tr, y_val = train.loc[tr, 'action'], train.loc[te, 'action']
    
    print(f"train split shape is {X_tr.shape}, validation split shape is {X_val.shape}")
    train_steps = int(len(tr)//batch_size)
    val_steps = int(len(te)//batch_size)
    print(f"batch_size is {batch_size}")
    print(f"train_steps is {train_steps}, val_steps is {val_steps}")
    
    dataset = prepare_dataset(X_tr,y_tr,lookback,batch_size)
    val_dataset = prepare_dataset(X_val,y_val,lookback,batch_size)
#     print(dataset)
    
    print('Training...')
    ckp_path = f'JS_LSTM_Model_{fold}.hdf5'
    model = create_lstm(lookback, len(features), 1, head_hidden_units,lstm_units,tail_hidden_units, dropout_rates, label_smoothing, learning_rate)
#     model.summary()
    rlr = ReduceLROnPlateau(monitor = 'val_AUC', factor = 0.2, patience = 5, verbose = 1, 
                            min_delta = 1e-4, mode = 'max')
    ckp = ModelCheckpoint(ckp_path, monitor = 'val_AUC', verbose = 1, 
                          save_best_only = True, save_weights_only = True, mode = 'max')
    es = EarlyStopping(monitor = 'val_AUC', min_delta = 1e-4, patience = 12, mode = 'max', 
                       baseline = None, restore_best_weights = True, verbose = 0)
    model.fit(dataset,validation_data = val_dataset, epochs = 1000, 
              batch_size=batch_size, callbacks = [rlr, ckp, es], verbose = 2)
    
    print("Fine tuning...")
    
    # Finetune 5 epochs on validation set with small learning rate
    model = create_lstm(lookback, len(features), 1, head_hidden_units,lstm_units,tail_hidden_units, dropout_rates, label_smoothing, learning_rate / 100)
    model.load_weights(ckp_path)
    ckp_path_ft = f'JS_LSTM_Model_{fold}_finetuning.hdf5'
    ckp_ft = ModelCheckpoint(ckp_path_ft, monitor = 'AUC', verbose = 1,
                             save_best_only = True, save_weights_only = True, mode = 'max')
    model.fit(val_dataset, epochs = 5, batch_size=batch_size, callbacks = [ckp_ft], verbose = 2)
    model.save_weights(ckp_path_ft)
    
    # predict oof
    val_set = prepare_dataset(X_val,[],lookback,batch_size, mode='predict')
    val_pred = model.predict(val_set, batch_size=batch_size).ravel()
#     print(val_pred.shape)

    oof[te[lookback-1:]] += val_pred
    val_idx.append(te[lookback-1:])
    score = roc_auc_score(y_val[lookback-1:], oof[te[lookback-1:]])
    oof_scores.append(score)
    print(f'Fold {fold} ROC AUC:\t', score)
    
    K.clear_session()
    del model, dataset, val_dataset, val_pred, val_set, X_tr, X_val, y_tr, y_val
    rubbish = gc.collect()
    
print("#"*75)

###########################################################################
Fold0
train split shape is (1256716, 130), validation split shape is (314699, 130)
batch_size is 4096
train_steps is 306, val_steps is 76
Training...


NotImplementedError: Cannot convert a symbolic Tensor (lstm_9/strided_slice:0) to a numpy array.

In [33]:
lookback = 10
batch_size = 4096
head_hidden_units = [256]
lstm_units = [64]
tail_hidden_units = [512, 394]
dropout_rates = [0.2, 0.2, 0.2, 0.2, 0.2]
label_smoothing = 1e-2
learning_rate = 1e-3
epoch = 1000

oof = np.zeros(len(train['action']))
gkf = GroupKFold(n_splits=5)
val_idx = []
oof_scores = []

for fold, (tr, te) in enumerate(gkf.split(
    train[features].values,
    train['action'].values,
    train['date'].values,
)):
    X_tr, y_tr = train.loc[tr, features].values, train.loc[tr, 'action'].values
    X_te, y_te = train.loc[te, features].values, train.loc[te, 'action'].values
    
    dataset = prepare_dateset(X_tr, y_tr, lookback, batch_size)
    val_dateset = prepare_dateset(X_te, y_te, lookback, batch_size)
    
    ckpt_path = f"JS_LSTM_MODEL_{fold}.hdf5"
    
    model = create_lstm(
        lookback=lookback,
        num_columns=len(features),
        num_labels=1,
        head_hidden_units=head_hidden_units,
        lstm_units=lstm_units,
        tail_hidden_units=tail_hidden_units,
        dropout_rates=dropout_rates,
        label_smoothing=label_smoothing,
        learning_rate=learning_rate
    )
    
    rlr = ReduceLROnPlateau(
        monitor='val_AUC',
        factor=0.2,
        patience=5,
        verbose=1,
        save_best_only=True,
        save_weights_only=True,
        mode='max'
    )
    
    ckp = ModelCheckpoint(
        ckpt_path,
        monitor='val_AUC',
        verbose=1,
        save_best_only=True,
        save_weights_only=True,
        mode='max'
    )
    
    es = EarlyStopping(
        monitor='val_AUC',
        min_delta=1e-4,
        patience=12,
        mode='max',
        baseline=None,
        restore_best_weights=True,
        verbose=0
    )
    
    model.fit(
        dataset,
        val_dateset,
        epoch=epoch,
        batch_size=batch_size,
        callbacks = [rlr, ckp, es],
        verbose=2
    )
    
    print('Fine tuning...')
    
    model = create_lstm(
        lookback=lookback,
        num_columns=len(features),
        num_labels=1,
        head_hidden_units=head_hidden_units,
        lstm_units=lstm_units,
        tail_hidden_units=tail_hidden_units,
        dropout_rates=dropout_rates,
        label_smoothing=label_smoothing,
        learning_rate=learning_rate / 100,
    )
    model.load_weight(ckpt_path)
    
    ckpt_path_ft = f"JS_LSTM_MODEL_{fold}_finetuning.hdf5"
    ckpt_ft = ModelCheckpoint(
        ckpt_path_ft,
        monitor='AUC',
        verbose=1,
        save_best_only=True,
        save_weights_only=True,
        mode='max'
    )
    model.fit(
        val_dateset,
        epoch=5,
        batch_size=batch_size,
        callbacks=[ckpt_ft],
        verbose=2
    )
    
    val_set = prepare_dataset(X_te, [], lookback, batch_size, mode='predict')
    val_pred = model.predict(val_set, batch_size=batch_size).ravel()
    print(val_pred.shape)
    
    oof[te[lookback - 1:]] += val_pred
    val_idx.append(te[lookback - 1:])
    
    score = roc_auc_score(y_te[lookback - 1:], oof[te[lookback - 1:]])
    oof_scores.append(score)
    print(f"Fold {fold} ROC AUC:\t", score)
    
    K.clear_session()
    
    del model, dataset, val_dateset, val_pred, val_set, X_tr, X_te, y_tr, y_te
    gc.collect()

NotImplementedError: Cannot convert a symbolic Tensor (lstm_8/strided_slice:0) to a numpy array.