# Setup

In [1]:
%load_ext autotime
%matplotlib inline

Hack to load TensorFlow from system libraries:

In [2]:
import sys

time: 878 µs


In [3]:
sys.path.append("/usr/lib/python3.6/site-packages/")

time: 124 ms


(I run everything from virtual environment, but there's already built package of TensorFlow with CUDA support in ArchLinux repo, so I use it instead of building myself.)

In [4]:
import tensorflow as tf

  from ._conv import register_converters as _register_converters


time: 4.87 s


Ensure that it will run on GPU:

In [5]:
from tensorflow.python.client import device_lib


def get_available_gpus():
    local_device_protos = device_lib.list_local_devices()
    return [x.name for x in local_device_protos if x.device_type == 'GPU']


available_gpus = get_available_gpus()
print("Available GPUs:", available_gpus)
assert len(available_gpus) > 0

Available GPUs: ['/device:GPU:0']
time: 1.14 s


Other usual stuff:

In [6]:
import pickle

time: 986 µs


In [7]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt

time: 117 ms


In [8]:
import tqdm

time: 318 ms


# Load the Data

In [9]:
with open("chunks_neg_1000_900.pickle", "rb") as f:
    X_neg = pickle.load(f)

time: 1min 3s


In [10]:
with open("chunks_pos_1000_900.pickle", "rb") as f:
    X_pos = pickle.load(f)

time: 52.4 s


In [11]:
y_neg = np.zeros(X_neg.shape[0])
y_pos = np.ones(X_pos.shape[0])

time: 2.43 ms


Full dataset:

In [12]:
X = np.vstack([X_neg, X_pos])
y = np.concatenate((y_neg, y_pos))

time: 2.98 s


Normalize dataset:

In [13]:
def normalize(X):
    X_std = X.std(axis=1, keepdims=True)
    X_mean = X.mean(axis=1, keepdims=True)
    X_std[np.where(X_std == 0)[0]] = 1
    return (X - X_mean) / X_std

time: 5.57 ms


In [14]:
X = normalize(X)

time: 11.2 s


Prepare training data:

In [15]:
from sklearn.model_selection import train_test_split

time: 785 ms


In [16]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1)

time: 2.6 s


Well, "test" is actually "validation".

# The Model

## Build

In [17]:
from keras.layers import (
    Conv1D,
    Dense,
    Flatten,
    Input,
    MaxPooling1D,
    Reshape,
)
from keras.models import Model
from keras.optimizers import Adam

time: 276 ms


Using TensorFlow backend.


In [48]:
def make_model():
    inputs = Input(shape=X.shape[1:])
    
    x = inputs
    x = Reshape(X.shape[1:] + (-1,))(x)
    x = Conv1D(8, 3, activation="relu")(x)
    x = Conv1D(8, 3, activation="relu")(x)
    x = MaxPooling1D(2)(x)
    x = Conv1D(16, 3, activation="relu")(x)
    x = Conv1D(16, 3, activation="relu")(x)
    x = MaxPooling1D(2)(x)
    x = Conv1D(32, 3, activation="relu")(x)
    x = Conv1D(32, 3, activation="relu")(x)
    x = MaxPooling1D(2)(x)
    x = Flatten()(x)
    x = Dense(100, activation="relu")(x)
    x = Dense(10, activation="relu")(x)
    x = Dense(1, activation="sigmoid")(x)
    outputs = x
    
    model = Model(inputs=inputs, outputs=outputs)
    model.summary()
    
    model.compile(
        loss="binary_crossentropy",
        optimizer=Adam(lr=0.0008),
    )
    return model

time: 42.8 ms


## Train

Sample for evaluating performance on a training set:

In [19]:
i_train_sample = np.random.choice(range(X_train.shape[0]), size=X_test.shape[0], replace=False)
X_train_sample = X_train[i_train_sample]
y_train_sample = y_train[i_train_sample]

time: 502 ms


In [51]:
BATCH_SIZE = 5000
EPOCHS = 22
LOG_EVERY = 1

time: 1.37 ms


In [22]:
from sklearn.metrics import roc_auc_score

time: 101 ms


***N.b.: actually you should not run the next two code cells, they were used to test different architectures and to find out a number of epochs required to reach the optimum.***

Instantiate the model:

```python
m = make_model()
```

Train the model on a part of train set and evaluate both on train and on a validation (called "test" here):

```python
df_log = pd.DataFrame(columns=["epoch", "train_auc", "test_auc"])

for epoch in range(1, EPOCHS + 1, LOG_EVERY):
    m.fit(X_train, y_train, batch_size=BATCH_SIZE, epochs=LOG_EVERY)
    
    yhat_train_sample = m.predict(X_train_sample)
    train_auc = roc_auc_score(y_train_sample, yhat_train_sample)
    
    yhat_test = m.predict(X_test)
    test_auc = roc_auc_score(y_test, yhat_test)
    
    row = (epoch + LOG_EVERY - 1, train_auc, test_auc)
    print("\nEPOCH #{} - train AUC {:.3f} test AUC {:.3f}".format(*row))
    
    df_log.loc[df_log.shape[0]] = row
    
    f, a = plt.subplots(figsize=(10, 6))
    df_log.set_index("epoch").plot(ax=a, marker=".")
    plt.grid()
    plt.show()
```

## Score the real thing

Model to fit on full train:

In [52]:
m = make_model()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_5 (InputLayer)         (None, 1000)              0         
_________________________________________________________________
reshape_5 (Reshape)          (None, 1000, 1)           0         
_________________________________________________________________
conv1d_23 (Conv1D)           (None, 998, 8)            32        
_________________________________________________________________
conv1d_24 (Conv1D)           (None, 996, 8)            200       
_________________________________________________________________
max_pooling1d_12 (MaxPooling (None, 498, 8)            0         
_________________________________________________________________
conv1d_25 (Conv1D)           (None, 496, 16)           400       
_________________________________________________________________
conv1d_26 (Conv1D)           (None, 494, 16)           784       
__________

Fit on the full train:

In [53]:
m.fit(X, y, batch_size=BATCH_SIZE, epochs=EPOCHS)

Epoch 1/22
Epoch 2/22
Epoch 3/22
Epoch 4/22
Epoch 5/22
Epoch 6/22
Epoch 7/22
Epoch 8/22
Epoch 9/22
Epoch 10/22
Epoch 11/22
Epoch 12/22
Epoch 13/22
Epoch 14/22
Epoch 15/22
Epoch 16/22
Epoch 17/22
Epoch 18/22
Epoch 19/22
Epoch 20/22
Epoch 21/22
Epoch 22/22


<keras.callbacks.History at 0x7f41bc3155c0>

time: 1h 2min 36s


Metric on a training set:

In [54]:
roc_auc_score(y, m.predict(X, batch_size=BATCH_SIZE))

0.9998657225194145

time: 57.5 s


Loading the real data:

In [25]:
with open("data.pickle", "rb") as f:
    data_real = pickle.load(f)

time: 20.6 s


In [26]:
X_real = data_real["X"][:, 2:, :]
y_real = data_real["y"]
names_real = data_real["names"]

time: 2.9 ms


In [27]:
X_real.shape, y_real.shape, names_real.shape

((500, 54, 5760), (500,), (500,))

time: 111 ms


Treat each of the time series individually:

In [28]:
X_real = np.vstack(X_real)
X_real.shape

(27000, 5760)

time: 578 ms


Split them into chunks:

In [29]:
def generate_chunks(l, o):
    chunks = []
    start = 0
    while start + l < X_real.shape[-1]:
        chunks.append((start, start + l))
        start += (l - o)
    return chunks


CHUNK_LENGTH = 1000
CHUNK_OVERLAP = 900

chunks = generate_chunks(CHUNK_LENGTH, CHUNK_OVERLAP)

time: 8.52 ms


In [30]:
len(chunks)

48

time: 98.8 ms


In [34]:
i_known = np.where((y_real[:, None] == [0, 1]).max(axis=1))[0]
i_unknown = np.where(~(y_real[:, None] == [0, 1]).max(axis=1))[0]

time: 81.7 ms


Calculate predicts for each chunk:

In [55]:
yhats_real = []

for chunk in tqdm.tqdm(chunks):
    yhat_real_one = m.predict(normalize(X_real[:, chunk[0]:chunk[1]]))
    yhats_real.append(yhat_real_one)

100%|██████████| 48/48 [03:34<00:00,  4.48s/it]

time: 3min 34s





In [56]:
yhat_real = np.hstack(yhats_real)

time: 7.37 ms


Smooth with moving average:

In [61]:
MA_PERIOD = 3

yhat_convolved = np.vstack([
    np.convolve(yhat_real[i], np.ones((MA_PERIOD,)) / MA_PERIOD, mode="valid")
    for i in range(yhat_real.shape[0])
])

time: 376 ms


In [62]:
yhat_convolved.shape

(27000, 46)

time: 2.11 ms


In [64]:
yhat_reshaped = yhat_convolved.max(axis=1).reshape(y_real.shape[0], -1).max(axis=1)

time: 4.13 ms


Evaluate thus created scores on the train dataset:

In [65]:
roc_auc_score(y_real[i_known], yhat_reshaped[i_known])

0.9800495321959273

time: 3.4 ms


For submission:

In [66]:
pd.DataFrame({
    "Id": names_real[i_unknown],
    "Attack": yhat_reshaped[i_unknown],
})[["Id", "Attack"]].to_csv("cnn.980.csv", index=False)

time: 7.5 ms


For stacking:

In [69]:
df_y = pd.DataFrame({
    "y": y_real,
    "yhat": yhat_reshaped,
})

time: 2.06 ms


In [74]:
(
    df_y
    .rename(columns={"y": "Attack", "yhat": "AttackHat"})
    .to_csv("cnn.980.full.csv", index_label="Id")
)

time: 5.36 ms
