# Setup

In [1]:
%load_ext autotime
%matplotlib inline

In [2]:
import pickle

time: 649 µs


In [3]:
import numpy as np
import pandas as pd
import scipy as sp

from scipy import stats

time: 414 ms


# Load the Data

In [4]:
with open("data.pickle", "rb") as f:
    data = pickle.load(f)

time: 1.06 s


In [5]:
X = data["X"][:, 2:, :]
y = data["y"]
names = data["names"]

time: 1.77 ms


# Normalize the Data

In [6]:
def normalize(X):
    X_iqr = stats.iqr(X, axis=2, keepdims=True)
    X_iqr = np.where(X_iqr == 0, np.ones_like(X_iqr), X_iqr)
    X_median = np.median(X, axis=2, keepdims=True)
    return ((X - X_median) / X_iqr / 2)

time: 86.6 ms


In [7]:
X_normalized = normalize(X)

time: 10.6 s


# NN predictions

In [8]:
df_y = pd.read_csv("cnn.980.full.csv")
df_y.head()

Unnamed: 0,Id,Attack,AttackHat
0,0,0.0,0.644155
1,1,1.0,1.0
2,2,0.0,0.69447
3,3,1.0,1.0
4,4,0.0,0.350756


time: 12.5 ms


In [9]:
from sklearn.metrics import roc_auc_score

time: 137 ms


In [10]:
roc_auc_score(
    df_y[~df_y.Attack.isnull()].Attack,
    df_y[~df_y.Attack.isnull()].AttackHat,
)

0.9800495321959273

time: 105 ms


In [11]:
nn_pred = df_y.AttackHat.as_matrix()

time: 87.5 ms


# Statistical features

In [12]:
def make_stat_features(X):
    X_min = X.min(axis=2, keepdims=True)
    X_max = X.max(axis=2, keepdims=True)
    X_mean = X.mean(axis=2, keepdims=True)
    X_ptp = X_max - X_min
    X_std = X.std(axis=2, keepdims=True)
    X_median = np.median(X, axis=2, keepdims=True)
    X_iqr = stats.iqr(X, axis=2, keepdims=True)
    X_mean_diff = np.abs(X - X_mean).max(axis=2, keepdims=True)
    X_median_diff = np.abs(X - X_median).max(axis=2, keepdims=True)

    raw_features = [
        X_min,
        X_max,
        X_mean,
        X_ptp,
        X_std,
        X_median,
        X_iqr,
        X_mean_diff,
        X_median_diff,
    ]

    return np.hstack([
        v.squeeze()
        for v in raw_features
    ])

time: 96.3 ms


In [13]:
X_features_1 = make_stat_features(X)

time: 12.6 s


In [14]:
X_features_2 = make_stat_features(X_normalized)

time: 12.9 s


# Model

In [15]:
i_train = np.where((y[:, None] == ([0, 1])).max(axis=1))[0]
i_test = np.where(~(y[:, None] == ([0, 1])).max(axis=1))[0]

time: 3.55 ms


In [16]:
from sklearn.model_selection import cross_val_score, cross_val_predict
from sklearn.linear_model import LogisticRegression

time: 159 ms


### Linear model

Evaluation:

In [17]:
cross_val_score(
    LogisticRegression(),
    X_features_1[i_train],
    y[i_train],
    scoring="roc_auc",
    cv=5,
).mean()

0.8649399877381626

time: 1.01 s


In [18]:
cross_val_score(
    LogisticRegression(),
    X_features_2[i_train],
    y[i_train],
    scoring="roc_auc",
    cv=5,
).mean()

0.7684693925674401

time: 773 ms


Out-of-fold predictions for train:

In [19]:
lr_pred_1_train = cross_val_predict(
    LogisticRegression(),
    X_features_1[i_train],
    y[i_train],
    cv=5,
    method="predict_proba"
)[:, 1]

time: 970 ms


In [20]:
lr_pred_2_train = cross_val_predict(
    LogisticRegression(),
    X_features_2[i_train],
    y[i_train],
    cv=5,
    method="predict_proba"
)[:, 1]

time: 791 ms


Predictions for test:

In [21]:
lr_pred_1_test = LogisticRegression().fit(
    X_features_1[i_train],
    y[i_train],
).predict_proba(X_features_1[i_test])[:, 1]

time: 216 ms


In [22]:
lr_pred_2_test = LogisticRegression().fit(
    X_features_2[i_train],
    y[i_train],
).predict_proba(X_features_2[i_test])[:, 1]

time: 181 ms


  np.exp(prob, prob)


Concatenate train and test predictions:

In [23]:
lr_pred_1 = np.zeros_like(y)
lr_pred_1[i_train] = lr_pred_1_train
lr_pred_1[i_test] = lr_pred_1_test

time: 2.27 ms


In [24]:
lr_pred_2 = np.zeros_like(y)
lr_pred_2[i_train] = lr_pred_2_train
lr_pred_2[i_test] = lr_pred_2_test

time: 93.1 ms


### LightGBM as a stacker

In [25]:
from lightgbm import LGBMClassifier

time: 98.4 ms


One setup:

In [26]:
lgbm = LGBMClassifier(
    n_estimators=80,
    num_leaves=4,
    reg_alpha=1e-6,
    reg_lambda=0,
)

time: 91.6 ms


In [27]:
cross_val_score(
    lgbm,
    np.hstack([
        X_features_1[i_train],
        X_features_2[i_train],
        lr_pred_1[:, None][i_train],
        lr_pred_2[:, None][i_train],
        nn_pred[:, None][i_train],
    ]),
    y[i_train],
    scoring="roc_auc",
    cv=5,
).mean()

0.9898380022637238

time: 514 ms


In [28]:
lgbm.fit(
    np.hstack([
        X_features_1[i_train],
        X_features_2[i_train],
        lr_pred_1[:, None][i_train],
        lr_pred_2[:, None][i_train],
        nn_pred[:, None][i_train],
    ]),
    y[i_train],
);

time: 151 ms


In [29]:
lgbm_pred_test = lgbm.predict_proba(
    np.hstack([
        X_features_1[i_test],
        X_features_2[i_test],
        lr_pred_1[:, None][i_test],
        lr_pred_2[:, None][i_test],
        nn_pred[:, None][i_test],
    ]),
)[:, 1]

time: 34.2 ms


In [30]:
pd.DataFrame({
    "Id": names[i_test],
    "Attack": lgbm_pred_test,
})[["Id", "Attack"]].to_csv("lgbm.9898.csv", index=False)

time: 101 ms


Another setup:

In [31]:
lgbm = LGBMClassifier(
    n_estimators=350,
    num_leaves=4,
    reg_alpha=0,
    reg_lambda=0,
)

time: 135 ms


In [32]:
cross_val_score(
    lgbm,
    np.hstack([
#         X_features_1[i_train],
#         X_features_2[i_train],
        lr_pred_1[:, None][i_train],
        lr_pred_2[:, None][i_train],
        nn_pred[:, None][i_train],
    ]),
    y[i_train],
    scoring="roc_auc",
    cv=5,
).mean()

0.9841999151103564

time: 258 ms


In [33]:
lgbm.fit(
    np.hstack([
#         X_features_1[i_train],
#         X_features_2[i_train],
        lr_pred_1[:, None][i_train],
        lr_pred_2[:, None][i_train],
        nn_pred[:, None][i_train],
    ]),
    y[i_train],
);

time: 127 ms


In [34]:
lgbm_pred_test = lgbm.predict_proba(
    np.hstack([
#         X_features_1[i_test],
#         X_features_2[i_test],
        lr_pred_1[:, None][i_test],
        lr_pred_2[:, None][i_test],
        nn_pred[:, None][i_test],
    ]),
)[:, 1]

time: 6.74 ms


In [35]:
pd.DataFrame({
    "Id": names[i_test],
    "Attack": lgbm_pred_test,
})[["Id", "Attack"]].to_csv("lgbm.984.csv", index=False)

time: 109 ms
