In [1]:
import os
import sys

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt 
from sklearn.svm import OneClassSVM
from sklearn.ensemble import IsolationForest
from sklearn.metrics import f1_score, make_scorer
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
f1_scorer = make_scorer(f1_score, average="macro")
from sklearn.metrics import roc_auc_score

from torch.optim import Adam
# becuase we're in a nested folder...
sys.path.append('../')

from utils.preprocess import *
from models.AEAD import AEAD

## TAKE AWAY FROM WEEKEND: WINDOW-LEVEL NORMALIZATION PLAYS VERY NICELY WITH NEURAL NETWORKS. 

In [5]:
data_dir ="../../for_students/data_v2"

In [6]:
def load_data(data_dir, window_size, window_func):
    '''
    data_dir (str): Base directory of data 
    window_size (str): Window size for input examples
    window_func (str): Window function reference as defined in utils.preprocess
                       Option are either 'window' or 'window_func'
    '''
    train_dir = os.path.join(data_dir, 'training')
    train_str = os.path.join(train_dir, 'training_{}.csv')
    test_str = os.path.join(data_dir, 'dataset_{}.csv')

    train_xs = []
    train_ys = []
    for i in range(1,4):
        train_df_i = pd.read_csv(train_str.format(str(i)))
        train_xi = window_func(train_df_i.kpi_value.values, window_size)
        train_xs.append(train_xi)
        train_ys.append(train_df_i.anomaly_label.values)
    x_train = np.concatenate(train_xs)
    y_train = np.concatenate(train_ys)
    assert len(x_train) == len(y_train)
    
    test_xs = []
    test_ys = []
    for i in range(1,7):
        test_df_i = pd.read_csv(test_str.format(str(i)))
        test_xi = window_func(test_df_i.values[:,1], window_size)
        test_xs.append(test_xi)
    x_test = np.concatenate(test_xs)
    print("Train x shape: {}\nTrain y shape: {}\n\nTest x shape: {}".format(x_train.shape, y_train.shape, x_test.shape))
    return x_train, y_train, x_test

def window_min_max(x):
    x_min = x.min(axis=1).reshape(-1, 1)
    x_max = x.max(axis=1).reshape(-1, 1)
    for i in range(len(x)):
        if x_max[i] > 0:
            x[i] =  (x[i] - x_min[i])/(x_max[i] - x_min[i])
    return x

### Window Data

In [7]:
x_train, y_train, x_test = load_data(data_dir, 100, window_offset)
# Window level normalisation
x_train = window_min_max(x_train)
x_test = window_min_max(x_test)

Train x shape: (12096, 100)
Train y shape: (12096,)

Test x shape: (39476, 100)


In [8]:
# Cross val incides
skf = StratifiedKFold(n_splits=5)

In [43]:
## We may be able to get min max parameters from training set. We should do cross validation also. 
# minmax_scalar = MinMaxScaler()
# minmax_scalar.fit(x_train_normal)
# x_train_normal_min_max = minmax_scalar.transform(x_train_normal)
# x_train_min_max = minmax_scalar.transform(x_train)

### OCSVM

In [9]:
for train_index, val_index in skf.split(x_train, y_train):
    x_train_fold = x_train[train_index]
    y_train_fold = y_train[train_index]
    x_train_normal = x_train_fold[y_train_fold == 0]

    x_val_fold = x_train[val_index]
    y_val_fold = y_train[val_index]
    
    # Train on normal data
    ocsvm = OneClassSVM(gamma='auto').fit(x_train_normal)
    # check performance on training set for sanity check. 
    y_pred_ocsvm = ocsvm.predict(x_val_fold)

    y_pred_ocsvm[y_pred_ocsvm==1] = 0
    y_pred_ocsvm[y_pred_ocsvm==-1] = 1
    ocsvm_f1 = f1_score(y_val_fold, y_pred_ocsvm, average='macro')
    print(ocsvm_f1)

0.44234858809509703
0.3060673555749642
0.021835826930853213
0.44661543494676426
0.2661503285140415


### iForest

In [10]:
for train_index, val_index in skf.split(x_train, y_train):
    x_train_fold = x_train[train_index]
    y_train_fold = y_train[train_index]
    x_train_normal = x_train_fold[y_train_fold == 0]

    x_val_fold = x_train[val_index]
    y_val_fold = y_train[val_index]
    
    # Train on normal data
    iforest = IsolationForest().fit(x_train_normal)
    y_pred_iforest = iforest.predict(x_val_fold)

    y_pred_iforest[y_pred_iforest==1] = 0
    y_pred_iforest[y_pred_iforest==-1] = 1
    iforest_f1 = f1_score(y_val_fold, y_pred_iforest, average='macro')
    print(iforest_f1)

0.019008264462809916
0.2583036974704906
0.7870261132925992
0.2636592729359515
0.02223120452708165


### Normal AE (Without finetuning)

In [11]:
aead_aucs = []
for train_index, val_index in skf.split(x_train, y_train):
    x_train_fold = x_train[train_index]
    y_train_fold = y_train[train_index]
    x_train_normal = x_train_fold[y_train_fold == 0]
    y_train_normal = y_train_fold[y_train_fold == 0]

    
    x_val_fold = x_train[val_index]
    y_val_fold = y_train[val_index]
    
    x_train_fold = window_min_max(x_train_fold)
    x_val_fold = window_min_max(x_val_fold)

    aead = AEAD(100,256, 0.0001, 20, 'cpu', Adam).fit(x_train_normal,  y_train_normal)
    y_pred_aead = aead.predict(x_val_fold)
    val_auc = roc_auc_score(y_val_fold, y_pred_aead)
    print(sum(y_val_fold ))
    aead_aucs.append(val_auc)
    print(val_auc)
print(np.mean(aead_aucs))

Train Epoch: 1	Loss: 0.210391
Train Epoch: 2	Loss: 0.185857
Train Epoch: 3	Loss: 0.120992
Train Epoch: 4	Loss: 0.074716
Train Epoch: 5	Loss: 0.065574
Train Epoch: 6	Loss: 0.063741
Train Epoch: 7	Loss: 0.063076
Train Epoch: 8	Loss: 0.062695
Train Epoch: 9	Loss: 0.062409
Train Epoch: 10	Loss: 0.062172
Train Epoch: 11	Loss: 0.062006
Train Epoch: 12	Loss: 0.061868
Train Epoch: 13	Loss: 0.061753
Train Epoch: 14	Loss: 0.061636
Train Epoch: 15	Loss: 0.061505
Train Epoch: 16	Loss: 0.061363
Train Epoch: 17	Loss: 0.061200
Train Epoch: 18	Loss: 0.060997
Train Epoch: 19	Loss: 0.060769
Train Epoch: 20	Loss: 0.060507
55
0.48581203152027663
Train Epoch: 1	Loss: 0.233726
Train Epoch: 2	Loss: 0.211161
Train Epoch: 3	Loss: 0.143856
Train Epoch: 4	Loss: 0.078989
Train Epoch: 5	Loss: 0.069230
Train Epoch: 6	Loss: 0.067862
Train Epoch: 7	Loss: 0.067334
Train Epoch: 8	Loss: 0.066995
Train Epoch: 9	Loss: 0.066754
Train Epoch: 10	Loss: 0.066550
Train Epoch: 11	Loss: 0.066389
Train Epoch: 12	Loss: 0.066218
Tra

## AE with positive penalty (Fancy cost function)

## FANCY COST FUNCTION 
### (I don't know if this exists already, I thought of it while lazing around the house) I think it was inspired by this paper: https://papers.nips.cc/paper/1998/file/b710915795b9e9c02cf10d6d2bdb688c-Paper.pdf ...maybe...anyway, it goes like this:

$\Delta = \frac{1}{n}\sum^{n}_{i=1}(X - \hat{X})$ => i.e. normal Mean squared error used for standard AEs
    
- where $n$ is the number of examples, $X$ is the input, $\hat{X}$.

$(1-y)\Delta - y\Delta$
    
- where $y$ is the ground truth binary label. 

This looks a little similar to binary cross entropy but is not for classification. Basically we penalise the autoencoder for recontructing normal examples well. The algorithms below need thresholds for F1 scores but I'll work that out later. 

In [13]:
aead_aucs = []
for train_index, val_index in skf.split(x_train, y_train):
    x_train_fold = x_train[train_index]
    y_train_fold = y_train[train_index]
    
    x_val_fold = x_train[val_index]
    y_val_fold = y_train[val_index]

    aead = AEAD(100,256, 0.0001, 20, 'cpu', Adam, normal_only=False).fit(x_train_fold,  y_train_fold)
    y_pred_aead = aead.predict(x_val_fold)
    val_auc = roc_auc_score(y_val_fold, y_pred_aead)
    print(sum(y_val_fold ))
    aead_aucs.append(val_auc)
    print(val_auc)
print(np.mean(aead_aucs))

Train Epoch: 1	Loss: 0.146212	Train auc: 0.572925
Train Epoch: 2	Loss: 0.131324	Train auc: 0.572902
Train Epoch: 3	Loss: 0.091672	Train auc: 0.843739
Train Epoch: 4	Loss: 0.057077	Train auc: 0.984190
Train Epoch: 5	Loss: 0.050393	Train auc: 0.986628
Train Epoch: 6	Loss: 0.048898	Train auc: 0.986744
Train Epoch: 7	Loss: 0.048212	Train auc: 0.986908
Train Epoch: 8	Loss: 0.047811	Train auc: 0.987046
Train Epoch: 9	Loss: 0.047531	Train auc: 0.987078
Train Epoch: 10	Loss: 0.047372	Train auc: 0.987086
Train Epoch: 11	Loss: 0.047242	Train auc: 0.987068
Train Epoch: 12	Loss: 0.047165	Train auc: 0.987076
Train Epoch: 13	Loss: 0.047043	Train auc: 0.987007
Train Epoch: 14	Loss: 0.046961	Train auc: 0.987020
Train Epoch: 15	Loss: 0.046898	Train auc: 0.987120
Train Epoch: 16	Loss: 0.046772	Train auc: 0.987177
Train Epoch: 17	Loss: 0.046655	Train auc: 0.987416
Train Epoch: 18	Loss: 0.046497	Train auc: 0.987644
Train Epoch: 19	Loss: 0.046330	Train auc: 0.988289
Train Epoch: 20	Loss: 0.046107	Train auc