In [1]:
import os
import sys

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt 
from sklearn.svm import OneClassSVM
from sklearn.ensemble import IsolationForest
from sklearn.metrics import f1_score, make_scorer
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
f1_scorer = make_scorer(f1_score, average="macro")
from sklearn.metrics import roc_auc_score

from torch.optim import Adam
# becuase we're in a nested folder...
sys.path.append('../')

from utils.preprocess import *
from models.AEAD import AEAD

In [6]:
data_dir_v2 ="../../for_students/data_v2"

In [5]:
# os.getcwd()

'C:\\Users\\Pheobe\\Documents\\[B]Huawei-Challenge-2020\\anomaly-detection-challenge-2020\\notebooks'

In [7]:
def load_data(data_dir, window_size):
    '''
    data_dir (str): Base directory of data 
    window_size (str): Window size for input examples
    window_func (str): Window function reference as defined in utils.preprocess
                       Option are either 'window' or 'window_func'
    '''
    train_dir = os.path.join(data_dir, 'training')
    train_str = os.path.join(train_dir, 'training_{}.csv')
    test_str = os.path.join(data_dir, 'dataset_{}.csv')

    train_xs = []
    train_ys = []
    for i in [1, 2, 3, 4, 5, 100]: # file name updated to v2
        train_df_i = pd.read_csv(train_str.format(str(i)))
        
    # adding padded values and then windowing
        #         train_xi = window_func(train_df_i.kpi_value.values, window_size)
        local_min = train_df_i.kpi_value[0:window_size].min() # Using the global min as the padding 
        pad_min = np.ones(window_size) * local_min
        x_padded = np.concatenate([pad_min, train_df_i.kpi_value.values])
        train_xi = [x_padded[j:j+window_size] for j in range(len(x_padded)-(window_size))]
     
        train_xs.append(train_xi)
        train_ys.append(train_df_i.anomaly_label.values)
    x_train = np.concatenate(train_xs)
    y_train = np.concatenate(train_ys)
    assert len(x_train) == len(y_train)
    
    test_xs = []
    test_ys = []
    for i in [1,2,3,4,5,6,7,8,9,10,11,12,13,100,101,102,103,105,106]:  # file name updated to v2
        test_df_i = pd.read_csv(test_str.format(str(i)))
#         test_xi = window_func(test_df_i.values[:,1], window_size)
        test_local_min = test_df_i.kpi_value.values[0:window_size].min() # Using the local min as the padding 
        test_pad_min = np.ones(window_size) * test_local_min
        test_x_padded = np.concatenate([test_pad_min, train_df_i.kpi_value.values])
        test_xi = [test_x_padded[j:j+window_size] for j in range(len(test_x_padded)-(window_size))]

    test_xs.append(test_xi)
    x_test = np.concatenate(test_xs)
    print("Train_x shape: {}\nTrain_y shape: {}\n\nTest_x shape: {}".format(x_train.shape, y_train.shape, x_test.shape))
    return x_train, y_train, x_test

def window_min_max(x):
    x_min = x.min(axis=1).reshape(-1, 1)
    x_max = x.max(axis=1).reshape(-1, 1)
    for i in range(len(x)):
        if x_max[i] > x_min[i]:
            x[i] =  (x[i] - x_min[i])/(x_max[i] - x_min[i])
        else:  # add scenario where x_max = x_min in a window 
            x[i] = 0
    return x

In [8]:
x_train, y_train, x_test = load_data(data_dir_v2, 100)

Train_x shape: (54337, 100)
Train_y shape: (54337,)

Test_x shape: (20159, 100)


In [9]:
# Window level normalisation
x_train = window_min_max(x_train)
x_test = window_min_max(x_test)

In [10]:
# Cross val incides
skf = StratifiedKFold(n_splits=5)

In [11]:
aead_aucs = []
for train_index, val_index in skf.split(x_train, y_train):
#     x_train_fold = x_train[train_index]
#     y_train_fold = y_train[train_index]
#     x_train_normal = x_train_fold[y_train_fold == 0]
#     y_train_normal = y_train_fold[y_train_fold == 0]

#     x_val_fold = x_train[val_index]
#     y_val_fold = y_train[val_index]
    
#     x_train_fold = window_min_max(x_train_fold)
#     x_val_fold = window_min_max(x_val_fold)
    
    x_train_fold = x_train[train_index]
    y_train_fold = y_train[train_index]
    
    x_val_fold = x_train[val_index]
    y_val_fold = y_train[val_index]  

#     aead = AEAD(100,256, 0.0001, 20, 'cpu', Adam).fit(x_train_normal,  y_train_normal)
    aead = AEAD(100,256, 0.0001, 20, 'cpu', Adam, normal_only=False).fit(x_train_fold,  y_train_fold)
    y_pred_aead = aead.predict(x_val_fold)
    val_auc = roc_auc_score(y_val_fold, y_pred_aead)
    print(sum(y_val_fold ))
    aead_aucs.append(val_auc)
    print(val_auc)
print(np.mean(aead_aucs))

Train Epoch: 1	Loss: 0.139521	Train auc: 0.555441
Train Epoch: 2	Loss: 0.044801	Train auc: 0.573403
Train Epoch: 3	Loss: 0.041435	Train auc: 0.621690
Train Epoch: 4	Loss: 0.035134	Train auc: 0.709723
Train Epoch: 5	Loss: 0.028953	Train auc: 0.754626
Train Epoch: 6	Loss: 0.024517	Train auc: 0.757208
Train Epoch: 7	Loss: 0.022002	Train auc: 0.755020
Train Epoch: 8	Loss: 0.020749	Train auc: 0.755518
Train Epoch: 9	Loss: 0.019927	Train auc: 0.760466
Train Epoch: 10	Loss: 0.019314	Train auc: 0.763746
Train Epoch: 11	Loss: 0.018788	Train auc: 0.765525
Train Epoch: 12	Loss: 0.018336	Train auc: 0.768733
Train Epoch: 13	Loss: 0.017964	Train auc: 0.770827
Train Epoch: 14	Loss: 0.017664	Train auc: 0.771705
Train Epoch: 15	Loss: 0.017421	Train auc: 0.773691
Train Epoch: 16	Loss: 0.017225	Train auc: 0.774971
Train Epoch: 17	Loss: 0.017054	Train auc: 0.777121
Train Epoch: 18	Loss: 0.016907	Train auc: 0.778925
Train Epoch: 19	Loss: 0.016762	Train auc: 0.780258
Train Epoch: 20	Loss: 0.016617	Train auc

In [12]:
for train_index, val_index in skf.split(x_train, y_train):
    x_train_fold = x_train[train_index]
    y_train_fold = y_train[train_index]
    x_train_normal = x_train_fold[y_train_fold == 0]

    x_val_fold = x_train[val_index]
    y_val_fold = y_train[val_index]
    
    # Train on normal data
    iforest = IsolationForest().fit(x_train_normal)
    y_pred_iforest = iforest.predict(x_val_fold)

    y_pred_iforest[y_pred_iforest==1] = 0
    y_pred_iforest[y_pred_iforest==-1] = 1
    iforest_f1 = f1_score(y_val_fold, y_pred_iforest, average='macro')
    print(iforest_f1)

0.31950022426601254
0.5855826880076329
0.40918100626886766
0.30489379560745794
0.14871715669956698


In [13]:
for train_index, val_index in skf.split(x_train, y_train):
    x_train_fold = x_train[train_index]
    y_train_fold = y_train[train_index]
    x_train_normal = x_train_fold[y_train_fold == 0]

    x_val_fold = x_train[val_index]
    y_val_fold = y_train[val_index]
    
    # Train on normal data
    ocsvm = OneClassSVM(gamma='auto').fit(x_train_normal)
    # check performance on training set for sanity check. 
    y_pred_ocsvm = ocsvm.predict(x_val_fold)

    y_pred_ocsvm[y_pred_ocsvm==1] = 0
    y_pred_ocsvm[y_pred_ocsvm==-1] = 1
    ocsvm_f1 = f1_score(y_val_fold, y_pred_ocsvm, average='macro')
    print(ocsvm_f1)

0.38577033519720005
0.2213723586102315
0.37481302496835806
0.3713325260196587
0.3159978299371823
