In [113]:
from helper import *
sys.path.insert(0,'../') 
import plotter as pltr

In [114]:
# Hyper-parameters
sw = 1  # length of slide window in days
fs_sw_ratio = 0.25
detector = 'knn'
threshold_opt = 'hybrid'
train_contextual = False
dataset = 'SMD'

## Step 1. Load Dataset

In [115]:
if dataset == 'SMD':
    GROUP = 1
    ENTITY = 3
    SMD_BASE_PATH = 'Dataset/SMD'

    X_train = load_data('%s/train/machine-%d-%d.txt' % (SMD_BASE_PATH, GROUP, ENTITY), header=False)
    X_train.columns = ['m%d' % i for i in range(X_train.shape[1])]
    X_train.index = pd.date_range('2021/03/02', '2021/03/21', periods=X_train.shape[0])
    X_train.index.name = 'timestamp'

    X_test = load_data('%s/test/machine-%d-%d.txt' % (SMD_BASE_PATH, GROUP, ENTITY), header=False)
    X_test.columns = ['m%d' % i for i in range(X_test.shape[1])]
    X_test.index = pd.date_range('2021/03/21', '2021/4/8', periods=X_test.shape[0])
    X_test.index.name = 'timestamp'

    y = pd.read_csv('Dataset/SMD/test_label/machine-%d-%d.txt' % (GROUP, ENTITY), header=None)
    y.columns = ['label']
    y.index = X_test.index
    y = y.iloc[:-1000]

    X_test = X_test.iloc[:-1000]

In [116]:
if dataset == 'CTF':
    import pickle as pkl

    ENTITY = 0
    CTF_BASE_PATH = 'Dataset/CTF/processed'

    X_train = load_data('%s/train/%d.csv' % (CTF_BASE_PATH, ENTITY)).iloc[:-1]
    X_train.columns = ['m%d' % i for i in range(X_train.shape[1])]
    X_train.index = pd.date_range(start='2020/04/18', freq='30s', periods=X_train.shape[0])
    X_train.index.name = 'timestamp'

    X_test = load_data('%s/test/%d.csv' % (CTF_BASE_PATH, ENTITY)).iloc[:-1]
    X_test.columns = ['m%d' % i for i in range(X_test.shape[1])]
    X_test.index = pd.date_range(start='2020/04/23', freq='30s', periods=X_test.shape[0])
    X_test.index.name = 'timestamp'

    with open('Dataset/CTF/label_result/%d.pkl' % ENTITY, 'rb') as f:
        y = pd.DataFrame(pkl.load(f), columns=['label'])
        y.index = X_test.index

In [117]:
print('Train Shape:', X_train.shape)
print('Test Shape:', X_test.shape)

Train Shape: (23702, 37)
Test Shape: (22703, 37)


In [118]:
# plot the first dimension
pltr.plot_data(pd.concat([X_train, X_test], axis=0), 'm0')

In [119]:
# plot the first dimension
pltr.plot_data(X_test, 'm0')

## Step 2. Feature Analysis

In [120]:
# # PCA Dimension Reduction
# tmp = pca_dr(X, 0.95, transform=True)
# X = pd.DataFrame(tmp, columns=['m%d' % i for i in range(tmp.shape[1])], index=X.index)

In [121]:
# # plot the first dimension
# pltr.plot_data(X, 'm0')

## Step 3. Anomaly Detection

In [122]:
from pyod.models.knn import KNN
from pyod.models.combination import aom, moa, average, maximization, median
from pyod.utils.utility import standardizer
from pyod.utils.data import evaluate_print
from joblib import dump, load
import os

def knn_detector(sw_X_train, sw_X_test):
    n_clf = 20
    k_list = [10*x for x in range(1, n_clf+1)]
    test_scores = np.zeros([sw_X_test.shape[0], n_clf])
    train_scores = np.zeros([sw_X_train.shape[0], n_clf])
    for i, k in enumerate(k_list):
        clf = KNN(n_neighbors=k, method='largest', n_jobs=-1)
        clf.fit(sw_X_train)
        score = clf.decision_function(sw_X_test)  # outlier scores
        test_scores[:, i] = score
        train_scores[:, i] = clf.decision_scores_
    train_scores_norm, test_scores_norm = standardizer(train_scores, test_scores)
    # return average(train_scores_norm), average(test_scores_norm)
    return aom(train_scores_norm), aom(test_scores_norm)

In [123]:
from pyod.models.auto_encoder import AutoEncoder

def ae_detector(sw_X_train, sw_X_test):
    hidden_neurons=[64, 32, 2, 32, 64]
    if not os.path.exists('ae.joblib'):
        clf = AutoEncoder(hidden_neurons=hidden_neurons, epochs=30, preprocessing=False, validation_size=0.3, verbose=0)
        clf.fit(sw_X_train)
        dump(clf, 'ae.joblib')
    else:
        clf = load('ae.joblib')
    test_scores = clf.decision_function(sw_X_test)
    train_scores = clf.decision_scores_
    return train_scores, test_scores

In [None]:
from pyod.models.vae import VAE

def vae_detector(sw_X_train, sw_X_test):
    encoder_neurons = [64, 32]
    decoder_neurons = [32, 64]
    if not os.path.exists('vae.joblib'):
        clf = VAE(encoder_neurons=encoder_neurons, decoder_neurons=decoder_neurons, latent_dim=2)
        clf.fit(sw_X_train)
        dump(clf, 'vae.joblib')
    else:
        clf = load('vae.joblib')
    test_scores = clf.decision_function(sw_X_test)
    train_scores = clf.decision_scores_
    return train_scores, test_scores

In [124]:
# the size of slide window is 1 day, prediction window is 1 hour
test_start_date = X_test.index[0]
slide_window_size = X_test.loc[(X_test.index < pd.Timestamp(test_start_date.year, test_start_date.month, test_start_date.day+sw)) & (X_test.index >= pd.Timestamp(test_start_date.year, test_start_date.month, test_start_date.day))].shape[0]
forward_steps = int(fs_sw_ratio * slide_window_size)

if detector == 'knn':
    detector = knn_detector
elif detector == 'autoencoder':
    detector = ae_detector
elif detector == 'vae':
    detector = vae_detector

train_results, test_results = slide_window_detector(detector, X_train, X_test, slide_window_size, forward_steps, train_contextual=train_contextual)
y_labels = []
k = 0
while k < len(y):
    y_labels.append(y.iloc[k:k+forward_steps]['label'].to_list())
    k += forward_steps

In [125]:
alph1 = 3
alph2 = 1
results1 = label_anomalies_1(train_results, test_results, slide_window_size, forward_steps, alph1, eval_opt='global')
results2 = label_anomalies_2(test_results, y_labels, slide_window_size, forward_steps, alph2, eval_opt='global')

In [126]:
results = results2
results['timestamp'] = X_test.index[:results.shape[0]]

In [127]:
import plotly.graph_objects as go

## Plot decision scores
fig = go.Figure()
fig.add_trace(go.Scatter(x=X_test.index, y=results['anomaly_score'], mode='lines', name='anomaly_score'))
fig.update_layout(height=400, width=800, showlegend=True)
fig.show()

## Step 4. Label Anomalies on the Test Dataset

In [128]:
match_data = X_test.loc[results['timestamp']]
pred_anomalies = X_test.loc[results[results['label'] == 1]['timestamp']]

col='m0'
fig = go.Figure()
fig.add_trace(go.Scatter(x=X_test.index, y=X_test[col], mode='lines', name='y_test'))
fig.add_trace(go.Scatter(x=pred_anomalies.index, y=pred_anomalies[col], mode='markers', name='Anomaly'))
fig.update_layout(showlegend=True, xaxis_title="Time", yaxis_title="value", height=400, width=800)
fig.show()

## Step 5. Evaluate Results

In [129]:
pltr.plot_anomaly(match_data, y.loc[results['timestamp']], col)

In [130]:
from sklearn.metrics import accuracy_score, confusion_matrix

ac = accuracy_score(results['label'], y.loc[results['timestamp']])
tn, fp, fn, tp  = confusion_matrix(results['label'], y.loc[results['timestamp']]['label']).ravel()

print('Accurancy:', ac)
print('TN:', tn)
print('FP:', fp)
print('FN:', fn)
print('TP:', tp)

Accurancy: 0.9354653980000881
TN: 20909
FP: 393
FN: 1072
TP: 327


In [131]:
recall = tp / (tp + fn)
precision = tp / (tp + fp)

In [132]:
print('Precision:', precision)
print('Recall:', recall)

Precision: 0.45416666666666666
Recall: 0.23373838456040028
