<a href="https://colab.research.google.com/github/ZhuangweiKang/VU-AD/blob/master/knn.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
sys.path.insert(0,'../') 
import plotter as pltr
import numpy as np
from dim_reduce import *

# from google.colab import drive
# drive.mount('/content/drive/Cisco/')

## SMD Dataset

In [2]:
GROUP = 1
ENTITY = 3
SMD_BASE_PATH = '../Dataset/SMD/processed'

X_train = pd.read_csv('%s/train/machine-%d-%d.txt' % (SMD_BASE_PATH, GROUP, ENTITY), index_col=0)
X_test = pd.read_csv('%s/test/machine-%d-%d.txt' % (SMD_BASE_PATH, GROUP, ENTITY), index_col=0)

y_test = pd.read_csv('../Dataset/SMD/test_label/machine-%d-%d.txt' % (GROUP, ENTITY), header=None)
y_test.columns = ['label']
y_test.index = X_test.index
y_test = y_test.iloc[:-1000]

# remove all-zero columns that are in training or testing set
train_nonzero_cols = X_train.columns[(X_train != 0).any()]
test_nonzero_cols = X_test.columns[(X_test != 0).any()]
cols = set(train_nonzero_cols) & set(test_nonzero_cols)

X_train = X_train[cols]
X_test = X_test[cols].iloc[:-1000]

train_index = X_train.index
train_lines = X_train.shape[0]
test_index = X_test.index
test_lines = X_test.shape[0]

X = pd.concat([X_train, X_test], axis=0)

In [3]:
col = 'm0'
pltr.plot_data(X, col)

In [4]:
"""
PCA worsens anomaly prediction accuracy, so it's better to apply KNN on real feature space.
"""

# X = pca_dr(X, 0.90, transform=True)
# X = pd.DataFrame(X, columns=['m%d' % i for i in range(X.shape[1])], index=list(X_train.index)+list(X_test.index))

# X_train = X.iloc[:train_lines]
# X_test = X.iloc[train_lines:]

In [5]:
from sklearn.preprocessing import MinMaxScaler, Normalizer


"""
The dataset has been normalized, so skip the pre-processing step.
"""

# scaler = MinMaxScaler()
# X_train = scaler.fit_transform(X_train)
# X_test = scaler.transform(X_test)

# X_train = pd.DataFrame(X_train, columns=['m%d' % i for i in range(X_train.shape[1])], index=train_index)
# X_test = pd.DataFrame(X_test, columns=['m%d' % i for i in range(X_test.shape[1])], index=test_index)

In [6]:
from pyod.models.knn import KNN

"""
Critical hyper-params
---------
contamination: The amount of contamination of the data set. [0, 1]
n_neighbors: we use essemble learning to explore n_clf numbers of k values
"""

# outlier portion in testing set
anomaly_por = y_test[y_test['label'] == 1].shape[0] / y_test.shape[0]

n_clf = 20
k_list = [5*x for x in range(1, n_clf+1)]

train_scores = np.zeros([X_train.shape[0], n_clf])
test_scores = np.zeros([X_test.shape[0], n_clf])

for i, k in enumerate(k_list):
    clf = KNN(contamination=anomaly_por, n_neighbors=k, method='largest', n_jobs=-1)
    clf.fit(X_train)
    
    # storing anomaly scores of each points of traing and testing dataset
    train_scores[:, i] = clf.decision_scores_
    test_scores[:, i] = clf.decision_function(X_test)

In [8]:
from pyod.models.combination import aom, moa, average, maximization, median
from pyod.utils.utility import standardizer

# scores have to be normalized before combination
train_scores_norm, test_scores_norm = standardizer(train_scores, test_scores)

# calculate statistics over n_clf numbers of k
comb_by_median = median(test_scores_norm)
comb_by_average = average(test_scores_norm)
comb_by_maximization = maximization(test_scores_norm)

# Average of Maximum (AOM): divide base detectors into subgroups and take the maximum score for each subgroup. The final score is the average of all subgroup scores.
comb_by_aom = aom(test_scores_norm, 5) # 5 groups 

# Maximum of Average (MOA): divide base detectors into subgroups and take the average score for each subgroup. The final score is the maximum of all subgroup scores.
comb_by_moa = moa(test_scores_norm, 5) # 5 groups

In [9]:
from pyod.utils.data import evaluate_print

# evaluate and print the results
evaluate_print('Combination by Median', y_test, comb_by_median)
evaluate_print('Combination by Average', y_test, comb_by_average)
evaluate_print('Combination by Maximization', y_test, comb_by_maximization)
evaluate_print('Combination by AOM', y_test, comb_by_aom)
evaluate_print('Combination by MOA', y_test, comb_by_moa)

Combination by Median ROC:0.8521, precision @ rank n:0.4014
Combination by Average ROC:0.8516, precision @ rank n:0.3986
Combination by Maximization ROC:0.8395, precision @ rank n:0.3708
Combination by AOM ROC:0.8477, precision @ rank n:0.3958
Combination by MOA ROC:0.8486, precision @ rank n:0.3903


In [10]:
"""
threshold is critical when labling anomalies on the test dataset.
The threshold defined below ensures the amount of outliers is equal to that in y_test.
"""

threshold = np.quantile(comb_by_aom, 1-anomaly_por)

y_test_pred_label = pd.DataFrame(comb_by_aom, columns=['label'], index=X_test.index)
y_test_pred_label['label'] = y_test_pred_label['label'] > threshold
pltr.plot_anomaly(X_test, y_test_pred_label, col)

In [11]:
pltr.plot_anomaly(X_test, y_test, col) # anomalies labled by SMD vendor

In [13]:
from sklearn.metrics import accuracy_score, confusion_matrix

ac = accuracy_score(y_test_pred_label,y_test)
tn, fp, fn, tp  = confusion_matrix(y_test, y_test_pred_label).ravel()

print('Accurancy:', ac)
print('TN:', tn)
print('FP:', fp)
print('FN:', fn)
print('TP:', tp)

Accurancy: 0.9616790732502313
TN: 21548
FP: 435
FN: 435
TP: 285
