In [None]:
from pyod.models.copod import COPOD
import os
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import re
import pyod
import pyod.models.auto_encoder_torch as ae
import pyod.models.cblof as cblof
import pyod.models.knn as knn
# from dao import DataLoading

# Potentially added at a later point, but need it install SUOD on Conda
# import pyod.models.suod as SUOD

In [None]:
def plot_results(data, anamoly_score, threshold, anamoly_pos, title, file_to_load):
    
    fig, (ax1, ax2) = plt.subplots(2)
    fig.suptitle(title)
    ax1.plot(total[0:threshold], color = 'gray')
    ax1.plot(total[threshold:], color = 'blue')
    ax1.set_ylabel('Value')
    ax1.axvline(threshold, label='threshold', color = 'black')
    ax1.axvline(anamoly_pos, label='predicted anamoly', color = 'red')
    ax2.plot(anamoly_score[0:threshold], color = 'gray')
    ax2.plot(anamoly_score[threshold:], color = 'blue')
    ax2.set_ylabel('Anamoly Score')
    ax2.axvline(threshold, label='threshold', color = 'black')
    ax2.axvline(anamoly_pos, label='predicted anamoly', color = 'red')
    plt.show()

In [None]:
# Define split data

def split_data(file_path, index):
    file_name = os.listdir(file_path)[index]
    test_data_start_pt = int(re.findall(
        r'[0-9]*.txt', file_name)[0].split('.')[0])
    total_data = pd.read_csv(os.path.join(file_path, os.listdir(file_path)[index]))
    train_data = total_data[0:test_data_start_pt]
    test_data = total_data[test_data_start_pt+1:len(total_data)]

    return train_data, test_data, total_data, test_data_start_pt

In [None]:
def train_models():
    
    # Autoencoder
    anamoly_pos_autoenc, test_outlier_scores_autoenc, prominence_score_autoenc = Detect_AE()

    # Cluster-based Local Outlier Factor
    anamoly_pos_CBLOF, test_outlier_scores_CBLOF, prominence_score_CBLOF = Detect_CBLOF()

    # K-Nearest Neighbors
    anamoly_pos_KNN, test_outlier_scores_KNN, prominence_score_KNN = Detect_KNN()

    # Copula-Based Outlier Detection
    anamoly_pos_COPOD, test_outlier_scores_COPOD, prominence_score_COPOD = Detect_COPOD()
    
    return anamoly_pos_autoenc, anamoly_pos_CBLOF, anamoly_pos_KNN, anamoly_pos_COPOD, prominence_score_autoenc, prominence_score_CBLOF, prominence_score_KNN, prominence_score_COPOD
    

In [None]:
def Detect_AE():
    # Implement algorithm #1 (Autoencoding)
    # https://pyod.readthedocs.io/en/latest/pyod.models.html#pyod-models-auto-encoder-module

    # Define model (TBD hidden neuron details...)
    clf = ae.AutoEncoder(epochs = 2)

    # Fit model
    clf.fit(train)
    
    # Determine outlier scores for training data
    train_outlier_scores = clf.decision_scores_
    
    # Determine outlier scores for test data
    test_outlier_scores = clf.decision_function(total)
    test_outlier_scores = pd.Series(test_outlier_scores)
    anamoly_pos = test_outlier_scores[threshold:].idxmax()
    prominence_score = test_outlier_scores.nlargest(2).iloc[0] / test_outlier_scores.nlargest(2).iloc[1]
    print("Predicted Anamoloy from Autoencoder algorithm is located at location: " + str(anamoly_pos))
    
    return anamoly_pos, test_outlier_scores, prominence_score

In [None]:
def Detect_CBLOF():
    # Implement algorithm #2 (CBLOF)
    outliers_fraction = 0.000000001

    # Define model (TBD hidden neuron details...)
    clf = cblof.CBLOF(contamination=outliers_fraction,check_estimator=False, random_state=0)

    # Fit model
    clf.fit(train)

    # Determine outlier scores for training data
    train_outlier_scores = clf.decision_scores_

    # Determine outlier scores for test data
    test_outlier_scores = clf.decision_function(total)
    test_outlier_scores = pd.Series(test_outlier_scores)
    anamoly_pos = test_outlier_scores[threshold:].idxmax()
    prominence_score = test_outlier_scores.nlargest(2).iloc[0] / test_outlier_scores.nlargest(2).iloc[1]
    print("Predicted Anamoloy from CBLOF algorithm is located at location: " + str(anamoly_pos))
    
    return anamoly_pos, test_outlier_scores, prominence_score

In [7]:
def Detect_KNN():
    # Implement algorithm #3 (K-Nearest Neighbors)
    outliers_fraction = 0.000000001

    # Define model (TBD hidden neuron details...)
    clf = knn.KNN(contamination=outliers_fraction)

    # Fit model
    clf.fit(train)

    # Determine outlier scores for training data
    train_outlier_scores = clf.decision_scores_

    # Determine outlier scores for test data
    test_outlier_scores = clf.decision_function(total)
    test_outlier_scores = pd.Series(test_outlier_scores)
    anamoly_pos = test_outlier_scores[threshold:].idxmax()
    prominence_score = test_outlier_scores.nlargest(2).iloc[0] / test_outlier_scores.nlargest(2).iloc[1]
    print("Predicted anamoloy from KNN algorithm is located at location: " + str(anamoly_pos))
    
    return anamoly_pos, test_outlier_scores, prominence_score

In [8]:
def Detect_COPOD():    
    # Implement algorithm #4 (COPOD)

    ##### COPOD Algorithm
    clf = COPOD()
    clf.fit(train)

    # Fit model
    clf.fit(train)

    # Determine outlier scores for training data
    train_outlier_scores = clf.decision_scores_

    # Determine outlier scores for test data
    test_outlier_scores = clf.decision_function(total)
    test_outlier_scores = pd.Series(test_outlier_scores)
    anamoly_pos = test_outlier_scores[threshold:].idxmax()
    prominence_score = test_outlier_scores.nlargest(2).iloc[0] / test_outlier_scores.nlargest(2).iloc[1]
    print("Predicted anamoloy from COPOD algorithm  is located at location: " + str(anamoly_pos))
    
    return anamoly_pos, test_outlier_scores, prominence_score

In [9]:
# Initialize all anamoly lists to be zero

# Filename
file_name_list = []

# Autoencoder
anamoly_pos_autoenc = []
anamoly_prom_autoenc = []

# Cluster-based Local Outlier Factor
anamoly_pos_CBLOF = []
anamoly_prom_CBLOF = []

# K-Nearest Neighbors
anamoly_pos_KNN = []
anamoly_prom_KNN = []

# Copula-Based Outlier Detection
anamoly_pos_COPOD = []
anamoly_prom_COPOD = []

# Ensemble prediction
predicted_voting = []
predicted_prom = []

# for k in range(len(os.listdir(file_path))-1):

# subset for testing purposes
for k in range(10):

# full set
# for k in range(250):
    # define path to data directory
    file_path = os.path.join(os.getcwd(), 'KDD-Cup', 'data')
    file_name = os.listdir(file_path)[k+1]
    file_to_load = os.path.join(file_path, file_name)
    train, test, total, threshold = split_data(file_path, k+1)
    
    print("Analysis started for file " + file_name)
    
    autoenc_pred, CBLOF_pred, KNN_pred, COPOD_pred, autoenc_prom, CBLOF_prom, KNN_prom, COPOD_prom = train_models()
    
    # Add value for each item to list
    file_name_list.append(file_name) 
    anamoly_pos_autoenc.append(autoenc_pred) 
    anamoly_pos_CBLOF.append(CBLOF_pred) 
    anamoly_pos_KNN.append(KNN_pred) 
    anamoly_pos_COPOD.append(COPOD_pred) 
    anamoly_prom_autoenc.append(autoenc_prom) 
    anamoly_prom_CBLOF.append(CBLOF_prom) 
    anamoly_prom_KNN.append(KNN_prom) 
    anamoly_prom_COPOD.append(COPOD_prom) 
    
    # Initially add zero to the list (voting mechanism handled within data frame below)
    predicted_voting.append(0)
    predicted_prom.append(0)
    
    
# Create Data frame of predicted anamolies
predictions = pd.DataFrame({'File Name': file_name_list, 'AutoEncoder': anamoly_pos_autoenc, 'CBLOF':anamoly_pos_CBLOF, 'KNN': anamoly_pos_KNN,'COPOD':anamoly_pos_COPOD, 'AutoEncoder_prom': anamoly_prom_autoenc, 'CBLOF_prom':anamoly_prom_CBLOF, 'KNN_prom': anamoly_prom_KNN, 'COPOD_prom':anamoly_prom_COPOD, 'Predicted_voting': predicted_voting, 'Predicted_prom':predicted_prom} ) 

# Prominence-based voting
predictions['Predicted_prom'] = np.select(
    [
        predictions['AutoEncoder_prom'] >= predictions[['AutoEncoder_prom', 'CBLOF_prom', 'KNN_prom', 'COPOD_prom']].values.max(1),
        predictions['CBLOF_prom'] >= predictions[['AutoEncoder_prom', 'CBLOF_prom', 'KNN_prom', 'COPOD_prom']].values.max(1),
        predictions['KNN_prom'] >= predictions[['AutoEncoder_prom', 'CBLOF_prom', 'KNN_prom', 'COPOD_prom']].values.max(1),
        predictions['COPOD_prom'] >= predictions[['AutoEncoder_prom', 'CBLOF_prom', 'KNN_prom', 'COPOD_prom']].values.max(1)
    ], 
    [
        predictions['AutoEncoder'],
        predictions['CBLOF'],
        predictions['KNN'],
        predictions['COPOD']
    ], 
    default=-9999
)

# Mode-based voting
predictions['Predicted_voting'] = predictions[['AutoEncoder', 'CBLOF', 'KNN', 'COPOD']].mode(axis=1, dropna=False)

# Display data frame
predictions

Analysis started for file 001_UCR_Anomaly_35000.txt
inner_autoencoder(
  (activation): ReLU()
  (encoder): Sequential(
    (batch_norm0): BatchNorm1d(1, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (linear0): Linear(in_features=1, out_features=64, bias=True)
    (relu0): ReLU()
    (dropout0): Dropout(p=0.2, inplace=False)
    (batch_norm1): BatchNorm1d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (linear1): Linear(in_features=64, out_features=32, bias=True)
    (relu1): ReLU()
    (dropout1): Dropout(p=0.2, inplace=False)
  )
  (decoder): Sequential(
    (batch_norm0): BatchNorm1d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (linear0): Linear(in_features=32, out_features=64, bias=True)
    (dropout0): Dropout(p=0.2, inplace=False)
    (batch_norm1): BatchNorm1d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (linear1): Linear(in_features=64, out_features=1, bias=True)
    (dropout1): D

Predicted Anamoloy from CBLOF algorithm is located at location: 5101
Predicted anamoloy from KNN algorithm is located at location: 6252
Predicted anamoloy from COPOD algorithm  is located at location: 5101
Analysis started for file 007_UCR_Anomaly_4000.txt
inner_autoencoder(
  (activation): ReLU()
  (encoder): Sequential(
    (batch_norm0): BatchNorm1d(1, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (linear0): Linear(in_features=1, out_features=64, bias=True)
    (relu0): ReLU()
    (dropout0): Dropout(p=0.2, inplace=False)
    (batch_norm1): BatchNorm1d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (linear1): Linear(in_features=64, out_features=32, bias=True)
    (relu1): ReLU()
    (dropout1): Dropout(p=0.2, inplace=False)
  )
  (decoder): Sequential(
    (batch_norm0): BatchNorm1d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (linear0): Linear(in_features=32, out_features=64, bias=True)
    (dropout0): Dropou

Unnamed: 0,File Name,AutoEncoder,CBLOF,KNN,COPOD,AutoEncoder_prom,CBLOF_prom,KNN_prom,COPOD_prom,Predicted_voting,Predicted_prom
0,001_UCR_Anomaly_35000.txt,71184,66888,71184,66888,1.108196,1.001774,1.119512,1.063256,0,0
1,002_UCR_Anomaly_35000.txt,71390,67095,67095,67095,1.0877,1.012925,1.084305,1.063246,0,0
2,003_UCR_Anomaly_35000.txt,71389,71389,67094,67094,1.05438,1.041123,1.056425,1.063246,0,0
3,004_UCR_Anomaly_2500.txt,5590,5590,5590,5590,1.027281,1.010809,1.020513,1.078613,0,0
4,005_UCR_Anomaly_4000.txt,4768,6253,6254,6253,1.000762,1.029672,1.309945,1.086411,0,0
5,006_UCR_Anomaly_4000.txt,5101,5101,6252,5101,1.02907,1.021937,1.159905,1.086411,0,0
6,007_UCR_Anomaly_4000.txt,5078,5078,5078,5078,1.01034,1.015862,1.084188,1.086411,0,0
7,008_UCR_Anomaly_4000.txt,4837,4791,4790,4791,1.001873,1.001247,1.006973,1.086411,0,0
8,009_UCR_Anomaly_4000.txt,6253,6253,6700,6253,1.002657,1.008614,1.071063,1.086411,0,0
9,010_UCR_Anomaly_4000.txt,5101,5101,5101,5101,1.000367,1.005557,1.053841,1.132217,0,0


In [10]:
# Creation of final prediction file
submission_file = pd.DataFrame()
submission_file['No.'] = predictions['File Name'].str[:3].astype('int32')
submission_file['Location of Anomaly'] = predictions['Predicted_prom']
submission_file.style.hide_index()

No.,Location of Anomaly
1,0
2,0
3,0
4,0
5,0
6,0
7,0
8,0
9,0
10,0


In [None]:
#### Below for development purposes ####
print(file_name)

# Autoencoder
anamoly_pos_autoenc, test_outlier_scores_autoenc, prom_autoenc = Detect_AE()

# Cluster-based Local Outlier Factor
anamoly_pos_CBLOF, test_outlier_scores_CBLOF, prom_CBLOF = Detect_CBLOF()

# K-Nearest Neighbors
anamoly_pos_KNN, test_outlier_scores_KNN, prom_KNN = Detect_KNN()

# Copula-Based Outlier Detection
anamoly_pos_COPOD, test_outlier_scores_COPOD, prom_COPOD = Detect_COPOD()

In [None]:
# Plot results
plot_results(total, test_outlier_scores_autoenc, threshold, anamoly_pos_autoenc, "Auto Encoder Anomoly Scores", file_to_load)

In [None]:
# Plot results
plot_results(total, test_outlier_scores_CBLOF, threshold, anamoly_pos_CBLOF, "Cluster-based Local Outlier Factor Anomoly Scores", file_to_load)

In [None]:
# Plot results
plot_results(total, test_outlier_scores_KNN, threshold, anamoly_pos_KNN, "K-Nearest Neighbors Anomoly Scores", file_to_load)

In [None]:
# Plot
plot_results(total, test_outlier_scores_COPOD, threshold, anamoly_pos_COPOD, "Copula-Based Outlier Detection Anomoly Scores", file_to_load)