In [1]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from cycler import cycler
import mplhep as hep
from src.models import TranAD, iTransformer
from torch.utils.data import Dataset, DataLoader
import torch


In [2]:
# plt.rcParams['axes.prop_cycle'] = cycler('color', ['#17becf','#8c564b', '#e377c2', '#7f7f7f','#bcbd22', '#1f77b4', '#ff7f0e', '#2ca02c', '#d62728', '#9467bd'])
plt.style.use([hep.style.ROOT, hep.style.firamath])

## Deterministic baseline algorithm
apply POT directly to time series features without applying any model before

In [3]:
import math
import os
from src.data_loader import MyDataset
from src.pot import calc_point2point
from src.pot import pot_eval
from src.plotting import plot_ascore, plot_labels, plot_metrics, compare_labels

In [4]:
dataset = 'GECCO'
flag_less = False
modeltype = 'None'
n_windows = 10
feats = -1
res_path = f'None/None_{dataset}/feats{feats}'
plot_path = res_path
os.makedirs(plot_path, exist_ok=True)

In [5]:
train_set = MyDataset(dataset, n_windows, n_windows, modeltype, flag='train', feats=feats, less=flag_less, enc=False, k=-1)
test_set = MyDataset(dataset, n_windows, n_windows, modeltype, flag='test', feats=feats, enc=False, less=flag_less, k=-1)
feats = test_set.feats
enc_feats = test_set.enc_feats
print(feats, enc_feats)

9 0


In [6]:
labels = test_set.get_labels()
train = train_set.get_complete_data()
test = test_set.get_complete_data()
print(train.shape, test.shape, labels.shape)

(69783, 9) (69783, 9) (69783, 9)


In [7]:
# don't apply any model
lossT = np.abs(train)
loss = np.abs(test)

In [8]:
# local anomaly labels
df_res_local = pd.DataFrame()
preds = []
for i in range(loss.shape[1]):
    lt, l, ls = lossT[:, i], loss[:, i], labels[:, i]  
    result_local, pred = pot_eval(lt, l, ls, plot_path, f'dim{i}', q=1e-5)
    preds.append(pred)
    df_res = pd.DataFrame.from_dict(result_local, orient='index').T
    df_res_local = pd.concat([df_res_local, df_res], ignore_index=True)
lossTfinal, lossFinal = np.mean(lossT, axis=1), np.mean(loss, axis=1)
true_labels = (np.sum(labels, axis=1) >= 1) + 0
preds = np.array(preds).T
preds = preds.astype(int)
labelspred = (np.sum(preds, axis=1) >= 1) + 0


In [9]:
plot_ascore(plot_path, 'ascore_local', ascore=loss, labels=true_labels)
plot_labels(plot_path, 'labels_local', y_pred=labelspred, y_true=true_labels)
result_local = calc_point2point(predict=labelspred, actual=true_labels)
result_local1 = {'f1': result_local[0], 'precision': result_local[1], 'recall': result_local[2], 
                'TP': result_local[3], 'TN': result_local[4], 'FP': result_local[5], 'FN': result_local[6], 
                'ROC/AUC': result_local[7], 'MCC': result_local[8]}
print('local results')
print(result_local1)

# do majority voting over dimensions for local results instead of inclusive OR
majority = math.ceil(labels.shape[1] / 2)
labelspred_maj = (np.sum(preds, axis=1) >= majority) + 0
plot_labels(plot_path, 'labels_local_maj', y_pred=labelspred_maj, y_true=true_labels)
result_local = calc_point2point(predict=labelspred_maj, actual=true_labels)
result_local2 = {'f1': result_local[0], 'precision': result_local[1], 'recall': result_local[2], 
                'TP': result_local[3], 'TN': result_local[4], 'FP': result_local[5], 'FN': result_local[6], 
                'ROC/AUC': result_local[7], 'MCC': result_local[8]}
print('\nlocal results with majority voting')
print(result_local2)
temp = np.where(labelspred_maj != true_labels)
print(temp, np.all(labelspred_maj == true_labels))

# global anomaly labels
result_global, pred2 = pot_eval(lossTfinal, lossFinal, true_labels, plot_path, f'all_dim', q=1e-5)
labelspred_glob = (pred2 >= 1) + 0
plot_ascore(plot_path, 'ascore_global', ascore=lossFinal, labels=true_labels)
plot_labels(plot_path, 'labels_global', y_pred=labelspred_glob, y_true=true_labels)
metrics_global = calc_point2point(predict=labelspred_glob, actual=true_labels)
print('\nglobal results') 
print(result_global)

plot_metrics(plot_path, ['local (incl. OR)', 'local (maj. voting)', 'global'], 
			  y_pred=[labelspred, labelspred_maj, labelspred_glob], y_true=true_labels)

# compare local & global anomaly labels
compare_labels(plot_path, pred_labels=[labelspred, labelspred_maj], true_labels=true_labels, 
            plot_labels=['Local anomaly\n(inclusive OR)', 'Local anomaly\n(majority voting)'], name='_loc_vs_maj')
compare_labels(plot_path, pred_labels=[labelspred, labelspred_maj, labelspred_glob], true_labels=true_labels, 
            plot_labels=['Local anomaly\n(inclusive OR)', 'Local anomaly\n(majority voting)', 'Global anomaly'], name='_all')

# saving results
df_res_global = pd.DataFrame.from_dict(result_global, orient='index').T
df_res_global.index = ['global']
result_local1 = pd.DataFrame.from_dict(result_local1, orient='index').T
result_local2 = pd.DataFrame.from_dict(result_local2, orient='index').T
result_local1.index = ['local_all']
result_local2.index = ['local_all_maj']
df_res_local = pd.concat([df_res_local, result_local1, result_local2])
df_res = pd.concat([df_res_local, df_res_global]) 
df_labels = pd.DataFrame( {'local': labelspred, 'local_maj': labelspred_maj, 'global': labelspred_glob} )

df_res.to_csv(f'{res_path}/res.csv')	
df_labels.to_csv(f'{res_path}/pred_labels.csv', index=False)


local results
{'f1': 0.08288311118528607, 'precision': 0.043233639299239776, 'recall': 0.9999999863013701, 'TP': 730, 'TN': 52898, 'FP': 16155, 'FN': 0, 'ROC/AUC': 0.883024633252719, 'MCC': 0.18198653162571624}

local results with majority voting
{'f1': 0.0, 'precision': 0.0, 'recall': 0.0, 'TP': 0, 'TN': 69053, 'FP': 0, 'FN': 730, 'ROC/AUC': 0.5, 'MCC': 0.0}
(array([ 2112,  2113,  2114,  2115,  2116,  2117,  2118,  2119,  2120,
        2121,  2122,  2123,  2124,  2125,  2126,  2127,  2128,  2129,
        2130,  2131,  2132,  2133,  2134,  2135,  2136,  2137,  2138,
        2139,  2140,  2141,  2142,  2143,  2144,  2145,  2146,  2147,
        2148,  2149,  2150,  2151,  2152,  2153,  2154,  2155,  2156,
        2157,  2158,  2159,  2160,  2161,  2162,  2163,  2164,  2165,
        2166,  2167,  2168,  2169,  2170,  2171,  2172,  2173,  2174,
        2175,  2176,  2177,  2178,  2179,  2180,  2181,  2182,  2183,
        2184,  2185,  2186,  2187,  2188,  2189,  2190,  2191,  2192,
       

## Isolation forest
apply Isolation Forest (IF) model to time windows

In [10]:
import math
import os
from src.data_loader import MyDataset
from src.pot import calc_point2point
from src.pot import pot_eval
from src.plotting import plot_ascore, plot_labels, plot_metrics, compare_labels

In [11]:
dataset = 'GECCO'
feats = -1

res_path = f'IF/IF_{dataset}/feats{feats}'
plot_path = f'{res_path}/plots'
os.makedirs(plot_path, exist_ok=True)
res_path = f'{res_path}/results'
os.makedirs(res_path, exist_ok=True)
flag_less = True
n_windows = 10
modeltype = 'IF'

In [12]:
train_set = MyDataset(dataset, n_windows, n_windows, modeltype, flag='train', feats=feats, less=flag_less, enc=False, k=-1)
test_set = MyDataset(dataset, n_windows, n_windows, modeltype, flag='test', feats=feats, enc=False, less=flag_less, k=-1)
feats = test_set.feats
enc_feats = test_set.enc_feats
print(feats, enc_feats)

90 0


In [13]:
labels = test_set.get_labels()
train = train_set.get_complete_data()
test = test_set.get_complete_data()
print(train.shape, test.shape, labels.shape)

(10000, 9) (10000, 9) (10000, 90)


In [14]:
# print(len(labels[:,0][labels[:,0]==1])/len(labels[:,0]))
print(len(labels[labels==1])/len(labels), len(labels[labels==1]))

4.311 43110


In [15]:
contamination = 0.02

In [16]:
# isolation forest
from sklearn.ensemble import IsolationForest

clf = IsolationForest(contamination=contamination, random_state=0).fit(train)
lossTfinal = clf.decision_function(train)
lossFinal = clf.decision_function(test)
labelspredIF = clf.predict(test)
labelspredIF = (labelspredIF == -1) + 0
print(len(labelspredIF[labelspredIF == 1]), len(labelspredIF[labelspredIF == 0]))
print(labelspredIF.shape)
print(lossTfinal.shape, lossFinal.shape)

2110 7890
(10000,)
(10000,) (10000,)


In [17]:
true_labels = (np.sum(labels, axis=1) >= 1) + 0

In [18]:
plot_labels(plot_path, 'labels_local', y_pred=labelspredIF, y_true=true_labels)
metrics_IF = calc_point2point(predict=labelspredIF, actual=true_labels)
result_IF = {'f1': metrics_IF[0], 'precision': metrics_IF[1], 'recall': metrics_IF[2],
            'TP': metrics_IF[3], 'TN': metrics_IF[4], 'FP': metrics_IF[5], 'FN': metrics_IF[6],
            'ROC/AUC': metrics_IF[7], 'MCC': metrics_IF[8]}
print('IF results')
print(result_IF)
df_res_IF = pd.DataFrame.from_dict(result_IF, orient='index').T


# global anomaly labels
result_global, pred2 = pot_eval(lossTfinal, lossFinal, true_labels, plot_path, f'all_dim', q=1e-5)
labelspred_glob = (pred2 >= 1) + 0
plot_ascore(plot_path, 'ascore_global', ascore=lossFinal, labels=true_labels)
plot_labels(plot_path, 'labels_global', y_pred=labelspred_glob, y_true=true_labels)
metrics_global = calc_point2point(predict=labelspred_glob, actual=true_labels)
print('\nglobal results') 
print(result_global)

plot_metrics(plot_path, ['IF', 'global'], 
			  y_pred=[labelspredIF, labelspred_glob], y_true=true_labels)

# compare local & global anomaly labels
compare_labels(plot_path, pred_labels=[labelspredIF, labelspred_glob], true_labels=true_labels, 
            plot_labels=['Anomaly from IF', 'Global anomaly'], name='_all')

# saving results
df_res_global = pd.DataFrame.from_dict(result_global, orient='index').T
df_res_global.index = ['global']
df_res = pd.concat([df_res_IF, df_res_global]) 
df_labels = pd.DataFrame( {'IF': labelspredIF, 'global': labelspred_glob} )

df_res.to_csv(f'{res_path}/res.csv')	
df_labels.to_csv(f'{res_path}/pred_labels.csv', index=False)

IF results
{'f1': 0.3530290403381545, 'precision': 0.21658767669863657, 'recall': 0.9540709612928818, 'TP': 457, 'TN': 7868, 'FP': 1653, 'FN': 22, 'ROC/AUC': 0.8902273822134523, 'MCC': 0.4084858773765253}

global results
{'f1': 0.0, 'precision': 0.0, 'recall': 0.0, 'TP': 0, 'TN': 9521, 'FP': 0, 'FN': 479, 'ROC/AUC': 0.5, 'MCC': 0.0, 'threshold': 0.20852962159184285}
