In [3]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from cycler import cycler
import mplhep as hep
from src.models import TranAD, iTransformer
from torch.utils.data import Dataset, DataLoader
import torch
from src.data_loader import convert_to_windows_new

In [4]:
# plt.rcParams['axes.prop_cycle'] = cycler('color', ['#17becf','#8c564b', '#e377c2', '#7f7f7f','#bcbd22', '#1f77b4', '#ff7f0e', '#2ca02c', '#d62728', '#9467bd'])
plt.style.use([hep.style.ROOT, hep.style.firamath])

### redo older plots

In [None]:
iTransf_n_windows = [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 200, 500, 1000, 2000]
TranAD_n_windows = [10, 20, 30, 40, 50, 60, 70]

In [None]:
N = 25
dataset = 'SMAP'

In [None]:
# def load_dataset(dataset):
# 	loader = []
# 	for file in ['train', 'test', 'labels']:
# 		if dataset == 'SMD': file = 'machine-1-1_' + file
# 		if dataset == 'SMAP': file = 'P-1_' + file
# 		if dataset == 'MSL': file = 'C-1_' + file
# 		if dataset == 'UCR': file = '136_' + file
# 		if dataset == 'NAB': file = 'ec2_request_latency_system_failure_' + file
# 		loader.append(np.load(os.path.join(f'processed/{dataset}', f'{file}.npy')))
	
# 	train_loader = DataLoader(loader[0], batch_size=loader[0].shape[0])
# 	test_loader = DataLoader(loader[1], batch_size=loader[1].shape[0])
# 	labels = loader[2]
# 	print('training set shape:', train_loader.dataset.shape)
# 	print('test set shape:', test_loader.dataset.shape)
# 	return train_loader, test_loader, labels

In [None]:
from src.plotting import plot_ascore, plot_labels, plot_metrics, compare_labels
from main import backprop
from src.pot import pot_eval

In [None]:
def replotting(model_name, n_windows, dataset='SMAP', N=25):
    for i, elem in enumerate(n_windows):
        # path = f'studies/{model_name}_{dataset}_2/n_window{elem}/results'
        model_path = f'studies/{model_name}_{dataset}_2/n_window{elem}/checkpoints/model.ckpt'
        plot_path = f'studies/{model_name}_{dataset}_2/n_window{elem}/plots'

        # initialise model etc
        checkpoint = torch.load(model_path)
        if model_name == 'TranAD':
            model = TranAD(N, elem).double()
        elif model_name == 'iTransformer':
            model = iTransformer(N, elem).double()
        else:
            print('pbm with model loading')
            break
        optimizer = torch.optim.AdamW(model.parameters() , lr=model.lr, weight_decay=1e-5)
        scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 5, 0.9)
        # load model etc
        model.load_state_dict(checkpoint['model_state_dict'])
        optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
        scheduler.load_state_dict(checkpoint['scheduler_state_dict'])
        # epoch = checkpoint['epoch']
        # accuracy_list = checkpoint['accuracy_list']
        total_params = sum(p.numel() for p in model.parameters())
        trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
        print(f'total params: {total_params}, trainable params: {trainable_params}')

        train_loader, test_loader, labels = load_dataset(dataset)
        trainD, testD = next(iter(train_loader)), next(iter(test_loader))
        trainO, testO = trainD, testD
        # prepare data
        if model.name in ['Attention', 'DAGMM', 'USAD', 'MSCRED', 'CAE_M', 'GDN', 'MTAD_GAT', 'MAD_GAN', 'iTransformer'] or 'TranAD' in model.name: 
            trainD, testD = convert_to_windows(trainD, model), convert_to_windows(testD, model)
        
        ### Testing phase
        torch.zero_grad = True
        model.eval()
        lossT, _ = backprop(0, model, trainD, trainO, optimizer, scheduler, training=False)  # training loss for POT
        loss, y_pred = backprop(0, model, testD, testO, optimizer, scheduler, training=False)

        preds = []
        for i in range(loss.shape[1]):
            lt, l, ls = lossT[:, i], loss[:, i], labels[:, i]
            result, pred = pot_eval(lt, l, ls, plot_path, f'dim{i}')
            preds.append(pred)
            df_res = pd.DataFrame.from_dict(result, orient='index').T
        lossTfinal, lossFinal = np.mean(lossT, axis=1), np.mean(loss, axis=1)
        labelsFinal = (np.sum(labels, axis=1) >= 1) + 0
        preds = np.array(preds).T
        preds = preds.astype(int)
        df_labels = pd.DataFrame(preds)
        labelspred = (np.sum(preds, axis=1) >= 1) + 0
        print(loss.shape, labelsFinal.shape)
        
        plot_ascore(plot_path, 'ascore_local', loss, labelsFinal)
        plot_labels(plot_path, 'labels_local', labelspred, labelsFinal)
        plot_metrics(plot_path, 'metrics_local', labelspred, labelsFinal)

        result2, pred2 = pot_eval(lossTfinal, lossFinal, labelsFinal, plot_path, f'all_dim')
        labelspred2 = (pred2 >= 1) + 0
        plot_ascore(plot_path, 'ascore_global', lossFinal, labelsFinal)
        plot_labels(plot_path, 'labels_global', labelspred2, labelsFinal)
        plot_metrics(plot_path, 'metrics_global', labelspred2, labelsFinal)
        
        # arr = np.where(labelspred!=labelspred2)
        # print(len(arr[0]), len(np.where(labelspred==labelspred2)[0]))

        compare_labels(plot_path, labels_loc=labelspred, labels_glob=labelspred2, labels=labelsFinal)


In [None]:
replotting('TranAD', TranAD_n_windows, dataset, 25)
replotting('iTransformer', iTransf_n_windows, dataset, 25)

## apply POT directly to ATLAS time series features without applying any model before

In [58]:
import math
import os
from main import load_dataset
from src.pot import calc_point2point
from src.pot import pot_eval
from src.plotting import plot_ascore, plot_labels, plot_metrics, compare_labels

In [59]:
dataset = 'GECCO'
feats = -1
res_path = f'None/None_{dataset}/feats{feats}'
plot_path = res_path
os.makedirs(plot_path, exist_ok=True)

In [60]:
train_loader, test_loader, label, _ = load_dataset(dataset, feats=feats, less=False)
train, test = next(iter(train_loader)), next(iter(test_loader))
print(train.shape, test.shape, label.shape)
train = np.array(train)
test = np.array(test)
labels = np.array(label)
print(train.shape, test.shape, labels.shape)

training set shape: (83739, 9)
test set shape: (55827, 9)
labels shape: (55827, 9)
ts_lengths 0: 83739
ts_lengths 1: 55827
torch.Size([83739, 9]) torch.Size([55827, 9]) (55827, 9)
(83739, 9) (55827, 9) (55827, 9)


In [61]:
lossT = np.abs(train)
loss = np.abs(test)

In [62]:
# local anomaly labels
df_res_local = pd.DataFrame()
preds = []
for i in range(loss.shape[1]):
    lt, l, ls = lossT[:, i], loss[:, i], labels[:, i]  
    result_local, pred = pot_eval(lt, l, ls, plot_path, f'dim{i}', q=1e-5)
    preds.append(pred)
    df_res = pd.DataFrame.from_dict(result_local, orient='index').T
    df_res_local = pd.concat([df_res_local, df_res], ignore_index=True)
lossTfinal, lossFinal = np.mean(lossT, axis=1), np.mean(loss, axis=1)
true_labels = (np.sum(labels, axis=1) >= 1) + 0
preds = np.array(preds).T
preds = preds.astype(int)
labelspred = (np.sum(preds, axis=1) >= 1) + 0


In [63]:
plot_ascore(plot_path, 'ascore_local', ascore=loss, labels=true_labels)
plot_labels(plot_path, 'labels_local', y_pred=labelspred, y_true=true_labels)
result_local = calc_point2point(predict=labelspred, actual=true_labels)
result_local1 = {'f1': result_local[0], 'precision': result_local[1], 'recall': result_local[2], 
                'TP': result_local[3], 'TN': result_local[4], 'FP': result_local[5], 'FN': result_local[6], 
                'ROC/AUC': result_local[7], 'MCC': result_local[8]}
print('local results')
print(result_local1)

# do majority voting over dimensions for local results instead of inclusive OR
majority = math.ceil(labels.shape[1] / 2)
labelspred_maj = (np.sum(preds, axis=1) >= majority) + 0
plot_labels(plot_path, 'labels_local_maj', y_pred=labelspred_maj, y_true=true_labels)
result_local = calc_point2point(predict=labelspred_maj, actual=true_labels)
result_local2 = {'f1': result_local[0], 'precision': result_local[1], 'recall': result_local[2], 
                'TP': result_local[3], 'TN': result_local[4], 'FP': result_local[5], 'FN': result_local[6], 
                'ROC/AUC': result_local[7], 'MCC': result_local[8]}
print('\nlocal results with majority voting')
print(result_local2)
temp = np.where(labelspred_maj != true_labels)
print(temp, np.all(labelspred_maj == true_labels))

# global anomaly labels
result_global, pred2 = pot_eval(lossTfinal, lossFinal, true_labels, plot_path, f'all_dim', q=1e-5)
labelspred_glob = (pred2 >= 1) + 0
plot_ascore(plot_path, 'ascore_global', ascore=lossFinal, labels=true_labels)
plot_labels(plot_path, 'labels_global', y_pred=labelspred_glob, y_true=true_labels)
metrics_global = calc_point2point(predict=labelspred_glob, actual=true_labels)
print('\nglobal results') 
print(result_global)

plot_metrics(plot_path, ['local (incl. OR)', 'local (maj. voting)', 'global'], 
			  y_pred=[labelspred, labelspred_maj, labelspred_glob], y_true=true_labels)

# compare local & global anomaly labels
compare_labels(plot_path, pred_labels=[labelspred, labelspred_maj], true_labels=true_labels, 
            plot_labels=['Local anomaly\n(inclusive OR)', 'Local anomaly\n(majority voting)'], name='_loc_vs_maj')
compare_labels(plot_path, pred_labels=[labelspred, labelspred_maj, labelspred_glob], true_labels=true_labels, 
            plot_labels=['Local anomaly\n(inclusive OR)', 'Local anomaly\n(majority voting)', 'Global anomaly'], name='_all')

# saving results
df_res_global = pd.DataFrame.from_dict(result_global, orient='index').T
df_res_global.index = ['global']
result_local1 = pd.DataFrame.from_dict(result_local1, orient='index').T
result_local2 = pd.DataFrame.from_dict(result_local2, orient='index').T
result_local1.index = ['local_all']
result_local2.index = ['local_all_maj']
df_res_local = pd.concat([df_res_local, result_local1, result_local2])
df_res = pd.concat([df_res_local, df_res_global]) 
df_labels = pd.DataFrame( {'local': labelspred, 'local_maj': labelspred_maj, 'global': labelspred_glob} )

df_res.to_csv(f'{res_path}/res.csv')	
df_labels.to_csv(f'{res_path}/pred_labels.csv', index=False)


local results
{'f1': 0.011370215364927579, 'precision': 0.005717670105078095, 'recall': 0.9999999601593641, 'TP': 251, 'TN': 11928, 'FP': 43648, 'FN': 0, 'ROC/AUC': 0.6073125089966892, 'MCC': 0.03503077289273145}

local results with majority voting
{'f1': 0.0, 'precision': 0.0, 'recall': 0.0, 'TP': 0, 'TN': 55576, 'FP': 0, 'FN': 251, 'ROC/AUC': 0.5, 'MCC': 0.0}
(array([ 7828,  7829,  7830,  7831,  7832,  7833,  7834,  7835,  7836,
        7837,  7838,  7839,  7840,  7841,  7842,  7843,  7844,  7845,
        7846,  7847,  7848,  7849,  7850,  7851,  7852,  7853,  7854,
        7855,  7856,  9027,  9028,  9029,  9030,  9031,  9032,  9033,
        9034,  9035,  9036,  9037,  9038,  9039,  9040,  9041,  9042,
        9043,  9044,  9045,  9046,  9047,  9048,  9049,  9050,  9051,
        9052,  9053,  9054,  9055, 10324, 10325, 10326, 10327, 10328,
       10329, 10330, 10331, 10332, 10333, 10334, 10335, 10336, 10337,
       10338, 10339, 10340, 10341, 10342, 10343, 10344, 10345, 10346,
     

### test Isolation forest

In [64]:
import math
import os
from main import load_dataset
from src.pot import calc_point2point
from src.pot import pot_eval
from src.plotting import plot_ascore, plot_labels, plot_metrics, compare_labels

In [65]:
dataset = 'GECCO'
feats = -1
contamination = 0.01
res_path = f'IF/IF_{dataset}/feats{feats}'
plot_path = f'{res_path}/plots'
os.makedirs(plot_path, exist_ok=True)
res_path = f'{res_path}/results'
os.makedirs(res_path, exist_ok=True)


In [66]:
train_loader, test_loader, labels, _ = load_dataset(dataset, feats=feats)
train, test = next(iter(train_loader)), next(iter(test_loader))
print(train.shape, test.shape, labels.shape)
train = np.array(train)
test = np.array(test)
labels = np.array(labels)
print(train.shape, test.shape, labels.shape)

training set shape: (83739, 9)
test set shape: (55827, 9)
labels shape: (55827, 9)
ts_lengths 0: 83739
ts_lengths 1: 55827
torch.Size([83739, 9]) torch.Size([55827, 9]) (55827, 9)
(83739, 9) (55827, 9) (55827, 9)


In [67]:
# print(len(labels[:,0][labels[:,0]==1])/len(labels[:,0]))
print(len(labels[labels==1])/len(labels), len(labels[labels==1]))

0.040464291471868453 2259


In [68]:
# isolation forest
from sklearn.ensemble import IsolationForest

clf = IsolationForest(contamination=contamination, random_state=0).fit(train)
lossTfinal = clf.decision_function(train)
lossFinal = clf.decision_function(test)
labelspredIF = clf.predict(test)
labelspredIF = (labelspredIF == -1) + 0
print(len(labelspredIF[labelspredIF == 1]), len(labelspredIF[labelspredIF == 0]))
print(labelspredIF.shape)
print(lossTfinal.shape, lossFinal.shape)

2015 53812
(55827,)
(83739,) (55827,)


In [69]:
true_labels = (np.sum(labels, axis=1) >= 1) + 0

In [70]:
plot_labels(plot_path, 'labels_local', y_pred=labelspredIF, y_true=true_labels)
metrics_IF = calc_point2point(predict=labelspredIF, actual=true_labels)
result_IF = {'f1': metrics_IF[0], 'precision': metrics_IF[1], 'recall': metrics_IF[2],
            'TP': metrics_IF[3], 'TN': metrics_IF[4], 'FP': metrics_IF[5], 'FN': metrics_IF[6],
            'ROC/AUC': metrics_IF[7], 'MCC': metrics_IF[8]}
print('IF results')
print(result_IF)
df_res_IF = pd.DataFrame.from_dict(result_IF, orient='index').T


# global anomaly labels
result_global, pred2 = pot_eval(lossTfinal, lossFinal, true_labels, plot_path, f'all_dim', q=1e-5)
labelspred_glob = (pred2 >= 1) + 0
plot_ascore(plot_path, 'ascore_global', ascore=lossFinal, labels=true_labels)
plot_labels(plot_path, 'labels_global', y_pred=labelspred_glob, y_true=true_labels)
metrics_global = calc_point2point(predict=labelspred_glob, actual=true_labels)
print('\nglobal results') 
print(result_global)

plot_metrics(plot_path, ['IF', 'global'], 
			  y_pred=[labelspredIF, labelspred_glob], y_true=true_labels)

# compare local & global anomaly labels
compare_labels(plot_path, pred_labels=[labelspredIF, labelspred_glob], true_labels=true_labels, 
            plot_labels=['Anomaly from IF', 'Global anomaly'], name='_all')

# saving results
df_res_global = pd.DataFrame.from_dict(result_global, orient='index').T
df_res_global.index = ['global']
df_res = pd.concat([df_res_IF, df_res_global]) 
df_labels = pd.DataFrame( {'IF': labelspredIF, 'global': labelspred_glob} )

df_res.to_csv(f'{res_path}/res.csv')	
df_labels.to_csv(f'{res_path}/pred_labels.csv', index=False)

IF results
{'f1': 0.03795036872341088, 'precision': 0.021339950266302976, 'recall': 0.17131473421056836, 'TP': 43, 'TN': 53604, 'FP': 1972, 'FN': 208, 'ROC/AUC': 0.5679158993793073, 'MCC': 0.04871972896281428}

global results
{'f1': 0.0, 'precision': 0.0, 'recall': 0.0, 'TP': 0, 'TN': 55576, 'FP': 0, 'FN': 251, 'ROC/AUC': 0.5, 'MCC': 0.0, 'threshold': 0.21992938575143553}
