In [1]:
import pandas as pd
import numpy as np
# import h5py
import matplotlib.pyplot as plt
import os
import glob

In [2]:
from src.data_loader_old import load_dataset

In [3]:
def consecutive_nonzero_lengths(arr):
    lengths = []
    current_length = 0
    for val in arr:
        if val != 0:
            current_length += 1
        else:
            if current_length > 0:
                lengths.append(current_length)
                current_length = 0
    if current_length > 0:
        lengths.append(current_length)
    return lengths

In [5]:
train_loader, test_loader, labels, ts_lengths, enc_feats = load_dataset('WADI')

print(train_loader.dataset.data.shape, test_loader.dataset.data.shape)
true_labels = (np.sum(labels, axis=1) >= 1) + 0
print(true_labels.shape)
print(np.sum(true_labels), f'{np.sum(true_labels) / len(true_labels)*100:.2f} %')

anomaly_lens = consecutive_nonzero_lengths(true_labels)
print(len(anomaly_lens), anomaly_lens)
print(np.mean(anomaly_lens), np.max(anomaly_lens), np.min(anomaly_lens))

training set shape: (784571, 127)
test set shape: (172803, 127)
labels shape: (172803, 127)
ts_lengths 0: 784571
ts_lengths 1: 172803
(784571, 127) (172803, 127)
(172803,)
9977 5.77 %
14 [1501, 591, 1741, 851, 671, 699, 581, 203, 88, 807, 675, 361, 577, 631]
712.6428571428571 1741 88


In [6]:
def load_dataset2(dataset, feats=-1, less=False, enc=False):
	folder = os.path.join('processed', dataset)
	if not os.path.exists(folder):
		raise Exception('Processed Data not found.')
	loader = []
	ts_lengths = []
	enc_feats = 0

	for file in ['train', 'test', 'labels']:
		if 'ATLAS_DQM' in dataset and file != 'train':
			# file = f'{file}_cosmicCalo'
			# file = f'{file}_hardProbes'
			file = f'{file}_pumpNoise'
			# file = f'{file}_hvononNominal'
		paths = glob.glob(os.path.join(folder, f'*{file}*.npy'))
		paths = sorted(paths)  # sort paths to ensure correct order, otherwise labels & test files are mismatched

		loader.append(np.concatenate([np.load(p) for p in paths]))
		ts_lengths.append([np.load(p).shape[0] for p in paths])
	
	if feats > 0:  # reduce number of features
		print(f'data set has {loader[0].shape[1]} features, only using {feats}')
		for i in range(2):
			max_feats = feats + enc_feats
			loader[i] = loader[i][:,:max_feats]
	
	train_loader = loader[0]
	test_loader = loader[1]
	labels = loader[2]
	
	# if labels are one dimensional, add axis
	if len(labels.shape) == 1:
		labels = labels[:, np.newaxis]
	if labels.shape[1] == 1: # if labels are 1D, repeat them for each feature to have 2D labels
		labels = np.repeat(labels, loader[0].shape[1], axis=1)
		
	print('training set shape:', train_loader.shape)
	print('test set shape:', test_loader.shape)
	print('labels shape:', labels.shape)
	print('ts_lengths 0:', np.sum(ts_lengths[0]))
	print('ts_lengths 1:', np.sum(ts_lengths[1]))
	return train_loader, test_loader, labels, ts_lengths, enc_feats

In [7]:
train_loader, test_loader, labels, ts_lengths, enc_feats = load_dataset2('WADI')
print(train_loader.shape, test_loader.shape)
true_labels = (np.sum(labels, axis=1) >= 1) + 0
print(true_labels.shape)
print(np.sum(true_labels), f'{np.sum(true_labels) / len(true_labels)*100:.2f} %')

anomaly_lens = consecutive_nonzero_lengths(true_labels)
print(len(anomaly_lens), anomaly_lens)
print(np.mean(anomaly_lens), np.max(anomaly_lens), np.min(anomaly_lens))

training set shape: (784571, 127)
test set shape: (172803, 127)
labels shape: (172803, 127)
ts_lengths 0: 784571
ts_lengths 1: 172803
(784571, 127) (172803, 127)
(172803,)
9977 5.77 %
14 [1501, 591, 1741, 851, 671, 699, 581, 203, 88, 807, 675, 361, 577, 631]
712.6428571428571 1741 88
