In [1]:
import sys, os, time
import pandas as pd
import numpy as np
import pickle

from sklearn.metrics import roc_auc_score
from sklearn.ensemble import IsolationForest

%reload_ext autoreload
%autoreload 2

Importing PBAD and PreProcessor requires that the Cython code is compiled (using setup.py).

The path from where they are imported can differ.

In [2]:
# try loading the relevant methods
try:
    cwd = os.getcwd()
    src_path = os.path.join(cwd.split('notebooks')[0], 'src')
    sys.path.insert(0, src_path)
except:
    print('Failed to add path')
    pass

In [4]:
from methods.PreProcessor import PreProcessor
from methods.PBAD import PBAD
from baselines.PAV import PAV
from baselines.FPOF import FPOF
from baselines.MPAD import MPAD

# Univariate time series

## load the data

In [6]:
data_path = os.path.join(cwd.split('notebooks')[0], 'data/univariate')
univariate_data = pd.read_csv(os.path.join(data_path, 'ambient_temperature', 'train_data.csv'), index_col=0, header=0)
continuous_data = {0: univariate_data.iloc[:, 0].values}
labels = univariate_data.iloc[:, 1].values

In [7]:
# read out the recommended preprocessing settings for this dataset (THIS STEP CAN BE SKIPPED)
recommended_settings = pickle.load(open(os.path.join(data_path, 'ambient_temperature', 'data_settings.pickle'), 'rb'))
recommended_settings

{'alphabet_size': 30,
 'bin_size': 1,
 'capvalue': 0.5,
 'data_type': 'univariate',
 'discretize': False,
 'dname': 'ambient_temperature',
 'mph': 1,
 'scaler': 1.0,
 'scaling': False,
 'wincrement': 6.0,
 'wsize': 12.0}

In [8]:
# read out the recommended preprocessing settings for this dataset (THIS STEP CAN BE SKIPPED)
recommended_settings = pickle.load(open(os.path.join(data_path, 'request_latency', 'data_settings.pickle'), 'rb'))
recommended_settings

{'alphabet_size': 30,
 'bin_size': 1,
 'capvalue': 0.5,
 'data_type': 'univariate',
 'discretize': False,
 'dname': 'request_latency',
 'mph': 12,
 'scaler': 1.0,
 'scaling': False,
 'wincrement': 6.0,
 'wsize': 12.0}

In [10]:
# read out the recommended preprocessing settings for this dataset (THIS STEP CAN BE SKIPPED)
recommended_settings = pickle.load(open(os.path.join(data_path, 'new_york_taxi', 'data_settings.pickle'), 'rb'))
recommended_settings

{'alphabet_size': 30,
 'bin_size': 1,
 'capvalue': 0.5,
 'data_type': 'univariate',
 'discretize': False,
 'dname': 'new_york_taxi',
 'mph': 2,
 'scaler': 1.0,
 'scaling': False,
 'wincrement': 6.0,
 'wsize': 12.0}

## PBAD

In [6]:
# preprocess the data
prep = PreProcessor(scaling=False, smoothing=False, discretize=False, window_size=12, window_incr=6, bin_size=1, alphabet_size=30)
cont_prep, _, window_labels = prep.preprocess(continuous_series=continuous_data, labels=labels)

# run PBAD on the data
mdl = PBAD(relative_minsup=0.01, jaccard_threshold=0.9, pattern_type='all', pattern_pruning='closed')
scores = mdl.fit_predict(continuous_data=cont_prep)


Running preprocessor on TIME SERIES with settings & steps:
0. remove extreme values (mean +/- 3 * stdv) + min-max scaling is always applied FIRST
1. scaling:             NO
2. smoothing:           NO
3. binning:             NO
4. subsampling:         NO
5. discretizing:        NO
6. window (size - inc): 12 - 6

DEBUG: Mining CLOSED ITEMSETS with SPMF CHARM; #rows:1211 minsup absolute: 0.01

DEBUG: Found #4509 patterns
DEBUG: # Jaccard thresholded patterns: 4461
DEBUG: most frequent patterns: [(array([ 0.55737921]), 285), (array([ 0.56866954]), 281), (array([ 0.60254055]), 271), (array([ 0.53479853]), 270), (array([ 0.59125022]), 269)]
DEBUG: least frequent patterns: [(array([ 0.61383089,  0.62512123,  0.6477019 ,  0.68157291,  0.69286324]), 13), (array([ 0.61383089,  0.63641156,  0.6477019 ,  0.65899223,  0.67028257,
        0.69286324]), 13), (array([ 0.50092753,  0.5235082 ,  0.53479853,  0.54608887,  0.55737921,
        0.59125022]), 13), (array([ 0.60254055,  0.63641156,  0.658992

In [7]:
# result: we can only observe the labeled segments!
ixl = np.where(window_labels != 0)[0]
print('AUROC =', roc_auc_score(y_true=window_labels[ixl], y_score=scores[ixl]))

AUROC = 0.999236874237


## PAV

In [8]:
# preprocess the data
prep = PreProcessor(scaling=False, smoothing=False, discretize=False, window_size=12, window_incr=6, bin_size=1, alphabet_size=30)
cont_prep, _, window_labels = prep.preprocess(continuous_series=continuous_data, labels=labels)

# run PAV
mdl = PAV()
scores = mdl.fit_predict(continuous_data=cont_prep, window_size=12, window_incr=6)
ixl = np.where(window_labels != 0)[0]
print('\nAUROC =', roc_auc_score(y_true=window_labels[ixl], y_score=scores[ixl]))


Running preprocessor on TIME SERIES with settings & steps:
0. remove extreme values (mean +/- 3 * stdv) + min-max scaling is always applied FIRST
1. scaling:             NO
2. smoothing:           NO
3. binning:             NO
4. subsampling:         NO
5. discretizing:        NO
6. window (size - inc): 12 - 6

AUROC = 0.589667277167


## FPOF

In [9]:
# preprocess the data
prep = PreProcessor(scaling=False, smoothing=False, discretize=True, window_size=12, window_incr=6, bin_size=1, alphabet_size=30)
cont_prep, _, window_labels = prep.preprocess(continuous_series=continuous_data, labels=labels)

# run FPOF
mdl = FPOF(relative_minsup=0.01, jaccard_threshold=0.9, pattern_pruning='closed')
scores = mdl.fit_predict(continuous_data=cont_prep)
ixl = np.where(window_labels != 0)[0]
print('\nAUROC =', roc_auc_score(y_true=window_labels[ixl], y_score=scores[ixl]))


Running preprocessor on TIME SERIES with settings & steps:
0. remove extreme values (mean +/- 3 * stdv) + min-max scaling is always applied FIRST
1. scaling:             NO
2. smoothing:           NO
3. binning:             NO
4. subsampling:         NO
5. discretizing:        YES
   alphabet size:       30
6. window (size - inc): 12 - 6

DEBUG: Mining CLOSED ITEMSETS with SPMF CHARM; #rows:1211 minsup absolute: 0.01

DEBUG: Found #650 patterns
DEBUG: # Jaccard thresholded patterns: 551
DEBUG: most frequent patterns: [(array([ 0.53]), 445), (array([ 0.5]), 428), (array([ 0.6]), 418), (array([ 0.57]), 418), (array([ 0.47]), 391)]
DEBUG: least frequent patterns: [(array([ 0.4 ,  0.53,  0.57,  0.6 ]), 13), (array([ 0.3 ,  0.4 ,  0.47,  0.5 ,  0.53]), 13), (array([ 0.13,  0.3 ,  0.37]), 13), (array([ 0.4,  0.5,  0.6]), 13), (array([ 0.13,  0.2 ,  0.23,  0.3 ]), 13)]

AUROC = 0.99503968254


## Matrix profile anomaly detection (MPAD)

In [10]:
# preprocess the data: matrix profile works on the original data
prep = PreProcessor(window_size=12, window_incr=6)

# run MPAD
mdl = MPAD(window_size=12)
scores = mdl.fit_predict(continuous_data)

# transform the scores to a score per window
w_scores = prep._fast_divide_series_into_windows(scores, 'continuous')
scores = np.sum(w_scores, axis=1).T

# compute the AUROC
ixl = np.where(window_labels != 0)[0]
print('\nAUROC =', roc_auc_score(y_true=window_labels[ixl], y_score=scores[ixl]))

100%|██████████| 7255/7255 [00:00<00:00, 9213.83it/s]


AUROC = 0.451923076923





## MIFPOD