## First tests PySAD package

The objective of this notebook is to present the tests of 3 anomaly detection models (LODA, RS-Hash and xStream) using the package PySAD, on 4 different datasets.

The retained performance metrics is ROC-AUC, saved with the time of execution for each test.

### imports

In [1]:
import pysad

In [2]:
from pysad.evaluation.metrics import AUROCMetric
from pysad.models import LODA, RSHash, xStream
from pysad.utils.data import Data
from pysad.utils import ArrayStreamer
from tqdm import tqdm

In [3]:
import scipy
path = "C:/Users/e32cl/Documents/M2/P2 Data Stream/Projet/projet_v2/PySAD_to_River/data/"
data_arr = scipy.io.loadmat(path+"arrhythmia.mat")
data_opt = scipy.io.loadmat(path+"optdigits.mat") 
data_breast = scipy.io.loadmat(path+"breastw.mat") 
data_wine = scipy.io.loadmat(path+"wine.mat") 

In [5]:
import numpy as np
import time

### Anomalies percentage

In [4]:
print(data_arr['y'].shape, data_arr['y'].mean())
print(data_opt['y'].shape, data_opt['y'].mean())
print(data_breast['y'].shape, data_breast['y'].mean())
print(data_wine['y'].shape, data_wine['y'].mean())

(452, 1) 0.14601769911504425
(5216, 1) 0.028757668711656442
(683, 1) 0.34992679355783307
(129, 1) 0.07751937984496124


### Dataset arryhtmia

In [7]:
X_arr = data_arr['X']
y_arr = data_arr['y']
y_flat_arr = data_arr['y'].flatten()

#### LODA

In [8]:
start = time.time()
model = LODA()
metric = AUROCMetric()
iterator = ArrayStreamer(shuffle=False)

for xi, yi in tqdm(iterator.iter(X_arr, y_arr)):
    anomaly_score = model.fit_score_partial(xi)
    metric.update(yi, anomaly_score)

print(f"Area under ROC metric is {metric.get()}.")
AUC_LODA_arr = metric.get()
time_LODA_arr = time.time()-start
print('time (sec) : ', time_LODA_arr)

452it [00:12, 37.59it/s]

Area under ROC metric is 0.49870466321243523.
time (sec) :  12.041186094284058





#### RS-Hash

In [9]:
start = time.time()
model = RSHash(X_arr.min(axis=0), X_arr.max(axis=0))
metric = AUROCMetric()
iterator = ArrayStreamer(shuffle=False)

for xi, yi in tqdm(iterator.iter(X_arr, y_arr)):
    anomaly_score = model.fit_score_partial(xi)
    # small score = more suspect to be an anomaly
    metric.update(yi, -anomaly_score)

print(f"Area under ROC metric is {metric.get()}.")
AUC_RSHash_arr = metric.get()
time_RSHash_arr = time.time()-start
print('time (sec) : ', time_RSHash_arr)

452it [00:04, 91.12it/s] 

Area under ROC metric is 0.736026063746271.
time (sec) :  4.98899245262146





#### xStream

In [10]:
start = time.time()
model = xStream()
metric = AUROCMetric()
iterator = ArrayStreamer(shuffle=False)

for xi, yi in tqdm(iterator.iter(X_arr, y_arr)):
    anomaly_score = model.fit_score_partial(xi)
    metric.update(yi, anomaly_score)

print(f"Area under ROC metric is {metric.get()}.")
AUC_xstream_arr = metric.get()
time_xstream_arr = time.time()-start
print('time (sec) : ', time_xstream_arr)

452it [02:25,  3.10it/s]

Area under ROC metric is 0.7125333647354373.
time (sec) :  145.93011689186096





### Dataset wine

In [11]:
X_wine = data_wine['X']
y_wine = data_wine['y']
y_flat_arr = data_wine['y'].flatten()

#### LODA

In [12]:
start = time.time()
model = LODA()
metric = AUROCMetric()
iterator = ArrayStreamer(shuffle=False)

for xi, yi in tqdm(iterator.iter(X_wine, y_wine)):
    anomaly_score = model.fit_score_partial(xi)
    metric.update(yi, anomaly_score)

print(f"Area under ROC metric is {metric.get()}.")
AUC_LODA_wine = metric.get()
time_LODA_wine = time.time()-start
print('time (sec) : ', time_LODA_wine)

129it [00:04, 27.40it/s]

Area under ROC metric is 0.7.
time (sec) :  4.860857009887695





#### RS-Hash

In [13]:
start = time.time()
model = RSHash(X_wine.min(axis=0), X_wine.max(axis=0))
metric = AUROCMetric()
iterator = ArrayStreamer(shuffle=False)

for xi, yi in tqdm(iterator.iter(X_wine, y_wine)):
    anomaly_score = model.fit_score_partial(xi)
    metric.update(yi, -anomaly_score)

print(f"Area under ROC metric is {metric.get()}.")
AUC_RSHash_wine = metric.get()
time_RSHash_wine = time.time()-start
print('time (sec) : ', time_RSHash_wine)

129it [00:02, 56.62it/s]


Area under ROC metric is 0.8285714285714285.
time (sec) :  2.3164610862731934


#### xStream

In [14]:
start = time.time()
model = xStream()
metric = AUROCMetric()
iterator = ArrayStreamer(shuffle=False)

for xi, yi in tqdm(iterator.iter(X_wine, y_wine)):
    anomaly_score = model.fit_score_partial(xi)
    metric.update(yi, anomaly_score)

print(f"Area under ROC metric is {metric.get()}.")
AUC_xstream_wine = metric.get()
time_xstream_wine = time.time()-start
print('time (sec) : ', time_xstream_wine)

129it [00:25,  4.98it/s]

Area under ROC metric is 0.5033613445378151.
time (sec) :  25.993932962417603





### Dataset breast

In [15]:
X_breast = data_breast['X']
y_breast = data_breast['y']
y_flat_breast = data_breast['y'].flatten()

#### LODA

In [16]:
start = time.time()
model = LODA()
metric = AUROCMetric()
iterator = ArrayStreamer(shuffle=False)

for xi, yi in tqdm(iterator.iter(X_breast, y_breast)):
    anomaly_score = model.fit_score_partial(xi)
    metric.update(yi, anomaly_score)

print(f"Area under ROC metric is {metric.get()}.")
AUC_LODA_breast = metric.get()
time_LODA_breast = time.time()-start
print('time (sec) : ', time_LODA_breast)

683it [00:19, 35.72it/s]

Area under ROC metric is 0.49436936936936937.
time (sec) :  19.320590496063232





#### RS-Hash

In [17]:
start = time.time()
model = RSHash(X_breast.min(axis=0), X_breast.max(axis=0))
metric = AUROCMetric()

for xi, yi in tqdm(iterator.iter(X_breast, y_breast)):
    anomaly_score = model.fit_score_partial(xi)
    metric.update(yi, -anomaly_score)

print(f"Area under ROC metric is {metric.get()}.")
AUC_RSHash_breast = metric.get()
time_RSHash_breast = time.time()-start
print('time (sec) : ', time_RSHash_breast)

683it [00:08, 79.78it/s] 

Area under ROC metric is 0.9707772626182668.
time (sec) :  8.58812665939331





#### xStream

In [18]:
start = time.time()
model = xStream()
metric = AUROCMetric()

for xi, yi in tqdm(iterator.iter(X_breast, y_breast)):
    anomaly_score = model.fit_score_partial(xi)
    metric.update(yi, anomaly_score)

print(f"Area under ROC metric is {metric.get()}.")
AUC_xstream_breast = metric.get()
time_xstream_breast = time.time()-start
print('time (sec) : ', time_xstream_breast)

683it [02:15,  5.03it/s]

Area under ROC metric is 0.94319424026537.
time (sec) :  135.78693413734436





### Dataset optdigits

In [19]:
X_opt = data_opt['X']
y_opt = data_opt['y']
y_flat_opt = data_opt['y'].flatten()

#### LODA

In [20]:
start = time.time()
model = LODA()
metric = AUROCMetric()
iterator = ArrayStreamer(shuffle=False)

for xi, yi in tqdm(iterator.iter(X_opt, y_opt)):
    anomaly_score = model.fit_score_partial(xi)
    metric.update(yi, anomaly_score)

print(f"Area under ROC metric is {metric.get()}.")
AUC_LODA_opt = metric.get()
time_LODA_opt = time.time()-start
print('time (sec) : ', time_LODA_opt)

5216it [02:16, 38.21it/s]

Area under ROC metric is 0.4997039084090012.
time (sec) :  136.69508004188538





#### RS-Hash

In [21]:
start = time.time()
model = RSHash(X_opt.min(axis=0),X_opt.max(axis=0))
metric = AUROCMetric()

for xi, yi in tqdm(iterator.iter(X_opt, y_opt)):
    anomaly_score = model.fit_score_partial(xi)
    metric.update(yi, -anomaly_score)

print(f"Area under ROC metric is {metric.get()}.")
AUC_RSHash_opt = metric.get()
time_RSHash_opt = time.time()-start
print('time (sec) : ', time_RSHash_opt)

5216it [01:01, 85.09it/s] 

Area under ROC metric is 0.5126727200947494.
time (sec) :  61.35401439666748





#### xStream

In [22]:
start = time.time()
model = xStream()
metric = AUROCMetric()

for xi, yi in tqdm(iterator.iter(X_opt, y_opt)):
    anomaly_score = model.fit_score_partial(xi)
    metric.update(yi, anomaly_score)

print(f"Area under ROC metric is {metric.get()}.")
AUC_xstream_opt = metric.get()
time_xstream_opt = time.time()-start
print('time (sec) : ', time_xstream_opt)

5216it [21:35,  4.03it/s]

Area under ROC metric is 0.663018160284248.
time (sec) :  1295.6687059402466



