# Basic experiments

Experiments comparing Random Isolation Similarity Forest to other outlier (anomaly) detection algorithms

In [1]:
from sklearn.metrics import roc_auc_score
import numpy as np
import pandas as pd
import sys
sys.path.insert(0, '..')
from data.data_getter import get_numerical_datasets

from sklearn.metrics import roc_auc_score

We will use different outlier detection algorithms to compare to RISF:
* LOF
* ECOD
* Isolation Forest
* HBOS


In [2]:
from pyod.models.ecod import ECOD
from pyod.models.lof import LOF
from pyod.models.iforest import IForest
from pyod.models.hbos import HBOS
# from isf.forest import RandomIsolationSimilarityForest

In [3]:
SEED = 23


We will measure AUC (as a binary classification task of being an outlier) and processing time. We can show plots for every algorithm and the top-N feature importance

In [7]:
datasets = get_numerical_datasets()

In [79]:
from sklearn.metrics import roc_auc_score
from pyod.utils.utility import precision_n_scores
import matplotlib.pyplot as plt

In [26]:
def new_clf(name):
    if name == 'ECOD':
        return ECOD()
    if name == 'LOF':
        return LOF()
    if name == 'IForest':
        return IForest(random_state=SEED)
    if name == 'HBOS':
        return HBOS()
    else:
        raise NotImplementedError()

In [48]:
clfs_names = ['ECOD', 'LOF', 'IForest', 'HBOS']
nan_datasets = []
results = {x: {} for x in clfs_names}

In [53]:
import time
import gc
from typing import Literal, Optional, NoReturn


class Timer:
    """
    timer type can only take the following string values:
    - "performance": the most precise clock in the system.
    - "process": measures the CPU time, meaning sleep time is not measured.
    - "long_running": it is an increasing clock that do not change when the
        date and or time of the machine is changed.
    """

    _counter_start: Optional[int] = None
    _counter_stop: Optional[int] = None

    def __init__(
        self,
        timer_type: Literal["performance", "process", "long_running"] = "performance",
        disable_garbage_collect: bool = True,
    ) -> None:
        self.timer_type = timer_type
        self.disable_garbage_collect = disable_garbage_collect

    def start(self) -> None:
        if self.disable_garbage_collect:
            gc.disable()
        self._counter_start = self._get_counter()

    def stop(self) -> None:
        self._counter_stop = self._get_counter()
        if self.disable_garbage_collect:
            gc.enable()

    @property
    def time_nanosec(self) -> int:
        self._valid_start_stop()
        return self._counter_stop - self._counter_start  # type: ignore

    @property
    def time_sec(self) -> float:
        return self.time_nanosec / 1e9

    def _get_counter(self) -> int:
        counter: int
        if self.timer_type == "performance":
            counter = time.perf_counter_ns()
        elif self.timer_type == "process":
            counter = time.process_time_ns()
        elif self.timer_type == "long_running":
                counter = time.monotonic_ns()
        return counter

    def _valid_start_stop(self) -> Optional[NoReturn]:
        if self._counter_start is None:
            raise ValueError("Timer has not been started.")
        if self._counter_stop is None:
            raise ValueError("Timer has not been stopped.")
        return None

# from https://towardsdatascience.com/execution-times-in-python-ed45ecc1bb4d

In [52]:
timer = Timer(timer_type="long_running")

In [54]:
for name, data in datasets.items():
    print(f'{name}:')
    for clf_name in clfs_names:
        print(clf_name, end=', ')
        clf = new_clf(clf_name)
        timer.start()
        clf.fit(data['X_train'])
        timer.stop()
        
        # get the prediction labels and outlier scores of the training data
        y_train_pred = clf.labels_  # binary labels (0: inliers, 1: outliers)
        y_train_scores = clf.decision_scores_  # raw outlier scores

        # get the prediction on the test data
        y_test_pred = clf.predict(data['X_test'])  # outlier labels (0 or 1)
        y_test_scores = clf.decision_function(data['X_test'])  # outlier scores

        
        if np.isnan(y_train_scores).any() or np.isnan(y_train_pred).any():
            nan_datasets.append(name)
            results[clf_name][name] = (np.nan, np.nan, np.nan, np.nan, np.nan) # AUC/ROC, Rank@N for train,test ; fit Time
            continue
        
        roc_train=np.round(roc_auc_score(data['y_train'], y_train_scores), decimals=4)
        prn_train=np.round(precision_n_scores(data['y_train'], y_train_scores), decimals=4)
        roc_test=np.round(roc_auc_score(data['y_test'], y_test_scores), decimals=4)
        prn_test=np.round(precision_n_scores(data['y_test'], y_test_scores), decimals=4)

        results[clf_name][name] = (roc_train, prn_train, roc_test, prn_test, timer.time_sec)
        
    print('\n')

01_ALOI.npz:
ECOD, LOF, IForest, HBOS, 

02_annthyroid.npz:
ECOD, LOF, IForest, HBOS, 

03_backdoor.npz:
ECOD, LOF, IForest, HBOS, 

04_breastw.npz:
ECOD, LOF, IForest, HBOS, 

05_campaign.npz:
ECOD, LOF, IForest, HBOS, 

06_cardio.npz:
ECOD, LOF, IForest, HBOS, 

07_Cardiotocography.npz:
ECOD, LOF, IForest, HBOS, 

08_celeba.npz:
ECOD, LOF, IForest, HBOS, 

09_census.npz:
ECOD, LOF, IForest, HBOS, 

10_cover.npz:
ECOD, LOF, IForest, HBOS, 

11_donors.npz:
ECOD, LOF, IForest, HBOS, 

12_fault.npz:
ECOD, LOF, IForest, HBOS, 

13_fraud.npz:
ECOD, LOF, IForest, HBOS, 

14_glass.npz:
ECOD, LOF, IForest, HBOS, 

15_Hepatitis.npz:
ECOD, LOF, IForest, HBOS, 

16_http.npz:
ECOD, LOF, IForest, HBOS, 

17_InternetAds.npz:
ECOD, LOF, IForest, HBOS, 

18_Ionosphere.npz:
ECOD, LOF, IForest, HBOS, 

19_landsat.npz:
ECOD, LOF, IForest, HBOS, 

20_letter.npz:
ECOD, LOF, IForest, HBOS, 

21_Lymphography.npz:
ECOD, LOF, IForest, HBOS, 

22_magic.gamma.npz:
ECOD, LOF, IForest, HBOS, 

23_mammography.npz:

  skewness = np.sign(skew(X, axis=0))


LOF, IForest, HBOS, 

25_musk.npz:
ECOD, LOF, IForest, HBOS, 

26_optdigits.npz:
ECOD, LOF, IForest, HBOS, 

27_PageBlocks.npz:
ECOD, LOF, IForest, HBOS, 

28_pendigits.npz:
ECOD, LOF, IForest, HBOS, 

29_Pima.npz:
ECOD, LOF, IForest, HBOS, 

30_satellite.npz:
ECOD, LOF, IForest, HBOS, 

31_satimage-2.npz:
ECOD, LOF, IForest, HBOS, 

32_shuttle.npz:
ECOD, LOF, IForest, HBOS, 

33_skin.npz:
ECOD, LOF, IForest, HBOS, 

34_smtp.npz:
ECOD, LOF, IForest, HBOS, 

35_SpamBase.npz:
ECOD, 

  _warn_prf(average, modifier, msg_start, len(result))


LOF, IForest, HBOS, 

36_speech.npz:
ECOD, LOF, IForest, HBOS, 

37_Stamps.npz:
ECOD, LOF, IForest, HBOS, 

38_thyroid.npz:
ECOD, LOF, IForest, HBOS, 

39_vertebral.npz:
ECOD, LOF, IForest, HBOS, 

40_vowels.npz:
ECOD, LOF, IForest, HBOS, 

41_Waveform.npz:
ECOD, LOF, IForest, HBOS, 

42_WBC.npz:
ECOD, LOF, IForest, HBOS, 

43_WDBC.npz:
ECOD, LOF, IForest, HBOS, 

44_Wilt.npz:
ECOD, LOF, IForest, HBOS, 

45_wine.npz:
ECOD, LOF, IForest, HBOS, 

46_WPBC.npz:
ECOD, LOF, IForest, HBOS, 

47_yeast.npz:
ECOD, LOF, IForest, HBOS, 



In [97]:
df = pd.DataFrame(results)
df.to_csv('../results/numerical.csv')

In [90]:
times = df.apply(lambda x: x.str[4])
times = times.style.highlight_min(color = 'green', axis = 1)
times

Unnamed: 0,ECOD,LOF,IForest,HBOS
01_ALOI.npz,0.359,1.625,3.031,0.047
02_annthyroid.npz,0.015,0.078,0.359,0.0
03_backdoor.npz,,15.547,32.797,1.031
04_breastw.npz,0.016,0.015,0.203,0.0
05_campaign.npz,0.328,1.516,4.953,0.094
06_cardio.npz,0.0,0.047,0.297,0.016
07_Cardiotocography.npz,0.016,0.032,0.297,0.0
08_celeba.npz,1.204,344.797,18.64,2.547
09_census.npz,27.328,377.219,298.672,9.062
10_cover.npz,1.094,10.375,12.391,6.266


In [91]:
train_roc = df.apply(lambda x: x.str[0])
train_roc = train_roc.style.highlight_max(color = 'green', axis = 1)
train_roc

Unnamed: 0,ECOD,LOF,IForest,HBOS
01_ALOI.npz,0.5259,0.7401,0.5356,0.5338
02_annthyroid.npz,0.791,0.7198,0.8106,0.6286
03_backdoor.npz,,0.7212,0.7584,0.759
04_breastw.npz,0.9923,0.4467,0.9869,0.9847
05_campaign.npz,0.7696,0.5704,0.7065,0.7896
06_cardio.npz,0.9302,0.548,0.9215,0.8348
07_Cardiotocography.npz,0.7917,0.5982,0.6971,0.4912
08_celeba.npz,0.7558,0.4242,0.6769,0.7542
09_census.npz,0.6604,0.5441,0.6161,0.6339
10_cover.npz,0.9184,0.5609,0.8515,0.6448


In [92]:
train_prn = df.apply(lambda x: x.str[1])
train_prn = train_prn.style.highlight_max(color = 'green', axis = 1)
train_prn

Unnamed: 0,ECOD,LOF,IForest,HBOS
01_ALOI.npz,0.0208,0.1439,0.0331,0.0483
02_annthyroid.npz,0.3289,0.2781,0.3235,0.2842
03_backdoor.npz,,0.3751,0.0315,0.0227
04_breastw.npz,0.9401,0.2216,0.9102,0.9461
05_campaign.npz,0.3935,0.0868,0.3147,0.4046
06_cardio.npz,0.5041,0.2276,0.5122,0.4472
07_Cardiotocography.npz,0.5123,0.3252,0.408,0.2699
08_celeba.npz,0.1558,0.0082,0.1034,0.153
09_census.npz,0.0702,0.0152,0.0487,0.0605
10_cover.npz,0.1602,0.0426,0.0515,0.0455


In [93]:
test_roc = df.apply(lambda x: x.str[2])
test_roc = test_roc.style.highlight_max(color = 'green', axis = 1)
test_roc

Unnamed: 0,ECOD,LOF,IForest,HBOS
01_ALOI.npz,0.5407,0.7416,0.5523,0.5363
02_annthyroid.npz,0.7781,0.7335,0.7958,0.613
03_backdoor.npz,,0.6965,0.7523,0.7451
04_breastw.npz,0.9898,0.4722,0.9866,0.982
05_campaign.npz,0.7701,0.5752,0.7033,0.7924
06_cardio.npz,0.9496,0.6026,0.9495,0.8395
07_Cardiotocography.npz,0.7785,0.6035,0.6813,0.5
08_celeba.npz,0.7605,0.4265,0.6803,0.7589
09_census.npz,0.6578,0.5421,0.6116,0.6322
10_cover.npz,0.9257,0.5678,0.8593,0.6616


In [94]:
test_prn = df.apply(lambda x: x.str[3])
test_prn = test_prn.style.highlight_max(color = 'green', axis = 1)
test_prn

Unnamed: 0,ECOD,LOF,IForest,HBOS
01_ALOI.npz,0.0398,0.1084,0.031,0.042
02_annthyroid.npz,0.2438,0.2875,0.2438,0.2075
03_backdoor.npz,,0.372,0.0286,0.03
04_breastw.npz,0.9167,0.2083,0.9028,0.9167
05_campaign.npz,0.3915,0.0747,0.301,0.3922
06_cardio.npz,0.5849,0.2264,0.6038,0.5472
07_Cardiotocography.npz,0.4714,0.3286,0.3857,0.2429
08_celeba.npz,0.1437,0.0066,0.1012,0.143
09_census.npz,0.0734,0.0147,0.0492,0.0607
10_cover.npz,0.1711,0.0485,0.0546,0.0437


In [87]:
import dataframe_image as dfi

In [95]:
dfi.export(train_roc, '../results/figures/train_roc.png')
dfi.export(train_prn, '../results/figures/train_prn.png')
dfi.export(test_roc, '../results/figures/test_roc.png')
dfi.export(test_prn, '../results/figures/test_prn.png')
dfi.export(times, '../results/figures/times.png')