# Basic experiments

Experiments comparing Random Isolation Similarity Forest to other outlier (anomaly) detection algorithms

In [1]:
from sklearn.metrics import roc_auc_score
import numpy as np
import pandas as pd
import sys
sys.path.insert(0, '..')
from data.data_getter import get_numerical_datasets

from sklearn.metrics import roc_auc_score

from tqdm import tqdm

In [2]:
from pyod.utils.utility import precision_n_scores
from sklearn.metrics import roc_auc_score
from notebooks.utils import *

We will use different outlier detection algorithms to compare to RISF:
* LOF
* ECOD
* Isolation Forest
* HBOS


In [3]:
SEED = 23

We will measure AUC (as a binary classification task of being an outlier) and processing time. We can show plots for every algorithm and the top-N feature importance

In [4]:
clfs_names = ['ECOD', 'LOF', 'IForest', 'HBOS']
nan_datasets = []
results = {x: {} for x in clfs_names}

In [5]:
timer = Timer(timer_type="long_running")

In [6]:
from tqdm.contrib.concurrent import process_map

datasets_loop = tqdm(get_numerical_datasets(), desc="Datasets (outer loop)", position=0)
algorithms_loop = tqdm(clfs_names, desc=" Algorithms (inner loop)", position=1, leave=False)

Datasets (outer loop): 0it [00:00, ?it/s]

In [7]:
for data in datasets_loop:
    datasets_loop.set_description(data['name'])
    for clf_name in algorithms_loop:
        algorithms_loop.set_description(clf_name)
        clf = new_clf(clf_name, SEED)
        timer.start()
        clf.fit(data['X_train'])
        timer.stop()
        
        # get the prediction labels and outlier scores of the training data
        y_train_pred = clf.labels_  # binary labels (0: inliers, 1: outliers)
        y_train_scores = clf.decision_scores_  # raw outlier scores

        # get the prediction on the test data
        y_test_pred = clf.predict(data['X_test'])  # outlier labels (0 or 1)
        y_test_scores = clf.decision_function(data['X_test'])  # outlier scores

        
        if np.isnan(y_train_scores).any() or np.isnan(y_train_pred).any():
            nan_datasets.append(data['name'])
            results[clf_name][data['name']] = (np.nan, np.nan, np.nan, np.nan, np.nan) # AUC/ROC, Rank@N for train,test ; fit Time
            continue
        
        roc_train=np.round(roc_auc_score(data['y_train'], y_train_scores), decimals=4)
        prn_train=np.round(precision_n_scores(data['y_train'], y_train_scores), decimals=4)
        roc_test=np.round(roc_auc_score(data['y_test'], y_test_scores), decimals=4)
        prn_test=np.round(precision_n_scores(data['y_test'], y_test_scores), decimals=4)

        results[clf_name][data['name']] = (roc_train, prn_train, roc_test, prn_test, timer.time_sec)


  skewness = np.sign(skew(X, axis=0))
  _warn_prf(average, modifier, msg_start, len(result))
47_yeast.npz: : 47it [1:00:11, 76.83s/it]    


In [18]:
df = pd.DataFrame(results)
df.to_pickle('../results/numerical.pkl')
df

Unnamed: 0,ECOD,LOF,IForest,HBOS
01_ALOI.npz,"(0.5259, 0.0208, 0.5407, 0.0398, 0.469)","(0.7401, 0.1439, 0.7416, 0.1084, 2.141)","(0.5356, 0.0331, 0.5523, 0.031, 3.171)","(0.5338, 0.0483, 0.5363, 0.042, 1.985)"
02_annthyroid.npz,"(0.791, 0.3289, 0.7781, 0.2438, 0.016)","(0.7198, 0.2781, 0.7335, 0.2875, 0.078)","(0.8106, 0.3235, 0.7958, 0.2438, 0.36)","(0.6286, 0.2842, 0.613, 0.2075, 0.0)"
03_backdoor.npz,"(nan, nan, nan, nan, nan)","(0.7212, 0.3751, 0.6965, 0.372, 15.234)","(0.7584, 0.0315, 0.7523, 0.0286, 32.765)","(0.759, 0.0227, 0.7451, 0.03, 1.391)"
04_breastw.npz,"(0.9923, 0.9401, 0.9898, 0.9167, 0.016)","(0.4467, 0.2216, 0.4722, 0.2083, 0.016)","(0.9869, 0.9102, 0.9866, 0.9028, 0.406)","(0.9847, 0.9461, 0.982, 0.9167, 1.766)"
05_campaign.npz,"(0.7696, 0.3935, 0.7701, 0.3915, 0.407)","(0.5704, 0.0868, 0.5752, 0.0747, 1.719)","(0.7065, 0.3147, 0.7033, 0.301, 5.516)","(0.7896, 0.4046, 0.7924, 0.3922, 0.125)"
06_cardio.npz,"(0.9302, 0.5041, 0.9496, 0.5849, 0.015)","(0.548, 0.2276, 0.6026, 0.2264, 0.031)","(0.9215, 0.5122, 0.9495, 0.6038, 0.359)","(0.8348, 0.4472, 0.8395, 0.5472, 0.0)"
07_Cardiotocography.npz,"(0.7917, 0.5123, 0.7785, 0.4714, 0.015)","(0.5982, 0.3252, 0.6035, 0.3286, 0.046)","(0.6971, 0.408, 0.6813, 0.3857, 0.36)","(0.4912, 0.2699, 0.5, 0.2429, 0.0)"
08_celeba.npz,"(0.7558, 0.1558, 0.7605, 0.1437, 1.25)","(0.4242, 0.0082, 0.4265, 0.0066, 379.094)","(0.6769, 0.1034, 0.6803, 0.1012, 20.031)","(0.7542, 0.153, 0.7589, 0.143, 1.468)"
09_census.npz,"(0.6604, 0.0702, 0.6578, 0.0734, 28.329)","(0.5441, 0.0152, 0.5421, 0.0147, 399.297)","(0.6161, 0.0487, 0.6116, 0.0492, 296.672)","(0.6339, 0.0605, 0.6322, 0.0607, 10.297)"
10_cover.npz,"(0.9184, 0.1602, 0.9257, 0.1711, 1.062)","(0.5609, 0.0426, 0.5678, 0.0485, 10.609)","(0.8515, 0.0515, 0.8593, 0.0546, 12.375)","(0.6448, 0.0455, 0.6616, 0.0437, 16.891)"


In [40]:
multiTable = df.copy()

for name in clfs_names:
    multiTable[[(name, 'TrainROC'), 
        (name, 'TrainR@n'), 
        (name, 'TestROC'), 
        (name, 'TestR@n'), 
        (name, 'Time')]
        ] = pd.DataFrame(multiTable[name].to_list(), index=multiTable.index)

multiTable= multiTable.drop(['ECOD', 'HBOS', 'IForest', 'LOF'], axis=1)
multiTable.columns = pd.MultiIndex.from_tuples(multiTable.columns)
multiTable = multiTable.style.set_caption('All performed experiments').set_table_styles([{
    'selector': 'caption',
    'props': [
        ('font-size', '16px'),
        ('font-weight', 'bold')
    ]
}])
multiTable

Unnamed: 0_level_0,ECOD,ECOD,ECOD,ECOD,ECOD,LOF,LOF,LOF,LOF,LOF,IForest,IForest,IForest,IForest,IForest,HBOS,HBOS,HBOS,HBOS,HBOS
Unnamed: 0_level_1,TrainROC,TrainR@n,TestROC,TestR@n,Time,TrainROC,TrainR@n,TestROC,TestR@n,Time,TrainROC,TrainR@n,TestROC,TestR@n,Time,TrainROC,TrainR@n,TestROC,TestR@n,Time
01_ALOI.npz,0.5259,0.0208,0.5407,0.0398,0.469,0.7401,0.1439,0.7416,0.1084,2.141,0.5356,0.0331,0.5523,0.031,3.171,0.5338,0.0483,0.5363,0.042,1.985
02_annthyroid.npz,0.791,0.3289,0.7781,0.2438,0.016,0.7198,0.2781,0.7335,0.2875,0.078,0.8106,0.3235,0.7958,0.2438,0.36,0.6286,0.2842,0.613,0.2075,0.0
03_backdoor.npz,,,,,,0.7212,0.3751,0.6965,0.372,15.234,0.7584,0.0315,0.7523,0.0286,32.765,0.759,0.0227,0.7451,0.03,1.391
04_breastw.npz,0.9923,0.9401,0.9898,0.9167,0.016,0.4467,0.2216,0.4722,0.2083,0.016,0.9869,0.9102,0.9866,0.9028,0.406,0.9847,0.9461,0.982,0.9167,1.766
05_campaign.npz,0.7696,0.3935,0.7701,0.3915,0.407,0.5704,0.0868,0.5752,0.0747,1.719,0.7065,0.3147,0.7033,0.301,5.516,0.7896,0.4046,0.7924,0.3922,0.125
06_cardio.npz,0.9302,0.5041,0.9496,0.5849,0.015,0.548,0.2276,0.6026,0.2264,0.031,0.9215,0.5122,0.9495,0.6038,0.359,0.8348,0.4472,0.8395,0.5472,0.0
07_Cardiotocography.npz,0.7917,0.5123,0.7785,0.4714,0.015,0.5982,0.3252,0.6035,0.3286,0.046,0.6971,0.408,0.6813,0.3857,0.36,0.4912,0.2699,0.5,0.2429,0.0
08_celeba.npz,0.7558,0.1558,0.7605,0.1437,1.25,0.4242,0.0082,0.4265,0.0066,379.094,0.6769,0.1034,0.6803,0.1012,20.031,0.7542,0.153,0.7589,0.143,1.468
09_census.npz,0.6604,0.0702,0.6578,0.0734,28.329,0.5441,0.0152,0.5421,0.0147,399.297,0.6161,0.0487,0.6116,0.0492,296.672,0.6339,0.0605,0.6322,0.0607,10.297
10_cover.npz,0.9184,0.1602,0.9257,0.1711,1.062,0.5609,0.0426,0.5678,0.0485,10.609,0.8515,0.0515,0.8593,0.0546,12.375,0.6448,0.0455,0.6616,0.0437,16.891


In [41]:
times = df.apply(lambda x: x.str[4])
times = times.style.highlight_min(color = 'green', axis = 1).set_caption('Time [s]').set_table_styles([{
    'selector': 'caption',
    'props': [
        ('font-size', '16px'),
        ('font-weight', 'bold')
    ]
}])
times   

Unnamed: 0,ECOD,LOF,IForest,HBOS
01_ALOI.npz,0.469,2.141,3.171,1.985
02_annthyroid.npz,0.016,0.078,0.36,0.0
03_backdoor.npz,,15.234,32.765,1.391
04_breastw.npz,0.016,0.016,0.406,1.766
05_campaign.npz,0.407,1.719,5.516,0.125
06_cardio.npz,0.015,0.031,0.359,0.0
07_Cardiotocography.npz,0.015,0.046,0.36,0.0
08_celeba.npz,1.25,379.094,20.031,1.468
09_census.npz,28.329,399.297,296.672,10.297
10_cover.npz,1.062,10.609,12.375,16.891


In [42]:
train_roc = df.apply(lambda x: x.str[0])
train_roc = train_roc.style.highlight_max(color = 'green', axis = 1).set_caption('AUC/ROC training set').set_table_styles([{
    'selector': 'caption',
    'props': [
        ('font-size', '16px'),
        ('font-weight', 'bold')
    ]
}])
train_roc

Unnamed: 0,ECOD,LOF,IForest,HBOS
01_ALOI.npz,0.5259,0.7401,0.5356,0.5338
02_annthyroid.npz,0.791,0.7198,0.8106,0.6286
03_backdoor.npz,,0.7212,0.7584,0.759
04_breastw.npz,0.9923,0.4467,0.9869,0.9847
05_campaign.npz,0.7696,0.5704,0.7065,0.7896
06_cardio.npz,0.9302,0.548,0.9215,0.8348
07_Cardiotocography.npz,0.7917,0.5982,0.6971,0.4912
08_celeba.npz,0.7558,0.4242,0.6769,0.7542
09_census.npz,0.6604,0.5441,0.6161,0.6339
10_cover.npz,0.9184,0.5609,0.8515,0.6448


In [43]:
train_prn = df.apply(lambda x: x.str[1])
train_prn = train_prn.style.highlight_max(color = 'green', axis = 1).set_caption('Rank @ n on training set').set_table_styles([{
    'selector': 'caption',
    'props': [
        ('font-size', '16px'),
        ('font-weight', 'bold')
    ]
}])
train_prn

Unnamed: 0,ECOD,LOF,IForest,HBOS
01_ALOI.npz,0.0208,0.1439,0.0331,0.0483
02_annthyroid.npz,0.3289,0.2781,0.3235,0.2842
03_backdoor.npz,,0.3751,0.0315,0.0227
04_breastw.npz,0.9401,0.2216,0.9102,0.9461
05_campaign.npz,0.3935,0.0868,0.3147,0.4046
06_cardio.npz,0.5041,0.2276,0.5122,0.4472
07_Cardiotocography.npz,0.5123,0.3252,0.408,0.2699
08_celeba.npz,0.1558,0.0082,0.1034,0.153
09_census.npz,0.0702,0.0152,0.0487,0.0605
10_cover.npz,0.1602,0.0426,0.0515,0.0455


In [44]:
test_roc = df.apply(lambda x: x.str[2])
test_roc = test_roc.style.highlight_max(color = 'green', axis = 1).set_caption('AUC/ROC on test set').set_table_styles([{
    'selector': 'caption',
    'props': [
        ('font-size', '16px'),
        ('font-weight', 'bold')
    ]
}])
test_roc

Unnamed: 0,ECOD,LOF,IForest,HBOS
01_ALOI.npz,0.5407,0.7416,0.5523,0.5363
02_annthyroid.npz,0.7781,0.7335,0.7958,0.613
03_backdoor.npz,,0.6965,0.7523,0.7451
04_breastw.npz,0.9898,0.4722,0.9866,0.982
05_campaign.npz,0.7701,0.5752,0.7033,0.7924
06_cardio.npz,0.9496,0.6026,0.9495,0.8395
07_Cardiotocography.npz,0.7785,0.6035,0.6813,0.5
08_celeba.npz,0.7605,0.4265,0.6803,0.7589
09_census.npz,0.6578,0.5421,0.6116,0.6322
10_cover.npz,0.9257,0.5678,0.8593,0.6616


In [45]:
test_prn = df.apply(lambda x: x.str[3])
test_prn = test_prn.style.highlight_max(color = 'green', axis = 1).set_caption('Rank @ n on test set').set_table_styles([{
    'selector': 'caption',
    'props': [
        ('font-size', '16px'),
        ('font-weight', 'bold')
    ]
}])
test_prn

Unnamed: 0,ECOD,LOF,IForest,HBOS
01_ALOI.npz,0.0398,0.1084,0.031,0.042
02_annthyroid.npz,0.2438,0.2875,0.2438,0.2075
03_backdoor.npz,,0.372,0.0286,0.03
04_breastw.npz,0.9167,0.2083,0.9028,0.9167
05_campaign.npz,0.3915,0.0747,0.301,0.3922
06_cardio.npz,0.5849,0.2264,0.6038,0.5472
07_Cardiotocography.npz,0.4714,0.3286,0.3857,0.2429
08_celeba.npz,0.1437,0.0066,0.1012,0.143
09_census.npz,0.0734,0.0147,0.0492,0.0607
10_cover.npz,0.1711,0.0485,0.0546,0.0437


In [46]:
import dataframe_image as dfi

In [95]:
dfi.export(train_roc, '../results/figures/train_roc.png')
dfi.export(train_prn, '../results/figures/train_prn.png')
dfi.export(test_roc, '../results/figures/test_roc.png')
dfi.export(test_prn, '../results/figures/test_prn.png')
dfi.export(times, '../results/figures/times.png')
dfi.export(multiTable, '../results/figures/multiTable.png')