# Basic experiments

Experiments comparing Random Isolation Similarity Forest to other outlier (anomaly) detection algorithms

In [1]:
from sklearn.metrics import roc_auc_score
import numpy as np
import pandas as pd
import sys
sys.path.insert(0, '..')
from data.data_getter import get_numerical_datasets

from sklearn.metrics import roc_auc_score

from tqdm import tqdm

In [2]:
from pyod.utils.utility import precision_n_scores
from sklearn.metrics import roc_auc_score
from notebooks.utils import *

We will use different outlier detection algorithms to compare to RISF:
* LOF
* ECOD
* Isolation Forest
* HBOS


In [8]:
SEED = 23

We will measure AUC (as a binary classification task of being an outlier) and processing time. We can show plots for every algorithm and the top-N feature importance

In [9]:
clfs_names = ['ECOD', 'LOF', 'IForest', 'HBOS', 'RISF']
results = {x: {} for x in clfs_names}

In [10]:
timer = Timer(timer_type="long_running")

In [11]:
datasets_loop = tqdm(get_numerical_datasets(), desc="Datasets (outer loop)", position=0)
algorithms_loop = tqdm(clfs_names, desc=" Algorithms (inner loop)", position=1, leave=False)

Datasets (outer loop): 0it [00:00, ?it/s]

In [12]:
import warnings
warnings.filterwarnings('ignore')

In [13]:
cnt = 0
for data in datasets_loop:
    if cnt > 4:
        break
    cnt +=1
    datasets_loop.set_description(data['name'])
    for clf_name in algorithms_loop:
        algorithms_loop.set_description(clf_name)
        clf = new_clf(clf_name, SEED)
        timer.start()
        clf.fit(data['X_train'])
        timer.stop()
        
        # get the prediction labels and outlier scores of the training and tests  data
        if clf_name == 'RISF': # other libs return sklearn UndefinedMetricWarning from predicting th train data
            y_train_pred = clf.predict(data['X_train']) # binary labels (0: inliers, 1: outliers)
        else:
            y_train_pred = clf.labels_
        y_test_pred = clf.predict(data['X_test'])

        if np.isnan(y_train_pred).any():
            results[clf_name][data['name']] = (np.nan, np.nan, np.nan, np.nan, np.nan) # AUC/ROC, Rank@N for train,test ; fit Time
            continue
        roc_train=np.round(roc_auc_score(data['y_train'], y_train_pred), decimals=4)
        prn_train=np.round(precision_n_scores(data['y_train'], y_train_pred), decimals=4)
        roc_test=np.round(roc_auc_score(data['y_test'], y_test_pred), decimals=4)
        prn_test=np.round(precision_n_scores(data['y_test'], y_test_pred), decimals=4)

        results[clf_name][data['name']] = (roc_train, prn_train, roc_test, prn_test, timer.time_sec)


05_campaign.npz: : 5it [26:01, 312.32s/it]  


In [14]:
df = pd.DataFrame(results)
df.to_pickle('../results/numericalRISF.pkl')
df

Unnamed: 0,ECOD,LOF,IForest,HBOS,RISF
01_ALOI.npz,"(0.4992, 0.0, 0.5021, 0.0, 0.297)","(0.6169, 0.0, 0.6005, 0.0, 1.719)","(0.5119, 0.0, 0.5092, 0.0, 3.594)","(0.5119, 0.0, 0.5153, 0.0, 1.031)","(0.5, 0.0, 0.5, 0.0, 226.266)"
02_annthyroid.npz,"(0.6467, 0.0, 0.6106, 0.0, 0.015)","(0.6279, 0.0, 0.6441, 0.0, 0.109)","(0.6482, 0.0, 0.6131, 0.0, 0.343)","(0.6323, 0.0, 0.5929, 0.0, 0.0)","(0.4998, 0.0, 0.5, 0.0, 16.703)"
03_backdoor.npz,"(0.5, 0.0, 0.5, 0.0, 2.469)","(0.6915, 0.0, 0.6812, 0.0, 15.0)","(0.5638, 0.0, 0.5627, 0.0, 28.297)","(0.5824, 0.0, 0.5759, 0.0, 0.907)","(0.5, 0.0, 0.5, 0.0, 828.844)"
04_breastw.npz,"(0.6407, 1.0, 0.6458, 1.0, 0.016)","(0.4504, 0.125, 0.4616, 0.2424, 0.094)","(0.6391, 0.9792, 0.6111, 1.0, 0.218)","(0.6299, 0.9375, 0.6944, 1.0, 5.469)","(0.518, 1.0, 0.5208, 1.0, 2.172)"
05_campaign.npz,"(0.6529, 0.4183, 0.6506, 0.4256, 0.328)","(0.4849, 0.0826, 0.4767, 0.0678, 1.578)","(0.6107, 0.334, 0.604, 0.3267, 4.609)","(0.6577, 0.428, 0.6451, 0.4223, 0.109)","(0.5, 0.0, 0.5, 0.0, 122.516)"


In [15]:
multiTable = df.copy()

for name in clfs_names:
    multiTable[[(name, 'TrainROC'), 
        (name, 'TrainR@n'), 
        (name, 'TestROC'), 
        (name, 'TestR@n'), 
        (name, 'Time')]
        ] = pd.DataFrame(multiTable[name].to_list(), index=multiTable.index)

multiTable= multiTable.drop(clfs_names, axis=1)
multiTable.columns = pd.MultiIndex.from_tuples(multiTable.columns)
multiTable = multiTable.style.set_caption('All performed experiments').set_table_styles([{
    'selector': 'caption',
    'props': [
        ('font-size', '16px'),
        ('font-weight', 'bold')
    ]
}])
multiTable

Unnamed: 0_level_0,ECOD,ECOD,ECOD,ECOD,ECOD,LOF,LOF,LOF,LOF,LOF,IForest,IForest,IForest,IForest,IForest,HBOS,HBOS,HBOS,HBOS,HBOS,RISF,RISF,RISF,RISF,RISF
Unnamed: 0_level_1,TrainROC,TrainR@n,TestROC,TestR@n,Time,TrainROC,TrainR@n,TestROC,TestR@n,Time,TrainROC,TrainR@n,TestROC,TestR@n,Time,TrainROC,TrainR@n,TestROC,TestR@n,Time,TrainROC,TrainR@n,TestROC,TestR@n,Time
01_ALOI.npz,0.4992,0.0,0.5021,0.0,0.297,0.6169,0.0,0.6005,0.0,1.719,0.5119,0.0,0.5092,0.0,3.594,0.5119,0.0,0.5153,0.0,1.031,0.5,0.0,0.5,0.0,226.266
02_annthyroid.npz,0.6467,0.0,0.6106,0.0,0.015,0.6279,0.0,0.6441,0.0,0.109,0.6482,0.0,0.6131,0.0,0.343,0.6323,0.0,0.5929,0.0,0.0,0.4998,0.0,0.5,0.0,16.703
03_backdoor.npz,0.5,0.0,0.5,0.0,2.469,0.6915,0.0,0.6812,0.0,15.0,0.5638,0.0,0.5627,0.0,28.297,0.5824,0.0,0.5759,0.0,0.907,0.5,0.0,0.5,0.0,828.844
04_breastw.npz,0.6407,1.0,0.6458,1.0,0.016,0.4504,0.125,0.4616,0.2424,0.094,0.6391,0.9792,0.6111,1.0,0.218,0.6299,0.9375,0.6944,1.0,5.469,0.518,1.0,0.5208,1.0,2.172
05_campaign.npz,0.6529,0.4183,0.6506,0.4256,0.328,0.4849,0.0826,0.4767,0.0678,1.578,0.6107,0.334,0.604,0.3267,4.609,0.6577,0.428,0.6451,0.4223,0.109,0.5,0.0,0.5,0.0,122.516


In [16]:
times = df.apply(lambda x: x.str[4])
times = times.style.highlight_min(color = 'green', axis = 1).set_caption('Time [s]').set_table_styles([{
    'selector': 'caption',
    'props': [
        ('font-size', '16px'),
        ('font-weight', 'bold')
    ]
}])
times   

Unnamed: 0,ECOD,LOF,IForest,HBOS,RISF
01_ALOI.npz,0.297,1.719,3.594,1.031,226.266
02_annthyroid.npz,0.015,0.109,0.343,0.0,16.703
03_backdoor.npz,2.469,15.0,28.297,0.907,828.844
04_breastw.npz,0.016,0.094,0.218,5.469,2.172
05_campaign.npz,0.328,1.578,4.609,0.109,122.516


In [17]:
train_roc = df.apply(lambda x: x.str[0])
train_roc = train_roc.style.highlight_max(color = 'green', axis = 1).set_caption('AUC/ROC training set').set_table_styles([{
    'selector': 'caption',
    'props': [
        ('font-size', '16px'),
        ('font-weight', 'bold')
    ]
}])
train_roc

Unnamed: 0,ECOD,LOF,IForest,HBOS,RISF
01_ALOI.npz,0.4992,0.6169,0.5119,0.5119,0.5
02_annthyroid.npz,0.6467,0.6279,0.6482,0.6323,0.4998
03_backdoor.npz,0.5,0.6915,0.5638,0.5824,0.5
04_breastw.npz,0.6407,0.4504,0.6391,0.6299,0.518
05_campaign.npz,0.6529,0.4849,0.6107,0.6577,0.5


In [18]:
train_prn = df.apply(lambda x: x.str[1])
train_prn = train_prn.style.highlight_max(color = 'green', axis = 1).set_caption('Rank @ n on training set').set_table_styles([{
    'selector': 'caption',
    'props': [
        ('font-size', '16px'),
        ('font-weight', 'bold')
    ]
}])
train_prn

Unnamed: 0,ECOD,LOF,IForest,HBOS,RISF
01_ALOI.npz,0.0,0.0,0.0,0.0,0.0
02_annthyroid.npz,0.0,0.0,0.0,0.0,0.0
03_backdoor.npz,0.0,0.0,0.0,0.0,0.0
04_breastw.npz,1.0,0.125,0.9792,0.9375,1.0
05_campaign.npz,0.4183,0.0826,0.334,0.428,0.0


In [19]:
test_roc = df.apply(lambda x: x.str[2])
test_roc = test_roc.style.highlight_max(color = 'green', axis = 1).set_caption('AUC/ROC on test set').set_table_styles([{
    'selector': 'caption',
    'props': [
        ('font-size', '16px'),
        ('font-weight', 'bold')
    ]
}])
test_roc

Unnamed: 0,ECOD,LOF,IForest,HBOS,RISF
01_ALOI.npz,0.5021,0.6005,0.5092,0.5153,0.5
02_annthyroid.npz,0.6106,0.6441,0.6131,0.5929,0.5
03_backdoor.npz,0.5,0.6812,0.5627,0.5759,0.5
04_breastw.npz,0.6458,0.4616,0.6111,0.6944,0.5208
05_campaign.npz,0.6506,0.4767,0.604,0.6451,0.5


In [20]:
test_prn = df.apply(lambda x: x.str[3])
test_prn = test_prn.style.highlight_max(color = 'green', axis = 1).set_caption('Rank @ n on test set').set_table_styles([{
    'selector': 'caption',
    'props': [
        ('font-size', '16px'),
        ('font-weight', 'bold')
    ]
}])
test_prn

Unnamed: 0,ECOD,LOF,IForest,HBOS,RISF
01_ALOI.npz,0.0,0.0,0.0,0.0,0.0
02_annthyroid.npz,0.0,0.0,0.0,0.0,0.0
03_backdoor.npz,0.0,0.0,0.0,0.0,0.0
04_breastw.npz,1.0,0.2424,1.0,1.0,1.0
05_campaign.npz,0.4256,0.0678,0.3267,0.4223,0.0


In [46]:
import dataframe_image as dfi

In [95]:
dfi.export(train_roc, '../results/figures/train_roc.png')
dfi.export(train_prn, '../results/figures/train_prn.png')
dfi.export(test_roc, '../results/figures/test_roc.png')
dfi.export(test_prn, '../results/figures/test_prn.png')
dfi.export(times, '../results/figures/times.png')
dfi.export(multiTable, '../results/figures/multiTable.png')