# Experiments

Experiments comparing Random Isolation Similarity Forest to other outlier (anomaly) detection algorithms

In [17]:
from sklearn.metrics import roc_auc_score
import numpy as np
import pandas as pd
import sys
sys.path.insert(0, '..')
from data.data_getter import get_numerical_datasets

from sklearn.metrics import roc_auc_score

from tqdm import tqdm

In [18]:
# from pyod.utils.utility import precision_n_scores
from sklearn.metrics import roc_auc_score
from sklearn.metrics import precision_score, recall_score

from notebooks.utils import *

We will use different outlier detection algorithms to compare to RISF:
* LOF
* ECOD
* Isolation Forest
* HBOS


In [19]:
SEED = 23

We will measure AUC (as a binary classification task of being an outlier) and processing time. We can show plots for every algorithm and the top-N feature importance

In [20]:
clfs_names = ['ECOD', 'LOF', 'IForest', 'HBOS', 'RISF']
results = {x: {} for x in clfs_names}

In [21]:
timer = Timer(timer_type="long_running")

In [22]:
datasets_loop = tqdm(get_numerical_datasets(), desc="Datasets (outer loop)", position=0)
algorithms_loop = tqdm(clfs_names, desc=" Algorithms (inner loop)", position=1, leave=False)

Datasets (outer loop): 0it [01:09, ?it/s]


In [23]:
import warnings
warnings.filterwarnings('ignore')

In [24]:
for data in datasets_loop:
    datasets_loop.set_description(data['name'])
    for clf_name in algorithms_loop:
        algorithms_loop.set_description(clf_name)
        clf = new_clf(clf_name, SEED)
        timer.start()
        clf.fit(data['X_train'])
        timer.stop()
        train_time = timer.time_sec
        
        # get the prediction labels and outlier scores of the training and tests  data
        if clf_name == 'RISF': # other libs return sklearn UndefinedMetricWarning from predicting th train data
            y_train_pred = clf.predict(data['X_train']) # binary labels (0: inliers, 1: outliers)
        else:
            y_train_pred = clf.labels_
        
        timer.start()
        y_test_pred = clf.predict(data['X_test'])
        timer.stop()
        test_time = timer.time_sec

        if np.isnan(y_train_pred).any():
            results[clf_name][data['name']] = (np.nan, np.nan, np.nan, np.nan, 
                                               np.nan, np.nan, np.nan, np.nan) 
                                               # AUC/ROC, Rank@N for train,test ; fit/test Time
            continue
        
        roc_train=np.round(roc_auc_score(data['y_train'], y_train_pred), decimals=4)
        precision_train=np.round(precision_score(data['y_train'], y_train_pred), decimals=4)
        recall_train=np.round(recall_score(data['y_train'], y_train_pred), decimals=4)
        roc_test=np.round(roc_auc_score(data['y_test'], y_test_pred), decimals=4)
        precision_test=np.round(precision_score(data['y_test'], y_test_pred), decimals=4)
        recall_test=np.round(recall_score(data['y_test'], y_test_pred), decimals=4)

        results[clf_name][data['name']] = (roc_train, precision_train, recall_train,
                                           roc_test, precision_test, recall_test,
                                           train_time, test_time)
        df = pd.DataFrame(results)
        df.to_pickle('../results/numerical_temporary.pkl')


23_WPBC.npz: : 23it [55:53, 145.80s/it]          


In [26]:
df = pd.DataFrame(results)
df.to_pickle('../results/numerical_selected.pkl')

In [8]:
for data in datasets_loop:
    datasets_loop.set_description(data['name'])
    for clf_name in algorithms_loop:
        algorithms_loop.set_description(clf_name)
        clf = new_clf(clf_name, SEED)
        timer.start()
        clf.fit(data['X_train'], data['y_train'])
        timer.stop()
        train_time = timer.time_sec
        
        # get the prediction labels and outlier scores of the training and tests  data
        if clf_name == 'RISF': # other libs return sklearn UndefinedMetricWarning from predicting th train data
            y_train_pred = clf.predict(data['X_train']) # binary labels (0: inliers, 1: outliers)
        else:
            y_train_pred = clf.labels_
        
        timer.start()
        y_test_pred = clf.predict(data['X_test'])
        timer.stop()
        test_time = timer.time_sec

        if np.isnan(y_train_pred).any():
            results[clf_name][data['name']] = (np.nan, np.nan, np.nan, np.nan, 
                                               np.nan, np.nan, np.nan, np.nan) 
                                               # AUC/ROC, Rank@N for train,test ; fit/test Time
            continue
        
        roc_train=np.round(roc_auc_score(data['y_train'], y_train_pred), decimals=4)
        precision_train=np.round(precision_score(data['y_train'], y_train_pred), decimals=4)
        recall_train=np.round(recall_score(data['y_train'], y_train_pred), decimals=4)
        roc_test=np.round(roc_auc_score(data['y_test'], y_test_pred), decimals=4)
        precision_test=np.round(precision_score(data['y_test'], y_test_pred), decimals=4)
        recall_test=np.round(recall_score(data['y_test'], y_test_pred), decimals=4)

        results[clf_name][data['name']] = (roc_train, precision_train, recall_train,
                                           roc_test, precision_test, recall_test,
                                           train_time, test_time)
        df = pd.DataFrame(results)
        df.to_pickle('../results/numerical_temporary.pkl')
    if 'breast' in data['name']:
        break

04_breastw.npz: : 3it [10:31, 210.34s/it]   
