# Experiments

Experiments comparing Random Isolation Similarity Forest to other outlier (anomaly) detection algorithms

In [1]:
from tqdm import tqdm
from sklearn.metrics import roc_auc_score
from data.data_getter import get_numerical_datasets
from sklearn.metrics import roc_auc_score, average_precision_score
import numpy as np
import pandas as pd
import sys
sys.path.insert(0, '..')


In [2]:
# from pyod.utils.utility import precision_n_scores
from sklearn.metrics import roc_auc_score
from sklearn.metrics import precision_score, recall_score

from notebooks.utils import *


We will use different outlier detection algorithms to compare to RISF:
* LOF
* ECOD
* Isolation Forest
* HBOS


In [3]:
SEED = 23


We will measure AUC (as a binary classification task of being an outlier) and processing time. We can show plots for every algorithm and the top-N feature importance

In [4]:
clfs_names = ['ECOD', 'LOF', 'IForest', 'HBOS', 'RISF']
results = {x: {} for x in clfs_names}
resultsY = results.copy()


In [5]:
timer = Timer(timer_type="long_running")


In [6]:
datasets_loop = tqdm(get_numerical_datasets(),
                     desc="Datasets (outer loop)", position=0)
algorithms_loop = tqdm(
    clfs_names, desc=" Algorithms (inner loop)", position=1, leave=False)


Datasets (outer loop): 0it [00:00, ?it/s]

In [7]:
import warnings
warnings.filterwarnings('ignore')


## Only X known

In [8]:
for data in datasets_loop:
    datasets_loop.set_description(data['name'])
    for clf_name in algorithms_loop:
        algorithms_loop.set_description(clf_name)
        clf = new_clf(clf_name, SEED)
        timer.start()
        clf.fit(data['X_train'])
        timer.stop()
        train_time = timer.time_sec

        # get the prediction labels and outlier scores of the training and tests  data
        y_train_pred = clf.decision_scores_

        timer.start()
        y_test_pred = clf.decision_function(data['X_test'])
        timer.stop()
        test_time = timer.time_sec

        if np.isnan(y_train_pred).any():
            results[clf_name][data['name']] = (np.nan, np.nan, np.nan, np.nan,
                                               np.nan, np.nan, np.nan, np.nan)
            # AUC/ROC, Rank@N for train,test ; fit/test Time
            continue

        roc_train = np.round(roc_auc_score(
            data['y_train'], y_train_pred), decimals=4)
        ap_train = np.round(average_precision_score(
            data['y_train'], y_train_pred), decimals=4)
        roc_test = np.round(roc_auc_score(
            data['y_test'], y_test_pred), decimals=4)
        ap_test = np.round(average_precision_score(
            data['y_test'], y_test_pred), decimals=4)

        results[clf_name][data['name']] = (roc_train, ap_train,
                                           roc_test, ap_test,
                                           train_time, test_time)
        df = pd.DataFrame(results)
        df.to_pickle('../results/numerical_temporary.pkl')


23_WPBC.npz: : 24it [1:32:34, 231.44s/it]        


In [9]:
df = pd.DataFrame(results)
df.to_pickle('../results/numerical_selected.pkl')


## Training test provided

In [13]:
datasets_loop = tqdm(get_numerical_datasets(),
                     desc="Datasets (outer loop)", position=0)
algorithms_loop = tqdm(
    clfs_names, desc=" Algorithms (inner loop)", position=1, leave=False)


Datasets (outer loop): 0it [00:00, ?it/s]

In [14]:
for data in datasets_loop:
    datasets_loop.set_description(data['name'])
    for clf_name in algorithms_loop:
        algorithms_loop.set_description(clf_name)
        clf = new_clf(clf_name, SEED)
        timer.start()
        clf.fit(data['X_train'], data['y_train'])
        timer.stop()
        train_time = timer.time_sec

        # get the prediction labels and outlier scores of the training and tests  data
        y_train_pred = clf.decision_scores_

        timer.start()
        y_test_pred = clf.decision_function(data['X_test'])
        timer.stop()
        test_time = timer.time_sec

        if np.isnan(y_train_pred).any():
            resultsY[clf_name][data['name']] = (np.nan, np.nan, np.nan, np.nan,
                                                np.nan, np.nan, np.nan, np.nan)
            # AUC/ROC, Rank@N for train,test ; fit/test Time
            continue

        roc_train = np.round(roc_auc_score(
            data['y_train'], y_train_pred), decimals=4)
        ap_train = np.round(average_precision_score(
            data['y_train'], y_train_pred), decimals=4)
        roc_test = np.round(roc_auc_score(
            data['y_test'], y_test_pred), decimals=4)
        ap_test = np.round(average_precision_score(
            data['y_test'], y_test_pred), decimals=4)

        resultsY[clf_name][data['name']] = (roc_train, ap_train,
                                            roc_test, ap_test,
                                            train_time, test_time)


23_WPBC.npz: : 24it [2:43:55, 409.83s/it]        


In [15]:
df = pd.DataFrame(resultsY)
df.to_pickle('../results/numerical_y.pkl')
