# Basic experiments

Experiments comparing Random Isolation Similarity Forest to other outlier (anomaly) detection algorithms

In [1]:
from sklearn.metrics import roc_auc_score
import numpy as np
import pandas as pd
import sys
sys.path.insert(0, '..')
from data.SF_outliers_data import get_datasets

from sklearn.metrics import roc_auc_score

We will use different outlier detection algorithms to compare to RISF:
* LOF
* ECOD
* Isolation Forest
* HBOS


In [2]:
from pyod.models.ecod import ECOD
from pyod.models.lof import LOF
from pyod.models.iforest import IForest
from pyod.models.hbos import HBOS
#import RISF here

In [3]:
SEED = 23


We will measure AUC (as a binary classification task of being an outlier) and processing time. We can show plots for every algorithm and the top-N feature importance

In [4]:
datasets = get_datasets()

In [5]:
data = dict()

In [6]:
# loop for training
for X_train, X_test, y_train, y_test, name in datasets:
    print(name)
    data[name] = [X_train, X_test, y_train, y_test]


kddcup99_http
kddcup99_sf
kddcup99_sa
forestcover
breastw


In [7]:
from pyod.utils.data import evaluate_print
from pyod.utils.example import visualize


In [36]:
data['kddcup99_sa'][0]

array([[-0.15993384, -0.1962505 ,  0.5445351 , ..., -0.09555247,
        -0.25415296, -0.25241887],
       [-0.15993384, -0.1962505 , -1.83642891, ..., -0.09555247,
        -0.25415296, -0.25241887],
       [-0.15993384, -0.1962505 ,  0.5445351 , ..., -0.09555247,
        -0.25415296, -0.25241887],
       ...,
       [-0.15993384, -0.1962505 ,  0.5445351 , ..., -0.09555247,
         4.2221208 ,  3.8395796 ],
       [-0.15993384, -0.1962505 ,  0.5445351 , ..., -0.09555247,
        -0.25415296, -0.25241887],
       [-0.15993384, -0.1962505 ,  0.5445351 , ..., -0.09555247,
        -0.25415296, -0.25241887]])

In [22]:
def get_new_clf(name):
    if name == 'ECOD':
        return ECOD()
    if name == 'LOF':
        return LOF()
    if name == 'IForest':
        return IForest()
    if name == 'HBOS':
        return HBOS()

In [33]:
clfs_names = ['ECOD', 'LOF', 'IForest', 'HBOS']
for key, item in data.items():
    if key == 'kddcup99_http':
        continue
    print(f'\n\n{key}:')
    for clf_name in clfs_names:
        clf = get_new_clf(clf_name)
        X_train, X_test, y_train, y_test = item
        clf.fit(X_train)
        
        # get the prediction labels and outlier scores of the training data
        y_train_pred = clf.labels_  # binary labels (0: inliers, 1: outliers)
        y_train_scores = clf.decision_scores_  # raw outlier scores

        # get the prediction on the test data
        y_test_pred = clf.predict(X_test)  # outlier labels (0 or 1)
        y_test_scores = clf.decision_function(X_test)  # outlier scores
        # evaluate and print the results
        
        
        
        print("On Training Data:")
        evaluate_print(clf_name, y_train.astype('int'), y_train_scores)
        print("On Test Data:")
        evaluate_print(clf_name, y_test.astype('int'), y_test_scores)



kddcup99_sf:
On Training Data:
ECOD ROC:0.8183, precision @ rank n:0.1331
On Test Data:
ECOD ROC:0.8201, precision @ rank n:0.1301
On Training Data:
LOF ROC:0.3847, precision @ rank n:0.0702
On Test Data:
LOF ROC:0.3421, precision @ rank n:0.0625
On Training Data:
IForest ROC:0.9321, precision @ rank n:0.1649
On Test Data:
IForest ROC:0.9329, precision @ rank n:0.1602
On Training Data:
HBOS ROC:0.842, precision @ rank n:0.1496
On Test Data:
HBOS ROC:0.8433, precision @ rank n:0.1183


kddcup99_sa:
On Training Data:


ValueError: Input contains NaN.

In [26]:
y_train_scores

array([nan, nan, nan, ..., nan, nan, nan])

In [29]:
type(data['kddcup99_sa'][0][0][0])

numpy.float64