# Experiments

Experiments comparing Random Isolation Similarity Forest to other outlier (anomaly) detection algorithms

In [1]:
from sklearn.metrics import roc_auc_score
import numpy as np
import pandas as pd
import sys
sys.path.insert(0, '../data')
from data.data_getter import get_numerical_datasets

from sklearn.metrics import roc_auc_score

from tqdm import tqdm

%load_ext autoreload

%autoreload 2

In [2]:
# from pyod.utils.utility import precision_n_scores
from sklearn.metrics import roc_auc_score
from sklearn.metrics import precision_score, recall_score

from notebooks.utils import *

We will use different outlier detection algorithms to compare to RISF:
* LOF
* ECOD
* Isolation Forest
* HBOS


In [3]:
SEED = 23

We will measure AUC (as a binary classification task of being an outlier) and processing time. We can show plots for every algorithm and the top-N feature importance

In [35]:
clfs_names = ['ECOD', 'LOF', 'IForest', 'HBOS', 'RISF']
results = {x: {} for x in clfs_names}
resultsY = results.copy()

In [36]:
timer = Timer(timer_type="long_running")

In [37]:
datasets_loop = tqdm(get_numerical_datasets(), desc="Datasets (outer loop)", position=0)
algorithms_loop = tqdm(clfs_names, desc=" Algorithms (inner loop)", position=1, leave=False)

Datasets (outer loop): 0it [03:55, ?it/s]


In [38]:
import warnings
warnings.filterwarnings('ignore')

## Only X known

In [24]:
for data in datasets_loop:
    datasets_loop.set_description(data['name'])
    for clf_name in algorithms_loop:
        algorithms_loop.set_description(clf_name)
        clf = new_clf(clf_name, SEED)
        timer.start()
        clf.fit(data['X_train'])
        timer.stop()
        train_time = timer.time_sec
        
        # get the prediction labels and outlier scores of the training and tests  data
        if clf_name == 'RISF': # other libs return sklearn UndefinedMetricWarning from predicting th train data
            y_train_pred = clf.predict(data['X_train']) # binary labels (0: inliers, 1: outliers)
        else:
            y_train_pred = clf.labels_
        
        timer.start()
        y_test_pred = clf.predict(data['X_test'])
        timer.stop()
        test_time = timer.time_sec

        if np.isnan(y_train_pred).any():
            results[clf_name][data['name']] = (np.nan, np.nan, np.nan, np.nan, 
                                               np.nan, np.nan, np.nan, np.nan) 
                                               # AUC/ROC, Rank@N for train,test ; fit/test Time
            continue
        
        roc_train=np.round(roc_auc_score(data['y_train'], y_train_pred), decimals=4)
        precision_train=np.round(precision_score(data['y_train'], y_train_pred), decimals=4)
        recall_train=np.round(recall_score(data['y_train'], y_train_pred), decimals=4)
        roc_test=np.round(roc_auc_score(data['y_test'], y_test_pred), decimals=4)
        precision_test=np.round(precision_score(data['y_test'], y_test_pred), decimals=4)
        recall_test=np.round(recall_score(data['y_test'], y_test_pred), decimals=4)

        results[clf_name][data['name']] = (roc_train, precision_train, recall_train,
                                           roc_test, precision_test, recall_test,
                                           train_time, test_time)
        df = pd.DataFrame(results)
        df.to_pickle('../results/numerical_temporary.pkl')


23_WPBC.npz: : 23it [55:53, 145.80s/it]          


In [26]:
df = pd.DataFrame(results)
df.to_pickle('../results/numerical_selected.pkl')

## Training test provided

In [39]:
for data in datasets_loop:
    datasets_loop.set_description(data['name'])
    for clf_name in algorithms_loop:
        algorithms_loop.set_description(clf_name)
        clf = new_clf(clf_name, SEED)
        timer.start()
        clf.fit(data['X_train'], data['y_train'])
        timer.stop()
        train_time = timer.time_sec
        
        # get the prediction labels and outlier scores of the training and tests  data
        if clf_name == 'RISF': # other libs return sklearn UndefinedMetricWarning from predicting th train data
            y_train_pred = clf.predict(data['X_train']) # binary labels (0: inliers, 1: outliers)
        else:
            y_train_pred = clf.labels_
        
        timer.start()
        y_test_pred = clf.predict(data['X_test'])
        timer.stop()
        test_time = timer.time_sec

        if np.isnan(y_train_pred).any():
            resultsY[clf_name][data['name']] = (np.nan, np.nan, np.nan, np.nan, 
                                               np.nan, np.nan, np.nan, np.nan) 
                                               # AUC/ROC, Rank@N for train,test ; fit/test Time
            continue
        
        roc_train=np.round(roc_auc_score(data['y_train'], y_train_pred), decimals=4)
        precision_train=np.round(precision_score(data['y_train'], y_train_pred), decimals=4)
        recall_train=np.round(recall_score(data['y_train'], y_train_pred), decimals=4)
        roc_test=np.round(roc_auc_score(data['y_test'], y_test_pred), decimals=4)
        precision_test=np.round(precision_score(data['y_test'], y_test_pred), decimals=4)
        recall_test=np.round(recall_score(data['y_test'], y_test_pred), decimals=4)

        resultsY[clf_name][data['name']] = (roc_train, precision_train, recall_train,
                                           roc_test, precision_test, recall_test,
                                           train_time, test_time)
        df = pd.DataFrame(resultsY)
        df.to_pickle('../results/numerical_temporary.pkl')

23_WPBC.npz: : 24it [2:31:05, 377.72s/it]        


In [40]:
df = pd.DataFrame(resultsY)
df.to_pickle('../results/numerical_y.pkl')

## GRAPHS

In [4]:
from risf.distance_functions import DegreeDivergenceDist, JaccardDist
from risf.risf_data import RisfData
from data.data_getter import get_ucr_time_series, get_glocalkd_dataset
from risf.forest import RandomIsolationSimilarityForest
from collections import defaultdict

In [5]:
clfs_names = ["IForest", "ISF", 'ECOD', 'LOF', 'HBOS', "RISF"]
results = defaultdict(lambda : defaultdict(lambda: {}))

#datasets_loop = tqdm(get_graphs(), desc="Datasets (outer loop)", position=0)
DATA_DIR = "../data/graph"
datasets_loop = ["NCI1", "REDDIT-BINARY", "PPAR-gamma"]
distances = np.array(["JaccardDist", "DegreeDivergenceDist"], dtype=object)
algorithms_loop = tqdm(clfs_names, desc=" Algorithms (inner loop)", position=1, leave=False)



In [9]:
distances_to_use = get_binary_distances_choice(distances)

for dataset_name in datasets_loop:
    data = get_glocalkd_dataset(DATA_DIR, dataset_name)
    for clf_name in algorithms_loop:
        algorithms_loop.set_description(clf_name)
    
        if clf_name == "RISF":
            for dist_to_use in distances_to_use:
                dist = distances[dist_to_use]
                train_risf(data, dataset_name, dist)
        else:
            clf = new_clf(clf_name, SEED)
            clf.fit(data["X_train_num"])
            y_train_pred = clf.decision_function(data["X_train_num"])
            y_test_pred = clf.decision_function(data["X_test_num"])
            
            add_scores(results, clf_name, data, y_train_pred, y_test_pred)

No node attributes
train_counts (array([0, 1]), array([1437,  144], dtype=int64))
test_counts (array([0, 1]), array([616, 617], dtype=int64))
IForest, on NCI1 obtained, auc_train: 0.5751 and auc_test: 0.5874
ISF, on NCI1 obtained, auc_train: 0.3856 and auc_test: 0.3691
ECOD, on NCI1 obtained, auc_train: 0.5944 and auc_test: 0.5643
LOF, on NCI1 obtained, auc_train: 0.5263 and auc_test: 0.54
HBOS, on NCI1 obtained, auc_train: 0.5742 and auc_test: 0.5608
Distances already calculated. Skipping...
Distances already calculated. Skipping...
Distances already calculated. Skipping...
Distances already calculated. Skipping...
RISF_Jac_Deg, on NCI1 obtained, auc_train: 0.5229 and auc_test: 0.4769
Distances already calculated. Skipping...
Distances already calculated. Skipping...
RISF_Jac, on NCI1 obtained, auc_train: 0.4033 and auc_test: 0.3719
Distances already calculated. Skipping...
Distances already calculated. Skipping...
RISF_Deg, on NCI1 obtained, auc_train: 0.5153 and auc_test: 0.4842
No 

KeyboardInterrupt: 

In [17]:
for split, result in results.items():
    df = pd.DataFrame(result)
    df.to_pickle(f'../results/{split}_graphs.pkl')

In [18]:
#TEST
df

Unnamed: 0,IForest,ISF,ECOD,LOF,HBOS,RISF_Jac_Deg,RISF_Jac,RISF_Deg
NCI1,0.5947,0.3731,0.5639,0.5403,0.5562,0.4769,0.3719,0.4842
REDDIT-BINARY,0.705,0.194,0.6375,0.6232,0.5127,0.5632,0.4132,0.5705
PPAR-gamma,0.5804,0.2866,0.5942,0.4368,0.5792,0.5204,0.4124,0.5522


In [14]:
#TRAIN
df

Unnamed: 0,IForest,ISF,ECOD,LOF,HBOS,RISF_Jac_Deg,RISF_Jac,RISF_Deg
NCI1,0.5825,0.3873,0.5948,0.5276,0.5672,0.5229,0.4033,0.5153
REDDIT-BINARY,0.7143,0.2277,0.7075,0.6189,0.5132,0.5807,0.417,0.5904
PPAR-gamma,0.4775,0.4139,0.5258,0.4795,0.4936,0.4402,0.4282,0.4445


In [8]:
clfs_names = ["IForest", "ISF", 'ECOD', 'LOF', 'HBOS', "RISF"]
results = {x: {} for x in clfs_names}

datasets_loop = tqdm(get_ucr_time_series(), desc="Datasets (outer loop)", position=0)
algorithms_loop = tqdm(clfs_names, desc=" Algorithms (inner loop)", position=1, leave=False)

Datasets (outer loop): 0it [00:00, ?it/s]

In [9]:
from fastdtw import fastdtw

def dtw(series1, series2):
     return fastdtw(series1, series2)[0]

In [10]:
for data in datasets_loop:
    datasets_loop.set_description(data['name'])
    for clf_name in algorithms_loop:
        algorithms_loop.set_description(clf_name)
        clf = new_clf(clf_name, SEED)

        if clf_name == "RISF":
            X_risf = RisfData()
            X_risf.add_data(data["X_train"], dist = dtw)
            X_risf.precompute_distances(n_jobs=-2)
            clf = RandomIsolationSimilarityForest(random_state=SEED, distance=X_risf.distances, n_jobs=-3).fit(X_risf)
            y_train_pred = (-1)*clf.predict(X_risf, return_raw_scores=True)

            X_test_risf = clf.transform([data["X_test"]], n_jobs=-2)
            y_test_pred = (-1)*clf.predict(X_test_risf, return_raw_scores=True)
            
        else:
            clf.fit(data["X_train"])
            if clf_name == "ISF":
                y_train_pred = clf.score_samples(data["X_train"])
                y_test_pred = clf.score_samples(data["X_test"])
            else:
                y_train_pred = clf.decision_function(data["X_train"])
                y_test_pred = clf.decision_function(data["X_test"])
        
        auc_train = calculate_stats(data["y_train"], y_train_pred)
        auc_test = calculate_stats(data["y_test"], y_test_pred)

        results[clf_name][data['name']] = (auc_train, auc_test)

        print(f"{clf_name}, on {data['name']} obtained, auc_train: {auc_train} and auc_test: {auc_test}")
    
    df = pd.DataFrame(results)
    df.to_pickle('../results/time_series.pkl')
        

Computers: : 0it [00:00, ?it/s]          

IForest, on Computers obtained, auc_train: 0.352 and auc_test: 0.3747




ISF, on Computers obtained, auc_train: 0.6 and auc_test: 0.448




ECOD, on Computers obtained, auc_train: 0.372 and auc_test: 0.3467
LOF, on Computers obtained, auc_train: 0.556 and auc_test: 0.588




HBOS, on Computers obtained, auc_train: 0.4093 and auc_test: 0.2947


HouseTwenty: : 1it [05:20, 320.88s/it]

RISF, on Computers obtained, auc_train: 0.3187 and auc_test: 0.296
IForest, on HouseTwenty obtained, auc_train: 1.0 and auc_test: 0.8841
ISF, on HouseTwenty obtained, auc_train: 0.75 and auc_test: 0.087
ECOD, on HouseTwenty obtained, auc_train: 1.0 and auc_test: 0.5604
LOF, on HouseTwenty obtained, auc_train: 0.85 and auc_test: 0.2053
HBOS, on HouseTwenty obtained, auc_train: 0.8 and auc_test: 0.9758


HouseTwenty: : 2it [06:33, 196.84s/it]

RISF, on HouseTwenty obtained, auc_train: 0.85 and auc_test: 0.7778





ValueError: high <= 0