In [1]:
import sys
import os

# import parent directory
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), "..")))

In [2]:
import time
import numpy as np
import pandas as pd
import zipfile

from ucimlrepo import fetch_ucirepo 
from sklearn.metrics import roc_auc_score, average_precision_score

from forests.i_forest import IForest
from forests.ei_forest import EIForest
from forests.sci_forest import SCIForest
from forests.fc_forest import FCForest
from forests.rrc_forest import RRCForest

from pyod.models.ocsvm import OCSVM
from pyod.models.dif import DIF
from pyod.models.lof import LOF

In [3]:
with zipfile.ZipFile("./zip/arrhythmia.zip", "r") as zip_f:
    zip_f.extractall("./zip/arrhythmia")

In [4]:
df = pd.read_csv("./zip/arrhythmia/arrhythmia.data", header=None)
df = df.replace("?", 0)
df = df.to_numpy()

In [5]:
# get the data and standardize it
X = df[:, :-1].astype(np.float32)
X_mask = np.array([len(np.unique(X[:, i])) > 1 for i in range(X.shape[1])])
X = X[:, X_mask]
X = (X - np.mean(X, axis=0)) / np.std(X, axis=0)

# get the labels
y = df[:, -1].astype(int)

In [6]:
# get the percentage of each class in the dataset
y_bins_norm = np.bincount(y) / y.shape[0]
print(f"Class percentage: {y_bins_norm}")

# sort the classes by their percentage
y_bins_norm_sorted = np.sort(y_bins_norm)
y_bins_norm_sorted_idx = np.argsort(y_bins_norm)

# compute the cumulative sum of the distribution percentage
y_bins_cum_sum = np.cumsum(y_bins_norm_sorted)

Class percentage: [0.         0.5420354  0.09734513 0.03318584 0.03318584 0.02876106
 0.05530973 0.00663717 0.00442478 0.0199115  0.11061947 0.
 0.         0.         0.00884956 0.01106195 0.04867257]


In [7]:
target = 0.15
i = 0

while y_bins_cum_sum[i] < target:
    i += 1

y_outliers = y_bins_norm_sorted_idx[:i]
y_inliers = y_bins_norm_sorted_idx[i:]

print(f"Classes used for outliers: {y_outliers}")
print(f"Classes used for inliers: {y_inliers}")

Classes used for outliers: [ 0 13 12 11  8  7 14 15  9  5  3  4]
Classes used for inliers: [16  6  2 10  1]


In [8]:
for i, cls in enumerate(y):
    if cls in y_inliers:
        y[i] = 0
    else:
        y[i] = 1 

# compute the contamination rate
contamination = np.round(np.sum(y == 1) / y.shape[0], 4)
print(f"Contamination rate: {contamination}%")

Contamination rate: 0.146%


In [9]:
iforest_params = {
    "n_trees": 100,
    "sub_sample_size": 256,
    "contamination": contamination,
}

fcf_params = {
    "n_trees": 200,
    "sub_sample_size": 256,
    "height_limit": 256,
    "contamination": contamination
}

seeds = [42, 123, 456, 789, 1500, 1597, 3141, 4000, 5000, 10007]

In [10]:
def run_forest(
    forest_method: IForest | EIForest | SCIForest | FCForest | RRCForest,
    args: dict
) -> tuple:
    aurocs = []
    auprs = []
    times = []

    for seed in seeds:
        args_with_seed = {**args, "seed": seed}
        forest = forest_method(**args_with_seed)

        start = time.time()
        forest.fit(X)
        end = time.time()

        auroc = roc_auc_score(y, forest.decision_scores)
        aupr = average_precision_score(y, forest.decision_scores)
        time_to_run = end - start

        aurocs.append(auroc)
        auprs.append(aupr)
        times.append(time_to_run)
        
    return np.array(aurocs), np.array(auprs), np.array(times)

In [12]:
if_aurocs, if_auprs, if_times = run_forest(IForest, iforest_params)

print("IForest:")
print("--------")
print(f"AUROC {np.mean(if_aurocs):.4f}")
print(f"AUPR {np.mean(if_auprs):.4f}")
print(f"Time {np.mean(if_times):.4f}s")

IForest:
--------
AUROC 0.7115
AUPR 0.3268
Time 1.6867s


In [12]:
if_aurocs, if_auprs, if_times = run_forest(IForest, fcf_params)

print("IForest-U:")
print("--------")
print(f"AUROC {np.mean(if_aurocs):.4f}")
print(f"AUPR {np.mean(if_auprs):.4f}")
print(f"Time {np.mean(if_times):.4f}s")

IForest-U:
--------
AUROC 0.7658
AUPR 0.3994
Time 7.3050s


In [13]:
eif_aurocs, eif_auprs, eif_times = run_forest(EIForest, iforest_params)

print("EIForest:")
print("--------")
print(f"AUROC {np.mean(eif_aurocs):.4f}")
print(f"AUPR {np.mean(eif_auprs):.4f}")
print(f"Time {np.mean(eif_times):.4f}s")

EIForest:
--------
AUROC 0.7071
AUPR 0.3205
Time 3.1947s


In [14]:
eif_aurocs, eif_auprs, eif_times = run_forest(EIForest, fcf_params)

print("EIForest-U:")
print("--------")
print(f"AUROC {np.mean(eif_aurocs):.4f}")
print(f"AUPR {np.mean(eif_auprs):.4f}")
print(f"Time {np.mean(eif_times):.4f}s")

EIForest-U:
--------
AUROC 0.7821
AUPR 0.4252
Time 12.4911s


In [15]:
scif_aurocs, scif_auprs, scif_times = run_forest(SCIForest, iforest_params)

print("SCIForest:")
print("--------")
print(f"AUROC {np.mean(scif_aurocs):.4f}")
print(f"AUPR {np.mean(scif_auprs):.4f}")
print(f"Time {np.mean(scif_times):.4f}s")

SCIForest:
--------
AUROC 0.6672
AUPR 0.2855
Time 10.8679s


In [None]:
scif_aurocs, scif_auprs, scif_times = run_forest(SCIForest, fcf_params)

print("SCIForest-U:")
print("--------")
print(f"AUROC {np.mean(scif_aurocs):.4f}")
print(f"AUPR {np.mean(scif_auprs):.4f}")
print(f"Time {np.mean(scif_times):.4f}s")

In [12]:
fcf_aurocs, fcf_auprs, fcf_times = run_forest(FCForest, fcf_params)

print("FCForest:")
print("--------")
print(f"AUROC {np.mean(fcf_aurocs):.4f}")
print(f"AUPR {np.mean(fcf_auprs):.4f}")
print(f"Time {np.mean(fcf_times):.4f}s")

FCForest:
--------
AUROC 0.7184
AUPR 0.3632
Time 85.5051s


In [17]:
ocsvm_ = OCSVM(kernel="rbf", contamination=contamination)

start = time.time()
ocsvm_.fit(X)
end = time.time()

auroc = roc_auc_score(y, ocsvm_.decision_scores_)
aupr = average_precision_score(y, ocsvm_.decision_scores_)
time_to_run = end - start

print("OCSVM:")
print("--------")
print(f"AUROC: {auroc:.4f}")
print(f"AUPR: {aupr:.4f}")
print(f"Time: {time_to_run:.4f}")

OCSVM:
--------
AUROC: 0.7776
AUPR: 0.3927
Time: 0.0321


In [18]:
dif_aurocs, dif_auprs, dif_times = [], [], []

for seed in seeds:
    dif_ = DIF(
        
        contamination=contamination, 
        random_state=seed
    )

    start = time.time()
    dif_.fit(X)
    end = time.time()

    auroc = roc_auc_score(y, dif_.decision_scores_)
    aupr = average_precision_score(y, dif_.decision_scores_)
    time_to_run = end - start

    dif_aurocs.append(auroc)
    dif_auprs.append(aupr)
    dif_times.append(time_to_run)

dif_aurocs = np.array(dif_aurocs)
dif_auprs = np.array(dif_auprs)
dif_times = np.array(dif_times)

print("DIForest:")
print("--------")
print(f"AUROC {np.mean(dif_aurocs):.4f}")
print(f"AUPR {np.mean(dif_auprs):.4f}")
print(f"Time {np.mean(dif_times):.4f}s")


DIForest:
--------
AUROC 0.7869
AUPR 0.4219
Time 2.3940s


In [15]:
lof_ = LOF(contamination=contamination)

start = time.time()
lof_.fit(X)
end = time.time()

auroc = roc_auc_score(y, lof_.decision_scores_)
aupr = average_precision_score(y, lof_.decision_scores_)
time_to_run = end - start

print("LOF:")
print("--------")
print(f"AUROC: {auroc:.4f}")
print(f"AUPR: {aupr:.4f}")
print(f"Time: {time_to_run:.4f}")

LOF:
--------
AUROC: 0.7640
AUPR: 0.3354
Time: 0.0230
