In [19]:
import sys
import os

# import parent directory
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), "..")))

In [20]:
import time
import numpy as np

from ucimlrepo import fetch_ucirepo 
from sklearn.metrics import roc_auc_score, average_precision_score

from forests.i_forest import IForest
from forests.ei_forest import EIForest
from forests.sci_forest import SCIForest
from forests.fc_forest import FCForest
from forests.rrc_forest import RRCForest

from pyod.models.ocsvm import OCSVM
from pyod.models.dif import DIF
from pyod.models.lof import LOF

In [21]:
# fetch the dataset, where the 
# outliers are labeled with 1
# inliers are labeld with 0
spam_base = fetch_ucirepo(id=94) 

In [22]:
# get the data and standardize it
X = spam_base.data.features.to_numpy() 
X = (X - np.mean(X, axis=0)) / np.std(X, axis=0)

# get the labels
y = spam_base.data.targets.to_numpy()
y = y.reshape(-1)

# compute the contamination rate
contamination = np.round(np.sum(y == 1) / y.shape[0], 4)
print(f"Contamination rate: {contamination}%")

Contamination rate: 0.394%


In [23]:
iforest_params = {
    "n_trees": 100,
    "sub_sample_size": 256,
    "contamination": contamination,
}

fcf_params = {
    "n_trees": 200,
    "sub_sample_size": 256,
    "height_limit": 256,
    "contamination": contamination
}

seeds = [42, 123, 456, 789, 1500, 1597, 3141, 4000, 5000, 10007]

In [6]:
def run_forest(
    forest_method: IForest | EIForest | SCIForest | FCForest | RRCForest,
    args: dict
) -> tuple:
    aurocs = []
    auprs = []
    times = []

    for seed in seeds:
        args_with_seed = {**args, "seed": seed}
        forest = forest_method(**args_with_seed)

        start = time.time()
        forest.fit(X)
        end = time.time()

        auroc = roc_auc_score(y, forest.decision_scores)
        aupr = average_precision_score(y, forest.decision_scores)
        time_to_run = end - start

        aurocs.append(auroc)
        auprs.append(aupr)
        times.append(time_to_run)
        
    return np.array(aurocs), np.array(auprs), np.array(times)

In [8]:
if_aurocs, if_auprs, if_times = run_forest(IForest, iforest_params)

print("IForest:")
print("--------")
print(f"AUROC {np.mean(if_aurocs):.4f}")
print(f"AUPR {np.mean(if_auprs):.4f}")
print(f"Time {np.mean(if_times):.4f}s")

IForest:
--------
AUROC 0.6137
AUPR 0.4726
Time 4.5179s


In [9]:
if_aurocs, if_auprs, if_times = run_forest(IForest, fcf_params)

print("IForest-U:")
print("--------")
print(f"AUROC {np.mean(if_aurocs):.4f}")
print(f"AUPR {np.mean(if_auprs):.4f}")
print(f"Time {np.mean(if_times):.4f}s")

IForest-U:
--------
AUROC 0.6740
AUPR 0.5124
Time 23.5158s


In [10]:
eif_aurocs, eif_auprs, eif_times = run_forest(EIForest, iforest_params)

print("EIForest:")
print("--------")
print(f"AUROC {np.mean(eif_aurocs):.4f}")
print(f"AUPR {np.mean(eif_auprs):.4f}")
print(f"Time {np.mean(eif_times):.4f}s")

EIForest:
--------
AUROC 0.5660
AUPR 0.4291
Time 10.3861s


In [11]:
eif_aurocs, eif_auprs, eif_times = run_forest(EIForest, fcf_params)

print("EIForest-U:")
print("--------")
print(f"AUROC {np.mean(eif_aurocs):.4f}")
print(f"AUPR {np.mean(eif_auprs):.4f}")
print(f"Time {np.mean(eif_times):.4f}s")

EIForest-U:
--------
AUROC 0.6722
AUPR 0.5103
Time 103.5736s


In [12]:
scif_aurocs, scif_auprs, scif_times = run_forest(SCIForest, iforest_params)

print("SCIForest:")
print("--------")
print(f"AUROC {np.mean(scif_aurocs):.4f}")
print(f"AUPR {np.mean(scif_auprs):.4f}")
print(f"Time {np.mean(scif_times):.4f}s")

SCIForest:
--------
AUROC 0.3593
AUPR 0.3405
Time 12.7269s


In [13]:
scif_aurocs, scif_auprs, scif_times = run_forest(SCIForest, fcf_params)

print("SCIForest-U:")
print("--------")
print(f"AUROC {np.mean(scif_aurocs):.4f}")
print(f"AUPR {np.mean(scif_auprs):.4f}")
print(f"Time {np.mean(scif_times):.4f}s")

SCIForest-U:
--------
AUROC 0.4246
AUPR 0.3426
Time 254.8088s


In [14]:
fcf_aurocs, fcf_auprs, fcf_times = run_forest(FCForest, fcf_params)

print("FCForest:")
print("--------")
print(f"AUROC {np.mean(fcf_aurocs):.4f}")
print(f"AUPR {np.mean(fcf_auprs):.4f}")
print(f"Time {np.mean(fcf_times):.4f}s")

FCForest:
--------
AUROC 0.6844
AUPR 0.5529
Time 78.7472s
[0.65613295 0.68533026 0.73353573 0.64634839 0.67485237 0.67078601
 0.69700438 0.70243889 0.70262129 0.67490866]


In [10]:
ocsvm_ = OCSVM(kernel="rbf", contamination=contamination)
ocsvm_.fit(X)

start = time.time()
ocsvm_.fit(X)
end = time.time()

auroc = roc_auc_score(y, ocsvm_.decision_scores_)
aupr = average_precision_score(y, ocsvm_.decision_scores_)
time_to_run = end - start

print("OCSVM:")
print("--------")
print(f"AUROC: {auroc:.4f}")
print(f"AUPR: {aupr:.4f}")
print(f"Time: {time_to_run:.4f}")


OCSVM:
--------
AUROC: 0.5369
AUPR: 0.3990
Time: 1.9410


In [18]:
dif_aurocs, dif_auprs, dif_times = [], [], []

for seed in seeds:
    dif_ = DIF(
        
        contamination=contamination, 
        random_state=seed
    )

    start = time.time()
    dif_.fit(X)
    end = time.time()

    auroc = roc_auc_score(y, dif_.decision_scores_)
    aupr = average_precision_score(y, dif_.decision_scores_)
    time_to_run = end - start

    dif_aurocs.append(auroc)
    dif_auprs.append(aupr)
    dif_times.append(time_to_run)

dif_aurocs = np.array(dif_aurocs)
dif_auprs = np.array(dif_auprs)
dif_times = np.array(dif_times)

print("DIForest:")
print("--------")
print(f"AUROC {np.mean(dif_aurocs):.4f}")
print(f"AUPR {np.mean(dif_auprs):.4f}")
print(f"Time {np.mean(dif_times):.4f}s")


DIForest:
--------
AUROC 0.5143
AUPR 0.3671
Time 18.1357s


In [24]:
lof_ = LOF(contamination=contamination)

start = time.time()
lof_.fit(X)
end = time.time()

auroc = roc_auc_score(y, lof_.decision_scores_)
aupr = average_precision_score(y, lof_.decision_scores_)
time_to_run = end - start

print("LOF:")
print("--------")
print(f"AUROC: {auroc:.4f}")
print(f"AUPR: {aupr:.4f}")
print(f"Time: {time_to_run:.4f}")

LOF:
--------
AUROC: 0.4577
AUPR: 0.3546
Time: 0.2623
