In [1]:
import sys
import os

# import parent directory
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), "..")))

In [70]:
import time
import numpy as np

from ucimlrepo import fetch_ucirepo 
from sklearn.metrics import roc_auc_score, average_precision_score

from forests.i_forest import IForest
from forests.ei_forest import EIForest
from forests.sci_forest import SCIForest
from forests.fc_forest import FCForest
from forests.rrc_forest import RRCForest

from pyod.models.ocsvm import OCSVM
from pyod.models.dif import DIF

In [54]:
# fetch the dataset
statlog_landsat_satellite = fetch_ucirepo(id=146) 

In [55]:
# get the data and standardize it
X = statlog_landsat_satellite.data.features.to_numpy()
X = (X - np.mean(X, axis=0)) / np.std(X, axis=0)

# get the labels
y = statlog_landsat_satellite.data.targets.to_numpy()
y = y.reshape(-1)

In [56]:
# get the percentage of each class in the dataset
y_bins_norm = np.bincount(y) / y.shape[0]
print(f"Class percentage: {y_bins_norm}")

# sort the classes by their percentage
y_bins_norm_sorted = np.sort(y_bins_norm)
y_bins_norm_sorted_idx = np.argsort(y_bins_norm)

# compute the cumulative sum of the distribution percentage
y_bins_cum_sum = np.cumsum(y_bins_norm_sorted)

Class percentage: [0.         0.23822844 0.10924631 0.21103341 0.0972805  0.10986791
 0.         0.23434343]


In [57]:
target = 0.33
i = 0

while y_bins_cum_sum[i] < target:
    i += 1

y_outliers = y_bins_norm_sorted_idx[:i]
y_inliers = y_bins_norm_sorted_idx[i:]

print(f"Classes used for outliers: {y_outliers}")
print(f"Classes used for inliers: {y_inliers}")

Classes used for outliers: [0 6 4 2 5]
Classes used for inliers: [3 7 1]


In [58]:
for i, cls in enumerate(y):
    if cls in y_inliers:
        y[i] = 0
    else:
        y[i] = 1 

# compute the contamination rate
contamination = np.round(np.sum(y == 1) / y.shape[0], 4)
print(f"Contamination rate: {contamination}%")

Contamination rate: 0.3164%


In [59]:
iforest_params = {
    "n_trees": 100,
    "sub_sample_size": 256,
    "contamination": contamination,
}

fcf_params = {
    "n_trees": 200,
    "sub_sample_size": 256,
    "height_limit": 256,
    "contamination": contamination
}

seeds = [42, 123, 456, 789, 1500, 1597, 3141, 4000, 5000, 10007]

In [60]:
def run_forest(
    forest_method: IForest | EIForest | SCIForest | FCForest | RRCForest,
    args: dict
) -> tuple:
    aurocs = []
    auprs = []
    times = []

    for seed in seeds:
        args_with_seed = {**args, "seed": seed}
        forest = forest_method(**args_with_seed)

        start = time.time()
        forest.fit(X)
        end = time.time()

        auroc = roc_auc_score(y, forest.decision_scores)
        aupr = average_precision_score(y, forest.decision_scores)
        time_to_run = end - start

        aurocs.append(auroc)
        auprs.append(aupr)
        times.append(time_to_run)
        
    return np.array(aurocs), np.array(auprs), np.array(times)

In [61]:
if_aurocs, if_auprs, if_times = run_forest(IForest, iforest_params)

print("IForest:")
print("--------")
print(f"AUROC {np.mean(if_aurocs):.4f}")
print(f"AUPR {np.mean(if_auprs):.4f}")
print(f"Time {np.mean(if_times):.4f}s")

IForest:
--------
AUROC 0.6582
AUPR 0.5418
Time 4.1136s


In [62]:
if_aurocs, if_auprs, if_times = run_forest(IForest, fcf_params)

print("IForest-U:")
print("--------")
print(f"AUROC {np.mean(if_aurocs):.4f}")
print(f"AUPR {np.mean(if_auprs):.4f}")
print(f"Time {np.mean(if_times):.4f}s")

IForest-U:
--------
AUROC 0.7130
AUPR 0.6103
Time 8.9876s


In [63]:
eif_aurocs, eif_auprs, eif_times = run_forest(EIForest, iforest_params)

print("EIForest:")
print("--------")
print(f"AUROC {np.mean(eif_aurocs):.4f}")
print(f"AUPR {np.mean(eif_auprs):.4f}")
print(f"Time {np.mean(eif_times):.4f}s")

EIForest:
--------
AUROC 0.6878
AUPR 0.5841
Time 12.4717s


In [64]:
eif_aurocs, eif_auprs, eif_times = run_forest(EIForest, fcf_params)

print("EIForest-U:")
print("--------")
print(f"AUROC {np.mean(eif_aurocs):.4f}")
print(f"AUPR {np.mean(eif_auprs):.4f}")
print(f"Time {np.mean(eif_times):.4f}s")

EIForest-U:
--------
AUROC 0.7234
AUPR 0.6059
Time 40.2775s


In [65]:
scif_aurocs, scif_auprs, scif_times = run_forest(SCIForest, iforest_params)

print("SCIForest:")
print("--------")
print(f"AUROC {np.mean(scif_aurocs):.4f}")
print(f"AUPR {np.mean(scif_auprs):.4f}")
print(f"Time {np.mean(scif_times):.4f}s")

SCIForest:
--------
AUROC 0.6033
AUPR 0.5960
Time 12.5499s


In [66]:
scif_aurocs, scif_auprs, scif_times = run_forest(SCIForest, fcf_params)

print("SCIForest-U:")
print("--------")
print(f"AUROC {np.mean(scif_aurocs):.4f}")
print(f"AUPR {np.mean(scif_auprs):.4f}")
print(f"Time {np.mean(scif_times):.4f}s")

SCIForest-U:
--------
AUROC 0.7331
AUPR 0.6715
Time 77.7249s


In [67]:
fcf_aurocs, fcf_auprs, fcf_times = run_forest(FCForest, fcf_params)

print("FCForest:")
print("--------")
print(f"AUROC {np.mean(fcf_aurocs):.4f}")
print(f"AUPR {np.mean(fcf_auprs):.4f}")
print(f"Time {np.mean(fcf_times):.4f}s")

FCForest:
--------
AUROC 0.7459
AUPR 0.6104
Time 55.4516s


In [None]:
ocsvm_ = OCSVM(kernel="rbf", contamination=contamination)
ocsvm_.fit(X)

start = time.time()
ocsvm_.fit(X)
end = time.time()

auroc = roc_auc_score(y, ocsvm_.decision_scores_)
aupr = average_precision_score(y, ocsvm_.decision_scores_)
time_to_run = end - start

print("OCSVM:")
print("--------")
print(f"AUROC: {auroc:.4f}")
print(f"AUPR: {aupr:.4f}")
print(f"Time: {time_to_run:.4f}")

OCSVM:
--------
AUROC: 0.6636
AUPR: 0.6550
Time: 4.6073


In [72]:
dif_aurocs, dif_auprs, dif_times = [], [], []

for seed in seeds:
    dif_ = DIF(
        
        contamination=contamination, 
        random_state=seed
    )

    start = time.time()
    dif_.fit(X)
    end = time.time()

    auroc = roc_auc_score(y, dif_.decision_scores_)
    aupr = average_precision_score(y, dif_.decision_scores_)
    time_to_run = end - start

    dif_aurocs.append(auroc)
    dif_auprs.append(aupr)
    dif_times.append(time_to_run)

dif_aurocs = np.array(dif_aurocs)
dif_auprs = np.array(dif_auprs)
dif_times = np.array(dif_times)

print("DIForest:")
print("--------")
print(f"AUROC {np.mean(dif_aurocs):.4f}")
print(f"AUPR {np.mean(dif_auprs):.4f}")
print(f"Time {np.mean(dif_times):.4f}s")


DIForest:
--------
AUROC 0.6470
AUPR 0.6226
Time 23.1105s
