In [1]:
import sys
import os

# import parent directory
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), "..")))

In [2]:
import time
import numpy as np
import pandas as pd
import zipfile

from ucimlrepo import fetch_ucirepo 
from sklearn.metrics import roc_auc_score, average_precision_score

from forests.i_forest import IForest
from forests.ei_forest import EIForest
from forests.sci_forest import SCIForest
from forests.fc_forest import FCForest
from forests.rrc_forest import RRCForest

from pyod.models.ocsvm import OCSVM
from pyod.models.dif import DIF

In [3]:
with zipfile.ZipFile("./zip/arrhythmia.zip", "r") as zip_f:
    zip_f.extractall("./zip/arrhythmia")

In [13]:
df = pd.read_csv("./zip/arrhythmia/arrhythmia.data", header=None)
df = df.replace("?", 0)
df = df.to_numpy()

In [14]:
# get the data and standardize it
X = df[:, :-1].astype(np.float32)
X_mask = np.array([len(np.unique(X[:, i])) > 1 for i in range(X.shape[1])])
X = X[:, X_mask]
X = (X - np.mean(X, axis=0)) / np.std(X, axis=0)

# get the labels
y = df[:, -1].astype(int)

  X = (X - np.mean(X, axis=0)) / np.std(X, axis=0)


In [33]:
# get the percentage of each class in the dataset
y_bins_norm = np.bincount(y) / y.shape[0]
print(f"Class percentage: {y_bins_norm}")

# sort the classes by their percentage
y_bins_norm_sorted = np.sort(y_bins_norm)
y_bins_norm_sorted_idx = np.argsort(y_bins_norm)

# compute the cumulative sum of the distribution percentage
y_bins_cum_sum = np.cumsum(y_bins_norm_sorted)

Class percentage: [0.         0.5420354  0.09734513 0.03318584 0.03318584 0.02876106
 0.05530973 0.00663717 0.00442478 0.0199115  0.11061947 0.
 0.         0.         0.00884956 0.01106195 0.04867257]


In [34]:
target = 0.15
i = 0

while y_bins_cum_sum[i] < target:
    i += 1

y_outliers = y_bins_norm_sorted_idx[:i]
y_inliers = y_bins_norm_sorted_idx[i:]

print(f"Classes used for outliers: {y_outliers}")
print(f"Classes used for inliers: {y_inliers}")

Classes used for outliers: [ 0 13 12 11  8  7 14 15  9  5  3  4]
Classes used for inliers: [16  6  2 10  1]


In [35]:
for i, cls in enumerate(y):
    if cls in y_inliers:
        y[i] = 0
    else:
        y[i] = 1 

# compute the contamination rate
contamination = np.round(np.sum(y == 1) / y.shape[0], 4)
print(f"Contamination rate: {contamination}%")

Contamination rate: 0.146%
