In [1]:
from feature_select import *
from pyod.utils.data import evaluate_print
from sklearn.model_selection import train_test_split
import pandas as pd
from transformers import BertTokenizer, BertModel
import experiment_config

In [2]:
dataset_path,dataset_name = experiment_config.get_path_and_name()
num_dataset = len(dataset_path)
# load dataset
df = [pd.read_json(dataset_path[i], lines=True) for i in range(num_dataset)]
texts = [df[i]['text'].tolist() for i in range(num_dataset)]
labels = [df[i]['label'].tolist() for i in range(num_dataset)]
for i in range(num_dataset):
    print(dataset_name[i], end=' ')
    print(len(texts[i]))
    

clickbait_nonclickbait 13201
Corona_NLP 44955
movie_review 30000


In [3]:

# feature selection
features = [np.load('./feature/'+dataset_name[i]+'_feature.npy') for i in range(num_dataset)]
for i in range(num_dataset):
    print(dataset_name[i], end=' ')
    print(features[i].shape)
    
    

clickbait_nonclickbait (13201, 768)
Corona_NLP (44955, 768)
movie_review (30000, 768)


In [4]:
# split dataset
X_train, X_test, y_train, y_test = [], [], [], []
for i in range(num_dataset):
    xtrain, xtest, ytrain, ytest = train_test_split(features[i], labels[i], test_size=0.33, random_state=42)
    X_train.append(xtrain)
    X_test.append(xtest)
    y_train.append(ytrain)
    y_test.append(ytest)
for i in range(num_dataset):
    print(dataset_name[i], end='  ')
    print(X_train[i].shape, X_test[i].shape, len(y_train[i]), len(y_test[i]))

clickbait_nonclickbait  (8844, 768) (4357, 768) 8844 4357
Corona_NLP  (30119, 768) (14836, 768) 30119 14836
movie_review  (20100, 768) (9900, 768) 20100 9900


In [5]:
# KNN
from pyod.models.knn import KNN
def KNN_benchmark(X_train, X_test, y_train, y_test, KNN_Hyerparameters):
    clf_name = 'KNN'
    clf = KNN()
    clf.fit(X_train)

    y_train_scores = clf.decision_scores_  

    y_test_scores = clf.decision_function(X_test) 

    print("\nOn Training Data:")
    evaluate_print(clf_name, y_train, y_train_scores)
    print("\nOn Test Data:")
    evaluate_print(clf_name, y_test, y_test_scores)
KNN_hyerparameters = None
for i in range(num_dataset):
    print(dataset_name[i]+':')
    KNN_benchmark(X_train[i], X_test[i], y_train[i], y_test[i], KNN_hyerparameters)
    print()

clickbait_nonclickbait:

On Training Data:
KNN ROC:0.1872, precision @ rank n:0.0918

On Test Data:
KNN ROC:0.1864, precision @ rank n:0.0912

Corona_NLP:

On Training Data:
KNN ROC:0.5542, precision @ rank n:0.1661

On Test Data:
KNN ROC:0.5597, precision @ rank n:0.1711

movie_review:

On Training Data:
KNN ROC:0.4197, precision @ rank n:0.121

On Test Data:
KNN ROC:0.4273, precision @ rank n:0.1117



In [6]:
from pyod.models.abod import ABOD
from pyod.utils.data import generate_data
from pyod.utils.data import evaluate_print
def ABOD_benchmark(X_train, X_test, y_train, y_test):
    contamination = 0.1  # percentage of outliers
    n_train = 200  # number of training points
    n_test = 100  # number of testing points

    # Generate sample data
    
    clf_name = 'ABOD'
    clf = ABOD()
    clf.fit(X_train)

    # get the prediction labels and outlier scores of the training data
    y_train_pred = clf.labels_  # binary labels (0: inliers, 1: outliers)
    y_train_scores = clf.decision_scores_  # raw outlier scores

    # get the prediction on the test data
    y_test_pred = clf.predict(X_test)  # outlier labels (0 or 1)
    y_test_scores = clf.decision_function(X_test)  # outlier scores
    
    # evaluate and print the results
    print("\nOn Training Data:")
    # for item in y_train_scores:
    #     if np.isnan(item):
    #         print('yes')
    # print(len(y_train))
    # print(len(y_train_scores))
    evaluate_print(clf_name, y_train, y_train_scores)
    print("\nOn Test Data:")
    evaluate_print(clf_name, y_test, y_test_scores)
for i in range(num_dataset):
    print(dataset_name[i]+':')
    ABOD_benchmark(X_train[i], X_test[i], y_train[i], y_test[i])
    print()

clickbait_nonclickbait:

On Training Data:
ABOD ROC:0.2279, precision @ rank n:0.0894

On Test Data:
ABOD ROC:0.2309, precision @ rank n:0.0855

Corona_NLP:

On Training Data:
ABOD ROC:0.5484, precision @ rank n:0.1587

On Test Data:
ABOD ROC:0.5547, precision @ rank n:0.1691

movie_review:


In [None]:
from pyod.models.alad import ALAD
from pyod.utils.data import evaluate_print
def ALAD_benchmark(X_train, X_test, y_train, y_test):
    contamination = 0.1
    clf_name = 'ALAD'
    clf = ALAD(epochs=100, latent_dim=2,
               learning_rate_disc=0.0001,
               learning_rate_gen=0.0001,
               dropout_rate=0.2,
               add_recon_loss=False,
               lambda_recon_loss=0.05,
               add_disc_zz_loss=True,
               dec_layers=[75, 100],
               enc_layers=[100, 75],
               disc_xx_layers=[100, 75],
               disc_zz_layers=[25, 25],
               disc_xz_layers=[100, 75],
               spectral_normalization=False,
               activation_hidden_disc='tanh', activation_hidden_gen='tanh',
               preprocessing=True, batch_size=200, contamination=contamination)
    clf.fit(X_train)

    # get the prediction labels and outlier scores of the training data
    y_train_pred = clf.labels_  # binary labels (0: inliers, 1: outliers)
    y_train_scores = clf.decision_scores_  # raw outlier scores

    # get the prediction on the test data
    y_test_pred = clf.predict(X_test)  # outlier labels (0 or 1)
    y_test_scores = clf.decision_function(X_test)  # outlier scores

    # evaluate and print the results
    print("\nOn Training Data:")
    evaluate_print(clf_name, y_train, y_train_scores)
    print("\nOn Test Data:")
    evaluate_print(clf_name, y_test, y_test_scores)
for i in range(num_dataset):
    print(dataset_name[i]+':')
    ALAD_benchmark(X_train[i], X_test[i], y_train[i], y_test[i])
    print()

In [None]:
from pyod.models.auto_encoder import AutoEncoder
from pyod.utils.data import evaluate_print
def auto_encoder_benchmark(X_train, X_test, y_train, y_test):
    contamination = 0.1
    # train AutoEncoder detector
    clf_name = 'AutoEncoder'
    clf = AutoEncoder(epochs=30, contamination=contamination)
    clf.fit(X_train)

    # get the prediction labels and outlier scores of the training data
    y_train_pred = clf.labels_  # binary labels (0: inliers, 1: outliers)
    y_train_scores = clf.decision_scores_  # raw outlier scores

    # get the prediction on the test data
    y_test_pred = clf.predict(X_test)  # outlier labels (0 or 1)
    y_test_scores = clf.decision_function(X_test)  # outlier scores

    # evaluate and print the results
    print("\nOn Training Data:")
    evaluate_print(clf_name, y_train, y_train_scores)
    print("\nOn Test Data:")
    evaluate_print(clf_name, y_test, y_test_scores)
for i in range(num_dataset):
    print(dataset_name[i]+':')
    auto_encoder_benchmark(X_train[i], X_test[i], y_train[i], y_test[i])
    print()

In [None]:
from pyod.models.cblof import CBLOF
from pyod.utils.data import evaluate_print
def cblof_benchmark(X_train, X_test, y_train, y_test):
    contamination = 0.1
    # train AutoEncoder detector
    clf_name = 'CBLOF'
    clf = CBLOF(random_state=42)
    clf.fit(X_train)

    # get the prediction labels and outlier scores of the training data
    y_train_pred = clf.labels_  # binary labels (0: inliers, 1: outliers)
    y_train_scores = clf.decision_scores_  # raw outlier scores

    # get the prediction on the test data
    y_test_pred = clf.predict(X_test)  # outlier labels (0 or 1)
    y_test_scores = clf.decision_function(X_test)  # outlier scores

    # evaluate and print the results
    print("\nOn Training Data:")
    evaluate_print(clf_name, y_train, y_train_scores)
    print("\nOn Test Data:")
    evaluate_print(clf_name, y_test, y_test_scores)
for i in range(num_dataset):
    print(dataset_name[i]+':')
    cblof_benchmark(X_train[i], X_test[i], y_train[i], y_test[i])
    print()

In [None]:
from pyod.models.cd import CD
from pyod.utils.data import evaluate_print
def cd_benchmark(X_train, X_test, y_train, y_test):
    contamination = 0.1
    # train AutoEncoder detector
    clf_name = 'CBLOF'
    clf = CBLOF(random_state=42)
    clf.fit(X_train, y_train)

    # get the prediction labels and outlier scores of the training data
    y_train_pred = clf.labels_  # binary labels (0: inliers, 1: outliers)
    y_train_scores = clf.decision_scores_  # raw outlier scores

    # get the prediction on the test data
    y_test_pred = clf.predict(X_test)  # outlier labels (0 or 1)
    y_test_scores = clf.decision_function(X_test)  # outlier scores

    # evaluate and print the results
    print("\nOn Training Data:")
    evaluate_print(clf_name, y_train, y_train_scores)
    print("\nOn Test Data:")
    evaluate_print(clf_name, y_test, y_test_scores)
for i in range(num_dataset):
    print(dataset_name[i]+':')
    cd_benchmark(X_train[i], X_test[i], y_train[i], y_test[i])
    print()

In [None]:
from pyod.models.cof import COF
from pyod.utils.data import evaluate_print
def cof_benchmark(X_train, X_test, y_train, y_test):
    contamination = 0.1
    # train AutoEncoder detector
    # train COF detector
    clf_name = 'COF'
    clf = COF(n_neighbors=30)
    clf.fit(X_train)

    # get the prediction labels and outlier scores of the training data
    y_train_pred = clf.labels_  # binary labels (0: inliers, 1: outliers)
    y_train_scores = clf.decision_scores_  # raw outlier scores

    # get the prediction on the test data
    y_test_pred = clf.predict(X_test)  # outlier labels (0 or 1)
    y_test_scores = clf.decision_function(X_test)  # outlier scores

    # evaluate and print the results
    print("\nOn Training Data:")
    evaluate_print(clf_name, y_train, y_train_scores)
    print("\nOn Test Data:")
    evaluate_print(clf_name, y_test, y_test_scores)

for i in range(num_dataset):
    print(dataset_name[i]+':')
    cof_benchmark(X_train[i], X_test[i], y_train[i], y_test[i])
    print()

In [None]:
from pyod.utils.data import evaluate_print
from pyod.models.knn import KNN
from pyod.models.combination import aom, moa, average, maximization, median
from pyod.utils.utility import standardizer
def comb_benchmark(X_train, X_test, y_train, y_test):
    contamination = 0.1
    # train AutoEncoder detector
    # train COF detector
    X_train_norm, X_test_norm = standardizer(X_train, X_test)

    n_clf = 20  # number of base detectors

    # Initialize 20 base detectors for combination
    k_list = [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, 120, 130, 140,
              150, 160, 170, 180, 190, 200]

    train_scores = np.zeros([X_train.shape[0], n_clf])
    test_scores = np.zeros([X_test.shape[0], n_clf])

    print('Combining {n_clf} kNN detectors'.format(n_clf=n_clf))

    for i in range(n_clf):
        k = k_list[i]

        clf = KNN(n_neighbors=k, method='largest')
        clf.fit(X_train_norm)

        train_scores[:, i] = clf.decision_scores_
        test_scores[:, i] = clf.decision_function(X_test_norm)

    # Decision scores have to be normalized before combination
    train_scores_norm, test_scores_norm = standardizer(train_scores,
                                                       test_scores)
    # Combination by average
    y_by_average = average(train_scores_norm)
    evaluate_print('Combination by Average train', y_train, y_by_average)
    y_by_average = average(test_scores_norm)
    evaluate_print('Combination by Average test', y_test, y_by_average)

    # Combination by max
    y_by_maximization = maximization(train_scores_norm)
    evaluate_print('Combination by Maximization train', y_train, y_by_maximization)
    y_by_maximization = maximization(test_scores_norm)
    evaluate_print('Combination by Maximization test', y_test, y_by_maximization)

    # Combination by median
    y_by_median = median(train_scores_norm)
    evaluate_print('Combination by Median train', y_train, y_by_median)
    y_by_median = median(test_scores_norm)
    evaluate_print('Combination by Median test', y_test, y_by_median)

    # Combination by aom
    y_by_aom = aom(train_scores_norm, n_buckets=5)
    evaluate_print('Combination by AOM train', y_train, y_by_aom)
    y_by_aom = aom(test_scores_norm, n_buckets=5)
    evaluate_print('Combination by AOM test', y_test, y_by_aom)

    # Combination by moa
    y_by_moa = moa(train_scores_norm, n_buckets=5)
    evaluate_print('Combination by MOA train', y_train, y_by_moa)
    y_by_moa = moa(test_scores_norm, n_buckets=5)
    evaluate_print('Combination by MOA test', y_test, y_by_moa)

for i in range(num_dataset):
    print(dataset_name[i]+':')
    comb_benchmark(X_train[i], X_test[i], y_train[i], y_test[i])
    print()

In [None]:
from pyod.models.copod import COPOD
from pyod.utils.data import evaluate_print
def cof_benchmark(X_train, X_test, y_train, y_test):
    contamination = 0.1
    # train AutoEncoder detector
    # train COF detector
    # train COPOD detector
    clf_name = 'COPOD'
    clf = COPOD()

    # you could try parallel version as well.
    # clf = COPOD(n_jobs=2)
    clf.fit(X_train)

    # get the prediction labels and outlier scores of the training data
    y_train_pred = clf.labels_  # binary labels (0: inliers, 1: outliers)
    y_train_scores = clf.decision_scores_  # raw outlier scores

    # get the prediction on the test data
    y_test_pred = clf.predict(X_test)  # outlier labels (0 or 1)
    y_test_scores = clf.decision_function(X_test)  # outlier scores

    # evaluate and print the results
    print("\nOn Training Data:")
    evaluate_print(clf_name, y_train, y_train_scores)
    print("\nOn Test Data:")
    evaluate_print(clf_name, y_test, y_test_scores)

for i in range(num_dataset):
    print(dataset_name[i]+':')
    cof_benchmark(X_train[i], X_test[i], y_train[i], y_test[i])
    print()

In [None]:
from pyod.models.deep_svdd import DeepSVDD
from pyod.utils.data import evaluate_print
def cof_benchmark(X_train, X_test, y_train, y_test):
    use_ae = False  # hyperparameter for use ae architecture instead of simple NN
    random_state = 10  # if C is set to None use random_state
    contamination = 0.1
    # train AutoEncoder detector
    # train COF detector
    # train COPOD detector
    clf_name = 'DeepSVDD'
    clf = DeepSVDD(use_ae=use_ae, epochs=5, contamination=contamination,
                   random_state=random_state)
    clf.fit(X_train)

    # get the prediction labels and outlier scores of the training data
    y_train_pred = clf.labels_  # binary labels (0: inliers, 1: outliers)
    y_train_scores = clf.decision_scores_  # raw outlier scores

    # get the prediction on the test data
    y_test_pred = clf.predict(X_test)  # outlier labels (0 or 1)
    y_test_scores = clf.decision_function(X_test)  # outlier scores

    # evaluate and print the results
    print("\nOn Training Data:")
    evaluate_print(clf_name, y_train, y_train_scores)
    print("\nOn Test Data:")
    evaluate_print(clf_name, y_test, y_test_scores)

for i in range(num_dataset):
    print(dataset_name[i]+':')
    cof_benchmark(X_train[i], X_test[i], y_train[i], y_test[i])
    print()

In [None]:
from pyod.models.dif import DIF
from pyod.utils.data import evaluate_print
def dif_benchmark(X_train, X_test, y_train, y_test):
    contamination = 0.1
    # train AutoEncoder detector
    # train COF detector
    # train COPOD detector
    clf_name = 'DIF'
    clf = DIF()
    clf.fit(X_train)

    # get the prediction labels and outlier scores of the training data
    y_train_pred = clf.labels_  # binary labels (0: inliers, 1: outliers)
    y_train_scores = clf.decision_scores_  # raw outlier scores

    # get the prediction on the test data
    y_test_pred = clf.predict(X_test)  # outlier labels (0 or 1)
    y_test_scores = clf.decision_function(X_test)  # outlier scores

    # evaluate and print the results
    print("\nOn Training Data:")
    evaluate_print(clf_name, y_train, y_train_scores)
    print("\nOn Test Data:")
    evaluate_print(clf_name, y_test, y_test_scores)

for i in range(num_dataset):
    print(dataset_name[i]+':')
    dif_benchmark(X_train[i], X_test[i], y_train[i], y_test[i])
    print()

In [None]:
from pyod.models.ecod import ECOD
from pyod.utils.data import evaluate_print
def ecod_benchmark(X_train, X_test, y_train, y_test):
    contamination = 0.1
    # train AutoEncoder detector
    # train COF detector
    # train COPOD detector
    clf_name = 'ECOD'
    clf = ECOD()

    # you could try parallel version as well.
    # clf = ECOD(n_jobs=2)
    clf.fit(X_train)

    # get the prediction labels and outlier scores of the training data
    y_train_pred = clf.labels_  # binary labels (0: inliers, 1: outliers)
    y_train_scores = clf.decision_scores_  # raw outlier scores

    # get the prediction on the test data
    y_test_pred = clf.predict(X_test)  # outlier labels (0 or 1)
    y_test_scores = clf.decision_function(X_test)  # outlier scores

    # evaluate and print the results
    print("\nOn Training Data:")
    evaluate_print(clf_name, y_train, y_train_scores)
    print("\nOn Test Data:")
    evaluate_print(clf_name, y_test, y_test_scores)


for i in range(num_dataset):
    print(dataset_name[i]+':')
    ecod_benchmark(X_train[i], X_test[i], y_train[i], y_test[i])
    print()

In [None]:
from pyod.models.feature_bagging import FeatureBagging
from pyod.utils.data import evaluate_print
def FeatureBagging_benchmark(X_train, X_test, y_train, y_test):
    contamination = 0.1
    # train AutoEncoder detector
    # train COF detector
    # train COPOD detector
    clf_name = 'FeatureBagging'
    clf = FeatureBagging(check_estimator=False)
    clf.fit(X_train)

    # get the prediction labels and outlier scores of the training data
    y_train_pred = clf.labels_  # binary labels (0: inliers, 1: outliers)
    y_train_scores = clf.decision_scores_  # raw outlier scores

    # get the prediction on the test data
    y_test_pred = clf.predict(X_test)  # outlier labels (0 or 1)
    y_test_scores = clf.decision_function(X_test)  # outlier scores

    # evaluate and print the results
    print("\nOn Training Data:")
    evaluate_print(clf_name, y_train, y_train_scores)
    print("\nOn Test Data:")
    evaluate_print(clf_name, y_test, y_test_scores)


for i in range(num_dataset):
    print(dataset_name[i]+':')
    FeatureBagging_benchmark(X_train[i], X_test[i], y_train[i], y_test[i])
    print()

In [None]:
from pyod.models.gmm import GMM
from pyod.utils.data import evaluate_print
def gmm_benchmark(X_train, X_test, y_train, y_test):
    contamination = 0.1
    # train AutoEncoder detector
    # train COF detector
    # train COPOD detector
    clf_name = "GMM"
    clf = GMM(n_components=4)
    clf.fit(X_train)

    # get the prediction labels and outlier scores of the training data
    y_train_pred = clf.labels_  # binary labels (0: inliers, 1: outliers)
    y_train_scores = clf.decision_scores_  # raw outlier scores

    # get the prediction on the test data
    y_test_pred = clf.predict(X_test)  # outlier labels (0 or 1)
    y_test_scores = clf.decision_function(X_test)  # outlier scores

    # evaluate and print the results
    print("\nOn Training Data:")
    evaluate_print(clf_name, y_train, y_train_scores)
    print("\nOn Test Data:")
    evaluate_print(clf_name, y_test, y_test_scores)


for i in range(num_dataset):
    print(dataset_name[i]+':')
    gmm_benchmark(X_train[i], X_test[i], y_train[i], y_test[i])
    print()

In [None]:
from pyod.models.hbos import HBOS
from pyod.utils.data import evaluate_print
def hbos_benchmark(X_train, X_test, y_train, y_test):
    contamination = 0.1
    # train HBOS detector
    clf_name = 'HBOS'
    clf = HBOS()
    clf.fit(X_train)

    # get the prediction labels and outlier scores of the training data
    y_train_pred = clf.labels_  # binary labels (0: inliers, 1: outliers)
    y_train_scores = clf.decision_scores_  # raw outlier scores

    # get the prediction on the test data
    y_test_pred = clf.predict(X_test)  # outlier labels (0 or 1)
    y_test_scores = clf.decision_function(X_test)  # outlier scores

    # evaluate and print the results
    print("\nOn Training Data:")
    evaluate_print(clf_name, y_train, y_train_scores)
    print("\nOn Test Data:")
    evaluate_print(clf_name, y_test, y_test_scores)


for i in range(num_dataset):
    print(dataset_name[i]+':')
    hbos_benchmark(X_train[i], X_test[i], y_train[i], y_test[i])
    print()

In [None]:
from pyod.models.iforest import IForest
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
import numpy as np
import matplotlib.pyplot as plt


def IForest_benchmark_pca(X_train, X_test, y_train, y_test):
    # Normalize the data
    scaler = StandardScaler()
    X_train_norm = scaler.fit_transform(X_train)
    X_test_norm = scaler.transform(X_test)

    # Reduce the data into two principal components for visualization
    pca = PCA(n_components=2)
    X_train_pca = pca.fit_transform(X_train_norm)
    X_test_pca = pca.transform(X_test_norm)

    # Train IForest detector
    clf_name = 'IForest'
    clf = IForest()  # Add hyperparameters if necessary
    clf.fit(X_train_pca)

    # Get the prediction on the training data
    y_train_pred = clf.predict(X_train_pca)
    y_train_scores = clf.decision_function(X_train_pca)

    # Get the prediction on the test data
    y_test_pred = clf.predict(X_test_pca)
    y_test_scores = clf.decision_function(X_test_pca)

    # Evaluate and print the results
    print("\nOn Training Data:")
    evaluate_print(clf_name, y_train, y_train_scores)
    print("\nOn Test Data:")
    evaluate_print(clf_name, y_test, y_test_scores)
    

# Assuming X_train, X_test, y_train, y_test are defined and properly preprocessed
IForest_Hyerparameters = None
for i in range(num_dataset):
    print(dataset_name[i]+':')
    IForest_benchmark_pca(X_train[i], X_test[i], y_train[i], y_test[i])
    print()
    


In [None]:
from pyod.models.inne import INNE

from pyod.utils.data import evaluate_print
def inne_benchmark(X_train, X_test, y_train, y_test):
    contamination = 0.1
    # train HBOS detector
    clf_name = 'INNE'
    clf = INNE(contamination=contamination, max_samples=4)
    clf.fit(X_train)

    # get the prediction labels and outlier scores of the training data
    y_train_pred = clf.labels_  # binary labels (0: inliers, 1: outliers)
    y_train_scores = clf.decision_scores_  # raw outlier scores

    # get the prediction on the test data
    y_test_pred = clf.predict(X_test)  # outlier labels (0 or 1)
    y_test_scores = clf.decision_function(X_test)  # outlier scores

    # evaluate and print the results
    print("\nOn Training Data:")
    evaluate_print(clf_name, y_train, y_train_scores)
    print("\nOn Test Data:")
    evaluate_print(clf_name, y_test, y_test_scores)

for i in range(num_dataset):
    print(dataset_name[i]+':')
    inne_benchmark(X_train[i], X_test[i], y_train[i], y_test[i])
    print()

In [None]:
from pyod.models.kde import KDE
from pyod.utils.data import evaluate_print
def kde_benchmark(X_train, X_test, y_train, y_test):
    contamination = 0.1
    # train HBOS detector
    clf_name = 'kde'
    clf = KDE()
    clf.fit(X_train)

    # get the prediction labels and outlier scores of the training data
    y_train_pred = clf.labels_  # binary labels (0: inliers, 1: outliers)
    y_train_scores = clf.decision_scores_  # raw outlier scores

    # get the prediction on the test data
    y_test_pred = clf.predict(X_test)  # outlier labels (0 or 1)
    y_test_scores = clf.decision_function(X_test)  # outlier scores

    # evaluate and print the results
    print("\nOn Training Data:")
    evaluate_print(clf_name, y_train, y_train_scores)
    print("\nOn Test Data:")
    evaluate_print(clf_name, y_test, y_test_scores)


for i in range(num_dataset):
    print(dataset_name[i]+':')
    kde_benchmark(X_train[i], X_test[i], y_train[i], y_test[i])
    print()


In [None]:
from pyod.models.knn import KNN
from pyod.utils.data import evaluate_print
def knn_mahalanobis_benchmark(X_train, X_test, y_train, y_test):
    contamination = 0.1
    
    # train kNN detector with mahalanobis distance
    clf_name = 'KNN (mahalanobis distance)'
    # calculate covariance for mahalanobis distance
    X_train_cov = np.cov(X_train, rowvar=False)
    clf = KNN(algorithm='auto', metric='mahalanobis',
              metric_params={'V': X_train_cov})
    clf.fit(X_train)

    # get the prediction labels and outlier scores of the training data
    y_train_pred = clf.labels_  # binary labels (0: inliers, 1: outliers)
    y_train_scores = clf.decision_scores_  # raw outlier scores

    # get the prediction on the test data
    y_test_pred = clf.predict(X_test)  # outlier labels (0 or 1)
    y_test_scores = clf.decision_function(X_test)  # outlier scores

    # evaluate and print the results
    print("\nOn Training Data:")
    evaluate_print(clf_name, y_train, y_train_scores)
    print("\nOn Test Data:")
    evaluate_print(clf_name, y_test, y_test_scores)


for i in range(num_dataset):
    print(dataset_name[i]+':')
    knn_mahalanobis_benchmark(X_train[i], X_test[i], y_train[i], y_test[i])
    print()

In [None]:
from pyod.models.kpca import KPCA
from pyod.utils.data import evaluate_print
def kpca_benchmark(X_train, X_test, y_train, y_test):
    contamination = 0.1
    
    clf_name = "KPCA"
    clf = KPCA()
    clf.fit(X_train)

    # get the prediction labels and outlier scores of the training data
    y_train_pred = clf.labels_  # binary labels (0: inliers, 1: outliers)
    y_train_scores = clf.decision_scores_  # raw outlier scores

    # get the prediction on the test data
    y_test_pred = clf.predict(X_test)  # outlier labels (0 or 1)
    y_test_scores = clf.decision_function(X_test)  # outlier scores

    # evaluate and print the results
    print("\nOn Training Data:")
    evaluate_print(clf_name, y_train, y_train_scores)
    print("\nOn Test Data:")
    evaluate_print(clf_name, y_test, y_test_scores)


for i in range(num_dataset):
    print(dataset_name[i]+':')
    kpca_benchmark(X_train[i], X_test[i], y_train[i], y_test[i])
    print()

In [None]:
from pyod.models.lmdd import LMDD
from pyod.utils.data import evaluate_print
def imdd_benchmark(X_train, X_test, y_train, y_test):
    contamination = 0.1
    # train LMDD detector
    clf_name = 'LMDD'
    clf = LMDD(random_state=42)
    print(11)
    clf.fit(X_train)

    # get the prediction labels and outlier scores of the training data
    y_train_pred = clf.labels_  # binary labels (0: inliers, 1: outliers)
    y_train_scores = clf.decision_scores_  # raw outlier scores

    # get the prediction on the test data
    y_test_pred = clf.predict(X_test)  # outlier labels (0 or 1)
    y_test_scores = clf.decision_function(X_test)  # outlier scores

    # evaluate and print the results
    print("\nOn Training Data:")
    evaluate_print(clf_name, y_train, y_train_scores)
    print("\nOn Test Data:")
    evaluate_print(clf_name, y_test, y_test_scores)


for i in range(num_dataset):
    print(dataset_name[i]+':')
    imdd_benchmark(X_train[i], X_test[i], y_train[i], y_test[i])
    print()

In [None]:
from pyod.models.loci import LOCI
from pyod.utils.data import evaluate_print
def loci_benchmark(X_train, X_test, y_train, y_test):
    contamination = 0.1
    
    # train LMDD detector
    # train LOCI detector
    clf_name = 'LOCI'
    clf = LOCI()
    clf.fit(X_train)

    # get the prediction labels and outlier scores of the training data
    y_train_pred = clf.labels_  # binary labels (0: inliers, 1: outliers)
    y_train_scores = clf.decision_scores_  # raw outlier scores

    # get the prediction on the test data
    y_test_pred = clf.predict(X_test)  # outlier labels (0 or 1)
    y_test_scores = clf.decision_function(X_test)  # outlier scores

    # evaluate and print the results
    print("\nOn Training Data:")
    evaluate_print(clf_name, y_train, y_train_scores)
    print("\nOn Test Data:")
    evaluate_print(clf_name, y_test, y_test_scores)

for i in range(num_dataset):
    print(dataset_name[i]+':')
    loci_benchmark(X_train[i], X_test[i], y_train[i], y_test[i])
    print()

In [None]:
from pyod.models.loda import LODA
from pyod.utils.data import evaluate_print
def loda_benchmark(X_train, X_test, y_train, y_test):
    contamination = 0.1
    
    # train LOCI detector
    clf_name = 'LODA'
    clf = LODA()
    clf.fit(X_train)

    # get the prediction labels and outlier scores of the training data
    y_train_pred = clf.labels_  # binary labels (0: inliers, 1: outliers)
    y_train_scores = clf.decision_scores_  # raw outlier scores

    # get the prediction on the test data
    y_test_pred = clf.predict(X_test)  # outlier labels (0 or 1)
    y_test_scores = clf.decision_function(X_test)  # outlier scores

    # evaluate and print the results
    print("\nOn Training Data:")
    evaluate_print(clf_name, y_train, y_train_scores)
    print("\nOn Test Data:")
    evaluate_print(clf_name, y_test, y_test_scores)

for i in range(num_dataset):
    print(dataset_name[i]+':')
    loda_benchmark(X_train[i], X_test[i], y_train[i], y_test[i])
    print()

In [None]:
from pyod.models.lof import LOF
from pyod.utils.data import evaluate_print
def lof_benchmark(X_train, X_test, y_train, y_test):
    contamination = 0.1
    
    # train LOF detector
    clf_name = 'LOF'
    clf = LOF()
    clf.fit(X_train)

    # get the prediction labels and outlier scores of the training data
    y_train_pred = clf.labels_  # binary labels (0: inliers, 1: outliers)
    y_train_scores = clf.decision_scores_  # raw outlier scores

    # get the prediction on the test data
    y_test_pred = clf.predict(X_test)  # outlier labels (0 or 1)
    y_test_scores = clf.decision_function(X_test)  # outlier scores

    # evaluate and print the results
    print("\nOn Training Data:")
    evaluate_print(clf_name, y_train, y_train_scores)
    print("\nOn Test Data:")
    evaluate_print(clf_name, y_test, y_test_scores)

for i in range(num_dataset):
    print(dataset_name[i]+':')
    lof_benchmark(X_train[i], X_test[i], y_train[i], y_test[i])
    print()

In [None]:
from pyod.models.lscp import LSCP
from pyod.models.lof import LOF
from pyod.utils.data import evaluate_print
def lcsp_benchmark(X_train, X_test, y_train, y_test):
    contamination = 0.1
    
    # train lscp
    clf_name = 'LSCP'
    detector_list = [LOF(n_neighbors=15), LOF(n_neighbors=20),
                     LOF(n_neighbors=25), LOF(n_neighbors=35)]
    clf = LSCP(detector_list, random_state=42)
    clf.fit(X_train)

    # get the prediction labels and outlier scores of the training data
    y_train_pred = clf.labels_  # binary labels (0: inliers, 1: outliers)
    y_train_scores = clf.decision_scores_  # raw outlier scores

    # get the prediction on the test data
    y_test_pred = clf.predict(X_test)  # outlier labels (0 or 1)
    y_test_scores = clf.decision_function(X_test)  # outlier scores

    # evaluate and print the results
    print("\nOn Training Data:")
    evaluate_print(clf_name, y_train, y_train_scores)
    print("\nOn Test Data:")
    evaluate_print(clf_name, y_test, y_test_scores)

for i in range(num_dataset):
    print(dataset_name[i]+':')
    lcsp_benchmark(X_train[i], X_test[i], y_train[i], y_test[i])
    print()

In [None]:
from pyod.models.lunar import LUNAR
from pyod.utils.data import evaluate_print
def lunar_benchmark(X_train, X_test, y_train, y_test):
    contamination = 0.1
    
    # train LUNAR detector
    clf_name = 'LUNAR'
    clf = LUNAR()
    clf.fit(X_train)

    # get the prediction labels and outlier scores of the training data
    y_train_pred = clf.labels_  # binary labels (0: inliers, 1: outliers)
    y_train_scores = clf.decision_scores_  # raw outlier scores

    # get the prediction on the test data
    y_test_pred = clf.predict(X_test)  # outlier labels (0 or 1)
    y_test_scores = clf.decision_function(X_test)  # outlier scores

    # evaluate and print the results
    print("\nOn Training Data:")
    evaluate_print(clf_name, y_train, y_train_scores)
    print("\nOn Test Data:")
    evaluate_print(clf_name, y_test, y_test_scores)

for i in range(num_dataset):
    print(dataset_name[i]+':')
    lunar_benchmark(X_train[i], X_test[i], y_train[i], y_test[i])
    print()

In [None]:
from pyod.models.mad import MAD
from pyod.utils.data import evaluate_print
def mad_benchmark(X_train, X_test, y_train, y_test):
    contamination = 0.1
    
    # train MAD detector
    clf_name = 'MAD'
    clf = MAD(threshold=3.5)
    clf.fit(X_train)

    # get the prediction labels and outlier scores of the training data
    y_train_pred = clf.labels_  # binary labels (0: inliers, 1: outliers)
    y_train_scores = clf.decision_scores_  # raw outlier scores

    # get the prediction on the test data
    y_test_pred = clf.predict(X_test)  # outlier labels (0 or 1)
    y_test_scores = clf.decision_function(X_test)  # outlier scores

    # evaluate and print the results
    print("\nOn Training Data:")
    evaluate_print(clf_name, y_train, y_train_scores)
    print("\nOn Test Data:")
    evaluate_print(clf_name, y_test, y_test_scores)

for i in range(num_dataset):
    print(dataset_name[i]+':')
    mad_benchmark(X_train[i], X_test[i], y_train[i], y_test[i])
    print()

In [None]:
from pyod.models.mcd import MCD
from pyod.utils.data import evaluate_print
def mcd_benchmark(X_train, X_test, y_train, y_test):
    contamination = 0.1
    
    # train LOF detector
    clf_name = 'MCD'
    clf = MCD()
    clf.fit(X_train)

    # get the prediction labels and outlier scores of the training data
    y_train_pred = clf.labels_  # binary labels (0: inliers, 1: outliers)
    y_train_scores = clf.decision_scores_  # raw outlier scores

    # get the prediction on the test data
    y_test_pred = clf.predict(X_test)  # outlier labels (0 or 1)
    y_test_scores = clf.decision_function(X_test)  # outlier scores

    # evaluate and print the results
    print("\nOn Training Data:")
    evaluate_print(clf_name, y_train, y_train_scores)
    print("\nOn Test Data:")
    evaluate_print(clf_name, y_test, y_test_scores)


for i in range(num_dataset):
    print(dataset_name[i]+':')
    mcd_benchmark(X_train[i], X_test[i], y_train[i], y_test[i])
    print()

In [None]:
from pyod.models.mo_gaal import MO_GAAL
from pyod.utils.data import evaluate_print
def mo_gaal_benchmark(X_train, X_test, y_train, y_test):
    contamination = 0.1
    
    # train MO_GAAL detector
    clf_name = 'MO_GAAL'
    clf = MO_GAAL(k=3, stop_epochs=2, contamination=contamination)
    clf.fit(X_train)

    # get the prediction labels and outlier scores of the training data
    y_train_pred = clf.labels_  # binary labels (0: inliers, 1: outliers)
    y_train_scores = clf.decision_scores_  # raw outlier scores

    # get the prediction on the test data
    y_test_pred = clf.predict(X_test)  # outlier labels (0 or 1)
    y_test_scores = clf.decision_function(X_test)  # outlier scores

    # evaluate and print the results
    print("\nOn Training Data:")
    evaluate_print(clf_name, y_train, y_train_scores)
    print("\nOn Test Data:")
    evaluate_print(clf_name, y_test, y_test_scores)


for i in range(num_dataset):
    print(dataset_name[i]+':')
    mo_gaal_benchmark(X_train[i], X_test[i], y_train[i], y_test[i])
    print()

In [None]:
from pyod.models.ocsvm import OCSVM
from pyod.utils.data import evaluate_print
def ocsvm_benchmark(X_train, X_test, y_train, y_test):
    contamination = 0.1
    
    # train one_class_svm detector
    clf_name = 'OneClassSVM'
    clf = OCSVM()
    clf.fit(X_train)

    # get the prediction labels and outlier scores of the training data
    y_train_pred = clf.labels_  # binary labels (0: inliers, 1: outliers)
    y_train_scores = clf.decision_scores_  # raw outlier scores

    # get the prediction on the test data
    y_test_pred = clf.predict(X_test)  # outlier labels (0 or 1)
    y_test_scores = clf.decision_function(X_test)  # outlier scores

    # evaluate and print the results
    print("\nOn Training Data:")
    evaluate_print(clf_name, y_train, y_train_scores)
    print("\nOn Test Data:")
    evaluate_print(clf_name, y_test, y_test_scores)


for i in range(num_dataset):
    print(dataset_name[i]+':')
    ocsvm_benchmark(X_train[i], X_test[i], y_train[i], y_test[i])
    print()

In [None]:
from pyod.models.pca import PCA
from pyod.utils.data import evaluate_print
def pca_benchmark(X_train, X_test, y_train, y_test):
    contamination = 0.1
    
    # train PCA detector
    clf_name = 'PCA'
    clf = PCA(n_components=3)
    clf.fit(X_train)

    # get the prediction labels and outlier scores of the training data
    y_train_pred = clf.labels_  # binary labels (0: inliers, 1: outliers)
    y_train_scores = clf.decision_scores_  # raw outlier scores

    # get the prediction on the test data
    y_test_pred = clf.predict(X_test)  # outlier labels (0 or 1)
    y_test_scores = clf.decision_function(X_test)  # outlier scores

    # evaluate and print the results
    print("\nOn Training Data:")
    evaluate_print(clf_name, y_train, y_train_scores)
    print("\nOn Test Data:")
    evaluate_print(clf_name, y_test, y_test_scores)


for i in range(num_dataset):
    print(dataset_name[i]+':')
    pca_benchmark(X_train[i], X_test[i], y_train[i], y_test[i])
    print()

In [None]:
from pyod.models.qmcd import QMCD
from pyod.utils.data import evaluate_print
def qmcd_benchmark(X_train, X_test, y_train, y_test):
    contamination = 0.1
    
    # train QMCD detector
    clf_name = 'QMCD'
    clf = QMCD()
    clf.fit(X_train, y_train)

    # get the prediction labels and outlier scores of the training data
    y_train_pred = clf.labels_  # binary labels (0: inliers, 1: outliers)
    y_train_scores = clf.decision_scores_  # raw outlier scores

    # get the prediction on the test data
    y_test_pred = clf.predict(np.append(X_test, y_test.reshape(-1, 1), axis=1))  # outlier labels (0 or 1)
    y_test_scores = clf.decision_function(np.append(X_test, y_test.reshape(-1, 1), axis=1))  # outlier scores

    # evaluate and print the results
    print("\nOn Training Data:")
    evaluate_print(clf_name, y_train, y_train_scores)
    print("\nOn Test Data:")
    evaluate_print(clf_name, y_test, y_test_scores)


for i in range(num_dataset):
    print(dataset_name[i]+':')
    qmcd_benchmark(X_train[i], X_test[i], y_train[i], y_test[i])
    print()

In [None]:
from pyod.models.rgraph import RGraph
from pyod.utils.data import evaluate_print
def rgraph_benchmark(X_train, X_test, y_train, y_test):
    contamination = 0.1
    
    # train R-graph detector
    clf_name = 'R-graph'
    clf = RGraph(n_nonzero=100, transition_steps=20, gamma=50, gamma_nz=False,
             tau=1, preprocessing=True, active_support=False,
             blocksize_test_data=20,
             algorithm='lasso_lars', maxiter=100, verbose=1)

    clf.fit(X_train)

    # get the prediction labels and outlier scores of the training data
    y_train_pred = clf.labels_  # binary labels (0: inliers, 1: outliers)
    y_train_scores = clf.decision_scores_  # raw outlier scores

    # get the prediction on the test data
    y_test_pred = clf.predict(X_test)  # outlier labels (0 or 1)
    y_test_scores = clf.decision_function(X_test)  # outlier scores

    # evaluate and print the results
    print("\nOn Training Data:")
    evaluate_print(clf_name, y_train, y_train_scores)
    print("\nOn Test Data:")
    evaluate_print(clf_name, y_test, y_test_scores)


for i in range(num_dataset):
    print(dataset_name[i]+':')
    rgraph_benchmark(X_train[i], X_test[i], y_train[i], y_test[i])
    print()

In [None]:
from pyod.models.rod import ROD
from pyod.utils.data import evaluate_print
def rod_benchmark(X_train, X_test, y_train, y_test):
    contamination = 0.1
    
    # train ROD detector
    clf_name = 'ROD'
    clf = ROD()
    clf.fit(X_train)

    y_train_pred = clf.labels_  # binary labels (0: inliers, 1: outliers)
    y_train_scores = clf.decision_scores_  # raw outlier scores

    # get the prediction on the test data
    y_test_pred = clf.predict(X_test)  # outlier labels (0 or 1)
    y_test_scores = clf.decision_function(X_test)  # outlier scores

    # evaluate and print the results
    print("\nOn Training Data:")
    evaluate_print(clf_name, y_train, y_train_scores)
    print("\nOn Test Data:")
    evaluate_print(clf_name, y_test, y_test_scores)


for i in range(num_dataset):
    print(dataset_name[i]+':')
    rod_benchmark(X_train[i], X_test[i], y_train[i], y_test[i])
    print()

In [None]:
from pyod.models.sampling import Sampling
from pyod.utils.data import evaluate_print
def sampling_benchmark(X_train, X_test, y_train, y_test):
    contamination = 0.1
    
    # train kNN detector
    clf_name = "Sampling"
    clf = Sampling()
    clf.fit(X_train)

    # get the prediction labels and outlier scores of the training data
    y_train_pred = clf.labels_  # binary labels (0: inliers, 1: outliers)
    y_train_scores = clf.decision_scores_  # raw outlier scores

    # get the prediction on the test data
    y_test_pred = clf.predict(X_test)  # outlier labels (0 or 1)
    y_test_scores = clf.decision_function(X_test)  # outlier scores

    # evaluate and print the results
    print("\nOn Training Data:")
    evaluate_print(clf_name, y_train, y_train_scores)
    print("\nOn Test Data:")
    evaluate_print(clf_name, y_test, y_test_scores)


for i in range(num_dataset):
    print(dataset_name[i]+':')
    sampling_benchmark(X_train[i], X_test[i], y_train[i], y_test[i])
    print()

In [None]:
from pyod.models.so_gaal import SO_GAAL
from pyod.utils.data import evaluate_print
def so_gaal_benchmark(X_train, X_test, y_train, y_test):
    contamination = 0.1
    
    # train SO_GAAL detector
    clf_name = 'SO_GAAL'
    clf = SO_GAAL(stop_epochs=2, contamination=contamination)
    clf.fit(X_train)

    # get the prediction labels and outlier scores of the training data
    y_train_pred = clf.labels_  # binary labels (0: inliers, 1: outliers)
    y_train_scores = clf.decision_scores_  # raw outlier scores

    # get the prediction on the test data
    y_test_pred = clf.predict(X_test)  # outlier labels (0 or 1)
    y_test_scores = clf.decision_function(X_test)  # outlier scores

    # evaluate and print the results
    print("\nOn Training Data:")
    evaluate_print(clf_name, y_train, y_train_scores)
    print("\nOn Test Data:")
    evaluate_print(clf_name, y_test, y_test_scores)

for i in range(num_dataset):
    print(dataset_name[i]+':')
    so_gaal_benchmark(X_train[i], X_test[i], y_train[i], y_test[i])
    print()

In [None]:
from pyod.models.so_gaal import SO_GAAL
from pyod.utils.data import evaluate_print
def so_gaal_benchmark(X_train, X_test, y_train, y_test):
    contamination = 0.1
    
    # train SO_GAAL detector
    clf_name = 'SO_GAAL'
    clf = SO_GAAL(stop_epochs=2, contamination=contamination)
    clf.fit(X_train)

    # get the prediction labels and outlier scores of the training data
    y_train_pred = clf.labels_  # binary labels (0: inliers, 1: outliers)
    y_train_scores = clf.decision_scores_  # raw outlier scores

    # get the prediction on the test data
    y_test_pred = clf.predict(X_test)  # outlier labels (0 or 1)
    y_test_scores = clf.decision_function(X_test)  # outlier scores

    # evaluate and print the results
    print("\nOn Training Data:")
    evaluate_print(clf_name, y_train, y_train_scores)
    print("\nOn Test Data:")
    evaluate_print(clf_name, y_test, y_test_scores)

for i in range(num_dataset):
    print(dataset_name[i]+':')
    so_gaal_benchmark(X_train[i], X_test[i], y_train[i], y_test[i])
    print()

In [None]:
from pyod.models.sod import SOD
from pyod.utils.data import evaluate_print
def sod_benchmark(X_train, X_test, y_train, y_test):
    contamination = 0.1
    
    # thus, higher precision is expected in higher dimensions
    clf_name = 'SOD'
    clf = SOD()
    clf.fit(X_train)

    # get the prediction labels and outlier scores of the training data
    y_train_pred = clf.labels_  # binary labels (0: inliers, 1: outliers)
    y_train_scores = clf.decision_scores_  # raw outlier scores

    # get the prediction on the test data
    y_test_pred = clf.predict(X_test)  # outlier labels (0 or 1)
    y_test_scores = clf.decision_function(X_test)  # outlier scores

    # evaluate and print the results
    print("\nOn Training Data:")
    evaluate_print(clf_name, y_train, y_train_scores)
    print("\nOn Test Data:")
    evaluate_print(clf_name, y_test, y_test_scores)

for i in range(num_dataset):
    print(dataset_name[i]+':')
    sod_benchmark(X_train[i], X_test[i], y_train[i], y_test[i])
    print()

In [None]:
from pyod.models.sos import SOS
from pyod.utils.data import evaluate_print
def sos_benchmark(X_train, X_test, y_train, y_test):
    contamination = 0.1
    
    # thus, higher precision is expected in higher dimensions
    clf_name = 'SOS'
    clf = SOS()
    clf.fit(X_train)

    # get the prediction labels and outlier scores of the training data
    y_train_pred = clf.labels_  # binary labels (0: inliers, 1: outliers)
    y_train_scores = clf.decision_scores_  # raw outlier scores

    # get the prediction on the test data
    y_test_pred = clf.predict(X_test)  # outlier labels (0 or 1)
    y_test_scores = clf.decision_function(X_test)  # outlier scores

    # evaluate and print the results
    print("\nOn Training Data:")
    evaluate_print(clf_name, y_train, y_train_scores)
    print("\nOn Test Data:")
    evaluate_print(clf_name, y_test, y_test_scores)

for i in range(num_dataset):
    print(dataset_name[i]+':')
    sos_benchmark(X_train[i], X_test[i], y_train[i], y_test[i])
    print()

In [None]:
from pyod.models.suod import SUOD
from pyod.models.lof import LOF
from pyod.models.iforest import IForest
from pyod.models.copod import COPOD
from pyod.utils.utility import standardizer
from pyod.utils.data import evaluate_print
def suod_benchmark(X_train, X_test, y_train, y_test):
    contamination = 0.1
    
    # train SUOD
    clf_name = 'SUOD'

    # initialized a group of outlier detectors for acceleration
    detector_list = [LOF(n_neighbors=15), LOF(n_neighbors=20),
                     LOF(n_neighbors=25), LOF(n_neighbors=35),
                     COPOD(), IForest(n_estimators=100),
                     IForest(n_estimators=200)]

    # decide the number of parallel process, and the combination method
    clf = SUOD(base_estimators=detector_list, n_jobs=2, combination='average',
               verbose=False)

    # or to use the default detectors
    # clf = SUOD(n_jobs=2, combination='average',
    #            verbose=False)
    clf.fit(X_train)

    # get the prediction labels and outlier scores of the training data
    y_train_pred = clf.labels_  # binary labels (0: inliers, 1: outliers)
    y_train_scores = clf.decision_scores_  # raw outlier scores

    # get the prediction on the test data
    y_test_pred = clf.predict(X_test)  # outlier labels (0 or 1)
    y_test_scores = clf.decision_function(X_test)  # outlier scores

    # evaluate and print the results
    print("\nOn Training Data:")
    evaluate_print(clf_name, y_train, y_train_scores)
    print("\nOn Test Data:")
    evaluate_print(clf_name, y_test, y_test_scores)

for i in range(num_dataset):
    print(dataset_name[i]+':')
    suod_benchmark(X_train[i], X_test[i], y_train[i], y_test[i])
    print()

In [None]:
from pyod.models.vae import VAE
from pyod.utils.data import evaluate_print
def vae_benchmark(X_train, X_test, y_train, y_test):
    contamination = 0.1
    
   # train VAE detector (Beta-VAE)
    clf_name = 'VAE'
    clf = VAE(epochs=30, contamination=contamination, gamma=0.8, capacity=0.2)
    clf.fit(X_train)

    # get the prediction labels and outlier scores of the training data
    y_train_pred = clf.labels_  # binary labels (0: inliers, 1: outliers)
    y_train_scores = clf.decision_scores_  # raw outlier scores

    # get the prediction on the test data
    y_test_pred = clf.predict(X_test)  # outlier labels (0 or 1)
    y_test_scores = clf.decision_function(X_test)  # outlier scores

    # evaluate and print the results
    print("\nOn Training Data:")
    evaluate_print(clf_name, y_train, y_train_scores)
    print("\nOn Test Data:")
    evaluate_print(clf_name, y_test, y_test_scores)

for i in range(num_dataset):
    print(dataset_name[i]+':')
    vae_benchmark(X_train[i], X_test[i], y_train[i], y_test[i])
    print()

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.utils.validation import check_X_y
from scipy.io import loadmat

from pyod.models.xgbod import XGBOD
from pyod.utils.data import generate_data
from pyod.utils.data import evaluate_print
def xgbod_benchmark(X_train, X_test, y_train, y_test):
    contamination = 0.1
    
   # train XGBOD detector
    clf_name = 'XGBOD'
    clf = XGBOD(random_state=42)
    clf.fit(X_train, y_train)

    # get the prediction labels and outlier scores of the training data
    y_train_pred = clf.labels_  # binary labels (0: inliers, 1: outliers)
    y_train_scores = clf.decision_scores_  # raw outlier scores

    # get the prediction on the test data
    y_test_pred = clf.predict(X_test)  # outlier labels (0 or 1)
    y_test_scores = clf.decision_function(X_test)  # outlier scores

    # evaluate and print the results
    print("\nOn Training Data:")
    evaluate_print(clf_name, y_train, y_train_scores)
    print("\nOn Test Data:")
    evaluate_print(clf_name, y_test, y_test_scores)

for i in range(num_dataset):
    print(dataset_name[i]+':')
    xgbod_benchmark(X_train[i], X_test[i], y_train[i], y_test[i])
    print()