In [None]:
# import packages
import os,sys,inspect
currentdir = os.path.dirname(os.path.abspath(inspect.getfile(inspect.currentframe())))
parentdir = os.path.dirname(currentdir)
sys.path.insert(0,parentdir)

import numpy as np
import pandas as pd

# generate data
from utils.data_generator import generate_synthetic_clusters, generate_synthetic_d1, \
    generate_synthetic_d2, generate_synthetic_d3

# for outlier detection
from pyod.models.lof import LOF
from pyod.models.cblof import CBLOF
from pyod.models.iforest import IForest
from pyod.models.ocsvm import OCSVM
from pyod.models.knn import KNN
from outlier_detection.rocf import ROCF # import ROCF
from outlier_detection.cbof import CBOF

# for evaluation
from sklearn.metrics import classification_report, f1_score
from sklearn.model_selection import train_test_split

# exploratory data analysis
from matplotlib import pyplot as plt
import seaborn as sns

# bayesian hyperparameter optimization
from hyperopt import hp, Trials, fmin, tpe, STATUS_OK
from hyperopt.pyll import scope
from time import time
from tqdm import tqdm

# Synthetic Dataset D1

## Generate Data

In [None]:
X_d1, y_d1 = generate_synthetic_d1()
color_d1 = ["red" if i == 1 else "black" for i in y_d1]

plt.figure(figsize=(5,5))
plt.scatter([x[0] for x in X_d1], [x[1] for x in X_d1], c=color_d1, s=3)

## Replicate Parameters Suggested in Paper

In [None]:
# prepare model functions
n_samples = len(y_d1)

# dictionary of models & parameters to test
# LOF, CBOF, ROCF are replications of Table 1 of the paper
functions_dict = {
    'LOF (k=10, n=10)': { 'algo': 'LOF', 'k':10, 'n':10, 'f':LOF(n_neighbors=10, contamination=30/n_samples) }, 
    'LOF (k=10, n=20)': { 'algo': 'LOF', 'k':10, 'n':20, 'f':LOF(n_neighbors=10, contamination=45/n_samples) },
    'LOF (k=10, n=30)': { 'algo': 'LOF', 'k':10, 'n':30, 'f':LOF(n_neighbors=10, contamination=60/n_samples) },
    'CBOF (k=6, alpha=0.95)': {'algo': 'CBOF', 'k': 6, 'n':83, 'f':CBOF(k=6, contamination=0.05, pct=0.3, lofub=1) },
    'CBOF (k=6, alpha=0.90)': {'algo': 'CBOF', 'k': 6, 'n':165, 'f':CBOF(k=6, contamination=0.10, pct=0.3, lofub=1) },
    'CBOF (k=6, alpha=0.85)': {'algo': 'CBOF', 'k': 6, 'n':248, 'f':CBOF(k=6, contamination=0.15, pct=0.3, lofub=1) },
    'ROCF (k=4)': { 'algo': 'ROCF', 'k':4, 'n':None, 'f':ROCF(distance_metric="euclidean", k=4) }
}

In [None]:
# create output dataframe
d1_results = pd.DataFrame(columns=['algo', 'k', 'n', 'outlier_rate', 'recall', 'precision', 'f1'])

for name, f_dict in tqdm(functions_dict.items()):
    # initialise classifier
    clf = f_dict['f']

    # fit classifier on data
    clf.fit(X_d1)

    # retrieve predictions
    try:
        y_pred = clf.get_outliers()
    except:
        y_pred = clf.labels_


    # derive evaluation metrics
    report = classification_report(y_true=y_d1, y_pred=y_pred, output_dict=True)['1']

    row = { 
        'algo': f_dict['algo'], 'k': f_dict['k'], 'n': f_dict['n'],
        'precision': report['precision'], 'recall': report['recall'], 'f1': report['f1-score']
    }

    # retrieve outlier rate
    try:
        row['outlier_rate'] = clf.get_outlier_rate()
    except:
        row['outlier_rate'] = clf.contamination

    d1_results = d1_results.append(row, ignore_index=True)

In [None]:
# print results
d1_results[["algo", "outlier_rate", "recall", "precision", "f1"]]

## Hyperparameter Tuning

In [None]:
def hyperopt(param_space, X, y, num_eval, classifier):  
    '''
    Function that performs Bayesian hyperparameter optimisation 
    to find the optimal parameters for the outlier detection algorithm.
    
    Inputs:
        param_space (dict): A dictionary of the parameters and corresponding space to search.
        X (array): Features of the dataset.
        y (array): Labels of the dataset (0 = normal; 1 = anomaly).
        
        num_eval (int): Number of evaluation rounds.
        classifier (pyOD Object): Outlier detection algorithm.
        
    Outputs:
        trials
        -min(loss) (float): Best in-sample F1 score.
        best_param_values (dict): Dictionary of the best parameters for the classifier.
    '''
    
    start = time()
    
    def objective_function(params):
        # initialise classifier
        clf = classifier(**params)
        # fit data
        clf.fit(X)
        # predict
        try:
            y_pred = clf.predict(X)
        except: # ROCF algorithm
            y_pred = clf.get_outliers()
        # get F1 score
        report = classification_report(y_true=y, y_pred=y_pred, output_dict=True)['1']
        # objective is to maximize F1 i.e. minimize -F1
        return {'status': STATUS_OK, 'loss': -report['f1-score'], 'precision': report['precision'], 
                'recall': report['recall']}
    
    trials = Trials()
    
    # minimise objective function
    best_param = fmin(objective_function, param_space, algo=tpe.suggest, max_evals=num_eval, 
                      trials=trials, rstate= np.random.RandomState(1))
    
    loss = [x['result']['loss'] for x in trials.trials] 
    precision = [x['result']['precision'] for x in trials.trials] 
    recall = [x['result']['recall'] for x in trials.trials] 
    
    best_ind = loss.index(min(loss))
    
    best_param_values = best_param
    
    return trials, -loss[best_ind], best_param_values, precision[best_ind], recall[best_ind]

In [None]:
# create dict to store hyperopt inputs for each algorithm
hyperopt_inputs = dict()

### Local Outlier Factor (LOF)

In [None]:
# define parameter search range
LOF_param_hyperopt = {
    'n_neighbors': scope.int(hp.quniform('n_neighbors', 5, 15, 1)), 
    'algorithm': hp.choice('algorithm', ['ball_tree', 'kd_tree', 'brute']),
    'leaf_size': scope.int(hp.quniform('leaf_size', 25, 35, 1)),
    'contamination': 45/n_samples # set to actual outlier % 
}

# num_eval proportional to number of combinations of parameter values for different models
# num_eval = 3*(num_params_to_tune)
LOF_inputs = {'classifier': LOF, 'param_space': LOF_param_hyperopt, 'num_eval': 3**3}
hyperopt_inputs['LOF'] = LOF_inputs

### Cluster Based Outlier Factor (CBOF)

In [None]:
# define parameter search range
CBOF_param_hyperopt = {
    'k': scope.int(hp.quniform('n_neighbors', 2, 10, 1)),
    'lofub': hp.uniform('lofub', 0.5, 5.0),
    'pct': hp.uniform('pct', 0.2, 0.8),
    'contamination': 45/n_samples # set to actual outlier % 
}

CBOF_inputs = {'classifier': CBOF, 'param_space': CBOF_param_hyperopt, 'num_eval': 3**3}
hyperopt_inputs['CBOF'] = CBOF_inputs

### Relative Outlier Cluster Factor (ROCF)

In [None]:
ROCF_param_hyperopt = {
    'k': scope.int(hp.quniform('n_neighbors', 2, 10, 1))
}

ROCF_inputs = {'classifier': ROCF, 'param_space': ROCF_param_hyperopt, 'num_eval': 3**1}
hyperopt_inputs['ROCF'] = ROCF_inputs

### k-Nearest Neighbours

In [None]:
KNN_param_hyperopt = {
    'n_neighbors': scope.int(hp.quniform('n_neighbors', 2, 15, 1)),
    'method': hp.choice('method', ['largest', 'mean', 'median']),
    'contamination': 45/n_samples # set to actual outlier % 
}

KNN_inputs = {'classifier': KNN, 'param_space': KNN_param_hyperopt, 'num_eval': 3**2}
hyperopt_inputs['KNN'] = KNN_inputs

### Isolation Forest (IForest)

In [None]:
IF_param_hyperopt = {
    'n_estimators': scope.int(hp.quniform('n_estimators', 3, 20, 1)),
    'max_samples': scope.int(hp.quniform('max_samples', 10, 20, 1)),    
    'max_features': 2, # since X has only 2 features, set it to 2
    'contamination': 45/n_samples # set to actual outlier % 
}

IF_inputs = {'classifier': IForest, 'param_space': IF_param_hyperopt, 'num_eval': 3**3}
hyperopt_inputs['IForest'] = IF_inputs

### One-Class Support Vector Machine (OCSVM)

In [None]:
OCSVM_param_hyperopt = {
    'kernel': hp.choice('kernel', ['linear', 'poly', 'rbf', 'sigmoid']),
    'nu': hp.uniform('nu', 0.1, 0.9),
    'contamination': 45/n_samples # set to actual outlier % 
}

OCSVM_inputs = {'classifier': OCSVM, 'param_space': OCSVM_param_hyperopt, 'num_eval': 3**2}
hyperopt_inputs['OCSVM'] = OCSVM_inputs

### Comparison of Algorithms

In [None]:
d1_results_tuned = pd.DataFrame(columns=['algo', 'f1', 'precision', 'recall'])

for algo, algo_inputs in hyperopt_inputs.items():
    # run hyperopt
    algo_hyperopt = hyperopt(algo_inputs['param_space'], \
                             X_d1, y_d1, \
                             algo_inputs['num_eval'], algo_inputs['classifier'])
    # retrieve best parameters
    algo_opt = algo_hyperopt[2]
    algo_opt['f1'] = algo_hyperopt[1] # add f1 score
    algo_opt['precision'] = algo_hyperopt[3]
    algo_opt['recall'] = algo_hyperopt[4]
    algo_opt['algo'] = algo # add algo name
    # add to results dataframe
    d1_results_tuned = d1_results_tuned.append(algo_opt, ignore_index=True)

In [None]:
d1_results_tuned[["algo", "recall", "precision", "f1"]]

In [None]:
d1_results_tuned

## Final Results with Tuned Parameters

In [None]:
# retrieve float parameters
cbof_lofub = d1_results_tuned.loc[1,"lofub"]
cbof_pct = d1_results_tuned.loc[1,"pct"]
OCSVM_nu = d1_results_tuned.loc[5,"nu"]

In [None]:
# prepare model functions
n_samples = len(y_d1)

# dictionary of models & parameters to test
# LOF, CBOF, ROCF are replications of Table 3 of the paper
functions_dict = {
    # LOF
    'LOF (n=30)': { 'algo':'LOF', 'f':LOF(n_neighbors=9, algorithm="kd_tree", leaf_size=30,
                                          contamination=30/n_samples) }, 
    'LOF (n=45)': { 'algo':'LOF', 'f':LOF(n_neighbors=9, algorithm="kd_tree", leaf_size=30,
                                          contamination=45/n_samples) }, 
    'LOF (n=60)': { 'algo':'LOF', 'f':LOF(n_neighbors=9, algorithm="kd_tree", leaf_size=30,
                                           contamination=60/n_samples) },
    
    # CBOF
    'CBOF (n=30)': { 'algo':'CBOF', 'f':CBOF(k=5, pct=cbof_pct, lofub=cbof_lofub, contamination=30/n_samples) },
    'CBOF (n=45)': { 'algo':'CBOF', 'f':CBOF(k=5, pct=cbof_pct, lofub=cbof_lofub, contamination=45/n_samples, ) },
    'CBOF (n=60)': { 'algo':'CBOF', 'f':CBOF(k=5, pct=cbof_pct, lofub=cbof_lofub, contamination=60/n_samples) },
    
    # ROCF
    'ROCF': { 'algo': 'ROCF', 'f':ROCF(distance_metric="euclidean", k=4) },
    
    # KNN
    'KNN (n=30)': { 'algo': 'KNN', 'f':KNN(method="largest", n_neighbors=5, contamination=30/n_samples) },
    'KNN (n=45)': { 'algo': 'KNN', 'f':KNN(method="largest", n_neighbors=5, contamination=45/n_samples) },
    'KNN (n=60)': { 'algo': 'KNN', 'f':KNN(method="largest", n_neighbors=5, contamination=60/n_samples) },
    
    # IFOREST
    'IForest (n=30)': { 'algo': 'IForest', 'f':IForest(max_samples=15, n_estimators=15, 
                                                       contamination=30/n_samples) },
    'IForest (n=45)': { 'algo': 'IForest', 'f':IForest(max_samples=15, n_estimators=15,
                                                       contamination=45/n_samples) },
    'IForest (n=60)': { 'algo': 'IForest', 'f':IForest(max_samples=15, n_estimators=15, 
                                                        contamination=60/n_samples) },
    
    # OCSVM
    'OCSVM (n=30)': { 'algo': 'OCSVM', 'f':OCSVM(kernel="rbf", nu=OCSVM_nu, contamination=30/n_samples) },
    'OCSVM (n=45)': { 'algo': 'OCSVM', 'f':OCSVM(kernel="rbf", nu=OCSVM_nu, contamination=45/n_samples) },
    'OCSVM (n=60)': { 'algo': 'OCSVM', 'f':OCSVM(kernel="rbf", nu=OCSVM_nu, contamination=60/n_samples) },
}

In [None]:
# create output dataframe
d1_final_results = pd.DataFrame(columns=['algo', 'outlier_rate', 'recall', 'precision', 'f1'])

for name, f_dict in tqdm(functions_dict.items()):
    # initialise classifier
    clf = f_dict['f']

    # fit classifier on data
    clf.fit(X_d1)

    # retrieve predictions
    try:
        y_pred = clf.get_outliers()
    except:
        y_pred = clf.labels_


    # derive evaluation metrics
    report = classification_report(y_true=y_d1, y_pred=y_pred, output_dict=True)['1']

    row = { 
        'algo': f_dict['algo'], 
        'precision': report['precision'], 'recall': report['recall'], 'f1': report['f1-score']
    }

    # retrieve outlier rate
    try:
        row['outlier_rate'] = clf.get_outlier_rate()
    except:
        row['outlier_rate'] = clf.contamination

    d1_final_results = d1_final_results.append(row, ignore_index=True)

In [None]:
d1_final_results

## Retrieve Max ROCF from Tuned Model

In [None]:
rocf = ROCF(distance_metric="euclidean", k=4)
rocf.fit(X_d1)
max_rocf = max(rocf.get_rocfs())
print(max_rocf)

## Test Specification of Parameter k

In [None]:
k_f1score = []
k_value = []
for k in tqdm(range(1, 31)):
    # run rocf
    rocf = ROCF(distance_metric="euclidean", k=k)
    
    # fit rocf
    rocf.fit(X_d1)
    
    # retrieve predictions
    y_pred = rocf.get_outliers()

    # derive evaluation metrics
    report = classification_report(y_true=y_d1, y_pred=y_pred, output_dict=True)['1']
    k_f1score.append(report["f1-score"])
    k_value.append(k)

In [None]:
plt.scatter(k_value, k_f1score, marker=".", color="darkblue")
plt.title('F1 Score against k (D1 dataset)')
plt.xlabel('k, number of nearest neighbors')
plt.ylabel('F1 Score')

# Synthetic Dataset D2

## Generate Data

In [None]:
X_d2, y_d2 = generate_synthetic_d2()
color_d2 = ["red" if i == 1 else "black" for i in y_d2]

plt.figure(figsize=(5,5))
plt.scatter([x[0] for x in X_d2], [x[1] for x in X_d2], c=color_d2, s=3)

## Replicate Parameters Suggested in Paper

In [None]:
# prepare model functions
n_samples = len(y_d2)

# dictionary of models & parameters to test
# LOF, CBOF, ROCF are replications of Table 2 of the paper
functions_dict = {
    'LOF (k=15, n=50)': { 'algo': 'LOF', 'k':15, 'n':50, 'f':LOF(n_neighbors=15, contamination=50/n_samples) }, 
    'LOF (k=15, n=79)': { 'algo': 'LOF', 'k':15, 'n':79, 'f':LOF(n_neighbors=15, contamination=79/n_samples) },
    'LOF (k=15, n=100)': { 'algo': 'LOF', 'k':15, 'n':100, 'f':LOF(n_neighbors=15, contamination=100/n_samples) },
    'CBOF (k=15, alpha=0.95)': {'algo': 'CBOF', 'k': 15, 'n':54, 'f':CBOF(k=15, contamination=0.05, pct=0.2, lofub=1) },
    'CBOF (k=15, alpha=0.90)': {'algo': 'CBOF', 'k': 15, 'n':108, 'f':CBOF(k=15, contamination=0.10, pct=0.2, lofub=1) },
    'CBOF (k=15, alpha=0.85)': {'algo': 'CBOF', 'k': 15, 'n':162, 'f':CBOF(k=15, contamination=0.15, pct=0.2, lofub=1) },
    'ROCF (k=10)': { 'algo': 'ROCF', 'k':10, 'n':None, 'f':ROCF(distance_metric="euclidean", k=10) }
}

In [None]:
# create output dataframe
d2_results = pd.DataFrame(columns=['algo', 'k', 'n', 'outlier_rate', 'recall', 'precision', 'f1'])

for name, f_dict in tqdm(functions_dict.items()):
    # initialise classifier
    clf = f_dict['f']

    # fit classifier on data
    clf.fit(X_d2)

    # retrieve predictions
    try:
        y_pred = clf.get_outliers()
    except:
        y_pred = clf.labels_


    # derive evaluation metrics
    report = classification_report(y_true=y_d2, y_pred=y_pred, output_dict=True)['1']

    row = { 
        'algo': f_dict['algo'], 'k': f_dict['k'], 'n': f_dict['n'],
        'precision': report['precision'], 'recall': report['recall'], 'f1': report['f1-score']
    }

    # retrieve outlier rate
    try:
        row['outlier_rate'] = clf.get_outlier_rate()
    except:
        row['outlier_rate'] = clf.contamination

    d2_results = d2_results.append(row, ignore_index=True)

In [None]:
d2_results[["algo", "outlier_rate", "recall", "precision", "f1"]]

## Hyperparameter Tuning

In [None]:
hyperopt_inputs = dict()

### Local Outlier Factor (LOF)

In [None]:
# define parameter search range
LOF_param_hyperopt = {
    'n_neighbors': scope.int(hp.quniform('n_neighbors', 10, 20, 1)), 
    'algorithm': hp.choice('algorithm', ['ball_tree', 'kd_tree', 'brute']),
    'leaf_size': scope.int(hp.quniform('leaf_size', 20, 40, 1)),
    'contamination': 79/n_samples # set to actual outlier % 
}

# num_eval proportional to number of combinations of parameter values for different models
# num_eval = 3*(num_params_to_tune)
LOF_inputs = {'classifier': LOF, 'param_space': LOF_param_hyperopt, 'num_eval': 3**3}
hyperopt_inputs['LOF'] = LOF_inputs

### Cluster Based Outlier Factor (CBOF)

In [None]:
# define parameter search range
CBOF_param_hyperopt = {
    'k': scope.int(hp.quniform('n_neighbors', 10, 20, 1)),
    'lofub': hp.uniform('lofub', 0.5, 5.0),
    'pct': hp.uniform('pct', 0.2, 0.8),
    'contamination': 79/n_samples # set to actual outlier % 
}

CBOF_inputs = {'classifier': CBOF, 'param_space': CBOF_param_hyperopt, 'num_eval': 3**3}
hyperopt_inputs['CBOF'] = CBOF_inputs

### Relative Outlier Cluster Factor (ROCF)

In [None]:
# define parameter search range
ROCF_param_hyperopt = {
    'k': scope.int(hp.quniform('n_neighbors', 5, 15, 1))
}

ROCF_inputs = {'classifier': ROCF, 'param_space': ROCF_param_hyperopt, 'num_eval': 3**1}
hyperopt_inputs['ROCF'] = ROCF_inputs

### k-Nearest Neighbours

In [None]:
KNN_param_hyperopt = {
    'n_neighbors': scope.int(hp.quniform('n_neighbors', 5, 25, 1)),
    'method': hp.choice('method', ['largest', 'mean', 'median']),
    'contamination': 79/n_samples # set to actual outlier % 
}

KNN_inputs = {'classifier': KNN, 'param_space': KNN_param_hyperopt, 'num_eval': 3**2}
hyperopt_inputs['KNN'] = KNN_inputs

### Isolation Forest (IForest)

In [None]:
IF_param_hyperopt = {
    'n_estimators': scope.int(hp.quniform('n_estimators', 10, 30, 1)),
    'max_samples': scope.int(hp.quniform('max_samples', 10, 25, 1)),    
    'max_features': 2, # since X has only 2 features, set it to 2
    'contamination': 79/n_samples # set to actual outlier % 
}

IF_inputs = {'classifier': IForest, 'param_space': IF_param_hyperopt, 'num_eval': 3**3}
hyperopt_inputs['IForest'] = IF_inputs

### One-Class Support Vector Machine (OCSVM)

In [None]:
OCSVM_param_hyperopt = {
    'kernel': hp.choice('kernel', ['linear', 'poly', 'rbf', 'sigmoid']),
    'nu': hp.uniform('nu', 0.1, 0.9),
    'contamination': 79/n_samples # set to actual outlier % 
}

OCSVM_inputs = {'classifier': OCSVM, 'param_space': OCSVM_param_hyperopt, 'num_eval': 3**2}
hyperopt_inputs['OCSVM'] = OCSVM_inputs

### Comparison of Algorithms

In [None]:
d2_results_tuned = pd.DataFrame(columns=['algo', 'f1', 'precision', 'recall'])

for algo, algo_inputs in hyperopt_inputs.items():
    # run hyperopt
    algo_hyperopt = hyperopt(algo_inputs['param_space'], \
                             X_d2, y_d2, \
                             algo_inputs['num_eval'], algo_inputs['classifier'])
    # retrieve best parameters
    algo_opt = algo_hyperopt[2]
    algo_opt['f1'] = algo_hyperopt[1] # add f1 score
    algo_opt['precision'] = algo_hyperopt[3]
    algo_opt['recall'] = algo_hyperopt[4]
    algo_opt['algo'] = algo # add algo name
    # add to results dataframe
    d2_results_tuned = d2_results_tuned.append(algo_opt, ignore_index=True)

In [None]:
d2_results_tuned[["algo", "recall", "precision", "f1"]]

In [None]:
d2_results_tuned

## Final Results with Tuned Parameters

In [None]:
# retrieve float parameters
cbof_lofub = d2_results_tuned.loc[1,"lofub"]
cbof_pct = d2_results_tuned.loc[1,"pct"]
OCSVM_nu = d2_results_tuned.loc[5,"nu"]

In [None]:
# prepare model functions
n_samples = len(y_d2)

# dictionary of models & parameters to test
# LOF, CBOF, ROCF are replications of Table 3 of the paper
functions_dict = {
    # LOF
    'LOF (n=50)': { 'algo':'LOF', 'f':LOF(n_neighbors=20, algorithm="brute", leaf_size=29,
                                          contamination=50/n_samples) }, 
    'LOF (n=79)': { 'algo':'LOF', 'f':LOF(n_neighbors=20, algorithm="brute", leaf_size=29,
                                          contamination=79/n_samples) }, 
    'LOF (n=100)': { 'algo':'LOF', 'f':LOF(n_neighbors=20, algorithm="brute", leaf_size=29,
                                           contamination=100/n_samples) },
    
    # CBOF
    'CBOF (n=50)': { 'algo':'CBOF', 'f':CBOF(k=14, pct=cbof_pct, lofub=cbof_lofub, contamination=50/n_samples) },
    'CBOF (n=79)': { 'algo':'CBOF', 'f':CBOF(k=14, pct=cbof_pct, lofub=cbof_lofub, contamination=79/n_samples, ) },
    'CBOF (n=100)': { 'algo':'CBOF', 'f':CBOF(k=14, pct=cbof_pct, lofub=cbof_lofub, contamination=100/n_samples) },
    
    # ROCF
    'ROCF': { 'algo': 'ROCF', 'f':ROCF(distance_metric="euclidean", k=12) },
    
    # KNN
    'KNN (n=50)': { 'algo': 'KNN', 'f':KNN(method="mean", n_neighbors=22, contamination=50/n_samples) },
    'KNN (n=79)': { 'algo': 'KNN', 'f':KNN(method="mean", n_neighbors=22, contamination=79/n_samples) },
    'KNN (n=100)': { 'algo': 'KNN', 'f':KNN(method="mean", n_neighbors=22, contamination=100/n_samples) },
    
    # IFOREST
    'IForest (n=50)': { 'algo': 'IForest', 'f':IForest(max_samples=11, n_estimators=23, 
                                                       contamination=50/n_samples) },
    'IForest (n=79)': { 'algo': 'IForest', 'f':IForest(max_samples=11, n_estimators=23,
                                                       contamination=79/n_samples) },
    'IForest (n=100)': { 'algo': 'IForest', 'f':IForest(max_samples=11, n_estimators=23, 
                                                        contamination=100/n_samples) },
    
    # OCSVM
    'OCSVM (n=50)': { 'algo': 'OCSVM', 'f':OCSVM(kernel="rbf", nu=OCSVM_nu, contamination=50/n_samples) },
    'OCSVM (n=79)': { 'algo': 'OCSVM', 'f':OCSVM(kernel="rbf", nu=OCSVM_nu, contamination=79/n_samples) },
    'OCSVM (n=100)': { 'algo': 'OCSVM', 'f':OCSVM(kernel="rbf", nu=OCSVM_nu, contamination=100/n_samples) },
}

In [None]:
# create output dataframe
d2_final_results = pd.DataFrame(columns=['algo', 'outlier_rate', 'recall', 'precision', 'f1'])

for name, f_dict in tqdm(functions_dict.items()):
    # initialise classifier
    clf = f_dict['f']

    # fit classifier on data
    clf.fit(X_d2)

    # retrieve predictions
    try:
        y_pred = clf.get_outliers()
    except:
        y_pred = clf.labels_


    # derive evaluation metrics
    report = classification_report(y_true=y_d2, y_pred=y_pred, output_dict=True)['1']

    row = { 
        'algo': f_dict['algo'], 
        'precision': report['precision'], 'recall': report['recall'], 'f1': report['f1-score']
    }

    # retrieve outlier rate
    try:
        row['outlier_rate'] = clf.get_outlier_rate()
    except:
        row['outlier_rate'] = clf.contamination

    d2_final_results = d2_final_results.append(row, ignore_index=True)

In [None]:
d2_final_results

## Retrieve Max ROCF from Tuned Model

In [None]:
rocf = ROCF(distance_metric="euclidean", k=12)
rocf.fit(X_d2)
max_rocf = max(rocf.get_rocfs())
print(max_rocf)

## Test Specification of Parameter k

In [None]:
k_f1score = []
k_value = []
for k in tqdm(range(1, 31)):
    # run rocf
    rocf = ROCF(distance_metric="euclidean", k=k)
    
    # fit rocf
    rocf.fit(X_d2)
    
    # retrieve predictions
    y_pred = rocf.get_outliers()

    # derive evaluation metrics
    report = classification_report(y_true=y_d2, y_pred=y_pred, output_dict=True)['1']
    k_f1score.append(report["f1-score"])
    k_value.append(k)

In [None]:
plt.scatter(k_value, k_f1score, marker=".", color="darkblue")
plt.title('F1 Score against k (D2 dataset)')
plt.xlabel('k, number of nearest neighbors')
plt.ylabel('F1 Score')

# Synthetic Dataset D3

## Generate Data

In [None]:
X_d3, y_d3 = generate_synthetic_d3()
color_d3 = ["red" if i == 1 else "black" for i in y_d3]

plt.figure(figsize=(5,5))
plt.scatter([x[0] for x in X_d3], [x[1] for x in X_d3], c=color_d3, s=3)

## Replicate Parameters Suggested in Paper

In [None]:
# prepare model functions
n_samples = len(y_d3)

# dictionary of models & parameters to test
# LOF, CBOF, ROCF are replications of Table 3 of the paper
functions_dict = {
    'LOF (k=10, n=10)': { 'algo': 'LOF', 'k':10, 'n':10, 'f':LOF(n_neighbors=10, contamination=10/n_samples) }, 
    'LOF (k=10, n=20)': { 'algo': 'LOF', 'k':10, 'n':20, 'f':LOF(n_neighbors=10, contamination=20/n_samples) },
    'LOF (k=10, n=30)': { 'algo': 'LOF', 'k':10, 'n':30, 'f':LOF(n_neighbors=10, contamination=30/n_samples) },
    'CBOF (k=5, alpha=0.99)': {'algo': 'CBOF', 'k': 5, 'n':3, 'f':CBOF(k=5, contamination=0.01, pct=0.2, lofub=1) },
    'CBOF (k=5, alpha=0.90)': {'algo': 'CBOF', 'k': 5, 'n':31, 'f':CBOF(k=5, contamination=0.10, pct=0.2, lofub=1) },
    'CBOF (k=5, alpha=0.85)': {'algo': 'CBOF', 'k': 5, 'n':46, 'f':CBOF(k=5, contamination=0.15, pct=0.2, lofub=1) },
    'ROCF (k=10)': { 'algo': 'ROCF', 'k':10, 'n':None, 'f':ROCF(distance_metric="euclidean", k=5) }
}

In [None]:
# create output dataframe
d3_results = pd.DataFrame(columns=['algo', 'k', 'n', 'outlier_rate', 'recall', 'precision', 'f1'])

for name, f_dict in tqdm(functions_dict.items()):
    # initialise classifier
    clf = f_dict['f']

    # fit classifier on data
    clf.fit(X_d3)

    # retrieve predictions
    try:
        y_pred = clf.get_outliers()
    except:
        y_pred = clf.labels_


    # derive evaluation metrics
    report = classification_report(y_true=y_d3, y_pred=y_pred, output_dict=True)['1']

    row = { 
        'algo': f_dict['algo'], 'k': f_dict['k'], 'n': f_dict['n'],
        'precision': report['precision'], 'recall': report['recall'], 'f1': report['f1-score']
    }

    # retrieve outlier rate
    try:
        row['outlier_rate'] = clf.get_outlier_rate()
    except:
        row['outlier_rate'] = clf.contamination

    d3_results = d3_results.append(row, ignore_index=True)

In [None]:
d3_results[["algo", "outlier_rate", "recall", "precision", "f1"]]

## Hyperparameter Tuning

In [None]:
hyperopt_inputs = dict()

### Local Outlier Factor (LOF)

In [None]:
# define parameter search range
LOF_param_hyperopt = {
    'n_neighbors': scope.int(hp.quniform('n_neighbors', 5, 15, 1)), 
    'algorithm': hp.choice('algorithm', ['ball_tree', 'kd_tree', 'brute']),
    'leaf_size': scope.int(hp.quniform('leaf_size', 20, 40, 1)),
    'contamination': 20/n_samples # set to actual outlier % 
}

# num_eval proportional to number of combinations of parameter values for different models
# num_eval = 3*(num_params_to_tune)
LOF_inputs = {'classifier': LOF, 'param_space': LOF_param_hyperopt, 'num_eval': 3**3}
hyperopt_inputs['LOF'] = LOF_inputs

### Cluster Based Outlier Factor (CBOF)

In [None]:
# define parameter search range
CBOF_param_hyperopt = {
    'k': scope.int(hp.quniform('n_neighbors', 3, 10, 1)),
    'lofub': hp.uniform('lofub', 0.5, 5.0),
    'pct': hp.uniform('pct', 0.2, 0.8),
    'contamination': 20/n_samples # set to actual outlier % 
}

CBOF_inputs = {'classifier': CBOF, 'param_space': CBOF_param_hyperopt, 'num_eval': 3**3}
hyperopt_inputs['CBOF'] = CBOF_inputs

### Relative Outlier Cluster Factor (ROCF)

In [None]:
# define parameter search range
ROCF_param_hyperopt = {
    'k': scope.int(hp.quniform('n_neighbors', 5, 10, 1))
}

ROCF_inputs = {'classifier': ROCF, 'param_space': ROCF_param_hyperopt, 'num_eval': 3**1}
hyperopt_inputs['ROCF'] = ROCF_inputs

### k-Nearest Neighbours

In [None]:
KNN_param_hyperopt = {
    'n_neighbors': scope.int(hp.quniform('n_neighbors', 5, 15, 1)),
    'method': hp.choice('method', ['largest', 'mean', 'median']),
    'contamination': 20/n_samples # set to actual outlier % 
}

KNN_inputs = {'classifier': KNN, 'param_space': KNN_param_hyperopt, 'num_eval': 3**2}
hyperopt_inputs['KNN'] = KNN_inputs

### Isolation Forest (IForest)

In [None]:
IF_param_hyperopt = {
    'n_estimators': scope.int(hp.quniform('n_estimators', 3, 20, 1)),
    'max_samples': scope.int(hp.quniform('max_samples', 3, 20, 1)),    
    'max_features': 2, # since X has only 2 features, set it to 2
    'contamination': 20/n_samples # set to actual outlier % 
}

IF_inputs = {'classifier': IForest, 'param_space': IF_param_hyperopt, 'num_eval': 3**3}
hyperopt_inputs['IForest'] = IF_inputs

### One-Class Support Vector Machine (OCSVM)

In [None]:
OCSVM_param_hyperopt = {
    'kernel': hp.choice('kernel', ['linear', 'poly', 'rbf', 'sigmoid']),
    'nu': hp.uniform('nu', 0.1, 0.9),
    'contamination': 20/n_samples # set to actual outlier % 
}

OCSVM_inputs = {'classifier': OCSVM, 'param_space': OCSVM_param_hyperopt, 'num_eval': 3**2}
hyperopt_inputs['OCSVM'] = OCSVM_inputs

### Comparison of Algorithms

In [None]:
d3_results_tuned = pd.DataFrame(columns=['algo', 'f1', 'precision', 'recall'])

for algo, algo_inputs in hyperopt_inputs.items():
    # run hyperopt
    algo_hyperopt = hyperopt(algo_inputs['param_space'], \
                             X_d3, y_d3, \
                             algo_inputs['num_eval'], algo_inputs['classifier'])
    # retrieve best parameters
    algo_opt = algo_hyperopt[2]
    algo_opt['f1'] = algo_hyperopt[1] # add f1 score
    algo_opt['precision'] = algo_hyperopt[3]
    algo_opt['recall'] = algo_hyperopt[4]
    algo_opt['algo'] = algo # add algo name
    # add to results dataframe
    d3_results_tuned = d3_results_tuned.append(algo_opt, ignore_index=True)

In [None]:
d3_results_tuned[["algo", "recall", "precision", "f1"]]

## Final Results with Tuned Parameters

In [None]:
# retrieve float parameters
cbof_lofub = d3_results_tuned.loc[1,"lofub"]
cbof_pct = d3_results_tuned.loc[1,"pct"]
OCSVM_nu = d3_results_tuned.loc[5,"nu"]

In [None]:
d3_results_tuned

In [None]:
# prepare model functions
n_samples = len(y_d3)

# dictionary of models & parameters to test
# LOF, CBOF, ROCF are replications of Table 3 of the paper
functions_dict = {
    # LOF
    'LOF (n=10)': { 'algo':'LOF', 'f':LOF(n_neighbors=9, algorithm="kd_tree", leaf_size=30,
                                                               contamination=10/n_samples) }, 
    'LOF (n=20)': { 'algo':'LOF', 'f':LOF(n_neighbors=9, algorithm="kd_tree", leaf_size=30,
                                                               contamination=20/n_samples) }, 
    'LOF (n=30)': { 'algo':'LOF', 'f':LOF(n_neighbors=9, algorithm="kd_tree", leaf_size=30,
                                                               contamination=30/n_samples) },
    
    # CBOF
    'CBOF (n=10)': { 'algo':'CBOF', 'f':CBOF(k=3, pct=cbof_pct, lofub=cbof_lofub, 
                                                        contamination=10/n_samples) },
    'CBOF (n=20)': { 'algo':'CBOF', 'f':CBOF(k=3, pct=cbof_pct, lofub=cbof_lofub,
                                                        contamination=20/n_samples, ) },
    'CBOF (n=30)': { 'algo':'CBOF', 'f':CBOF(k=3, pct=cbof_pct, lofub=cbof_lofub,
                                                        contamination=30/n_samples) },
    
    # ROCF
    'ROCF': { 'algo': 'ROCF', 'f':ROCF(distance_metric="euclidean", k=6) },
    
    # KNN
    'KNN (n=10)': { 'algo': 'KNN', 'f':KNN(method="largest", n_neighbors=7, contamination=10/n_samples) },
    'KNN (n=20)': { 'algo': 'KNN', 'f':KNN(method="largest", n_neighbors=7, contamination=20/n_samples) },
    'KNN (n=30)': { 'algo': 'KNN', 'f':KNN(method="largest", n_neighbors=7, contamination=30/n_samples) },
    
    # IFOREST
    'IForest (n=10)': { 'algo': 'IForest', 'f':IForest(max_samples=19, n_estimators=3, contamination=10/n_samples) },
    'IForest (n=20)': { 'algo': 'IForest', 'f':IForest(max_samples=19, n_estimators=3, contamination=20/n_samples) },
    'IForest (n=30)': { 'algo': 'IForest', 'f':IForest(max_samples=19, n_estimators=3, contamination=30/n_samples) },
    
    # OCSVM
    'OCSVM (n=10)': { 'algo': 'OCSVM', 'f':OCSVM(kernel="rbf", nu=OCSVM_nu, contamination=10/n_samples) },
    'OCSVM (n=20)': { 'algo': 'OCSVM', 'f':OCSVM(kernel="rbf", nu=OCSVM_nu, contamination=20/n_samples) },
    'OCSVM (n=30)': { 'algo': 'OCSVM', 'f':OCSVM(kernel="rbf", nu=OCSVM_nu, contamination=30/n_samples) },
}

In [None]:
# create output dataframe
d3_final_results = pd.DataFrame(columns=['algo', 'outlier_rate', 'recall', 'precision', 'f1'])

for name, f_dict in tqdm(functions_dict.items()):
    # initialise classifier
    clf = f_dict['f']

    # fit classifier on data
    clf.fit(X_d3)

    # retrieve predictions
    try:
        y_pred = clf.get_outliers()
    except:
        y_pred = clf.labels_


    # derive evaluation metrics
    report = classification_report(y_true=y_d3, y_pred=y_pred, output_dict=True)['1']

    row = { 
        'algo': f_dict['algo'], 
        'precision': report['precision'], 'recall': report['recall'], 'f1': report['f1-score']
    }

    # retrieve outlier rate
    try:
        row['outlier_rate'] = clf.get_outlier_rate()
    except:
        row['outlier_rate'] = clf.contamination

    d3_final_results = d3_final_results.append(row, ignore_index=True)

In [None]:
d3_final_results

## Retrieve Max ROCF from Tuned Model

In [None]:
rocf = ROCF(distance_metric="euclidean", k=6)
rocf.fit(X_d3)
max_rocf = max(rocf.get_rocfs())
print(max_rocf)