In [None]:
import os,sys,inspect
currentdir = os.path.dirname(os.path.abspath(inspect.getfile(inspect.currentframe())))
parentdir = os.path.dirname(currentdir)
sys.path.insert(0,parentdir) 

# data science
import numpy as np
import pandas as pd

# for outlier detection
from pyod.models.lof import LOF
from pyod.models.iforest import IForest
from pyod.models.ocsvm import OCSVM
from pyod.models.knn import KNN
from outlier_detection.rocf import ROCF
from outlier_detection.cbof import CBOF

# preprocessing
from sklearn import preprocessing

# for evaluation
from sklearn.metrics import classification_report, f1_score
from sklearn.model_selection import train_test_split

# exploratory data analysis
from matplotlib import pyplot as plt
import seaborn as sns

# bayesian hyperparameter optimization
from hyperopt import hp, Trials, fmin, tpe, STATUS_OK
from hyperopt.pyll import scope
from time import time
from tqdm import tqdm

# Fraud Dataset
https://www.kaggle.com/ninads/kernel3b5cdd2865/data

## Prepare Dataset

### Load Dataset

In [None]:
fraud_data = pd.read_csv("../data/bankcard.csv", index_col=0)
print("Shape of data: ", fraud_data.shape)
fraud_data.head()

In [None]:
fraud_data.isna().sum().sum() #Â check na

### Clean Dataset

In [None]:
fraud_data = fraud_data.drop([fraud_data.columns[0]], axis=1) # drop card number
fraud_data['Date'] = pd.to_datetime(fraud_data['Date']) # convert to datetime
fraud_data['CBK'] = fraud_data.apply(lambda x: 1 if x['CBK'] == 'Yes' else 0, axis=1) # binarize
fraud_data['Hour'] = fraud_data.apply(lambda x: x.Date.hour, axis=1) # extract hour
fraud_data['DayName'] = fraud_data.apply(lambda x: x.Date.weekday(), axis=1) # extract day name
fraud_data['Day'] = fraud_data.apply(lambda x: x.Date.day, axis=1) # extract day
fraud_data = fraud_data[['DayName', 'Day', 'Hour', 'Amount', 'CBK']]

### Exploratory Data Analysis

In [None]:
fraud_data.info()

In [None]:
fraud_data.describe()

In [None]:
# plot settings
sns.set(rc={'figure.figsize':(8, 5)})
sns.set_theme(style="whitegrid")

# get value counts and plot frequency of labels
label_freq = fraud_data['CBK'].value_counts()
ax = sns.barplot(x=label_freq.index, y=label_freq.values)

# label values on top of bar
for p in ax.patches:
    perc = round((p.get_height()/len(fraud_data['CBK']))*100, 3)
    ax.annotate(str(int(p.get_height())) + f' ({perc}%)', # label
                (p.get_x() + p.get_width() / 2., p.get_height()), # location
                ha = 'center', va = 'center', xytext = (0, 9), \
                textcoords = 'offset points')

# label plot
plt.title("Distribution of Labels", size=14)
plt.xlabel("Class Label", size=12)
plt.ylabel("Count", size=12)

### Extract Dataset
- Train-Test split was done based on date. The optimal hyperparameters for each unsupervised algorithm is found on the train set, before testing on the test set.

In [None]:
features = ['DayName', 'Hour', 'Amount']

# extract train data
X_train = fraud_data.loc[fraud_data['Day'] < 25][features].values
y_train = fraud_data.loc[fraud_data['Day'] < 25].CBK

X_test = fraud_data.loc[fraud_data['Day'] >= 25][features].values
y_test = fraud_data.loc[fraud_data['Day'] >= 25].CBK

print(f"Train: {len(X_train)}")
print(f"Test: {len(X_test)}")

## Hyperparameter Tuning

Besides the 3 methods compared in the paper (LOF, CBOF and ROCF), we also tested other outlier detection algorithms, namely K-Nearest Neighbors, Isolation Forest, and One-Class SVM.

For these 6 methods, we conducted a hyperparameter tuning on the train set, to find the optimal parameters for each of the outlier detection algorithms for the credit card fraud dataset. We used Bayesian hyperparameter optimisation, which uses Bayes Theorem to direct the hyperparameter search in order to find the minimum or maximum of an objective function.

The contamination factor was set at 0.00173 for all algorithms, which is the true contamination factor.

In a later section of this notebook, we will show how the lack of knowledge on the contamination factor will affect the outlier detection results.

### Bayesian Hyperparameter Optimisation Function

In [None]:
def hyperopt(param_space, X, y, num_eval, classifier):  
    '''
    Function that performs Bayesian hyperparameter optimisation 
    to find the optimal parameters for the outlier detection algorithm.
    
    Inputs:
        param_space (dict): A dictionary of the parameters and corresponding space to search.
        X (array): Features of the dataset.
        y (array): Labels of the dataset (0 = normal; 1 = anomaly).
        
        num_eval (int): Number of evaluation rounds.
        classifier (pyOD Object): Outlier detection algorithm.
        
    Outputs:
        trials
        -min(loss) (float): Best in-sample F1 score.
        best_param_values (dict): Dictionary of the best parameters for the classifier.
    '''
    
    start = time()
    
    def objective_function(params):
        # initialise classifier
        clf = classifier(**params)
        # fit data
        clf.fit(X)
        # predict
        try:
            y_pred = clf.labels_
        except: # ROCF algorithm
            y_pred = clf.get_outliers()
        # get F1 score
        report = classification_report(y_true=y, y_pred=y_pred, output_dict=True)['1']
        # objective is to maximize F1 i.e. minimize -F1
        return {'status': STATUS_OK, 'loss': -report['f1-score'], 'precision': report['precision'], 
                'recall': report['recall']}
    
    trials = Trials()
    
    # minimise objective function
    best_param = fmin(objective_function, param_space, algo=tpe.suggest, max_evals=num_eval, 
                      trials=trials, rstate= np.random.RandomState(1))
    
    loss = [x['result']['loss'] for x in trials.trials] 
    precision = [x['result']['precision'] for x in trials.trials] 
    recall = [x['result']['recall'] for x in trials.trials] 
    
    best_ind = loss.index(min(loss))
    
    best_param_values = best_param
    
    return trials, -loss[best_ind], best_param_values, precision[best_ind], recall[best_ind]

In [None]:
# create dict to store hyperopt inputs for each algorithm
hyperopt_inputs = dict()

### Local Outlier Factor (LOF)

In [None]:
# define parameter search range
LOF_param_hyperopt = {
    'n_neighbors': scope.int(hp.quniform('n_neighbors', 50, 100, 1)),
    'algorithm': hp.choice('algorithm', ['ball_tree', 'kd_tree', 'brute']),
    'leaf_size': scope.int(hp.quniform('leaf_size', 10, 100, 1)),
    'contamination': sum(y_train)/len(y_train), # set to actual outlier % 
}

# num_eval proportional to number of combinations of parameter values for different models
# num_eval = 3 ** num_params 
LOF_inputs = {'classifier': LOF, 'param_space': LOF_param_hyperopt, 'num_eval': 3**3}
hyperopt_inputs['LOF'] = LOF_inputs

### k-Nearest Neighbors

In [None]:
KNN_param_hyperopt = {
    'contamination': sum(y_train)/len(y_train),
    'n_neighbors': scope.int(hp.quniform('n_neighbors', 50, 90, 1)),
    'method': hp.choice('method', ['largest', 'mean', 'median']),
}

KNN_inputs = {'classifier': KNN, 'param_space': KNN_param_hyperopt, 'num_eval': 3**2}
hyperopt_inputs['KNN'] = KNN_inputs

### Isolation Forest (IForest)

In [None]:
IF_param_hyperopt = {
    'n_estimators': scope.int(hp.quniform('n_estimators', 1, 80, 1)),
    'max_samples': scope.int(hp.quniform('max_samples', 10, 50, 1)),    
    'contamination': sum(y_train)/len(y_train),
    'max_features': scope.int(hp.quniform('max_features', 1, 3, 1)),    
}

IF_inputs = {'classifier': IForest, 'param_space': IF_param_hyperopt, 'num_eval': 3**3}
hyperopt_inputs['IForest'] = IF_inputs

### One-Class Support Vector Machine (OCSVM)

In [None]:
OCSVM_param_hyperopt = {
    'kernel': hp.choice('kernel', ['linear', 'poly', 'rbf', 'sigmoid']),
    'nu': hp.uniform('nu', 0.4, 0.8),
    'contamination': sum(y_train)/len(y_train),
}

OCSVM_inputs = {'classifier': OCSVM, 'param_space': OCSVM_param_hyperopt, 'num_eval': 3**2}
hyperopt_inputs['OCSVM'] = OCSVM_inputs

### Cluster-Based Outlier Factor (CBOF)

In [None]:
# define parameter search range
CBOF_param_hyperopt = {
    'k': scope.int(hp.quniform('n_neighbors', 50, 100, 1)),
    'lofub': hp.uniform('lofub', 0.5, 3.0),
    'pct': hp.uniform('pct', 0.2, 0.8),
    'contamination': sum(y_train)/len(y_train), # set to actual outlier % 
}

CBOF_inputs = {'classifier': CBOF, 'param_space': CBOF_param_hyperopt, 'num_eval': 3**3}
hyperopt_inputs['CBOF'] = CBOF_inputs

### Relative Outlier Cluster Factor (ROCF)

In [None]:
ROCF_param_hyperopt = {
    'k': scope.int(hp.quniform('n_neighbors', 10, 30, 1)),
}

ROCF_inputs = {'classifier': ROCF, 'param_space': ROCF_param_hyperopt, 'num_eval': 3**1}
hyperopt_inputs['ROCF'] = ROCF_inputs

### Comparison of Algorithms

In [None]:
try:
    fraud_results_tuned = pd.read_csv("fraud_results_tuned.csv")

except:
    fraud_results_tuned = pd.DataFrame(columns=['algo', 'recall', 'precision', 'f1'])

    for algo, algo_inputs in hyperopt_inputs.items():
        # run hyperopt
        algo_hyperopt = hyperopt(algo_inputs['param_space'], \
                                 X_train, y_train, \
                                 algo_inputs['num_eval'], algo_inputs['classifier'])
        # retrieve best parameters
        algo_opt = algo_hyperopt[2]
        algo_opt['f1'] = algo_hyperopt[1] # add f1 score
        algo_opt['precision'] = algo_hyperopt[3]
        algo_opt['recall'] = algo_hyperopt[4]
        algo_opt['algo'] = algo # add algo name
        # add to results dataframe
        fraud_results_tuned = fraud_results_tuned.append(algo_opt, ignore_index=True)

    fraud_results_tuned.to_csv("fraud_results_tuned.csv", index=False)

In [None]:
fraud_results_tuned

In [None]:
fraud_results_tuned[['algo', 'recall', 'precision', 'f1']].iloc[[0, 4, 5, 1, 2, 3], :].reset_index(drop=True)

## Evaluation on Test Set

In [None]:
test_results = pd.DataFrame(columns=['algo', 'recall', 'precision', 'f1'])

algos_dict = {'LOF': LOF, 'CBOF': CBOF, 'ROCF': ROCF, 'KNN': KNN, 'IForest': IForest, 'OCSVM': OCSVM}

outlier_rate = sum(y_train) / len(y_train)

# for each algorithm
for algo_name, f in algos_dict.items():
        
    # get best parameters from tuning
    algo_params = fraud_results_tuned.loc[fraud_results_tuned['algo'] == algo_name].reset_index().iloc[0]
    outlier_rate_add = outlier_rate

    # initialise classifier
    if algo_name == 'LOF':
        algo_lst = ['ball_tree', 'kd_tree', 'brute']
        clf = LOF(algorithm=algo_lst[int(algo_params['algorithm'])], contamination=outlier_rate,
                  leaf_size=algo_params['leaf_size'], n_neighbors=int(algo_params['n_neighbors']))

    elif algo_name == 'KNN':
        method_lst = ['largest', 'mean', 'median']
        clf = KNN(n_neighbors=int(algo_params['n_neighbors']), contamination=outlier_rate, \
                  method=method_lst[int(algo_params['method'])])

    elif algo_name == 'IForest':
        clf = IForest(max_features=int(algo_params['max_features']), contamination=outlier_rate, \
                      max_samples=int(algo_params['max_samples']), n_estimators=int(algo_params['n_estimators']))

    elif algo_name == 'CBOF':
        clf = CBOF(k=int(algo_params['n_neighbors']), contamination=outlier_rate, \
                   lofub=algo_params['lofub'], pct=algo_params['pct'])

    elif algo_name == 'OCSVM':
        kernel_lst = ['linear', 'poly', 'rbf', 'sigmoid']
        clf = OCSVM(kernel=kernel_lst[int(algo_params['kernel'])], \
                    nu=algo_params['nu'], contamination=outlier_rate)

    elif algo_name == 'ROCF':
        clf = ROCF(k=int(algo_params['n_neighbors']))
            
    # fit classifier on TEST data
    clf.fit(X_test)

    # retrieve predictions on TEST data
    try:
        y_pred = clf.get_outliers()
    except:
        y_pred = clf.labels_
        
    if algo_name == 'ROCF':
        outlier_rate_add = clf.get_outlier_rate()

    report = classification_report(y_true=y_test, y_pred=y_pred, output_dict=True)
    f1 = report['1']['f1-score']
    precision = report['1']['precision']
    recall = report['1']['recall']
    test_results = test_results.append({'algo': algo_name, 'outlier_rate': outlier_rate_add, 'recall': recall, \
                                        'precision': precision, 'f1': f1}, \
                                         ignore_index=True)

In [None]:
test_results[['algo', 'outlier_rate', 'recall', 'precision', 'f1']]

## Effect of "Top-n" parameter

In [None]:
train_outlier = sum(y_train) / len(y_train)
test_outlier = sum(y_test) / len(y_test)

top_n_results = pd.DataFrame(columns=['algo', 'outlier_rate', 'recall', 'precision', 'f1'])

outlier_rate_lst = [train_outlier, test_outlier, test_outlier + 0.02]
algos_dict = {'LOF': LOF, 'CBOF': CBOF, 'ROCF': ROCF, 'KNN': KNN, 'IForest': IForest, 'OCSVM': OCSVM}

# for each algorithm
for algo_name, f in algos_dict.items():
    
    # for each outlier rate
    for outlier_rate in outlier_rate_lst:
        outlier_rate_add = outlier_rate
        
        # get best parameters from tuning
        algo_params = fraud_results_tuned.loc[fraud_results_tuned['algo'] == algo_name].reset_index().iloc[0]
        
        # initialise classifier
        if algo_name == 'LOF':
            algo_lst = ['ball_tree', 'kd_tree', 'brute']
            clf = LOF(algorithm=algo_lst[int(algo_params['algorithm'])], contamination=outlier_rate,
                      leaf_size=algo_params['leaf_size'], n_neighbors=int(algo_params['n_neighbors']))
            
        elif algo_name == 'KNN':
            method_lst = ['largest', 'mean', 'median']
            clf = KNN(n_neighbors=int(algo_params['n_neighbors']), contamination=outlier_rate, \
                      method=method_lst[int(algo_params['method'])])
            
        elif algo_name == 'IForest':
            clf = IForest(max_features=int(algo_params['max_features']), contamination=outlier_rate, \
                          max_samples=int(algo_params['max_samples']), n_estimators=int(algo_params['n_estimators']))
            
        elif algo_name == 'CBOF':
            clf = CBOF(k=int(algo_params['n_neighbors']), contamination=outlier_rate, \
                       lofub=algo_params['lofub'], pct=algo_params['pct'])
            
        elif algo_name == 'OCSVM':
            kernel_lst = ['linear', 'poly', 'rbf', 'sigmoid']
            clf = OCSVM(kernel=kernel_lst[int(algo_params['kernel'])], \
                        nu=algo_params['nu'], contamination=outlier_rate)
        
        elif algo_name == 'ROCF':
            if outlier_rate != test_outlier:
                continue
            else:
                clf = ROCF(k=int(algo_params['n_neighbors']))
            
        # fit classifier on data
        clf.fit(X_test)
        
        # retrieve predictions
        try:
            y_pred = clf.get_outliers()
        except:
            y_pred = clf.labels_
        
        
        if algo_name == 'ROCF':
            outlier_rate_add = clf.get_outlier_rate()
            
        report = classification_report(y_true=y_test, y_pred=y_pred, output_dict=True)
        f1 = report['1']['f1-score']
        precision = report['1']['precision']
        recall = report['1']['recall']
        top_n_results = top_n_results.append({'algo': algo_name, 'outlier_rate': outlier_rate_add, 'recall': recall, \
                                              'precision': precision, 'f1': f1}, \
                                             ignore_index=True)

In [None]:
top_n_results

## Evaluation of ROCF 0.1 Threshold

In [None]:
rocf_algo = ROCF(k=9)
rocf_algo.fit(X_test)
max(rocf_algo.get_rocfs()) # print max rocf