In [1]:
from __future__ import division
from __future__ import print_function
import pandas as pd
import numpy as np
import os
import sys
from sklearn.ensemble import IsolationForest
from sklearn.neighbors import LocalOutlierFactor
import re
from sklearn.preprocessing import RobustScaler
import scipy.stats as ss
# from sklearn.metrics import f1_score
# from sklearn.metrics import accuracy_score
from sklearn import metrics
import scipy.io
import os
import sys
from time import time
import scipy.stats as ss
from sklearn.preprocessing import RobustScaler
# temporary solution for relative imports in case pyod is not installed
# if pyod is installed, no need to use the following line
sys.path.append(
    os.path.abspath(os.path.join(os.path.dirname("__file__"), '..')))
from numpy import percentile
import matplotlib.pyplot as plt
import matplotlib.font_manager

# Import all models
from pyod.models.abod import ABOD
from pyod.models.cblof import CBLOF
from pyod.models.hbos import HBOS
from pyod.models.iforest import IForest
from pyod.models.knn import KNN
from pyod.models.copod import COPOD
from pyod.models.lof import LOF
from pyod.models.ocsvm import OCSVM
from pyod.models.pca import PCA
from pyod.models.lscp import LSCP
from IPython.display import display
import time
import sys


# dataset path
data_path = "/Users/kadima/experiment_any/anomaly-detection/datasets_resend/"


training_dict = dict()
label_dict = dict()

for root, path, files in os.walk("../datasets_resend/"):
    for file in files:
        if file.endswith("txt"):
            if "label" not in file:
                with open(root+file,'r') as d:
                    data = d.readlines()
                    data = [x.split() for x in data]
                    data = [[float(i) for i in x] for x in data]
                training_dict[file[:-9]] = np.asarray(data).astype(float)
            else:
                with open(root+file,'r') as d:
                    label = d.readlines()
                    label = np.asarray([1-int(x[0]) for x in label ])
                label_dict[file[:-10]] = label

In [121]:
random_state = np.random.RandomState(10)
outliers_fraction = 0.4
detector_list = [LOF(n_neighbors=5), LOF(n_neighbors=10), LOF(n_neighbors=15),
                 LOF(n_neighbors=20), LOF(n_neighbors=25), LOF(n_neighbors=30),
                 LOF(n_neighbors=35), LOF(n_neighbors=40), LOF(n_neighbors=45),
                 LOF(n_neighbors=50)]


# initialize a set of detectors for LSCP
classifiers = {
#     'Angle-based Outlier Detector (ABOD)':
#         ABOD(contamination=outliers_fraction),

    'Histogram-base Outlier Detection (HBOS)': HBOS(
        contamination=outliers_fraction),
    'Isolation Forest': IForest(contamination=outliers_fraction,
                                random_state=random_state, n_estimators=280),
    'K Nearest Neighbors (KNN)': KNN(
        contamination=outliers_fraction),
    'Average KNN': KNN(method='mean',
                       contamination=outliers_fraction),
    'Local Outlier Factor (LOF)':
        LOF(n_neighbors=35, contamination=outliers_fraction),
#     'One-class SVM (OCSVM)': OCSVM(contamination=outliers_fraction),
    'Principal Component Analysis (PCA)': PCA(
        contamination=outliers_fraction, random_state=random_state),
    "COPOD": COPOD(),
    
    'Locally Selective Combinatio (LSCP)': LSCP(
    detector_list)
}

classifiers2 = dict(zip(['lof_'+str(i) for i in range(5,51,5)],[LOF(n_neighbors=x) for x in range(5,51,5)]))

names = []
for i, clf in enumerate(classifiers.keys()):
    names.append(clf)



# Data Check

In [115]:
a = []
b = []
c = []

for fname, data in training_dict.items():
    df_tmp = pd.DataFrame(data)
    a.append(fname)
    b.append(round(df_tmp.memory_usage(deep=True).sum()*1e-6,4))
    c.append(str(round(sum(label_dict[fname])/len(label_dict[fname])*100,4))+"%")
pd.DataFrame({"FileName":a, "Memory_in_MB": b, "OutliersRate": c})

Unnamed: 0,FileName,Memory_in_MB,OutliersRate
0,Shuttle_withoutdupl_v01,0.0731,1.2833%
1,Stamps_withoutdupl_09,0.0246,9.1176%
2,InternetAds_withoutdupl_norm_19,24.4572,18.7182%
3,WBC_v01,0.0328,2.2026%
4,Pima_withoutdupl_35,0.0493,34.8958%
5,Hepatitis_withoutdupl_16,0.0123,16.25%
6,thyroid,0.1812,2.4655%
7,mnist,6.0825,9.2069%
8,cover,22.884,0.9603%
9,breastw,0.0493,34.9927%


In [116]:
def get_detectors_scores(X,y,clfs):
    trained_clfs = []
    clf_scores_dict = dict()
    # Fit the model
    for i, (clf_name, clf) in enumerate(clfs.items()):
        print(clf_name)
        start_time = time.time()
        clf.fit(X)
        trained_clfs.append((clf_name, clf))
        scores_pred = clf.decision_function(X)
        standard_scores= RobustScaler().fit_transform(np.reshape(scores_pred,(-1,1)))
        clf_scores_dict[clf_name] = standard_scores.reshape(-1)
        end_time = time.time()
        print("cost",(end_time-start_time)//60,'minutes')
        print('-'*20)
    return pd.DataFrame.from_dict(clf_scores_dict)


def softmax(scores_array):
    weights = scores_array - max(scores_array)
    s = np.exp(weights).sum()
    weights = np.exp(weights)/s
#     smooth_scores = weights * scores_array
    return weights

def get_disagreement_matrix(rank_series, ranked_score_matrix):
    dup_a = np.tile(rank_series, (len(rank_series), 1)).astype(int)
    dup_b = (np.tile(rank_series, (len(rank_series), 1)).T).astype(int)
    # [[1,2,3], [1,2,3], [1,2,3]]
    idx_m = np.tile(np.arange(len(rank_series)), (len(rank_series),1))
    tmp_matrix = np.abs(ranked_score_matrix[dup_b, idx_m] - ranked_score_matrix[dup_a, idx_m])
    res = np.maximum(1e-4, tmp_matrix)
    # [[1,1,1],[2,2,2],[3,3,3]]
    idx_n = np.tile(np.arange(len(rank_series)), (len(rank_series),1)).T
    # sum after divide or divide after sum?
    weights = np.sum(res[idx_n, idx_m], axis = 1) / np.sum(res[idx_m, idx_n], axis = 1)
#     weights = np.sum(res[idx_n, idx_m] /res[idx_m, idx_n], axis = 1)
    scaled_weights = softmax(weights)
    return scaled_weights

def form_rank_matrix(score_matrix):
    sorted_matrix = score_matrix.copy()
    sorted_matrix.sort(axis=0)
    return sorted_matrix


def ensemble_disagreement_score(score_matrix, num_detectors):
    # normalize the score_matrix
    origin_score_matrix = score_matrix.copy()
    score_matrix = score_matrix.to_numpy()
    # get rank matrix
    rank_matrix = np.zeros([len(score_matrix), num_detectors])
    weight_matrix = np.zeros([len(score_matrix), num_detectors])
    for i in range(num_detectors):
        rank_matrix[:,i] = ss.rankdata(score_matrix[:, i], 'ordinal') - 1

    # form a matrix for each row
#     for row_idx in range(len(score_matrix)):
#         tmp_matrix = np.zeros([num_detectors, num_detectors])
#         for col_idx in range(num_detectors):
#             rank_refer = rank_matrix[row_idx, col_idx]
#             for col_idx_2 in range(num_detectors):
#                 if col_idx_2 == col_idx:
#                     tmp_matrix[col_idx, col_idx_2] = 0
#                 else:
#                     target_row = np.argwhere(rank_matrix[:,col_idx_2] == rank_refer)
#                     target_row = target_row[0][0]
#                     curr_score = score_matrix[row_idx, col_idx_2]
#                     refer_score = score_matrix[target_row, col_idx_2]
#                     # disagreement score 
#                     tmp_matrix[col_idx, col_idx_2] = abs(curr_score - refer_score)
                    
#         if np.sum(tmp_matrix) == 0:
#             weight_matrix[row_idx,:] = np.ones_like(weight_matrix[row_idx,:])
#         else:
#             for col_idx in range(num_detectors):
#                 weight_matrix[row_idx, col_idx] = np.sum(tmp_matrix[col_idx, :])/np.max([np.sum(tmp_matrix[:, col_idx]), 1e-3])
            
    sorted_score_matrix = form_rank_matrix(score_matrix)
    scaled_weights_matrix = np.apply_along_axis(get_disagreement_matrix,1,rank_matrix, sorted_score_matrix)
    assert scaled_weights_matrix.shape == score_matrix.shape
    output =  np.sum(score_matrix * scaled_weights_matrix, axis = 1)
    origin_score_matrix["ensemble"] = output
#     print(origin_score_matrix.loc[origin_score_matrix["ensemble"].isna(),'ensemble'])
    return origin_score_matrix
                
    

In [118]:
result_dict = dict()
file_auc_dict = dict()
for fname in training_dict.keys():
    print(fname)
    result_dict[fname] = get_detectors_scores(training_dict[fname], label_dict[fname], classifiers2)
    new_df = ensemble_disagreement_score(result_dict[fname], len(classifiers2))
    file_auc_dict[fname] = pd.DataFrame(new_df.apply(lambda x:  metrics.roc_auc_score(label_dict[fname],x)))
    print("="*45)

Shuttle_withoutdupl_v01
lof_5
cost 0.0 minutes
--------------------
lof_10
cost 0.0 minutes
--------------------
lof_15
cost 0.0 minutes
--------------------
lof_20
cost 0.0 minutes
--------------------
lof_25
cost 0.0 minutes
--------------------
Stamps_withoutdupl_09
lof_5
cost 0.0 minutes
--------------------
lof_10
cost 0.0 minutes
--------------------
lof_15
cost 0.0 minutes
--------------------
lof_20
cost 0.0 minutes
--------------------
lof_25
cost 0.0 minutes
--------------------
InternetAds_withoutdupl_norm_19
lof_5
cost 0.0 minutes
--------------------
lof_10
cost 0.0 minutes
--------------------
lof_15
cost 0.0 minutes
--------------------
lof_20
cost 0.0 minutes
--------------------
lof_25
cost 0.0 minutes
--------------------
WBC_v01
lof_5
cost 0.0 minutes
--------------------
lof_10
cost 0.0 minutes
--------------------
lof_15
cost 0.0 minutes
--------------------
lof_20
cost 0.0 minutes
--------------------
lof_25
cost 0.0 minutes
--------------------
Pima_withoutdupl_3

cost 0.0 minutes
--------------------
lof_20
cost 0.0 minutes
--------------------
lof_25
cost 0.0 minutes
--------------------
optdigits
lof_5
cost 0.0 minutes
--------------------
lof_10
cost 0.0 minutes
--------------------
lof_15
cost 0.0 minutes
--------------------
lof_20
cost 0.0 minutes
--------------------
lof_25
cost 0.0 minutes
--------------------
vowels
lof_5
cost 0.0 minutes
--------------------
lof_10
cost 0.0 minutes
--------------------
lof_15
cost 0.0 minutes
--------------------
lof_20
cost 0.0 minutes
--------------------
lof_25
cost 0.0 minutes
--------------------
vertebral
lof_5
cost 0.0 minutes
--------------------
lof_10
cost 0.0 minutes
--------------------
lof_15
cost 0.0 minutes
--------------------
lof_20
cost 0.0 minutes
--------------------
lof_25
cost 0.0 minutes
--------------------
musk
lof_5
cost 0.0 minutes
--------------------
lof_10
cost 0.0 minutes
--------------------
lof_15
cost 0.0 minutes
--------------------
lof_20
cost 0.0 minutes
----------

In [120]:
cnt_ = 0
for fname in file_auc_dict.keys():
    if file_auc_dict[fname].loc['ensemble',0] >= (file_auc_dict[fname].loc[list(classifiers2.keys()),:]).max()[0]:
        print(fname)
        print("ensemble AUC",file_auc_dict[fname].loc['ensemble',0])
        print('Best classifier auc', (file_auc_dict[fname].loc[list(classifiers2.keys()),:]).max()[0])
        cnt_ += 1
        print(cnt_,"\n\n")

Shuttle_withoutdupl_v01
ensemble AUC 0.9498461538461539
Best classifier auc 0.9473076923076924
1 


cover
ensemble AUC 0.5730346873593692
Best classifier auc 0.5596620638531327
2 


satimage-2
ensemble AUC 0.577268215012335
Best classifier auc 0.575933970887432
3 


letter
ensemble AUC 0.91782
Best classifier auc 0.9127533333333333
4 


ALOI_withoutdupl
ensemble AUC 0.7811182984327345
Best classifier auc 0.7669462460155039
5 


vowels
ensemble AUC 0.9539402560455191
Best classifier auc 0.9467567567567567
6 




In [169]:
test_score = np.round(np.random.random((5,3)),2)

In [170]:
rank_matrix = np.zeros([len(test_score), 3])
weight_matrix = np.zeros([len(test_score), 3])
for i in range(3):
    rank_matrix[:,i] = ss.rankdata(test_score[:, i], 'ordinal') - 1

In [171]:
sorted_test_score = form_rank_matrix(test_score)

In [172]:
scaled_weights_matrix = np.apply_along_axis(get_disagreement_matrix,1,rank_matrix, sorted_test_score)
output =  np.sum(test_score * scaled_weights_matrix, axis = 1)

In [173]:
test_score

array([[0.74, 0.8 , 0.77],
       [0.24, 0.64, 0.91],
       [0.92, 0.5 , 0.56],
       [0.76, 0.78, 0.03],
       [0.3 , 0.94, 0.49]])

In [174]:
scaled_weights_matrix

array([[0.9878491 , 0.00864841, 0.00350249],
       [0.43749664, 0.32820231, 0.23430106],
       [0.26209926, 0.56147666, 0.17642408],
       [0.33441398, 0.51188672, 0.1536993 ],
       [0.17401764, 0.60687958, 0.21910278]])

In [175]:
rank_matrix

array([[2., 3., 3.],
       [0., 1., 4.],
       [4., 0., 2.],
       [3., 2., 0.],
       [1., 4., 1.]])

In [176]:
sorted_test_score

array([[0.24, 0.5 , 0.03],
       [0.3 , 0.64, 0.49],
       [0.74, 0.78, 0.56],
       [0.76, 0.8 , 0.77],
       [0.92, 0.94, 0.91]])

In [177]:
output

array([0.74062398, 0.52826263, 0.62066713, 0.65803724, 0.73003246])

In [183]:
tmp_m = [[1e-4, 0.8-0.78, 0.77-0.56],
         [0.76-0.74, 1e-4, 1e-4],
         [0.76-0.74, 1e-4, 1e-4]]
tmp_m = np.asarray(tmp_m)

In [184]:
tmp_m

array([[1.0e-04, 2.0e-02, 2.1e-01],
       [2.0e-02, 1.0e-04, 1.0e-04],
       [2.0e-02, 1.0e-04, 1.0e-04]])

In [187]:
softmax(np.asarray([(0.02+0.21)/(0.02+0.02), (0.02/0.02), (0.02/0.21)])) 

array([0.98799341, 0.00854782, 0.00345877])

In [188]:
softmax(np.asarray([(0.02+0.21)/(0.02+0.02), (0.02/0.02), (0.02/0.21)])) * test_score[0,:]

array([0.73111512, 0.00683825, 0.00266326])