In [1]:
import pandas as pd
import numpy as np
import os
import sys
from sklearn.ensemble import IsolationForest
from sklearn.neighbors import LocalOutlierFactor
import re
from sklearn.preprocessing import RobustScaler
import scipy.stats as ss
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score
import scipy.io
from __future__ import division
from __future__ import print_function
import os
import sys
from time import time
import scipy.stats as ss

# temporary solution for relative imports in case pyod is not installed
# if pyod is installed, no need to use the following line
sys.path.append(
    os.path.abspath(os.path.join(os.path.dirname("__file__"), '..')))
from numpy import percentile
import matplotlib.pyplot as plt
import matplotlib.font_manager
# Import all models
from pyod.models.abod import ABOD
from pyod.models.cblof import CBLOF
from pyod.models.feature_bagging import FeatureBagging
from pyod.models.hbos import HBOS
from pyod.models.iforest import IForest
from pyod.models.knn import KNN
from pyod.models.lof import LOF
from pyod.models.mcd import MCD
from pyod.models.ocsvm import OCSVM
from pyod.models.pca import PCA
from pyod.models.lscp import LSCP # ensemble

In [2]:
# dataset path
data_path = "/Users/kadima/experiment_any/anomaly-detection/datasets/"

# load data

In [20]:
def getData(fileName):
    # check fileName:
    files = [x for x in os.listdir(data_path) if x.endswith(".mat")]
    mat = scipy.io.loadmat(data_path+fileName)
    X = mat["X"]
    y = mat["y"]
    return X, y

def read_data_parkinson():
    file_path = "/Users/kadima/experiment_any/anomaly-detection/Parkinson_withoutdupl_75.txt"
    with open(file_path,'r') as f:
        data = f.readlines()
    # print(data[28:-2])
    col_names = []
    list_ = []
    for id_, line in enumerate(data):
        if line.startswith("@ATTRIBUTE"):
            colName = re.findall("@ATTRIBUTE '(.*?)'", line)[0]
            col_names.append(colName)
        if id_ >= 28 and id_<= len(data) - 1 and line != "\n":
            new_line = line.split(",")
            new_line = [x.strip() for x in new_line]
            new_line = [float(new_line[x]) for x in range(len(new_line)-1)] + \
                       [new_line[-1].replace("'", "")]
            list_.append(new_line)

    df = pd.DataFrame(list_, columns=col_names)
    df['outlier'] = df.outlier.apply(lambda x: 1 if x =='yes' else 0)
    X = df.iloc[:, 1:-1]
    y = df.iloc[:, -1]
    return X, y




# thresholders

In [202]:
def sd_thresholder(scores, real_y, factor=2.5):
    high_limit = np.mean(scores) + factor*np.std(scores)
    y_predict = scores>=high_limit
    y_predict = [1 if j else 0 for j in y_predict]
    f1 = f1_score(real_y, y_predict)
    return y_predict, f1, high_limit


def mad_thresholder(scores, real_y):
    median_ = np.median(scores)
    mad = 1.4826*np.median(np.abs(scores-median_))
    y_predict = scores>= 3*mad
    y_predict = [1 if j else 0 for j in y_predict]
    f1 = f1_score(real_y, y_predict)
    return y_predict, f1, 3*mad

def iqr_thresholder(scores, real_y):
    iqr = np.percentile(scores,75) - np.percentile(scores,25)
    y_predict = scores >= (np.percentile(scores,75)+1.5*iqr)
    y_predict = [1 if j else 0 for j in y_predict]
    f1 = f1_score(real_y, y_predict)
    return y_predict, f1, np.percentile(scores,75)+1.5*iqr

def disagreement(score_matrix, num_detectors, real_y):
    rank_matrix = np.zeros([len(score_matrix), num_detectors])
    for i in range(num_detectors):
    # rank by each column and get its rank_position
        rank_matrix[:,i] = ss.rankdata(score_matrix[:, i])
    std_record = np.zeros(len(X))
    rank_record = []
    for i in range(len(score_matrix)):
        rank_rows = rank_matrix[i,:]
        min_ = np.min(rank_rows)
        max_ = np.max(rank_rows)
        rank_rows  = [x for x in rank_rows if x not in [min_,max_]]
        std_record[i] = np.std(rank_rows)
        rank_record.append(rank_rows)
    valid_rank_rows = []
    for i in range(len(score_matrix)):
        num_large = np.sum(np.array(rank_record[i]) >= len(score_matrix)//2)
        if num_large >= num_detectors//3:
            valid_rank_rows.append(i)
    std_max_rows = np.argsort(np.array(std_record)[valid_rank_rows])[-1:]
    std_median_scores = np.median(score_matrix[std_max_rows,:], axis=0)
    threshold_for_each_detector = std_median_scores
    
    f1_list = []
    for i in range(num_detectors):
        outliers_rows = score_matrix[:,i] >= threshold_for_each_detector[i]
        y_predict = [1 if j else 0 for j in outliers_rows]
        f1 = f1_score(real_y, y_predict)
        f1_list.append(f1)
    return f1_list, threshold_for_each_detector


In [203]:
def get_score_matrix(X, num_detectors):
    return np.zeros([X.shape[0], num_detectors])

def get_perform_matrix(num_thresholders, num_detectors):
    return np.zeros((num_thresholders, num_detectors))


def

# Detectors

In [204]:
random_state = np.random.RandomState(10)
outliers_fraction = 0.4
# initialize a set of detectors for LSCP
detector_list = [LOF(n_neighbors=5), LOF(n_neighbors=10), LOF(n_neighbors=15),
                 LOF(n_neighbors=20), LOF(n_neighbors=25), LOF(n_neighbors=30),
                 LOF(n_neighbors=35), LOF(n_neighbors=40), LOF(n_neighbors=45),
                 LOF(n_neighbors=50)]
classifiers = {
    'Angle-based Outlier Detector (ABOD)':
        ABOD(contamination=outliers_fraction),
    'Cluster-based Local Outlier Factor (CBLOF)':
        CBLOF(contamination=outliers_fraction,
              check_estimator=False, random_state=random_state),
    'Feature Bagging':
        FeatureBagging(LOF(n_neighbors=35),
                       contamination=outliers_fraction,
                       random_state=random_state),
    'Histogram-base Outlier Detection (HBOS)': HBOS(
        contamination=outliers_fraction),
    'Isolation Forest': IForest(contamination=outliers_fraction,
                                random_state=random_state, n_estimators = 200),
    'K Nearest Neighbors (KNN)': KNN(
        contamination=outliers_fraction),
    'Average KNN': KNN(method='mean',
                       contamination=outliers_fraction),
    'Local Outlier Factor (LOF)':
        LOF(n_neighbors=35, contamination=outliers_fraction),
    'Minimum Covariance Determinant (MCD)': MCD(
        contamination=outliers_fraction, random_state=random_state),
    'One-class SVM (OCSVM)': OCSVM(contamination=outliers_fraction),
    'Principal Component Analysis (PCA)': PCA(
        contamination=outliers_fraction, random_state=random_state),
    'Locally Selective Combination (LSCP)': LSCP(
        detector_list, contamination=outliers_fraction,
        random_state=random_state)
}

names = []
# Show all detectors
for i, clf in enumerate(classifiers.keys()):
    names.append(clf)
#     print('Model', i + 1, clf)

In [205]:
X,y = read_data_parkinson()

In [206]:
# Fit the models with the generated data and 
# compare model performances


def get_result(X, y, classifiers):
    threshold_records = list()
    # create matrix to store the performance
    score_matrix = get_score_matrix(X, len(classifiers.keys()))
    perform_table = get_perform_matrix(5, len(classifiers.keys()) )

    np.random.seed(5)
    clfs = []
    # Fit the model
    for i, (clf_name, clf) in enumerate(classifiers.items()):
#         print(i + 1, 'fitting', clf_name)
        # fit the data and tag outliers
        clf.fit(X)
        clfs.append(clf)
        scores_pred = clf.decision_function(X)
        score_matrix[:, i] = scores_pred

    for i, thresholder in enumerate([sd_thresholder, mad_thresholder, 
                                     iqr_thresholder]):
        kk = []
        for j in range(score_matrix.shape[1]):
            _,perform_table[i,j],b = thresholder(score_matrix[:,j], y)
            kk.append(b)
            
        threshold_records.append(kk)
            
    for i in range(score_matrix.shape[1]):
        perform_table[-2,i] = f1_score(y,clfs[i].predict(X))
        
    perform_table[-1,:], a = disagreement(score_matrix, len(classifiers), y)
    threshold_records.append(a)
    
    return (pd.DataFrame(perform_table, columns = names, index = ["sd",'mad','iqr','default','disagreement']), threshold_records)

In [207]:
df, records = get_result(X, y, classifiers)
records = pd.DataFrame([np.array(x) for x in records])

1 fitting Angle-based Outlier Detector (ABOD)
2 fitting Cluster-based Local Outlier Factor (CBLOF)
3 fitting Feature Bagging
4 fitting Histogram-base Outlier Detection (HBOS)
5 fitting Isolation Forest




6 fitting K Nearest Neighbors (KNN)
7 fitting Average KNN
8 fitting Local Outlier Factor (LOF)
9 fitting Minimum Covariance Determinant (MCD)
10 fitting One-class SVM (OCSVM)
11 fitting Principal Component Analysis (PCA)
12 fitting Locally Selective Combination (LSCP)




In [208]:
df

Unnamed: 0,Angle-based Outlier Detector (ABOD),Cluster-based Local Outlier Factor (CBLOF),Feature Bagging,Histogram-base Outlier Detection (HBOS),Isolation Forest,K Nearest Neighbors (KNN),Average KNN,Local Outlier Factor (LOF),Minimum Covariance Determinant (MCD),One-class SVM (OCSVM),Principal Component Analysis (PCA),Locally Selective Combination (LSCP)
sd,0.0,0.101911,0.101911,0.103226,0.090909,0.051948,0.065359,0.101911,0.04,0.0,0.065789,0.089744
mad,0.0,0.176471,0.859649,0.0,0.17284,0.243094,0.243094,0.859649,0.573991,0.02649,0.676259,0.134969
iqr,0.0,0.125,0.113208,0.196319,0.184049,0.147239,0.145455,0.113208,0.255814,0.0,0.115385,0.135802
default,0.526316,0.515556,0.5,0.657778,0.524444,0.411483,0.328205,0.479638,0.586667,0.524444,0.453333,0.466368
disagreement,0.232044,0.556962,0.642336,0.781022,0.681648,0.294737,0.223464,0.555102,0.81672,0.559322,0.386792,0.58498


In [209]:
df.apply(lambda x: np.mean(x), axis=1)

sd              0.067726
mad             0.330542
iqr             0.127623
default         0.497853
disagreement    0.526261
dtype: float64

In [227]:
result_dict = dict()
for datasets in os.listdir("/Users/kadima/experiment_any/anomaly-detection/datasets/"):
    print(datasets)
    X,y = getData(datasets)
    X = X.astype(np.float64)
    result_dict[datasets] = get_result(X, y, classifiers)
    print("\n\n")

pima.mat
1 fitting Angle-based Outlier Detector (ABOD)
2 fitting Cluster-based Local Outlier Factor (CBLOF)
3 fitting Feature Bagging
4 fitting Histogram-base Outlier Detection (HBOS)
5 fitting Isolation Forest




6 fitting K Nearest Neighbors (KNN)
7 fitting Average KNN
8 fitting Local Outlier Factor (LOF)
9 fitting Minimum Covariance Determinant (MCD)
10 fitting One-class SVM (OCSVM)
11 fitting Principal Component Analysis (PCA)
12 fitting Locally Selective Combination (LSCP)



cardio.mat
1 fitting Angle-based Outlier Detector (ABOD)


  return _methods._var(a, axis=axis, dtype=dtype, out=out, ddof=ddof,
  arrmean = um.true_divide(arrmean, div, out=arrmean, casting='unsafe',
  ret = ret.dtype.type(ret / rcount)


2 fitting Cluster-based Local Outlier Factor (CBLOF)
3 fitting Feature Bagging
4 fitting Histogram-base Outlier Detection (HBOS)
5 fitting Isolation Forest
6 fitting K Nearest Neighbors (KNN)
7 fitting Average KNN
8 fitting Local Outlier Factor (LOF)
9 fitting Minimum Covariance Determinant (MCD)




10 fitting One-class SVM (OCSVM)
11 fitting Principal Component Analysis (PCA)
12 fitting Locally Selective Combination (LSCP)


  return _methods._var(a, axis=axis, dtype=dtype, out=out, ddof=ddof,
  arrmean = um.true_divide(arrmean, div, out=arrmean, casting='unsafe',
  ret = ret.dtype.type(ret / rcount)





letter.mat
1 fitting Angle-based Outlier Detector (ABOD)
2 fitting Cluster-based Local Outlier Factor (CBLOF)
3 fitting Feature Bagging




4 fitting Histogram-base Outlier Detection (HBOS)
5 fitting Isolation Forest
6 fitting K Nearest Neighbors (KNN)
7 fitting Average KNN
8 fitting Local Outlier Factor (LOF)
9 fitting Minimum Covariance Determinant (MCD)
10 fitting One-class SVM (OCSVM)
11 fitting Principal Component Analysis (PCA)
12 fitting Locally Selective Combination (LSCP)



musk.mat
1 fitting Angle-based Outlier Detector (ABOD)
2 fitting Cluster-based Local Outlier Factor (CBLOF)
3 fitting Feature Bagging




4 fitting Histogram-base Outlier Detection (HBOS)
5 fitting Isolation Forest
6 fitting K Nearest Neighbors (KNN)
7 fitting Average KNN
8 fitting Local Outlier Factor (LOF)
9 fitting Minimum Covariance Determinant (MCD)
10 fitting One-class SVM (OCSVM)
11 fitting Principal Component Analysis (PCA)
12 fitting Locally Selective Combination (LSCP)



thyroid.mat
1 fitting Angle-based Outlier Detector (ABOD)


  return _methods._var(a, axis=axis, dtype=dtype, out=out, ddof=ddof,
  arrmean = um.true_divide(arrmean, div, out=arrmean, casting='unsafe',
  ret = ret.dtype.type(ret / rcount)
  return _methods._var(a, axis=axis, dtype=dtype, out=out, ddof=ddof,
  arrmean = um.true_divide(arrmean, div, out=arrmean, casting='unsafe',
  ret = ret.dtype.type(ret / rcount)


2 fitting Cluster-based Local Outlier Factor (CBLOF)
3 fitting Feature Bagging




4 fitting Histogram-base Outlier Detection (HBOS)
5 fitting Isolation Forest
6 fitting K Nearest Neighbors (KNN)
7 fitting Average KNN
8 fitting Local Outlier Factor (LOF)
9 fitting Minimum Covariance Determinant (MCD)
10 fitting One-class SVM (OCSVM)
11 fitting Principal Component Analysis (PCA)
12 fitting Locally Selective Combination (LSCP)


  return _methods._var(a, axis=axis, dtype=dtype, out=out, ddof=ddof,
  arrmean = um.true_divide(arrmean, div, out=arrmean, casting='unsafe',
  ret = ret.dtype.type(ret / rcount)





vowels.mat
1 fitting Angle-based Outlier Detector (ABOD)
2 fitting Cluster-based Local Outlier Factor (CBLOF)
3 fitting Feature Bagging




4 fitting Histogram-base Outlier Detection (HBOS)
5 fitting Isolation Forest
6 fitting K Nearest Neighbors (KNN)
7 fitting Average KNN
8 fitting Local Outlier Factor (LOF)
9 fitting Minimum Covariance Determinant (MCD)
10 fitting One-class SVM (OCSVM)
11 fitting Principal Component Analysis (PCA)
12 fitting Locally Selective Combination (LSCP)



satellite.mat
1 fitting Angle-based Outlier Detector (ABOD)
2 fitting Cluster-based Local Outlier Factor (CBLOF)
3 fitting Feature Bagging




4 fitting Histogram-base Outlier Detection (HBOS)
5 fitting Isolation Forest
6 fitting K Nearest Neighbors (KNN)
7 fitting Average KNN
8 fitting Local Outlier Factor (LOF)
9 fitting Minimum Covariance Determinant (MCD)
10 fitting One-class SVM (OCSVM)
11 fitting Principal Component Analysis (PCA)
12 fitting Locally Selective Combination (LSCP)



lympho.mat
1 fitting Angle-based Outlier Detector (ABOD)
2 fitting Cluster-based Local Outlier Factor (CBLOF)
3 fitting Feature Bagging
4 fitting Histogram-base Outlier Detection (HBOS)
5 fitting Isolation Forest




6 fitting K Nearest Neighbors (KNN)
7 fitting Average KNN
8 fitting Local Outlier Factor (LOF)
9 fitting Minimum Covariance Determinant (MCD)
10 fitting One-class SVM (OCSVM)
11 fitting Principal Component Analysis (PCA)
12 fitting Locally Selective Combination (LSCP)



speech.mat
1 fitting Angle-based Outlier Detector (ABOD)
2 fitting Cluster-based Local Outlier Factor (CBLOF)




3 fitting Feature Bagging
4 fitting Histogram-base Outlier Detection (HBOS)
5 fitting Isolation Forest
6 fitting K Nearest Neighbors (KNN)
7 fitting Average KNN
8 fitting Local Outlier Factor (LOF)
9 fitting Minimum Covariance Determinant (MCD)




10 fitting One-class SVM (OCSVM)
11 fitting Principal Component Analysis (PCA)
12 fitting Locally Selective Combination (LSCP)



wbc.mat
1 fitting Angle-based Outlier Detector (ABOD)
2 fitting Cluster-based Local Outlier Factor (CBLOF)
3 fitting Feature Bagging




4 fitting Histogram-base Outlier Detection (HBOS)
5 fitting Isolation Forest
6 fitting K Nearest Neighbors (KNN)
7 fitting Average KNN
8 fitting Local Outlier Factor (LOF)
9 fitting Minimum Covariance Determinant (MCD)
10 fitting One-class SVM (OCSVM)
11 fitting Principal Component Analysis (PCA)
12 fitting Locally Selective Combination (LSCP)



glass.mat
1 fitting Angle-based Outlier Detector (ABOD)
2 fitting Cluster-based Local Outlier Factor (CBLOF)
3 fitting Feature Bagging
4 fitting Histogram-base Outlier Detection (HBOS)
5 fitting Isolation Forest




6 fitting K Nearest Neighbors (KNN)
7 fitting Average KNN
8 fitting Local Outlier Factor (LOF)
9 fitting Minimum Covariance Determinant (MCD)
10 fitting One-class SVM (OCSVM)
11 fitting Principal Component Analysis (PCA)
12 fitting Locally Selective Combination (LSCP)



satimage-2.mat
1 fitting Angle-based Outlier Detector (ABOD)
2 fitting Cluster-based Local Outlier Factor (CBLOF)
3 fitting Feature Bagging




4 fitting Histogram-base Outlier Detection (HBOS)
5 fitting Isolation Forest
6 fitting K Nearest Neighbors (KNN)
7 fitting Average KNN
8 fitting Local Outlier Factor (LOF)
9 fitting Minimum Covariance Determinant (MCD)
10 fitting One-class SVM (OCSVM)
11 fitting Principal Component Analysis (PCA)
12 fitting Locally Selective Combination (LSCP)





In [211]:
result_dict['lympho.mat'][0]

Unnamed: 0,Angle-based Outlier Detector (ABOD),Cluster-based Local Outlier Factor (CBLOF),Feature Bagging,Histogram-base Outlier Detection (HBOS),Isolation Forest,K Nearest Neighbors (KNN),Average KNN,Local Outlier Factor (LOF),Minimum Covariance Determinant (MCD),One-class SVM (OCSVM),Principal Component Analysis (PCA),Locally Selective Combination (LSCP)
sd,0.0,0.4,0.909091,0.8,0.909091,0.6,0.6,0.666667,0.5,0.4,0.666667,0.666667
mad,0.0,0.077922,0.077922,0.0,0.909091,0.077922,0.077922,0.077922,0.153846,0.545455,0.077922,0.545455
iqr,0.0,0.375,0.75,0.857143,0.923077,0.5,0.533333,0.666667,0.5,0.545455,0.521739,0.75
default,0.166667,0.184615,0.193548,0.184615,0.184615,0.342857,0.5,0.184615,0.184615,0.184615,0.184615,0.181818
disagreement,0.088889,0.111111,0.218182,0.117647,0.102564,0.142857,0.155844,0.1875,0.115385,0.206897,0.11215,0.190476


In [212]:
pd.DataFrame(result_dict['lympho.mat'][1])

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11
0,0.006399,3.942939,1.185708,-10.533459,0.122819,3.669084,2.696982,1.21808,49.361686,7.909586,3974.61748,2.416171
1,0.004032,1.289103,0.104494,11.054582,0.107455,0.949257,0.779551,0.08698,19.574496,7.561092,967.079504,1.283248
2,0.003421,3.390803,1.125916,-12.414658,0.09587,3.260276,2.429143,1.142611,46.282708,6.747274,3021.188029,1.589623
3,-0.006173,1.877498,1.034748,-24.005871,-0.031235,2.44949,1.779796,1.025868,12.519862,0.613233,2000.995182,-0.243285


In [213]:
result_dict['lympho.mat'][0].apply(lambda x: np.mean(x), axis=1)

sd              0.593182
mad             0.218448
iqr             0.576868
default         0.223100
disagreement    0.145792
dtype: float64

In [214]:
result_dict['wbc.mat'][0].apply(lambda x: np.mean(x), axis=1)

sd              0.412609
mad             0.238732
iqr             0.478953
default         0.256351
disagreement    0.422098
dtype: float64

In [215]:
result_dict['wbc.mat'][0]

Unnamed: 0,Angle-based Outlier Detector (ABOD),Cluster-based Local Outlier Factor (CBLOF),Feature Bagging,Histogram-base Outlier Detection (HBOS),Isolation Forest,K Nearest Neighbors (KNN),Average KNN,Local Outlier Factor (LOF),Minimum Covariance Determinant (MCD),One-class SVM (OCSVM),Principal Component Analysis (PCA),Locally Selective Combination (LSCP)
sd,0.0,0.424242,0.424242,0.611111,0.529412,0.432432,0.388889,0.470588,0.344828,0.413793,0.5,0.411765
mad,0.0,0.222222,0.105263,0.173913,0.528302,0.124629,0.132075,0.105263,0.3,0.491803,0.162791,0.518519
iqr,0.0,0.566038,0.474576,0.612245,0.538462,0.54902,0.528302,0.491228,0.373333,0.535714,0.56,0.518519
default,0.222222,0.244186,0.251497,0.244186,0.244186,0.256098,0.391304,0.254545,0.244186,0.232558,0.244186,0.247059
disagreement,0.153846,0.447761,0.47619,0.507937,0.432432,0.46875,0.483871,0.461538,0.373333,0.418605,0.416667,0.424242
