In [18]:
import os 
import sys
import numpy as np 
import pandas as pd 
from sklearn.model_selection import train_test_split
from scipy.io import loadmat
from warnings import filterwarnings
filterwarnings('ignore')

import pyod packages and methods

In [2]:
from pyod.models.pca import PCA
from pyod.models.mcd import MCD
from pyod.models.ocsvm import OCSVM
from pyod.models.lof import LOF
from pyod.models.cblof import CBLOF
from pyod.models.knn import KNN
from pyod.models.hbos import HBOS
from pyod.models.abod import ABOD
from pyod.models.iforest import IForest
from pyod.models.feature_bagging import FeatureBagging




import performance metrics package

In [3]:
from pyod.utils.utility import standardizer
from pyod.utils.utility import precision_n_scores
from sklearn.metrics import roc_auc_score


define data file and read X and y

In [21]:
mat_file_list=['arrhythmia.mat','cardio.mat','glass.mat','ionosphere.mat',
              'letter.mat','lympho.mat','mnist.mat','musk.mat','optdigits.mat'
              ,'pendigits.mat','pima.mat','satellite.mat','satimage-2.mat',
              'shuttle.mat','vertebral.mat','vowels.mat','wbc.mat']
mat_file_list

['arrhythmia.mat',
 'cardio.mat',
 'glass.mat',
 'ionosphere.mat',
 'letter.mat',
 'lympho.mat',
 'mnist.mat',
 'musk.mat',
 'optdigits.mat',
 'pendigits.mat',
 'pima.mat',
 'satellite.mat',
 'satimage-2.mat',
 'shuttle.mat',
 'vertebral.mat',
 'vowels.mat',
 'wbc.mat']

In [5]:
data=loadmat('cardio.mat')
data 


{'__header__': b'MATLAB 5.0 MAT-file, written by Octave 3.8.0, 2014-12-18 10:48:09 UTC',
 '__version__': '1.0',
 '__globals__': [],
 'X': array([[ 0.00491231,  0.69319077, -0.20364049, ...,  0.23149795,
         -0.28978574, -0.49329397],
        [ 0.11072935, -0.07990259, -0.20364049, ...,  0.09356344,
         -0.25638541, -0.49329397],
        [ 0.21654639, -0.27244466, -0.20364049, ...,  0.02459619,
         -0.25638541,  1.14001753],
        ...,
        [-0.41835583, -0.91998844, -0.16463485, ..., -1.49268341,
          0.24461959, -0.49329397],
        [-0.41835583, -0.91998844, -0.15093411, ..., -1.42371616,
          0.14441859, -0.49329397],
        [-0.41835583, -0.91998844, -0.20364049, ..., -1.28578165,
          3.58465295, -0.49329397]]),
 'y': array([[0.],
        [0.],
        [0.],
        ...,
        [1.],
        [1.],
        [1.]])}

In [6]:
len(data)

5

In [7]:
data.keys()

dict_keys(['__header__', '__version__', '__globals__', 'X', 'y'])

In [8]:
data.values()

dict_values([b'MATLAB 5.0 MAT-file, written by Octave 3.8.0, 2014-12-18 10:48:09 UTC', '1.0', [], array([[ 0.00491231,  0.69319077, -0.20364049, ...,  0.23149795,
        -0.28978574, -0.49329397],
       [ 0.11072935, -0.07990259, -0.20364049, ...,  0.09356344,
        -0.25638541, -0.49329397],
       [ 0.21654639, -0.27244466, -0.20364049, ...,  0.02459619,
        -0.25638541,  1.14001753],
       ...,
       [-0.41835583, -0.91998844, -0.16463485, ..., -1.49268341,
         0.24461959, -0.49329397],
       [-0.41835583, -0.91998844, -0.15093411, ..., -1.42371616,
         0.14441859, -0.49329397],
       [-0.41835583, -0.91998844, -0.20364049, ..., -1.28578165,
         3.58465295, -0.49329397]]), array([[0.],
       [0.],
       [0.],
       ...,
       [1.],
       [1.],
       [1.]])])

In [9]:
type(data['X']),data['X'].shape

(numpy.ndarray, (1831, 21))

In [10]:
type(data['y']),data['y'].shape

(numpy.ndarray, (1831, 1))

define nine outliers to be compared 

In [11]:
df_columns=['Data','#Samples','#Dimeensions','Outlier Prec',
           'ABOD','CBLOF','FB','IForest','KNN','LOF','MCD',
           'OCSVM','PCA']


In [12]:
roc_df = pd.DataFrame(columns= df_columns)
roc_df

Unnamed: 0,Data,#Samples,#Dimeensions,Outlier Prec,ABOD,CBLOF,FB,IForest,KNN,LOF,MCD,OCSVM,PCA


PRECISION_N_SCORES -Performance evaluation table 

In [13]:
prn_df =pd.DataFrame(columns=df_columns)
prn_df

Unnamed: 0,Data,#Samples,#Dimeensions,Outlier Prec,ABOD,CBLOF,FB,IForest,KNN,LOF,MCD,OCSVM,PCA


TIME DATAFRAME

In [14]:
time_df =pd.DataFrame(columns=df_columns)
time_df

Unnamed: 0,Data,#Samples,#Dimeensions,Outlier Prec,ABOD,CBLOF,FB,IForest,KNN,LOF,MCD,OCSVM,PCA


EXPLORING ALL MAT FILES

In [22]:
#Exploring/Accessing all mat files
from time import time
random_state = np.random.RandomState(42)


for mat_file in mat_file_list:
    print("\n... Processing", mat_file, "...")
    mat = loadmat(os.path.join(mat_file))
    
    X = mat["X"]
    y = mat["y"].ravel()
    
    outliers_fraction = np.count_nonzero(y)/len(y)
    outliers_percentage = round(outliers_fraction*100, ndigits=4)
    
    #construct containers for saving results
    roc_list = [mat_file[:-4],X.shape[0],X.shape[1], outliers_percentage]
    prn_list = [mat_file[:-4],X.shape[0],X.shape[1], outliers_percentage]
    time_list = [mat_file[:-4],X.shape[0],X.shape[1], outliers_percentage]
    
    #60% data for training and 40% for testing
    X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.4,random_state = 2)
    
    # STandardaizing data for preprocessing
    X_train_norm,  X_test_norm = standardizer(X_train,X_test)
    
    classifiers = {
                   "Angle based outlier detector(ABOD)":ABOD(contamination = outliers_fraction), #contamination based on what factors the data is considered as outliers
                   "Cluster-based Loacl Outlier Factor":CBLOF(contamination = outliers_fraction,check_estimator= False, random_state=random_state),
                   "Feature Bagging":FeatureBagging(contamination = outliers_fraction,random_state=random_state),
                   "Histogram base outlier detection": HBOS(contamination = outliers_fraction),
                   "Isolation Forest": IForest(contamination = outliers_fraction,random_state=random_state),
                   "k Nearest Neighbours (kNN)": KNN(contamination = outliers_fraction),
                   "Local Outlier Factor(LOF)": LOF(contamination = outliers_fraction),
                   "Minimun Covariance Determinant (MCD)": MCD(contamination = outliers_fraction, random_state=random_state),
                   "One-Class SVM (OCSVM)": OCSVM(contamination = outliers_fraction),
                   "Principal Component Analysis": PCA(contamination = outliers_fraction,random_state=random_state)
                  }
                   
    for clf_name,clf in classifiers.items():
        t0 =time ()
        clf.fit(X_train_norm)
        test_scores = clf.decision_function(X_test_norm)
        t1 = time()
        duration = round(t1-t0, ndigits=4)
        time_list.append(duration)
                        
        roc = round(roc_auc_score(y_test,test_scores),ndigits = 4)
        prn = round(precision_n_scores(y_test,test_scores),ndigits = 4)
                        
        print("{clf_name} ROC:{roc}, precision @ rank n:{prn}, execution time: {duration}s".
                              format(clf_name=clf_name, roc =roc, prn = prn, duration = duration))
        roc_list.append(roc)
        prn_list.append(prn)
                        
    temp_df = pd.DataFrame(time_list).transpose()
    temp_df_columns = df_columns
    time_df = pd.concat([time_df,temp_df],axis=0)
    
    temp_df = pd.DataFrame(roc_list).transpose()
    temp_df_columns = df_columns
    roc_df = pd.concat([roc_df,temp_df],axis=0)
    
    temp_df = pd.DataFrame(prn_list).transpose()
    temp_df_columns = df_columns
    prn_df = pd.concat([prn_df,temp_df],axis=0)


... Processing arrhythmia.mat ...
Angle based outlier detector(ABOD) ROC:0.7246, precision @ rank n:0.2857, execution time: 0.1637s
Cluster-based Loacl Outlier Factor ROC:0.7283, precision @ rank n:0.2857, execution time: 0.1358s
Feature Bagging ROC:0.712, precision @ rank n:0.2857, execution time: 0.5761s
Histogram base outlier detection ROC:0.8077, precision @ rank n:0.4643, execution time: 0.0543s
Isolation Forest ROC:0.7972, precision @ rank n:0.4286, execution time: 0.4295s
k Nearest Neighbours (kNN) ROC:0.7148, precision @ rank n:0.2857, execution time: 0.0893s
Local Outlier Factor(LOF) ROC:0.7171, precision @ rank n:0.3214, execution time: 0.0703s
Minimun Covariance Determinant (MCD) ROC:0.725, precision @ rank n:0.3214, execution time: 0.6803s
One-Class SVM (OCSVM) ROC:0.7124, precision @ rank n:0.2857, execution time: 0.0583s
Principal Component Analysis ROC:0.7134, precision @ rank n:0.2857, execution time: 0.06s

... Processing cardio.mat ...
Angle based outlier detector(AB

Angle based outlier detector(ABOD) ROC:0.689, precision @ rank n:0.0882, execution time: 1.6201s
Cluster-based Loacl Outlier Factor ROC:0.8356, precision @ rank n:0.25, execution time: 0.2229s
Feature Bagging ROC:0.4959, precision @ rank n:0.0588, execution time: 4.0437s
Histogram base outlier detection ROC:0.9243, precision @ rank n:0.3235, execution time: 0.008s
Isolation Forest ROC:0.9422, precision @ rank n:0.3971, execution time: 0.6217s
k Nearest Neighbours (kNN) ROC:0.7524, precision @ rank n:0.1029, execution time: 0.5739s
Local Outlier Factor(LOF) ROC:0.4846, precision @ rank n:0.0441, execution time: 0.5352s
Minimun Covariance Determinant (MCD) ROC:0.8448, precision @ rank n:0.1176, execution time: 2.0516s
One-Class SVM (OCSVM) ROC:0.9332, precision @ rank n:0.4118, execution time: 0.9364s
Principal Component Analysis ROC:0.9343, precision @ rank n:0.3824, execution time: 0.0082s

... Processing pima.mat ...
Angle based outlier detector(ABOD) ROC:0.6924, precision @ rank n:0.

In [24]:
roc_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,CBLOF,Data,FB,IForest,KNN,LOF,MCD,OCSVM,Outlier Prec,PCA
0,arrhythmia,452,274,14.6018,0.7246,0.7283,0.712,0.8077,0.7972,0.7148,...,,,,,,,,,,
0,cardio,1831,21,9.6122,0.5928,0.7221,0.6003,0.8556,0.9243,0.7663,...,,,,,,,,,,
0,glass,214,9,4.2056,0.8571,0.8373,0.8571,0.7288,0.745,0.8427,...,,,,,,,,,,
0,ionosphere,351,33,35.8974,0.9312,0.9103,0.9132,0.6263,0.8597,0.937,...,,,,,,,,,,
0,letter,1600,32,6.25,0.8804,0.7622,0.8536,0.5871,0.6212,0.861,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
0,satimage-2,5803,36,1.2235,0.8338,0.9999,0.4576,0.9865,0.9972,0.9644,...,,,,,,,,,,
0,shuttle,49097,9,7.1511,0.6157,0.664,0.4715,0.9856,0.9958,0.6551,...,,,,,,,,,,
0,vertebral,240,6,12.5,0.358,0.385,0.3232,0.351,0.3981,0.3667,...,,,,,,,,,,
0,vowels,1456,12,3.4341,0.9667,0.913,0.9603,0.7511,0.8054,0.9669,...,,,,,,,,,,


In [26]:
prn_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,CBLOF,Data,FB,IForest,KNN,LOF,MCD,OCSVM,Outlier Prec,PCA
0,arrhythmia,452,274,14.6018,0.2857,0.2857,0.2857,0.4643,0.4286,0.2857,...,,,,,,,,,,
0,cardio,1831,21,9.6122,0.275,0.3,0.2,0.525,0.55,0.3875,...,,,,,,,,,,
0,glass,214,9,4.2056,0.2857,0.1429,0.1429,0,0.1429,0.1429,...,,,,,,,,,,
0,ionosphere,351,33,35.8974,0.8305,0.8644,0.7966,0.5085,0.7119,0.8983,...,,,,,,,,,,
0,letter,1600,32,6.25,0.3778,0.2667,0.3556,0.0889,0.1333,0.3111,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
0,satimage-2,5803,36,1.2235,0.1875,0.9375,0.0938,0.6562,0.875,0.4062,...,,,,,,,,,,
0,shuttle,49097,9,7.1511,0.1819,0.2326,0.079,0.9806,0.9486,0.2062,...,,,,,,,,,,
0,vertebral,240,6,12.5,0,0,0,0.0714,0.0714,0,...,,,,,,,,,,
0,vowels,1456,12,3.4341,0.5263,0.2105,0.3684,0.1579,0.3684,0.6316,...,,,,,,,,,,


In [27]:
time_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,CBLOF,Data,FB,IForest,KNN,LOF,MCD,OCSVM,Outlier Prec,PCA
0,arrhythmia,452,274,14.6018,4.4181,2.7594,0.6101,1.7519,0.5789,0.0901,...,,,,,,,,,,
0,cardio,1831,21,9.6122,0.4035,0.1632,0.7619,0.0101,0.3635,0.1358,...,,,,,,,,,,
0,glass,214,9,4.2056,0.0599,0.0559,0.0479,0.004,0.3156,0.008,...,,,,,,,,,,
0,ionosphere,351,33,35.8974,0.0959,0.0679,0.0882,0.012,0.3543,0.02,...,,,,,,,,,,
0,letter,1600,32,6.25,0.3893,0.0919,0.7204,0.008,0.3795,0.1355,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
0,satimage-2,5803,36,1.2235,1.6607,0.3081,5.3415,0.0163,0.7006,0.8657,...,,,,,,,,,,
0,shuttle,49097,9,7.1511,13.1617,1.1538,40.3906,0.0181,3.4099,7.0776,...,,,,,,,,,,
0,vertebral,240,6,12.5,0.0718,0.078,0.0462,0,0.346,0.012,...,,,,,,,,,,
0,vowels,1456,12,3.4341,0.3723,0.1432,0.3997,0.004,0.4236,0.0839,...,,,,,,,,,,
