In [72]:
import os 
import sys 
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from scipy.io import loadmat

In [73]:

from pyod.models.pca import PCA
from pyod.models.mcd import MCD
from pyod.models.ocsvm import OCSVM
from pyod.models.lof import LOF
from pyod.models.cblof import CBLOF
from pyod.models.knn import KNN

from pyod.models.hbos import HBOS
from pyod.models.abod import ABOD

from pyod.models.iforest import IForest
from pyod.models.feature_bagging import FeatureBagging

In [74]:
from pyod.utils.utility import standardizer
from pyod.utils.utility import precision_n_scores
from sklearn.metrics import roc_auc_score

In [75]:
mat_file_list=['arrhythmia.mat',
              'cardio.mat',
              'glass.mat',
              'ionosphere.mat',
              'letter.mat',
              'lympho.mat',
              'mnist.mat',
              'musk.mat',
              'optdigits.mat',
              'pendigits.mat',
              'pima.mat',
              'satellite.mat',
              'satimage-2.mat',
              'shuttle.mat',
              'vertebral.mat',
              'vowels.mat',
              'wbc.mat']



In [76]:
data=loadmat('cardio.mat')
data

{'__header__': b'MATLAB 5.0 MAT-file, written by Octave 3.8.0, 2014-12-18 10:48:09 UTC',
 '__version__': '1.0',
 '__globals__': [],
 'X': array([[ 0.00491231,  0.69319077, -0.20364049, ...,  0.23149795,
         -0.28978574, -0.49329397],
        [ 0.11072935, -0.07990259, -0.20364049, ...,  0.09356344,
         -0.25638541, -0.49329397],
        [ 0.21654639, -0.27244466, -0.20364049, ...,  0.02459619,
         -0.25638541,  1.14001753],
        ...,
        [-0.41835583, -0.91998844, -0.16463485, ..., -1.49268341,
          0.24461959, -0.49329397],
        [-0.41835583, -0.91998844, -0.15093411, ..., -1.42371616,
          0.14441859, -0.49329397],
        [-0.41835583, -0.91998844, -0.20364049, ..., -1.28578165,
          3.58465295, -0.49329397]]),
 'y': array([[0.],
        [0.],
        [0.],
        ...,
        [1.],
        [1.],
        [1.]])}

In [77]:
len(data)

5

In [78]:
data.keys()

dict_keys(['__header__', '__version__', '__globals__', 'X', 'y'])

In [79]:
data.values()

dict_values([b'MATLAB 5.0 MAT-file, written by Octave 3.8.0, 2014-12-18 10:48:09 UTC', '1.0', [], array([[ 0.00491231,  0.69319077, -0.20364049, ...,  0.23149795,
        -0.28978574, -0.49329397],
       [ 0.11072935, -0.07990259, -0.20364049, ...,  0.09356344,
        -0.25638541, -0.49329397],
       [ 0.21654639, -0.27244466, -0.20364049, ...,  0.02459619,
        -0.25638541,  1.14001753],
       ...,
       [-0.41835583, -0.91998844, -0.16463485, ..., -1.49268341,
         0.24461959, -0.49329397],
       [-0.41835583, -0.91998844, -0.15093411, ..., -1.42371616,
         0.14441859, -0.49329397],
       [-0.41835583, -0.91998844, -0.20364049, ..., -1.28578165,
         3.58465295, -0.49329397]]), array([[0.],
       [0.],
       [0.],
       ...,
       [1.],
       [1.],
       [1.]])])

## input (independent) feature shape in mat file format

In [80]:
type(data['X']), data['X'].shape

(numpy.ndarray, (1831, 21))

##  dependent /target/output feature shape 

In [81]:
type(data['y']),data['y'].shape

(numpy.ndarray, (1831, 1))

In [82]:
df_columns=['Data','#Samples','# Dimensions','Outlier Perc','ABOD','CBLOF','FB','HBOS',"IForest","KNN","LOF","MCD","OCSVM","PCA"]





### ROC performance evaluation table 

In [83]:
roc_df=pd.DataFrame(columns=df_columns)

### precision-n_scores performance evaluation table

In [84]:
prn_df=pd.DataFrame(columns=df_columns)

## Time dataframe

In [85]:
time_df=pd.DataFrame(columns=df_columns)

### Exploring all data files

In [102]:
from time import time 
random_state = np.random.RandomState(42)

for mat_file in mat_file_list :
    print("\n ......Processing",mat_file,'...')
    mat = loadmat(os.path.join(mat_file))
    
    X= mat['X']
    y= mat['y'].ravel()
    outliers_fraction= np.count_nonzero(y) / len(y)
    outliers_percentage =round(outliers_fraction * 100, ndigits=4)
    
    #construct containers for saving results
    roc_list = [mat_file [:-4], X.shape[0], X.shape[1], outliers_percentage]
    prn_list = [mat_file [:-4], X.shape[0], X.shape[1], outliers_percentage]
    time_list = [mat_file [:-4], X.shape[0], X.shape[1], outliers_percentage]
    
    # 60% data for training and 40% for testing
    X_train, X_test, y_train , y_test = train_test_split(X, y, test_size=0.4,random_state=random_state)
    
    #standardizing data for processing
    X_train_norm , X_test_norm= standardizer(X_train, X_test)
    
    classifiers = {'Angle-based Outlier Detector (ABOD)':ABOD(contamination=outliers_fraction),'Cluster-based Local Outlier Factor': CBLOF(contamination=outliers_fraction,random_state=random_state), 'feature Bagging': FeatureBagging(contamination=outliers_fraction,random_state=random_state),'Histogram-base Outlier Detection (HBOS)': HBOS(contamination=outliers_fraction), 'Isolation Forest':IForest(contamination=outliers_fraction,random_state=random_state),'K nearest neighbors (KNN)': KNN(contamination=outliers_fraction),'Local Outlier Factor (LOF)':LOF(contamination=outliers_fraction), 'MCD' : MCD(contamination=outliers_fraction,random_state=random_state), 'OCSVM': OCSVM(contamination=outliers_fraction), 'Principal Component Analysis (PCA)': PCA(contamination=outliers_fraction, random_state=random_state),}
    
    for clf_name, clf in classifiers.items():
        t0=time()
        clf.fit(X_train_norm)
        test_scores=clf.decision_function(X_test_norm)
        t1=time()
        duration = round(t1 - t0 , ndigits=4)
        time_list.append(duration)  #append not working without assigning
        
        roc = round(roc_auc_score(y_test,test_scores), ndigits=4)
        prn = round(precision_n_scores(y_test,test_scores), ndigits=4)
        
        print('{clf_name} ROC:{roc}, precison @ rank n:{prn},'
               'esection time: {duration}s'.format(clf_name=clf_name, roc=roc, prn=prn, duration=duration))
        roc_list.append(roc)  #append not working without assigning
        prn_list.append(prn)  #append not working without assigning
    temp_df = pd.DataFrame(time_list).transpose() 
    temp_df.columns = df_columns
    time_df = pd.concat( [time_df, temp_df] , axis=0 )

    
    temp_df = pd.DataFrame(roc_list).transpose() 
    temp_df.columns = df_columns
    roc_df = pd.concat( [roc_df, temp_df],axis=0 )

    temp_df = pd.DataFrame(prn_list).transpose() 
    temp_df.columns = df_columns
    prn_df = pd.concat( [prn_df, temp_df] , axis=0 )
    

        


 ......Processing arrhythmia.mat ...
Angle-based Outlier Detector (ABOD) ROC:0.7687, precison @ rank n:0.3571,esection time: 0.1845s
Cluster-based Local Outlier Factor ROC:0.7684, precison @ rank n:0.4643,esection time: 0.1536s
feature Bagging ROC:0.7799, precison @ rank n:0.5,esection time: 0.7201s
Histogram-base Outlier Detection (HBOS) ROC:0.8511, precison @ rank n:0.5714,esection time: 0.0733s
Isolation Forest ROC:0.8527, precison @ rank n:0.5714,esection time: 0.4986s
K nearest neighbors (KNN) ROC:0.782, precison @ rank n:0.5,esection time: 0.1052s
Local Outlier Factor (LOF) ROC:0.7787, precison @ rank n:0.4643,esection time: 0.0882s




MCD ROC:0.8228, precison @ rank n:0.4286,esection time: 0.6907s
OCSVM ROC:0.7986, precison @ rank n:0.5,esection time: 0.0558s
Principal Component Analysis (PCA) ROC:0.7997, precison @ rank n:0.5,esection time: 0.0728s

 ......Processing cardio.mat ...
Angle-based Outlier Detector (ABOD) ROC:0.5763, precison @ rank n:0.1875,esection time: 0.4768s
Cluster-based Local Outlier Factor ROC:0.8221, precison @ rank n:0.4844,esection time: 0.191s
feature Bagging ROC:0.4879, precison @ rank n:0.1406,esection time: 1.1345s
Histogram-base Outlier Detection (HBOS) ROC:0.8453, precison @ rank n:0.4688,esection time: 0.011s
Isolation Forest ROC:0.9414, precison @ rank n:0.5,esection time: 0.4786s
K nearest neighbors (KNN) ROC:0.6959, precison @ rank n:0.2812,esection time: 0.1745s
Local Outlier Factor (LOF) ROC:0.4715, precison @ rank n:0.125,esection time: 0.1167s




MCD ROC:0.8778, precison @ rank n:0.3906,esection time: 0.6328s
OCSVM ROC:0.9507, precison @ rank n:0.5938,esection time: 0.1033s
Principal Component Analysis (PCA) ROC:0.9638, precison @ rank n:0.6875,esection time: 0.005s

 ......Processing glass.mat ...
Angle-based Outlier Detector (ABOD) ROC:0.7104, precison @ rank n:0.25,esection time: 0.0529s
Cluster-based Local Outlier Factor ROC:0.8506, precison @ rank n:0.25,esection time: 0.0618s
feature Bagging ROC:0.7043, precison @ rank n:0.25,esection time: 0.0439s
Histogram-base Outlier Detection (HBOS) ROC:0.6524, precison @ rank n:0.0,esection time: 0.004s
Isolation Forest ROC:0.7195, precison @ rank n:0.25,esection time: 0.3561s
K nearest neighbors (KNN) ROC:0.7805, precison @ rank n:0.25,esection time: 0.011s
Local Outlier Factor (LOF) ROC:0.7774, precison @ rank n:0.25,esection time: 0.004s
MCD ROC:0.7165, precison @ rank n:0.0,esection time: 0.0419s
OCSVM ROC:0.6189, precison @ rank n:0.25,esection time: 0.002s
Principal Component 



Angle-based Outlier Detector (ABOD) ROC:0.7813, precison @ rank n:0.3562,esection time: 8.943s
Cluster-based Local Outlier Factor ROC:0.8447, precison @ rank n:0.4007,esection time: 1.2477s
feature Bagging ROC:0.7259, precison @ rank n:0.3664,esection time: 61.3186s
Histogram-base Outlier Detection (HBOS) ROC:0.5675, precison @ rank n:0.1199,esection time: 0.0858s
Isolation Forest ROC:0.7801, precison @ rank n:0.2979,esection time: 2.6451s
K nearest neighbors (KNN) ROC:0.8409, precison @ rank n:0.4144,esection time: 7.6673s
Local Outlier Factor (LOF) ROC:0.7085, precison @ rank n:0.339,esection time: 7.4682s




MCD ROC:0.863, precison @ rank n:0.3973,esection time: 5.5381s
OCSVM ROC:0.8417, precison @ rank n:0.3801,esection time: 5.9202s
Principal Component Analysis (PCA) ROC:0.8396, precison @ rank n:0.3767,esection time: 0.2015s

 ......Processing musk.mat ...
Angle-based Outlier Detector (ABOD) ROC:0.0809, precison @ rank n:0.0333,esection time: 3.7789s
Cluster-based Local Outlier Factor ROC:1.0, precison @ rank n:1.0,esection time: 0.5076s
feature Bagging ROC:0.5228, precison @ rank n:0.1667,esection time: 17.21s
Histogram-base Outlier Detection (HBOS) ROC:0.9999, precison @ rank n:0.9667,esection time: 0.0688s
Isolation Forest ROC:0.9996, precison @ rank n:0.9333,esection time: 1.5359s
K nearest neighbors (KNN) ROC:0.7348, precison @ rank n:0.2333,esection time: 2.1084s
Local Outlier Factor (LOF) ROC:0.5323, precison @ rank n:0.1333,esection time: 1.9468s
MCD ROC:1.0, precison @ rank n:0.9667,esection time: 13.5548s
OCSVM ROC:1.0, precison @ rank n:1.0,esection time: 1.3863s
Principal Co



MCD ROC:0.3486, precison @ rank n:0.0,esection time: 1.4451s
OCSVM ROC:0.4972, precison @ rank n:0.0,esection time: 1.5868s
Principal Component Analysis (PCA) ROC:0.504, precison @ rank n:0.0,esection time: 0.056s

 ......Processing pendigits.mat ...
Angle-based Outlier Detector (ABOD) ROC:0.7008, precison @ rank n:0.0308,esection time: 1.6666s
Cluster-based Local Outlier Factor ROC:0.9609, precison @ rank n:0.3077,esection time: 0.3141s
feature Bagging ROC:0.4687, precison @ rank n:0.0462,esection time: 5.1682s
Histogram-base Outlier Detection (HBOS) ROC:0.9294, precison @ rank n:0.2615,esection time: 0.011s
Isolation Forest ROC:0.9422, precison @ rank n:0.2769,esection time: 0.7759s
K nearest neighbors (KNN) ROC:0.7602, precison @ rank n:0.0462,esection time: 0.6991s
Local Outlier Factor (LOF) ROC:0.481, precison @ rank n:0.0462,esection time: 0.6413s
MCD ROC:0.8271, precison @ rank n:0.0615,esection time: 2.1502s
OCSVM ROC:0.93, precison @ rank n:0.2923,esection time: 1.0123s
Princi





MCD ROC:0.9903, precison @ rank n:0.7534,esection time: 13.3842s
OCSVM ROC:0.9922, precison @ rank n:0.9553,esection time: 51.4505s
Principal Component Analysis (PCA) ROC:0.9902, precison @ rank n:0.9503,esection time: 0.0389s

 ......Processing vertebral.mat ...
Angle-based Outlier Detector (ABOD) ROC:0.2797, precison @ rank n:0.0,esection time: 0.0559s
Cluster-based Local Outlier Factor ROC:0.3908, precison @ rank n:0.0,esection time: 0.0568s
feature Bagging ROC:0.3027, precison @ rank n:0.0,esection time: 0.0598s
Histogram-base Outlier Detection (HBOS) ROC:0.2695, precison @ rank n:0.0,esection time: 0.004s
Isolation Forest ROC:0.3576, precison @ rank n:0.0,esection time: 0.3281s
K nearest neighbors (KNN) ROC:0.318, precison @ rank n:0.0,esection time: 0.01s
Local Outlier Factor (LOF) ROC:0.318, precison @ rank n:0.0,esection time: 0.004s
MCD ROC:0.3308, precison @ rank n:0.0,esection time: 0.0429s
OCSVM ROC:0.4087, precison @ rank n:0.0,esection time: 0.002s
Principal Component Ana

In [103]:
roc_list

['wbc',
 378,
 30,
 5.5556,
 0.9232,
 0.9063,
 0.9415,
 0.9592,
 0.9451,
 0.9437,
 0.9352,
 0.8986,
 0.9408,
 0.9324]

In [104]:
prn_list

['wbc', 378, 30, 5.5556, 0.3, 0.6, 0.5, 0.7, 0.5, 0.5, 0.4, 0.4, 0.5, 0.6]

In [105]:
time_list

['wbc',
 378,
 30,
 5.5556,
 0.0937,
 0.0808,
 0.0837,
 0.01,
 0.349,
 0.0209,
 0.009,
 0.0618,
 0.008,
 0.003]