In [1]:
import numpy as np
import os
import sys
import pandas as pd
from sklearn.model_selection import train_test_split
from scipy.io import loadmat 

In [2]:
#import Pyod Packages & the methods

from pyod.models.pca import PCA  # Pca model
from pyod.models.mcd import MCD   # Mcd model
from pyod.models.ocsvm import OCSVM # Ocsvm model
from pyod.models.lof import LOF # lof model
from pyod.models.cblof import CBLOF
from pyod.models.knn import KNN
from pyod.models.hbos import HBOS
from pyod.models.abod import ABOD
from pyod.models.iforest import IForest
from pyod.models.feature_bagging import FeatureBagging



In [3]:
# Import Metics Packages

from pyod.utils.utility import standardizer
from pyod.utils.utility import precision_n_scores
from sklearn.metrics import roc_auc_score

In [4]:
# Define data file and read X and Y
mat_file_list = ["arrhythmia.mat","cardio.mat","glass.mat","ionosphere.mat","letter.mat","lympho.mat","mnist.mat","musk.mat","optdigits.mat","pendigits.mat","pima.mat","satellite.mat","satimage-2.mat","shuttle.mat","vertebral.mat","vowels.mat","wbc.mat"]

In [5]:
# How to load mat file
data = loadmat('data/cardio.mat')

In [6]:
data

{'__header__': b'MATLAB 5.0 MAT-file, written by Octave 3.8.0, 2014-12-18 10:48:09 UTC',
 '__version__': '1.0',
 '__globals__': [],
 'X': array([[ 0.00491231,  0.69319077, -0.20364049, ...,  0.23149795,
         -0.28978574, -0.49329397],
        [ 0.11072935, -0.07990259, -0.20364049, ...,  0.09356344,
         -0.25638541, -0.49329397],
        [ 0.21654639, -0.27244466, -0.20364049, ...,  0.02459619,
         -0.25638541,  1.14001753],
        ...,
        [-0.41835583, -0.91998844, -0.16463485, ..., -1.49268341,
          0.24461959, -0.49329397],
        [-0.41835583, -0.91998844, -0.15093411, ..., -1.42371616,
          0.14441859, -0.49329397],
        [-0.41835583, -0.91998844, -0.20364049, ..., -1.28578165,
          3.58465295, -0.49329397]]),
 'y': array([[0.],
        [0.],
        [0.],
        ...,
        [1.],
        [1.],
        [1.]])}

In [7]:
data.keys()

dict_keys(['__header__', '__version__', '__globals__', 'X', 'y'])

In [8]:
data.values()

dict_values([b'MATLAB 5.0 MAT-file, written by Octave 3.8.0, 2014-12-18 10:48:09 UTC', '1.0', [], array([[ 0.00491231,  0.69319077, -0.20364049, ...,  0.23149795,
        -0.28978574, -0.49329397],
       [ 0.11072935, -0.07990259, -0.20364049, ...,  0.09356344,
        -0.25638541, -0.49329397],
       [ 0.21654639, -0.27244466, -0.20364049, ...,  0.02459619,
        -0.25638541,  1.14001753],
       ...,
       [-0.41835583, -0.91998844, -0.16463485, ..., -1.49268341,
         0.24461959, -0.49329397],
       [-0.41835583, -0.91998844, -0.15093411, ..., -1.42371616,
         0.14441859, -0.49329397],
       [-0.41835583, -0.91998844, -0.20364049, ..., -1.28578165,
         3.58465295, -0.49329397]]), array([[0.],
       [0.],
       [0.],
       ...,
       [1.],
       [1.],
       [1.]])])

In [9]:
type(data['X']),data['X'].shape

(numpy.ndarray, (1831, 21))

In [10]:
type(data['y']),data['y'].shape

(numpy.ndarray, (1831, 1))

In [11]:
df_columns = ['data','#samples','#Dimensuions','Outlier','ABOD','CBLOF','FB','HBOS','IForest','KNN','LOF','MCD','OCSVM','PCA']

In [12]:
roc_df = pd.DataFrame(columns=df_columns)  # ROC Perforance Valuation Table
roc_df

Unnamed: 0,data,#samples,#Dimensuions,Outlier,ABOD,CBLOF,FB,HBOS,IForest,KNN,LOF,MCD,OCSVM,PCA


In [13]:
prn_df = pd.DataFrame(columns=df_columns)  # precision_n_scores Perforance Valuation Table 
prn_df

Unnamed: 0,data,#samples,#Dimensuions,Outlier,ABOD,CBLOF,FB,HBOS,IForest,KNN,LOF,MCD,OCSVM,PCA


In [14]:
time_df = pd.DataFrame(columns=df_columns) # Time dataframe
time_df

Unnamed: 0,data,#samples,#Dimensuions,Outlier,ABOD,CBLOF,FB,HBOS,IForest,KNN,LOF,MCD,OCSVM,PCA


# Exploring all Mat files

In [15]:
from time import time
random_state = np.random.RandomState(42)

for mat_file in mat_file_list:
    print("\n... Processing",mat_file, '...')
    mat = loadmat(os.path.join('data',mat_file))
    
    X = mat['X']
    y = mat['y'].ravel()
    outliers_fraction = np.count_nonzero(y)/len(y)
    outliers_percentage = round(outliers_fraction * 100,ndigits=4)
    
    # contruct containers for saving results
    roc_list = [mat_file[:-4], X.shape[0], X.shape[1], outliers_percentage]
    prn_list = [mat_file[:-4], X.shape[0], X.shape[1], outliers_percentage]
    time_list = [mat_file[:-4], X.shape[0], X.shape[1], outliers_percentage]
    
    # 60% data for training and 40% for testing
    X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.4,random_state=random_state)
    
    # standardizing data for processing
    X_train_norm,X_test_norm = standardizer(X_train,X_test)
    
    classifier = {'Angle-based Outlier Detector (ABOD)': ABOD(contamination=outliers_fraction),
                  'Cluster-based Local Outlier Factor':CBLOF(contamination=outliers_fraction, check_estimator=False,random_state=random_state),
                 'Feature Bagging': FeatureBagging(contamination=outliers_fraction,random_state=random_state),
                 'Histogram-base Outlier Detection (HBOS)': HBOS(contamination=outliers_fraction),
                 'Isolation Forest': IForest(contamination=outliers_fraction,random_state=random_state),
                 'K Nearest Neighbors (KNN)':KNN(contamination=outliers_fraction),
                 'Local Outlier Factor (LOF)': LOF(contamination=outliers_fraction),
                 'Minimum Covariance Determinant (MCD)': MCD(contamination=outliers_fraction, random_state=random_state),
                 'One-class SVM (OCSVM)': OCSVM(contamination=outliers_fraction),
                 'Principal Component Analysis (PCA)': PCA(contamination=outliers_fraction, random_state=random_state),
                 }
    for clf_name,clf in classifier.items():
        t0 = time()
        clf.fit(X_train_norm)
        test_scores = clf.decision_function(X_test_norm)
        t1 = time()
        duration = round(t1-t0,ndigits=4)
        time_list.append(duration)
        
        roc = round(roc_auc_score(y_test,test_scores),ndigits=4)
        prn = round(precision_n_scores(y_test,test_scores), ndigits=4)
        
        print('{clf_name} ROC:{roc}, precision @rank n:{prn},'
              'execution time: {duration}s'.format(clf_name = clf_name, roc=roc, prn=prn, duration=duration))
        
        roc_list.append(roc)
        prn_list.append(prn)
        
    temp_df = pd.DataFrame(time_list).transpose()
    temp_df.columns = df_columns
    time_df = pd.concat([time_df, temp_df],axis=0)
    
    temp_df = pd.DataFrame(roc_list).transpose()
    temp_df.columns = df_columns
    time_df = pd.concat([roc_df, temp_df],axis=0)
    
    temp_df = pd.DataFrame(prn_list).transpose()
    temp_df.columns = df_columns
    time_df = pd.concat([prn_df, temp_df],axis=0)


... Processing arrhythmia.mat ...
Angle-based Outlier Detector (ABOD) ROC:0.7687, precision @rank n:0.3571,execution time: 6.3046s
Cluster-based Local Outlier Factor ROC:0.7684, precision @rank n:0.4643,execution time: 4.5968s
Feature Bagging ROC:0.7799, precision @rank n:0.5,execution time: 1.3308s
Histogram-base Outlier Detection (HBOS) ROC:0.8511, precision @rank n:0.5714,execution time: 3.5003s
Isolation Forest ROC:0.8527, precision @rank n:0.5714,execution time: 0.9718s
K Nearest Neighbors (KNN) ROC:0.782, precision @rank n:0.5,execution time: 0.1981s
Local Outlier Factor (LOF) ROC:0.7787, precision @rank n:0.4643,execution time: 0.1664s




Minimum Covariance Determinant (MCD) ROC:0.8228, precision @rank n:0.4286,execution time: 2.0785s
One-class SVM (OCSVM) ROC:0.7986, precision @rank n:0.5,execution time: 0.1424s
Principal Component Analysis (PCA) ROC:0.7997, precision @rank n:0.5,execution time: 0.1611s

... Processing cardio.mat ...
Angle-based Outlier Detector (ABOD) ROC:0.5763, precision @rank n:0.1875,execution time: 0.9383s
Cluster-based Local Outlier Factor ROC:0.8221, precision @rank n:0.4844,execution time: 0.3689s
Feature Bagging ROC:0.4879, precision @rank n:0.1406,execution time: 1.851s
Histogram-base Outlier Detection (HBOS) ROC:0.8453, precision @rank n:0.4688,execution time: 0.0157s
Isolation Forest ROC:0.9414, precision @rank n:0.5,execution time: 0.9181s
K Nearest Neighbors (KNN) ROC:0.6959, precision @rank n:0.2812,execution time: 0.3733s
Local Outlier Factor (LOF) ROC:0.4715, precision @rank n:0.125,execution time: 0.2068s




Minimum Covariance Determinant (MCD) ROC:0.8778, precision @rank n:0.3906,execution time: 1.2304s
One-class SVM (OCSVM) ROC:0.9507, precision @rank n:0.5938,execution time: 0.1865s
Principal Component Analysis (PCA) ROC:0.9638, precision @rank n:0.6875,execution time: 0.0598s

... Processing glass.mat ...
Angle-based Outlier Detector (ABOD) ROC:0.7104, precision @rank n:0.25,execution time: 0.1294s
Cluster-based Local Outlier Factor ROC:0.8506, precision @rank n:0.25,execution time: 0.0981s
Feature Bagging ROC:0.7043, precision @rank n:0.25,execution time: 0.0713s
Histogram-base Outlier Detection (HBOS) ROC:0.6524, precision @rank n:0.0,execution time: 0.0091s
Isolation Forest ROC:0.7195, precision @rank n:0.25,execution time: 0.6575s
K Nearest Neighbors (KNN) ROC:0.7805, precision @rank n:0.25,execution time: 0.028s
Local Outlier Factor (LOF) ROC:0.7774, precision @rank n:0.25,execution time: 0.0093s
Minimum Covariance Determinant (MCD) ROC:0.7165, precision @rank n:0.0,execution time



Angle-based Outlier Detector (ABOD) ROC:0.7813, precision @rank n:0.3562,execution time: 14.9514s
Cluster-based Local Outlier Factor ROC:0.8447, precision @rank n:0.4007,execution time: 2.3043s
Feature Bagging ROC:0.7259, precision @rank n:0.3664,execution time: 68.6613s
Histogram-base Outlier Detection (HBOS) ROC:0.5675, precision @rank n:0.1199,execution time: 0.0524s
Isolation Forest ROC:0.7801, precision @rank n:0.2979,execution time: 2.1405s
K Nearest Neighbors (KNN) ROC:0.8409, precision @rank n:0.4144,execution time: 6.6632s
Local Outlier Factor (LOF) ROC:0.7085, precision @rank n:0.339,execution time: 6.5764s




Minimum Covariance Determinant (MCD) ROC:0.863, precision @rank n:0.3973,execution time: 2.9452s
One-class SVM (OCSVM) ROC:0.8417, precision @rank n:0.3801,execution time: 4.7469s
Principal Component Analysis (PCA) ROC:0.8396, precision @rank n:0.3767,execution time: 0.2233s

... Processing musk.mat ...
Angle-based Outlier Detector (ABOD) ROC:0.0809, precision @rank n:0.0333,execution time: 2.3592s
Cluster-based Local Outlier Factor ROC:1.0, precision @rank n:1.0,execution time: 0.3529s
Feature Bagging ROC:0.5228, precision @rank n:0.1667,execution time: 14.0471s
Histogram-base Outlier Detection (HBOS) ROC:0.9999, precision @rank n:0.9667,execution time: 0.0978s
Isolation Forest ROC:0.9996, precision @rank n:0.9333,execution time: 1.3118s
K Nearest Neighbors (KNN) ROC:0.7348, precision @rank n:0.2333,execution time: 1.8465s
Local Outlier Factor (LOF) ROC:0.5323, precision @rank n:0.1333,execution time: 1.7778s
Minimum Covariance Determinant (MCD) ROC:1.0, precision @rank n:0.9667,execu



Minimum Covariance Determinant (MCD) ROC:0.3486, precision @rank n:0.0,execution time: 2.4473s
One-class SVM (OCSVM) ROC:0.4972, precision @rank n:0.0,execution time: 3.1354s
Principal Component Analysis (PCA) ROC:0.504, precision @rank n:0.0,execution time: 0.0929s

... Processing pendigits.mat ...
Angle-based Outlier Detector (ABOD) ROC:0.7008, precision @rank n:0.0308,execution time: 3.7422s
Cluster-based Local Outlier Factor ROC:0.9609, precision @rank n:0.3077,execution time: 0.6062s
Feature Bagging ROC:0.4687, precision @rank n:0.0462,execution time: 10.4853s
Histogram-base Outlier Detection (HBOS) ROC:0.9294, precision @rank n:0.2615,execution time: 0.0234s
Isolation Forest ROC:0.9422, precision @rank n:0.2769,execution time: 1.5323s
K Nearest Neighbors (KNN) ROC:0.7602, precision @rank n:0.0462,execution time: 1.4373s
Local Outlier Factor (LOF) ROC:0.481, precision @rank n:0.0462,execution time: 1.284s
Minimum Covariance Determinant (MCD) ROC:0.8271, precision @rank n:0.0615,ex





Minimum Covariance Determinant (MCD) ROC:0.9903, precision @rank n:0.7534,execution time: 24.2093s
One-class SVM (OCSVM) ROC:0.9922, precision @rank n:0.9553,execution time: 95.9656s
Principal Component Analysis (PCA) ROC:0.9902, precision @rank n:0.9503,execution time: 0.0725s

... Processing vertebral.mat ...
Angle-based Outlier Detector (ABOD) ROC:0.2797, precision @rank n:0.0,execution time: 0.1257s
Cluster-based Local Outlier Factor ROC:0.3908, precision @rank n:0.0,execution time: 0.1046s
Feature Bagging ROC:0.3027, precision @rank n:0.0,execution time: 0.0753s
Histogram-base Outlier Detection (HBOS) ROC:0.2695, precision @rank n:0.0,execution time: 0.0053s
Isolation Forest ROC:0.3576, precision @rank n:0.0,execution time: 0.6297s
K Nearest Neighbors (KNN) ROC:0.318, precision @rank n:0.0,execution time: 0.0239s
Local Outlier Factor (LOF) ROC:0.318, precision @rank n:0.0,execution time: 0.0065s
Minimum Covariance Determinant (MCD) ROC:0.3308, precision @rank n:0.0,execution time: