In [4]:
import os
import sys
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from scipy.io import loadmat


In [14]:
#Importing PYoD packages and Methods'

from pyod.models.pca import PCA
from pyod.models.mcd import MCD
from pyod.models.ocsvm import OCSVM
from pyod.models.lof import LOF
from pyod.models.cblof import CBLOF
from pyod.models.knn import KNN
from pyod.models.hbos import HBOS

from pyod.models.abod import ABOD
from pyod.models.iforest import IForest
from pyod.models.feature_bagging import FeatureBagging

In [15]:
#Performance methods

from pyod.utils.utility import standardizer
from pyod.utils.utility import precision_n_scores
from sklearn.metrics import roc_auc_score

In [17]:
#Define the data file and reading X and y

mat_file_list = ["arrhythmia.mat","cardio.mat","glass.mat","ionosphere.mat","letter.mat","lympho.mat","mnist.mat","musk.mat",
                 "optdigits.mat","pendigits.mat","pima.mat","satellite.mat","satimage-2.mat","shuttle.mat","vertebral.mat",
                "vowels.mat","wbc.mat"]
mat_file_list

['arrhythmia.mat',
 'cardio.mat',
 'glass.mat',
 'ionosphere.mat',
 'letter.mat',
 'lympho.mat',
 'mnist.mat',
 'musk.mat',
 'optdigits.mat',
 'pendigits.mat',
 'pima.mat',
 'satellite.mat',
 'satimage-2.mat',
 'shuttle.mat',
 'vertebral.mat',
 'vowels.mat',
 'wbc.mat']

In [20]:
#packages to import or access mat files

data = loadmat("E:\\python\\projectday1\\cardio.mat")
data #X are inputs and y is output

{'__header__': b'MATLAB 5.0 MAT-file, written by Octave 3.8.0, 2014-12-18 10:48:09 UTC',
 '__version__': '1.0',
 '__globals__': [],
 'X': array([[ 0.00491231,  0.69319077, -0.20364049, ...,  0.23149795,
         -0.28978574, -0.49329397],
        [ 0.11072935, -0.07990259, -0.20364049, ...,  0.09356344,
         -0.25638541, -0.49329397],
        [ 0.21654639, -0.27244466, -0.20364049, ...,  0.02459619,
         -0.25638541,  1.14001753],
        ...,
        [-0.41835583, -0.91998844, -0.16463485, ..., -1.49268341,
          0.24461959, -0.49329397],
        [-0.41835583, -0.91998844, -0.15093411, ..., -1.42371616,
          0.14441859, -0.49329397],
        [-0.41835583, -0.91998844, -0.20364049, ..., -1.28578165,
          3.58465295, -0.49329397]]),
 'y': array([[0.],
        [0.],
        [0.],
        ...,
        [1.],
        [1.],
        [1.]])}

In [21]:
len(data)

5

In [23]:
data.keys()

dict_keys(['__header__', '__version__', '__globals__', 'X', 'y'])

In [25]:
data.values()

dict_values([b'MATLAB 5.0 MAT-file, written by Octave 3.8.0, 2014-12-18 10:48:09 UTC', '1.0', [], array([[ 0.00491231,  0.69319077, -0.20364049, ...,  0.23149795,
        -0.28978574, -0.49329397],
       [ 0.11072935, -0.07990259, -0.20364049, ...,  0.09356344,
        -0.25638541, -0.49329397],
       [ 0.21654639, -0.27244466, -0.20364049, ...,  0.02459619,
        -0.25638541,  1.14001753],
       ...,
       [-0.41835583, -0.91998844, -0.16463485, ..., -1.49268341,
         0.24461959, -0.49329397],
       [-0.41835583, -0.91998844, -0.15093411, ..., -1.42371616,
         0.14441859, -0.49329397],
       [-0.41835583, -0.91998844, -0.20364049, ..., -1.28578165,
         3.58465295, -0.49329397]]), array([[0.],
       [0.],
       [0.],
       ...,
       [1.],
       [1.],
       [1.]])])

In [27]:
#Input or independent feature shape in mat file 
type(data["X"]),data["X"].shape

(numpy.ndarray, (1831, 21))

In [31]:
#output or dependent feature shape in mat file
type(data["y"]),data["y"].shape

(numpy.ndarray, (1831, 1))

In [32]:
df_columns = ["Data", "#Samples","# Dimensions","Outlier Perc", "ABOD","CBLOF","FB","HBOS","IForest","KNN","LOF","MCD","OCSVM",
             "PCA"]
df_columns

['Data',
 '#Samples',
 '# Dimensions',
 'Outlier Perc',
 'ABOD',
 'CBLOF',
 'FB',
 'HBOS',
 'IForest',
 'KNN',
 'LOF',
 'MCD',
 'OCSVM',
 'PCA']

In [33]:
#ROC Performance evaluation table Region of Charectarstics
roc_df = pd.DataFrame(columns=df_columns)
roc_df

Unnamed: 0,Data,#Samples,# Dimensions,Outlier Perc,ABOD,CBLOF,FB,HBOS,IForest,KNN,LOF,MCD,OCSVM,PCA


In [34]:
#Precision_n_scores - Performance evaluation table

prn_df =  pd.DataFrame(columns=df_columns)
prn_df

Unnamed: 0,Data,#Samples,# Dimensions,Outlier Perc,ABOD,CBLOF,FB,HBOS,IForest,KNN,LOF,MCD,OCSVM,PCA


In [35]:
#Time Data Frame
time_df = pd.DataFrame(columns=df_columns)
time_df

Unnamed: 0,Data,#Samples,# Dimensions,Outlier Perc,ABOD,CBLOF,FB,HBOS,IForest,KNN,LOF,MCD,OCSVM,PCA


In [59]:
#Exploring/Accessing all mat files
from time import time
random_state = np.random.RandomState(42)


for mat_file in mat_file_list:
    print("\n... Processing", mat_file, "...")
    mat = loadmat(os.path.join("E:\\python\\projectday1",mat_file))
    
    X = mat["X"]
    y = mat["y"].ravel()
    
    outliers_fraction = np.count_nonzero(y)/len(y)
    outliers_percentage = round(outliers_fraction*100, ndigits=4)
    
    #construct containers for saving results
    roc_list = [mat_file[:-4],X.shape[0],X.shape[1], outliers_percentage]
    prn_list = [mat_file[:-4],X.shape[0],X.shape[1], outliers_percentage]
    time_list = [mat_file[:-4],X.shape[0],X.shape[1], outliers_percentage]
    
    #60% data for training and 40% for testing
    X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.4,random_state = 2)
    
    # STandardaizing data for preprocessing
    X_train_norm,  X_test_norm = standardizer(X_train,X_test)
    
    classifiers = {
                   "Angle based outlier detector(ABOD)":ABOD(contamination = outliers_fraction), #contamination based on what factors the data is considered as outliers
                   "Cluster-based Loacl Outlier Factor":CBLOF(contamination = outliers_fraction,check_estimator= False, random_state=random_state),
                   "Feature Bagging":FeatureBagging(contamination = outliers_fraction,random_state=random_state),
                   "Histogram base outlier detection": HBOS(contamination = outliers_fraction),
                   "Isolation Forest": IForest(contamination = outliers_fraction,random_state=random_state),
                   "k Nearest Neighbours (kNN)": KNN(contamination = outliers_fraction),
                   "Local Outlier Factor(LOF)": LOF(contamination = outliers_fraction),
                   "Minimun Covariance Determinant (MCD)": MCD(contamination = outliers_fraction, random_state=random_state),
                   "One-Class SVM (OCSVM)": OCSVM(contamination = outliers_fraction),
                   "Principal Component Analysis": PCA(contamination = outliers_fraction,random_state=random_state)
                  }
                   
    for clf_name,clf in classifiers.items():
        t0 =time ()
        clf.fit(X_train_norm)
        test_scores = clf.decision_function(X_test_norm)
        t1 = time()
        duration = round(t1-t0, ndigits=4)
        time_list.append(duration)
                        
        roc = round(roc_auc_score(y_test,test_scores),ndigits = 4)
        prn = round(precision_n_scores(y_test,test_scores),ndigits = 4)
                        
        print("{clf_name} ROC:{roc}, precision @ rank n:{prn}, execution time: {duration}s".
                              format(clf_name=clf_name, roc =roc, prn = prn, duration = duration))
        roc_list.append(roc)
        prn_list.append(prn)
                        
    temp_df = pd.DataFrame(time_list).transpose()
    temp_df_columns = df_columns
    time_df = pd.concat([time_df,temp_df],axis=0)
    
    temp_df = pd.DataFrame(roc_list).transpose()
    temp_df_columns = df_columns
    roc_df = pd.concat([roc_df,temp_df],axis=0)
    
    temp_df = pd.DataFrame(prn_list).transpose()
    temp_df_columns = df_columns
    prn_df = pd.concat([prn_df,temp_df],axis=0)
                
                
    
    


... Processing arrhythmia.mat ...
Angle based outlier detector(ABOD) ROC:0.7246, precision @ rank n:0.2857, execution time: 0.307s
Cluster-based Loacl Outlier Factor ROC:0.7283, precision @ rank n:0.2857, execution time: 0.2533s
Feature Bagging ROC:0.712, precision @ rank n:0.2857, execution time: 1.1521s
Histogram base outlier detection ROC:0.8077, precision @ rank n:0.4643, execution time: 0.1289s
Isolation Forest ROC:0.7972, precision @ rank n:0.4286, execution time: 0.8136s
k Nearest Neighbours (kNN) ROC:0.7148, precision @ rank n:0.2857, execution time: 0.1721s
Local Outlier Factor(LOF) ROC:0.7171, precision @ rank n:0.3214, execution time: 0.1514s




Minimun Covariance Determinant (MCD) ROC:0.725, precision @ rank n:0.3214, execution time: 1.1814s
One-Class SVM (OCSVM) ROC:0.7124, precision @ rank n:0.2857, execution time: 0.0944s
Principal Component Analysis ROC:0.7134, precision @ rank n:0.2857, execution time: 0.1124s

... Processing cardio.mat ...
Angle based outlier detector(ABOD) ROC:0.5928, precision @ rank n:0.275, execution time: 0.7708s
Cluster-based Loacl Outlier Factor ROC:0.7221, precision @ rank n:0.3, execution time: 0.2768s
Feature Bagging ROC:0.6003, precision @ rank n:0.2, execution time: 1.6223s
Histogram base outlier detection ROC:0.8556, precision @ rank n:0.525, execution time: 0.0166s
Isolation Forest ROC:0.9243, precision @ rank n:0.55, execution time: 0.7662s
k Nearest Neighbours (kNN) ROC:0.7663, precision @ rank n:0.3875, execution time: 0.3168s
Local Outlier Factor(LOF) ROC:0.6062, precision @ rank n:0.2, execution time: 0.1961s




Minimun Covariance Determinant (MCD) ROC:0.8685, precision @ rank n:0.45, execution time: 0.9555s
One-Class SVM (OCSVM) ROC:0.9404, precision @ rank n:0.55, execution time: 0.1593s
Principal Component Analysis ROC:0.9542, precision @ rank n:0.6375, execution time: 0.003s

... Processing glass.mat ...
Angle based outlier detector(ABOD) ROC:0.8571, precision @ rank n:0.2857, execution time: 0.0833s
Cluster-based Loacl Outlier Factor ROC:0.8373, precision @ rank n:0.1429, execution time: 0.0877s
Feature Bagging ROC:0.8571, precision @ rank n:0.1429, execution time: 0.0694s
Histogram base outlier detection ROC:0.7288, precision @ rank n:0.0, execution time: 0.0066s
Isolation Forest ROC:0.745, precision @ rank n:0.1429, execution time: 0.5471s
k Nearest Neighbours (kNN) ROC:0.8427, precision @ rank n:0.1429, execution time: 0.0153s
Local Outlier Factor(LOF) ROC:0.8499, precision @ rank n:0.1429, execution time: 0.0077s
Minimun Covariance Determinant (MCD) ROC:0.7975, precision @ rank n:0.14



Minimun Covariance Determinant (MCD) ROC:0.8541, precision @ rank n:0.3077, execution time: 5.1221s
One-Class SVM (OCSVM) ROC:0.864, precision @ rank n:0.4281, execution time: 8.9496s
Principal Component Analysis ROC:0.8635, precision @ rank n:0.4114, execution time: 0.311s

... Processing musk.mat ...
Angle based outlier detector(ABOD) ROC:0.1312, precision @ rank n:0.0588, execution time: 4.3484s
Cluster-based Loacl Outlier Factor ROC:1.0, precision @ rank n:1.0, execution time: 0.7069s
Feature Bagging ROC:0.4629, precision @ rank n:0.1471, execution time: 24.4637s
Histogram base outlier detection ROC:1.0, precision @ rank n:0.9697, execution time: 0.1108s
Isolation Forest ROC:0.9995, precision @ rank n:0.9118, execution time: 2.4532s
k Nearest Neighbours (kNN) ROC:0.7337, precision @ rank n:0.1471, execution time: 3.5664s
Local Outlier Factor(LOF) ROC:0.4544, precision @ rank n:0.1176, execution time: 3.3084s
Minimun Covariance Determinant (MCD) ROC:0.9999, precision @ rank n:0.9706



Minimun Covariance Determinant (MCD) ROC:0.3681, precision @ rank n:0.0, execution time: 2.2581s
One-Class SVM (OCSVM) ROC:0.5012, precision @ rank n:0.0, execution time: 2.7376s
Principal Component Analysis ROC:0.5033, precision @ rank n:0.0, execution time: 0.0975s

... Processing pendigits.mat ...
Angle based outlier detector(ABOD) ROC:0.689, precision @ rank n:0.0882, execution time: 2.9752s
Cluster-based Loacl Outlier Factor ROC:0.8356, precision @ rank n:0.25, execution time: 0.527s
Feature Bagging ROC:0.4959, precision @ rank n:0.0588, execution time: 8.5505s
Histogram base outlier detection ROC:0.9243, precision @ rank n:0.3235, execution time: 0.0215s
Isolation Forest ROC:0.9422, precision @ rank n:0.3971, execution time: 1.3633s
k Nearest Neighbours (kNN) ROC:0.7524, precision @ rank n:0.1029, execution time: 1.2703s
Local Outlier Factor(LOF) ROC:0.4846, precision @ rank n:0.0441, execution time: 1.1429s
Minimun Covariance Determinant (MCD) ROC:0.8448, precision @ rank n:0.11







Minimun Covariance Determinant (MCD) ROC:0.9889, precision @ rank n:0.7362, execution time: 18.1867s
One-Class SVM (OCSVM) ROC:0.9896, precision @ rank n:0.9514, execution time: 85.7344s
Principal Component Analysis ROC:0.9879, precision @ rank n:0.9486, execution time: 0.0703s

... Processing vertebral.mat ...
Angle based outlier detector(ABOD) ROC:0.358, precision @ rank n:0.0, execution time: 0.105s
Cluster-based Loacl Outlier Factor ROC:0.385, precision @ rank n:0.0, execution time: 0.092s
Feature Bagging ROC:0.3232, precision @ rank n:0.0, execution time: 0.0665s
Histogram base outlier detection ROC:0.351, precision @ rank n:0.0714, execution time: 0.004s
Isolation Forest ROC:0.3981, precision @ rank n:0.0714, execution time: 0.5593s
k Nearest Neighbours (kNN) ROC:0.3667, precision @ rank n:0.0, execution time: 0.02s
Local Outlier Factor(LOF) ROC:0.2892, precision @ rank n:0.0, execution time: 0.008s
Minimun Covariance Determinant (MCD) ROC:0.4051, precision @ rank n:0.0, executio



Angle based outlier detector(ABOD) ROC:0.9667, precision @ rank n:0.5263, execution time: 0.5602s
Cluster-based Loacl Outlier Factor ROC:0.913, precision @ rank n:0.2105, execution time: 0.2127s
Feature Bagging ROC:0.9603, precision @ rank n:0.3684, execution time: 0.562s
Histogram base outlier detection ROC:0.7511, precision @ rank n:0.1579, execution time: 0.008s
Isolation Forest ROC:0.8054, precision @ rank n:0.3684, execution time: 0.7169s
k Nearest Neighbours (kNN) ROC:0.9669, precision @ rank n:0.6316, execution time: 0.1379s
Local Outlier Factor(LOF) ROC:0.9595, precision @ rank n:0.4737, execution time: 0.07s
Minimun Covariance Determinant (MCD) ROC:0.7261, precision @ rank n:0.0, execution time: 1.3467s
One-Class SVM (OCSVM) ROC:0.8672, precision @ rank n:0.4211, execution time: 0.0811s
Principal Component Analysis ROC:0.7061, precision @ rank n:0.2105, execution time: 0.004s

... Processing wbc.mat ...
Angle based outlier detector(ABOD) ROC:0.8776, precision @ rank n:0.0, exe