In [28]:
import numpy as np
from sklearn.metrics import roc_auc_score
from RShash import RShash

#from scipy.io import arff

#### Apply the rs-hash implementation to three outlier detection benchmark datasets
this rs-hash method is an implementation according to Sathe and Aggarwal "Subspace Outlier Detection in Linear Time with Randomized Hashing"

in that paper the datasets Optdigits, Musk2, Waveform were used among others in rs-hash paper for performance benchmarking
the implementation is verified by checking that same metrics are achieved (as non-deterministic approach the results are subject to variations in each execution)

below the application to
1) Optdigits (http://archive.ics.uci.edu/ml/machine-learning-databases/optdigits)

2) Musk-2 (http://archive.ics.uci.edu/ml/machine-learning-databases/musk/clean2.data.Z)

3) Waveform (http://archive.ics.uci.edu/ml/machine-learning-databases/waveform)

In [23]:
url_path = "http://archive.ics.uci.edu/ml/machine-learning-databases/optdigits"
data_tra  = np.loadtxt(url_path+"/optdigits.tra",delimiter=',',usecols=range(64))
data_tes  = np.loadtxt(url_path+"/optdigits.tes",delimiter=',',usecols=range(64))
label_tra  = np.loadtxt(url_path+"/optdigits.tra",delimiter=',',usecols=[64])
label_tes  = np.loadtxt(url_path+"/optdigits.tes",delimiter=',',usecols=[64])
data = np.concatenate( (data_tra, data_tes))
label = np.concatenate( (label_tra, label_tes))
# downsample the outliers (0-labelled observations) to 150 as in paper
indices_to_drop = np.random.choice( np.where((label==0))[0], size=((label==0).sum()-150), replace=False)
data = np.delete(data,indices_to_drop, 0)
label = np.delete(label, indices_to_drop, 0)
outlier_label = (label==0)

# apply rs-hash and check roc-metric when comparing to true outlier labels 
rsh = RShash()
rsh_score = rsh.score(data)
roc = roc_auc_score(outlier_label, -rsh_score)
print("rs-hash outlier scoring on optdigits dataset gets a roc score of: ", roc)

rs-hash outlier scoring on optdigits dataset gets a roc score of:  0.6697328595867879


In [26]:
# load musk-2 ("http://archive.ics.uci.edu/ml/machine-learning-databases/musk/clean2.data.Z")
# load from local extracted copy 
data_musk  = np.loadtxt("musk_clean2_data",delimiter=',',usecols=range(2,168))
label_musk = np.genfromtxt("musk_clean2_data",dtype='str', delimiter=',',usecols=range(1))
# prepare data set as in paper
musk_inliers =  np.where( np.isin(label_musk,(['NON-MUSK-j146','NON-MUSK-j147','NON-MUSK-252']) ) )[0]
musk_outliers = np.where( np.isin(label_musk,(['MUSK-213','MUSK-211']) ) )[0]
# set data for algorithm
data = data_musk[np.concatenate( (musk_inliers,musk_outliers) ,axis=0)]
outlier_label = np.concatenate( (np.repeat(False, len(musk_inliers)),np.repeat(True,len(musk_outliers))) ,axis=0)

# apply rs-hash and check roc-metric when comparing to true outlier labels 
rsh = RShash()
rsh_score = rsh.score(data)
roc = roc_auc_score(outlier_label, -rsh_score)
print("rs-hash outlier scoring on Musk-2 dataset gets a roc score of: ", roc)

rs-hash outlier scoring on Musk-2 dataset gets a roc score of:  0.999993046017976


In [29]:
# from "http://archive.ics.uci.edu/ml/machine-learning-databases/waveform"
# load from local extracted copy
data_wave  = np.loadtxt("waveform_data",delimiter=',',usecols=range(21),encoding='utf-8')
label  = np.loadtxt("waveform_data",delimiter=',',usecols=[21])
# downsample the outliers (0-labelled observations) 10% as in paper
indices_to_drop = np.random.choice( np.where((label==0))[0], size=np.int(np.round(0.9*((label==0).sum()))), replace=False)
data = np.delete(data_wave,indices_to_drop, 0)
label = np.delete(label, indices_to_drop, 0)
outlier_label = (label==0)

# apply rs-hash and check roc-metric when comparing to true outlier labels 
rsh = RShash()
rsh_score = rsh.score(data)
roc = roc_auc_score(outlier_label, -rsh_score)
print("rs-hash outlier scoring on waveform dataset gets a roc score of: ", roc)

rs-hash outlier scoring on waveform dataset gets a roc score of:  0.7563061098717334
