In [1]:
import copy
import math
import time
import scipy
import pickle
import random
import numpy as np
import pandas as pd
import scipy.io as sio

from sklearn import metrics
from numpy import linalg as li
from math import log, ceil, floor
from sklearn.externals import joblib
from os.path import dirname, join as pjoin
from sklearn.model_selection import train_test_split
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import average_precision_score, f1_score, roc_auc_score



In [2]:
#load dataset
def DataPreprocessing(dataset):
    mat_fname = pjoin('./', dataset)
    print('Loading', mat_fname)
    mat_contents = sio.loadmat(mat_fname)
    X, y = mat_contents['X'], mat_contents['y']
    y = y.reshape(-1)
    inliers, outliers = np.where(y == 0)[0], np.where(y == 1)[0]#     1 = outliers, 0 = inliers
    print('inliers', inliers, 'outliers', outliers, len(inliers), 'inliers', len(outliers), 'outliers')
    return X, y, inliers, outliers

#Threshold selection algorithm in OC-HDC
def thresholdSelection(x, a, b):
    threshold = np.mean(x)*a - np.std(x)*b
#     print('Threshold = ', threshold)
    return threshold

In [3]:
# HDC model
def getlevelList(totalLevel, minimum, maximum):
    levelList = []
    length = maximum - minimum
    gap = length / totalLevel
    for level in range(totalLevel):
        levelList.append(minimum + level*gap)
    levelList.append(maximum)
    return levelList

def numToKey(value, levelList):
    if (value == levelList[-1]):
        return len(levelList)-2
    upperIndex = len(levelList) - 1
    lowerIndex = 0
    keyIndex = 0
    while (upperIndex > lowerIndex):
        keyIndex = int((upperIndex + lowerIndex)/2)
        if (levelList[keyIndex] <= value and levelList[keyIndex+1] > value):
            return keyIndex
        if (levelList[keyIndex] > value):
            upperIndex = keyIndex
            keyIndex = int((upperIndex + lowerIndex)/2)
        else:
            lowerIndex = keyIndex
            keyIndex = int((upperIndex + lowerIndex)/2)
    return keyIndex

def checkVector(classHV, inputHV, threshold):
    guess = 0
#     confidence
    confidence = associateSearch(classHV, inputHV)
    if(confidence < threshold): guess = 1
        
    return guess, confidence

def standardization(X, MAX, MIN):
    t = MIN + (MAX - MIN)/2
    print(MAX, MIN, t)
    return (X - t)/np.abs((MAX - MIN)/2)

def associateSearch(HV1, HV2):
    return np.dot(HV1, HV2)/(li.norm(HV1) * li.norm(HV2) + 0.0)

class HyperDimensionalComputing(object):
    def __init__(self, dimension, totalLevel, datatype, buffer, threshold, bip, *string, cuda = False):
        self.Q = totalLevel
        self.dim = dimension
        self.token = 0
        self.buffer = buffer
        self.datatype = datatype
        self.threshold = threshold
        self.levelVector = self.genLevelVector(self.Q, bip, self.dim)
        self.levelList = getlevelList(totalLevel, self.buffer[0], self.buffer[1])
        
    def genBaseHVs(self, totalPos, baseVal, dimension):
        D = dimension
        baseHVs = dict()
        indexVector = range(D)
        change = int(D/2)
        for level in range(totalPos):
            name = level
            base = np.full(D, baseVal)
            toOne = np.random.permutation(indexVector)[:change]  
            for index in toOne:
                base[index] = 1
            baseHVs[name] = copy.deepcopy(base)     
        return baseHVs

    def genLevelVector(self, totalLevel, baseVal, dimension):
        D = dimension
        levelHVs = dict()
        indexVector = range(D)
        nextLevel = int((D/2/totalLevel))
        change = int(D/2)
        for level in range(totalLevel):
            name = level
            if(level == 0):
                base = np.full(D, baseVal)
                toOne = np.random.permutation(indexVector)[:change]
            else:
                toOne = np.random.permutation(indexVector)[:nextLevel]
            for index in toOne:
                base[index] = base[index] * -1
            levelHVs[name] = copy.deepcopy(base)
        return levelHVs
    
    def genHV(self, Xtr, Xts):
        HV_train, HV_test = [], []
        for i in range(Xtr.shape[0]):
            trainData = Xtr[i]
            hdv1 = self.encoding(self.dim, trainData, self.levelVector)
            HV_train.append(hdv1)
        for i in range(Xts.shape[0]):
            testData = Xts[i]
            hdv2 = self.encoding(self.dim, testData, self.levelVector)
            HV_test.append(hdv2)
        return np.array(HV_train), np.array(HV_test)

    def encoding(self, dimension, label, levelHVs):
        HDVector = np.zeros(dimension, dtype = self.datatype)
        key_cluster = []
        for keyVal in range(len(label)):
            key = numToKey(label[keyVal], self.levelList)
            levelHV = levelHVs[key]
            HDVector = HDVector + np.roll(levelHV, keyVal)
            key_cluster.append(key)
        return HDVector

    def genClassHV(self, classHV, inputHVs):
        #generate OC-HV
        for i in range(len(inputHVs)):
            classHV = classHV + np.array(inputHVs[i])
        return classHV
    
    def fit(self, classHV, trainHV):
        classHV_fit = copy.deepcopy(classHV)
        fitting_result, prelimit = [], []
        for index in range(trainHV.shape[0]):
            fitting, conf = checkVector(classHV_fit, trainHV[index], 0)
            fitting_result.append(fitting)
            prelimit.append(conf)
        if np.mean(fitting_result) != 0: print('ERROR')
        return prelimit
    
    def reconfirm(self, classHV, trainHV, epoches, threshold, lr, a, b):
        classHV_retrain = copy.deepcopy(classHV)
        for epoch in range(epoches):
            confidence = []
            for i in range(trainHV.shape[0]):
                _, conf = checkVector(classHV_retrain, trainHV[i], threshold)
                confidence.append(conf)
                if conf < threshold:
                    classHV_retrain += trainHV[i]*lr
            threshold = thresholdSelection(confidence, a, b)
            print('epoch', epoch, 'mean distance', threshold)
        return classHV_retrain, threshold

    def predict(self, classHV, testHV, testLabel, threshold_predict):
        guessList, confList = [], []
        for index in range(testHV.shape[0]):
            predict, conf = checkVector(classHV, testHV[index], threshold_predict)
            guessList.append(predict)
            confList.append(conf)
        guessList = np.array(guessList)
        accuracy = np.mean(guessList == testLabel)
        return accuracy, guessList, confList

In [4]:
dataset = './Dataset/satimage-2.mat'
X_set, y_set, inner, outer = DataPreprocessing(dataset)
np.random.seed  = 5
p = outer.tolist() + np.random.choice(inner, 3*len(outer), replace = 0).tolist()
position = [x for x in inner if x not in p]
Xtrain, Xtest, ytrain, ytest = X_set[position], X_set[p], y_set[position],  y_set[p]
Xtrain = standardization(Xtrain, Xtrain.max(), Xtrain.min())
Xtest  = standardization(Xtest,Xtest.max(), Xtest.min())

Loading ././Dataset/satimage-2.mat
inliers [   0    1    2 ... 5729 5730 5731] outliers [5732 5733 5734 5735 5736 5737 5738 5739 5740 5741 5742 5743 5744 5745
 5746 5747 5748 5749 5750 5751 5752 5753 5754 5755 5756 5757 5758 5759
 5760 5761 5762 5763 5764 5765 5766 5767 5768 5769 5770 5771 5772 5773
 5774 5775 5776 5777 5778 5779 5780 5781 5782 5783 5784 5785 5786 5787
 5788 5789 5790 5791 5792 5793 5794 5795 5796 5797 5798 5799 5800 5801
 5802] 5732 inliers 71 outliers
139.0 29.0 84.0
157.0 27.0 92.0


In [5]:
dimension, totalLevel, datatype, buffer, threshold = 10000, 100, np.int64, [-1.0, 1.0], 0.9
OCHDC = HyperDimensionalComputing(dimension, totalLevel, datatype, buffer, threshold, -1)
# with open('levelVector.pickle', 'rb') as f2:
#     OCHDC.levelVector = pickle.load(f2)
ClassHV = np.zeros(OCHDC.dim, dtype = OCHDC.datatype)
s1 = time.time()
trainHVs, testHVs = OCHDC.genHV(Xtrain, Xtest)
s2 = time.time()
oneClassHV = OCHDC.genClassHV(ClassHV, trainHVs)
s3 = time.time()
print('Encoding time' , s2 - s1, 'training time', s3 - s2)

Encoding time 7.2196924686431885 training time 0.08177304267883301


In [6]:
testLabels = ytest
prelimitation = OCHDC.fit(oneClassHV, trainHVs)
a, b= 1, 1
threshold = thresholdSelection(prelimitation, a, b)
print(threshold)
Oneshot_accuracy, guessList, confList = OCHDC.predict(oneClassHV, testHVs, testLabels, threshold)
# Oneshot: inference without retraining
acc = Oneshot_accuracy
ap = average_precision_score(testLabels, guessList)
f1score = f1_score(testLabels, guessList)
rocauc = roc_auc_score(testLabels, guessList)
print("OCHDC oneshot")
print('ACC = ', acc)
print('AP = ', ap)
print('F1-score = ', f1score)
print('ROC AUC = ', rocauc)

0.9033249966657074
OCHDC oneshot
ACC =  0.8309859154929577
AP =  0.5804690528492054
F1-score =  0.7333333333333334
ROC AUC =  0.863849765258216


In [7]:
s = time.time()
oneClass_retrain, threshold_retrain = OCHDC.reconfirm(oneClassHV, trainHVs, 10, threshold, 5, a, b)
t = time.time()
print('retrain', t - s)

epoch 0 mean distance 0.9063327459188816
epoch 1 mean distance 0.9078270335802591
epoch 2 mean distance 0.9078620892339405
epoch 3 mean distance 0.9076075349929692
epoch 4 mean distance 0.9073178865618124
epoch 5 mean distance 0.9070711425910504
epoch 6 mean distance 0.9068474554016746
epoch 7 mean distance 0.9066594486818853
epoch 8 mean distance 0.9064931290378152
epoch 9 mean distance 0.9063498386974556
retrain 2.9959843158721924


In [8]:
prelimitation_retrain = OCHDC.fit(oneClass_retrain, trainHVs)
threshold_retrain = thresholdSelection(prelimitation_retrain, a, b)
retrain_accuracy, guessList, confList = OCHDC.predict(oneClass_retrain, testHVs, testLabels, threshold_retrain)
# Retraining: Check all training HV before inference process
acc2 = retrain_accuracy
ap2 = average_precision_score(testLabels, guessList)
f1score2 = f1_score(testLabels, guessList)
rocauc2 = roc_auc_score(testLabels, guessList)
print("OCHDC retrain")
print('ACC = ', acc2)
print('AP = ', ap2)
print('F1-score = ', f1score2)
print('ROC AUC = ', rocauc2)

OCHDC retrain
ACC =  0.8943661971830986
AP =  0.6939270028774799
F1-score =  0.8170731707317074
ROC AUC =  0.9107981220657277
