In [1]:
from dsio.anomaly_detectors import AnomalyMixin
from dsio.update_formulae import decision_rule
import numpy as np
import random

"""
Currently works only for one dimension
change copies of maximum and minimum values for multiple dimensions
this is due to the fact that every time a single variable is used as
a sensor to detect an anomaly

TODO? change for it to work for both list and numpy arrays
"""


class Node:
    def __init__(self, left, right, split_attribute, splitvalue, depth, r, l):
        self.left = left
        self.right = right
        self.split_attribute = split_attribute
        self.splitvalue = splitvalue
        self.depth = depth
        self.r = r
        self.l = l


class HSTress(AnomalyMixin):
    def __init__(self, n_estimators=10, batch_size=1000, max_depth=15, threshold=0.90, seed=None):
        """
        :param n_estimators: the number of Half Space Trees to be used 
        :param batch_size: the number of samples before change of window 
        :param max_depth: the maximum depth of the tree
        :param seed: random purposes
        """

        self.__name__ = "half_space_tree"
        self.n_estimators = n_estimators
        self.batch_size = batch_size
        self.max_depth = max_depth
        self.threshold = threshold

        random.seed(seed)

        # minimum numbers of samples required to split, TODO default value
        self.min_samples_split = int(batch_size / 20)

        # list of the roots of the Half Space Trees
        self.trees = []
        # used to store the number of new streaming data
        self.count = 0
        # max score achieved in in reference and latest window
        self.max_score_r = 0
        self.max_score_l = 0

    def build_single_hs_tree(self, minimum, maximum, depth_level):
        """
        :param minimum: array of minimum values
        :param maximum: array of maximum values
        values for every dimension in a Work Space,
        :param depth_level: current depth level
        :return an HS-Tree
        """

        if depth_level == self.max_depth:
            return Node(None, None, None, None, depth_level, 0, 0)

        # randomly select a dimension q, TODO currently only returns one
        # number_of_dimensions = len(minimum)
        # dimension = randint(0, number_of_dimensions - 1)

        # split_value = (maximum[dimension] + minimum[dimension])/2
        # TODO maybe a better implementation would be to find the mean?
        split_value = (maximum + minimum) / 2

        # Build two nodes (Lef t & Right) from a split into two equal-volume half-spaces
        temp = maximum
        maximum = split_value
        left = self.build_single_hs_tree(minimum, maximum, depth_level + 1)

        maximum = temp
        minimum = split_value
        right = self.build_single_hs_tree(minimum, maximum, depth_level + 1)

        # TODO multiple dimensions here split_attribute is always 0
        return Node(left, right, 0, split_value, depth_level, 0, 0)

    def update_mass(self, x, node, reference_window):
        """
        :param x: instance 
        :param node: node in the HS-Tree
        :param reference_window: boolean, true -> reference_window, false -> latest window
        :return: 
        """

        if reference_window:
            node.r += 1
        else:
            node.l += 1

        if node.depth < self.max_depth and node.left is not None:
            if x < node.splitvalue:
                self.update_mass(x, node.left, reference_window)
            else:
                self.update_mass(x, node.right, reference_window)

    def node_score(self, node):
        score = node.r * pow(2, node.depth)
        if score > self.max_score_l:
            self.max_score_l = score

        return score

    def score(self, x, node):
        # check if final node
        if node.left is None:
            return self.node_score(node)

        if x < node.splitvalue:
            # if insufficient number of elements in sub-tree
            if node.left.r < self.min_samples_split:
                return self.node_score(node)
            return self.score(x, node.left)
        else:
            # if insufficient number of elements in sub-tree
            if node.right.r < self.min_samples_split:
                return self.node_score(node)
            return self.score(x, node.right)

    def update_model(self, node):
        node.r = node.l
        node.l = 0

        if node.left is not None:
            self.update_model(node.left)
            # if left sub-tree not None so is the right sub-tree
            self.update_model(node.right)

    def initialize_hs_trees(self, training_sample):
        minimum = min(training_sample)
        maximum = max(training_sample)

        for _ in range(self.n_estimators):
            # TODO
            rand = random.uniform(minimum, maximum)
            tree_maximum = minimum + rand + 2 * max(rand - minimum, maximum - rand)
            tree_minimum = minimum + rand - 2 * max(rand - minimum, maximum - rand)
            root_node = self.build_single_hs_tree(tree_minimum, tree_maximum, 0)
            self.trees.append(root_node)
            for x in training_sample:
                self.update_mass(x, root_node, True)

    def streaming(self, streaming_data):
        for x in streaming_data:
            for tree in self.trees:
                # accumulate scores
                self.update_mass(x, tree, False)

            self.count += 1
            if self.count == self.batch_size:
                # update reference and latest window values for all nodes in the all trees
                for root_node in self.trees:
                    self.update_model(root_node)

                self.count = 0
                self.max_score_r = self.max_score_l
                self.max_score_l = 0

    def update(self, streaming_data):
        self.streaming(streaming_data)

    def flag_anomaly(self, x):

        # custom decision rule
        return decision_rule(self.score_anomaly(x), threshold=self.threshold, two_sided=False)

    def fit(self, x):
        self.initialize_hs_trees(x)

        # initialize max_score_r
        self.max_score_r = np.max(self.get_raw_score_anomaly(x))
        # return to zero due to change in node_score
        self.max_score_l = 0

    def get_raw_score_anomaly(self, streaming_data):
        scores = []
        for x in streaming_data:
            anomaly_score = 0
            for tree in self.trees:
                # accumulate scores
                anomaly_score += self.score(x, tree)
            scores.append(anomaly_score)

        return np.array(scores)

    def score_anomaly(self, streaming_data):
        return 1 - self.get_raw_score_anomaly(streaming_data) / self.max_score_r


In [2]:

import pandas as pd
from dsio.main import restream_dataframe
from functools import partial

dataframe = pd.read_csv('C:/Users/Sotiris/Desktop/project/data/failed_auth_dsio.csv', sep=',')

detector = partial(HSTress, n_estimators=10, max_depth=5)
# partial has no attribute __name__ needed
detector.__name__ = 'half_space_tree'
restream_dataframe(dataframe, detector, sensors=['attempts'], cols=2, speed=50000)

data found from 2018-03-27 17:50:00 to 2018-04-20 16:50:00
Converting to milliseconds ...
Done
Adding time offset of -364562.48 seconds
Setting speed to 50000x
Done



Writing 27 rows dated 2018-03-23 11:33:57.520000 to 2018-03-23 11:34:00.520000
.