In [18]:
"""
    implementation inspired from https://datasciencelab.wordpress.com/2014/01/15/improved-seeding-for-clustering-with-k-means/
"""

import numpy as np
import random
import scipy.spatial.distance
import sys


class KPlusPlusGaussian:
    def __init__(self, number_of_centers, x, stochastic=False, stochastic_n_samples=10000, random_seed=42):
        """
        :param stochastic: When stochastic is True for faster calculation only keep a smaller subset 
                            of the data of size stochastic_n_samples
        """
        assert len(x) >= number_of_centers
        assert number_of_centers > 0

        self.number_of_centers = number_of_centers
        if stochastic and stochastic_n_samples < len(x):
            idx = np.random.randint(len(x), size=stochastic_n_samples)
            self.x = x[idx,:]
        else:
            self.x = x

        self.overflow_avoid = len(x) + 1
        self.centers = []
        self.random_seed = random_seed
        
    def _distances(self, center):
        # the maximum mass probability value is for the center itself
        # this is definitely an integer as the centers are chosen from the dataaset
        return np.array([1/distance.euclidean(x, center) for x in self.x])

    def _dist_from_centers_initialize(self):
        testing_center = self.centers[len(self.centers) - 1]
        self.distances = self._distances(testing_center)
        
    def _dist_from_centers(self):
        testing_center = self.centers[len(self.centers) - 1]
        self.distances = np.min(np.column_stack((self._distances(testing_center), self.distances.T)), axis=1)

    def _choose_next_center(self):
        # avoid overflow
        self.distances[self.distances > np.finfo(np.float64).max / self.overflow_avoid] =  np.finfo(np.float64).max / self.overflow_avoid
        
        self.probabilities = self.distances / self.distances.sum()
        self.cumulativeProbabilities = self.probabilities.cumsum()
        r = random.random()
        ind = np.where(self.cumulativeProbabilities >= r)[0][0]
        return self.x[ind]

    def init_centers(self, verbose=0):
        random.seed(self.random_seed)
        
        center = random.randint(0, len(self.x))
        self.centers.append(self.x[center])
        if verbose > 0:
            print('Centers found:', len(self.centers))
        self._dist_from_centers_initialize()
        while len(self.centers) < self.number_of_centers:
            self.centers.append(self._choose_next_center())
            if verbose > 0:
                print('Centers found:', len(self.centers))
            if len(self.centers) < self.number_of_centers:
                self._dist_from_centers()


In [17]:
from math import log
from dsio.anomaly_detectors import AnomalyMixin
from sklearn.cluster import KMeans
import scipy.stats.distributions
import numpy as np


def gaussian(x, l, s):
    return_value = 1
    for x_i, l_i, s_i in zip(x, l, s):
        return_value *= scipy.stats.norm(l_i, s_i).pdf(x_i)
    return return_value


class OnlineEMGaussian(AnomalyMixin):
    def __init__(self, gammas, lambdas, segment_length, sigmas=None, update_power=1.0, verbose=0):
        """
        :param gammas: 
        :param lambdas: 
        :param segment_length: 
        :param n_clusters: the different profiles to create for the kind of users 
        :param update_power: the power that determmines the update faktor in each iteration of the online algorithm
        """
        # gammas and lambdas are the initialization
        self.gammas = np.array(gammas)
        self.lambdas = np.vstack(lambdas)
        self.segment_length = segment_length
        
        assert self.lambdas.ndim > 1
        
        if sigmas is not None:
            self.sigmas = sigmas
        else:
            self.sigmas = np.vstack([[10 for _ in range(len(lambdas[0]))]  for _ in range(len(lambdas))])

        assert len(gammas) == len(lambdas)
        assert len(gammas) == len(sigmas)

        # number of poisson mixtures
        self.m = len(gammas)
        # the dimension of the Poisson distribution
        self.dim = len(self.lambdas[0])

        # number of current iteration
        self.iteration_k = 1

        self.update_power = update_power

        # a dictionary containing for each host valuable information
        self.hosts = {}

        self.n_clusters = n_clusters

        self.verbose = verbose

        # HMM matrix
        self.hard_transition_matrix = np.eye(self.m)
        self.soft_transition_matrix = np.eye(self.m)

    def calculate_participation(self, data):
        """
        :param data: n array of the data to train
        :return: an (n, m) array of the participation of each data point to each poisson distribution
                m is the number of distributions
        """
        f = np.zeros(shape=(len(data), self.m))
        for i, x in enumerate(data):
            participation = self.gammas * np.array([gaussian(x, lambda_i, sigma_i) for lambda_i, sigma_i in zip(self.lambdas, self.sigmas)])
            total_x = np.sum(participation)
            
            # TODO
            if total_x == 0:
                participation = np.array([1/self.m] * self.m)
                total_x = 1
            f[i] = participation / total_x
        return f

    # TODO take into account the size of the batch
    def calculate_likelihood(self, data):
        # naive implementation for likelihood calculation
        new_likelihood = 0
        for x in data:
            total_x = np.sum(self.gammas * np.array([gaussian(x, lambda_i, sigma_i) for lambda_i, sigma_i in zip(self.lambdas, self.sigmas)]))
            new_likelihood = new_likelihood + log(total_x)
        return new_likelihood

    def update_parameters(self, batch):
        """
        :param data: the batch data 
        updates gammas, lambdas and likelihood
        """

        data = batch[:, :-1]

        self.iteration_k += 1
        n = len(data)
        if n <= 0:
            return
        assert len(data[0]) == len(self.lambdas[0])

        f = self.calculate_participation(data)

        # update gammas and lambdas
        temp_sum = f.sum(axis=0)

        update_factor = 1 / (pow(self.iteration_k, self.update_power))

        self.gammas = (1 - update_factor) * self.gammas + update_factor * (temp_sum / n)

        # update lambdas
        temp = np.zeros(shape=(self.m, self.dim))
        for i, x in enumerate(data):
            temp = temp + np.vstack([x * f_i for f_i in f[i]])
        new_lambdas = np.vstack([temp[i] / temp_i for i, temp_i in enumerate(temp_sum)])

        self.lambdas = (1 - update_factor) * self.lambdas + update_factor * new_lambdas
        
        # update sigmas
        temp = np.zeros(shape=(self.m, self.dim))
        for i, x in enumerate(data):
            temp = temp + np.vstack([np.power(x - l_i, 2) * f_i for f_i, l_i in zip(f[i], self.lambdas)])
        new_sigmas = np.vstack([temp[i] / temp_i for i, temp_i in enumerate(temp_sum)])

        self.sigmas = (1 - update_factor) * self.sigmas + update_factor * new_sigmas
        
        # upon initialization self.hosts should not contain a key for host
        # TODO memory intensive
        for point in batch:
            self.update_host(point)

    def get_new_batch(self, data, pos):
        n = len(data)

        assert self.segment_length <= n

        if self.segment_length + pos <= n:
            return data[pos: pos + self.segment_length], pos + self.segment_length

        return data[pos:], n

    def closest_centers(self, data):
        n = len(data)

        f = self.calculate_participation(data)

        # update gammas and lambdas
        temp_sum = f.sum(axis=0)
        return temp_sum / n

    def update_host(self, point):
        host = point[-1]
        if host in self.hosts:
            host_points = self.hosts[host]['n_points']

            point_center = self.closest_centers([point])
            # point_center = np.array([-pow(x-0.5, 2) if x < 0.5 else pow(x-0.5, 2) for x in point_center]) * 2 + 0.5

            self.hosts[host]['group'] = (point_center + self.hosts[host]['group'] * host_points) / \
                                        (host_points + 1)

            # the number of data points for the host
            self.hosts[host]['n_points'] += 1

            ###
            # update transpose matrix
            previous_point = self.hosts[host]['hard_previous']

            closest_center = np.argmax(point_center)

            new_transpose = np.zeros(self.m)
            new_transpose[closest_center] = 1

            points_for_cluster = self.hard_points_per_EM_cluster[previous_point]

            self.hard_transition_matrix[previous_point] = (self.hard_transition_matrix[previous_point] *
                                                           points_for_cluster + new_transpose) / \
                                                          (points_for_cluster + 1)

            for i, previous in enumerate(self.hosts[host]['soft_previous']):
                self.soft_transition_matrix[i] = (self.soft_transition_matrix[i] * self.soft_points_per_EM_cluster[i] +
                                                  point_center * previous) / (self.soft_points_per_EM_cluster[i] +
                                                                              previous)
                self.soft_points_per_EM_cluster[i] += previous
                
                self.hosts[host]['soft_transition_matrix'] = (self.hosts[host]['soft_transition_matrix'] * self.hosts[host]['soft_points_per_cluster'][i] + point_center * previous) / (self.hosts[host]['soft_points_per_cluster'][i] +
                                                                              previous)
                self.hosts[host]['soft_points_per_cluster'][i] += previous
                

            self.hosts[host]['hard_previous'] = closest_center
            self.hosts[host]['soft_previous'] = point_center
            self.hard_points_per_EM_cluster[previous_point] += 1
            
            
            points_for_cluster_host = self.hosts[host]['points_per_cluster'][previous_point]
            self.hosts[host]['transition_matrix'][previous_point] = (self.hosts[host]['transition_matrix'][previous_point] *
                                                           points_for_cluster_host + new_transpose) / \
                                                          (points_for_cluster_host + 1)
            self.hosts[host]['points_per_cluster'][previous_point] += 1

        else:
            self.hosts[host] = {}
            # create a self.m array containing the proportion of participation for this host for every center of poisson

            point_center = self.closest_centers([point])
            self.hosts[host]['group'] = point_center

            closest_center = np.argmax(point_center)
            self.hosts[host]['hard_previous'] = closest_center
            self.hosts[host]['soft_previous'] = point_center
            # self.hosts[host]['group'] = np.array(
            #    [-pow(x - 0.5, 2) if x < 0.5 else pow(x - 0.5, 2) for x in point_center]) * 2 + 0.5

            # the number of data points for the host
            self.hosts[host]['n_points'] = 1
            
            # Host specific HMM
            self.hosts[host]['transition_matrix'] = np.eye(self.m)
            self.hosts[host]['points_per_cluster'] = np.zeros(self.m)
            self.hosts[host]['soft_transition_matrix'] = np.eye(self.m)
            self.hosts[host]['soft_points_per_cluster'] = np.zeros(self.m)

    def fit(self, x):
        """
        For fitting the initial values update function is called the pth column holds the by attribute
        x is a array n times p where 
        :param x: data
        """
        if len(x) <= 0:
            return

        features = len(x[0])
        # the starting position of the current batch in the data

        pos = 0
        while pos < len(x):
            batch, pos = self.get_new_batch(x, pos)

            if self.verbose > 0:
                print('Running for data till position', pos, 'from total', len(x))

            self.update_parameters(batch)

        if self.verbose > 0:
            print('Running clustering algorithm')

        closest_centers = []

        for host in self.hosts.keys():
            closest_centers.append(self.hosts[host]['group'])

        self.kMeans.fit(closest_centers)

        for host in self.hosts.keys():
            category = self.kMeans.predict([self.hosts[host]['group']])[0]
            self.hosts[host]['category'] = category
            points_in_cluster = self.hosts_per_kMeans_cluster[category]

            self.probabilities_per_kMean_cluster[category] = \
                (self.probabilities_per_kMean_cluster[category] * points_in_cluster + self.hosts[host]['group']) / \
                (points_in_cluster + 1)

            self.hosts_per_kMeans_cluster[category] += 1

    def update(self, x):
        """
        :param data: dictionary?
        """
        # TODO (or another way to get the host name)
        #                assumes the data has the appropriate length fot batch processing

        if len(x) <= 0:
            return

        features = len(x[0])
        data = x[:, 0:features - 1]
        self.update_parameters(data)
        for point in x:
            self.update_host(point)

            # kMeans center should be updated every a number of batch updates??

    def score_anomaly_for_category(self, x, category=None, host=None):
        pass

    # TODO
    def score_anomaly(self, x):
        pass

    # TODO
    def flag_anomaly(self, x):
        pass

    def get_bic(self, data):
        """
        :return a tuple of the bic avg_log_likelihoods and the log likelihood of the whole data
        """
        return ((-2) / self.iteration_k) * self.calculate_likelihood(data) + log(len(data)) * (
            2 * self.m - 1), self.calculate_likelihood(data)


In [16]:
import scipy.stats.distributions

scipy.stats.norm(100, 10).cdf(90)

a = np.array([1,2])
b = np.array([1,4])

np.power(a - b, 2)

array([0, 4], dtype=int32)