In [1]:
from numpy.linalg import norm
import random as ran
import numpy as np
from math import log


def kl_distance(P, Q):
    total_distance = 0
    for i in range(len(P)):
        if Q[i] == 0 or P[i] == 0:
            continue
        total_distance += P[i] * log(P[i]/Q[i])
            
    return total_distance

def solveStationary(A):
    """ x = xA where x is the answer
    x - xA = 0
    x( I - A ) = 0 and sum(x) = 1
    """
    n = A.shape[0]
    a = np.eye( n ) - A
    a = np.vstack( (a.T, np.ones( n )) )
    b = np.matrix( [0] * n + [ 1 ] ).T
    return np.squeeze(np.asarray(np.linalg.lstsq( a, b )[0]))

def distance(A1, A2):
    stationary = solveStationary(A1)
    #return sum([stationary[i] * (2*kl_distance(A1[i], A2[i])) for i in range(len(stationary))])
    return sum([(2*kl_distance(A1[i], A2[i])) for i in range(len(stationary))])

def hmm_distance(A1, A2):
    return (distance(A1, A2) + distance(A2, A1)) / 2

class kMeans:
    def __init__(self, em, n_clusters=3, initial_centers=None, n_iters=20, n_runs=10):
        self.em = em
        self.initial_centers = initial_centers
        self.n_clusters = n_clusters
        self.n_iters = n_iters
        self.n_runs = n_runs
        
    def _transition_matrices_for_cluster(self, k, assignments):
        members = [key for key, value in assignments.items() if value == k]

        matrices = []
        for host in members:
            matrices.append(self.em.hosts[host]['transitiion_matrix'])

        return np.array(matrices)

    def _compute_centroids(self, assignments):
        C = np.zeros(shape=(self.n_clusters, self.em.m, self.em.m), dtype='d')
        
        for k in range(self.n_clusters):

            if not (np.array(list(assignments.values())) == k).any():
                continue
                
            matrices = self._transition_matrices_for_cluster(k, assignments)
            C[k] = np.mean(matrices, axis=0)
        return C

    def _cost(self, C, assignments):
        cost = 0
        for k in range(self.n_clusters):
            matrices = self._transition_matrices_for_cluster(k, assignments)
            for transition_matrix in matrices:
                cost += hmm_distance(transition_matrix, C[k])
        return cost
        
    def run(self):
        min_cost = float('+inf')
        best_C = None
        best_assignment = None
        
        for _ in range(self.n_runs):
            print('Starting run')
            # random initialize the assignment of each host to a cluster
            assignments = dict(zip(onlineEM.hosts, np.random.randint(0, self.n_clusters, len(onlineEM.hosts))))
            
            C = self._compute_centroids(assignments)
            
            C, assignments = self._base_kmeans(C)
            clust_cost = self._cost(C, assignments)

            if clust_cost < min_cost:
                best_C = C.copy()
                best_assignment = assignments.copy()
                
        return best_C, best_assignment


    def _base_kmeans(self, C):
        n = len(self.em.hosts)

        C_final = C

        #KMeans algorithm
        cent_dists = None
        assignments = None
        prev_assignments = None
        best_shift = None

        iters = self.n_iters
        converged = False

        while iters != 0 and not converged:
            #assign elements to new clusters    
            assignments = {}
            for host in self.em.hosts:
                distances = np.array([hmm_distance(self.em.hosts[host]['transitiion_matrix'], C_final[i]) 
                                      for i in range(self.n_clusters)])
                assignments[host] = np.argmin(distances)

            #check if converged, if not compute new centroids
            if prev_assignments is not None and prev_assignments == assignments:
                converged = True
                print('converged')
            else: 
                C_final = self._compute_centroids(assignments)
                print('not converged')

            prev_assignments = assignments
            iters -= 1

        self.assignments = assignments
        self.centers = C_final
        return C_final, assignments
                
    def classify(self, data_point):
        return min(range(self.n_clusters), key=lambda p: hmm_distance(data_point, self.centers[p]))

In [2]:
from sklearn.externals import joblib

onlineEM = joblib.load('onlineEM_60_500_13_host_specific.pkl')
groupped_data = joblib.load('groupped_data_60_500.pkl')
groupped_data1 = joblib.load('groupped_data1_60_500.pkl')

In [94]:
kmeans = kMeans(onlineEM, n_clusters=3, initial_centers=None, n_iters=20, n_runs=10)
C,assign = kmeans.run()

calculate_likelihood_em(groupped_data.values[:,[0,1,3]], kmeans)

Starting run
not converged
not converged
not converged
not converged
not converged
not converged
not converged
not converged
not converged
not converged
not converged
not converged
not converged
not converged
not converged
not converged
not converged
not converged
not converged
not converged
Starting run
not converged
not converged
not converged
not converged
not converged
not converged
not converged
not converged
not converged
not converged
not converged
not converged
not converged
not converged
not converged
not converged
not converged
not converged
not converged
not converged
Starting run
not converged
not converged
not converged
not converged
not converged
not converged
not converged
not converged
not converged
not converged
not converged
not converged
not converged
not converged
not converged
not converged
not converged
not converged
not converged
not converged
Starting run
not converged
not converged
not converged
not converged
not converged
not converged
not converged
not conver

KeyboardInterrupt: 

In [9]:
from math import log
import scipy.stats.distributions


def poisson(x, l):
    return_value = 1
    for x_i, l_i in zip(x, l):
        return_value *= scipy.stats.distributions.poisson.pmf(x_i, l_i)
    return return_value

def calculate_likelihood_em(data, kmeans, take_mean=False, weight=0.5):
    # first reset previous point for all hosts for rerun

    previous_points = {}
    for host in onlineEM.hosts:
        previous_points[host] = onlineEM.hosts[host]['hard_previous']

    total_likelihood = []
    
    i = 0
    for point in data:
        i += 1
        if i % 10000 == 0:
            print(i, sum(total_likelihood) / len(total_likelihood))
            
        host = point[-1]

        previous_point = previous_points[host]

        point_center = onlineEM.closest_centers([point])
        closest_center = np.argmax(point_center)

        previous_points[host] = closest_center
        
        if take_mean:
            kmeans_probabilities = kmeans.centers[kmeans.assignments[host]][previous_point]
            host_probabilities = onlineEM.hosts[host]['transitiion_matrix'][previous_point]
            probabilities = kmeans_probabilities * weight + host_probabilities * (1 - weight)
        else:
            probabilities = kmeans.centers[kmeans.assignments[host]][previous_point]
    
        participation = probabilities * np.array([poisson(point, lambda_i) for lambda_i in onlineEM.lambdas])
        
        likelihood = log(np.sum(participation))
    
        total_likelihood.append(likelihood)

    return sum(total_likelihood) / len(total_likelihood)

In [9]:
kmeans_all = []
for n_clusters in range(2, 8):
    kmeans = kMeans(onlineEM, n_clusters=n_clusters, initial_centers=None, n_iters=15, n_runs=5)
    C, assign = kmeans.run()
    
    print('======================================================================')
    print('n_clusters', n_clusters)
    print(calculate_likelihood_em(groupped_data1.values[:189400,[0,1,3]], kmeans))
    #print(kmeans._cost(C, assign))
    kmeans_all.append(kmeans)

Starting run
not converged
not converged
not converged
not converged
not converged
not converged
not converged
not converged
not converged
not converged
not converged
not converged
not converged
not converged
not converged
Starting run
not converged
not converged
not converged
not converged
not converged
not converged
not converged
not converged
not converged
not converged
not converged
not converged
not converged
not converged
not converged
Starting run
not converged
not converged
not converged
not converged
not converged
not converged
not converged
not converged
not converged
not converged
not converged
not converged
not converged
not converged
not converged
Starting run
not converged
not converged
not converged
not converged
not converged
not converged
not converged
not converged
not converged
not converged
not converged
not converged
not converged
not converged
not converged
Starting run
not converged
not converged
not converged
not converged
not converged
not converged
not converg

KeyboardInterrupt: 

In [4]:
# joblib.dump(kmeans_all, 'kmeans_all_500_1_to_8_clusters__hmm_distance.pkl')

kmeans_all = joblib.load('kmeans_all_500_1_to_8_clusters__hmm_distance.pkl')

In [None]:
for kmeans in kmeans_all:
    print('For number of clusters', kmeans.n_clusters)
    print(calculate_likelihood_em(groupped_data1.values[:189400,[0,1,3]], kmeans, take_mean=False))

In [16]:
for kmeans in kmeans_all:
    print('For number of clusters', kmeans.n_clusters)
    print(calculate_likelihood_em(groupped_data1.values[:189400,[0,1,3]], kmeans, take_mean=True, weight=0.2))

For number of clusters 1
10000 -3.5686028688481133
20000 -3.6353186820733177
30000 -3.660732577743154
40000 -3.680953896114238
50000 -3.7368592025754492
60000 -3.739507561721263
70000 -3.7387808708597903
80000 -3.750069394202289
90000 -3.747276533991048
100000 -3.735839695255946
110000 -3.7400657883494257
120000 -3.7295334966988505
130000 -3.71879402978936
140000 -3.7183607669537957
150000 -3.713751176625391
160000 -3.711128811906811
170000 -3.715513832479572
180000 -3.7174182176717667
-3.7185271668825983
For number of clusters 2
10000 -3.5658752270079646
20000 -3.633751853329529
30000 -3.6589692955947424
40000 -3.6796192311580893
50000 -3.736641532924678
60000 -3.7403358064861085
70000 -3.740075164202061
80000 -3.751739306091957
90000 -3.7491237965892372
100000 -3.7378046692782783
110000 -3.742140855867991
120000 -3.731580477807037
130000 -3.7207240096364593
140000 -3.720269042899402
150000 -3.7156724780053114
160000 -3.712945806208319
170000 -3.717551787187719
180000 -3.7195314576605

KeyboardInterrupt: 

In [17]:
for kmeans in kmeans_all:
    print('For number of clusters', kmeans.n_clusters)
    print(calculate_likelihood_em(groupped_data1.values[:189400,[0,1,3]], kmeans, take_mean=True, weight=0.5))

For number of clusters 1
10000 -3.620677222455439
20000 -3.6838785162212107
30000 -3.7062074132723066
40000 -3.7234737331151706
50000 -3.775355560476942
60000 -3.7773153464093827
70000 -3.777064351710199
80000 -3.7876366572709945
90000 -3.7854355778202593
100000 -3.7745135274657646
110000 -3.778547740907677
120000 -3.7685553359603796
130000 -3.758453519618155
140000 -3.757946088835174
150000 -3.7532973582796743
160000 -3.750568073675632
170000 -3.754177264203289
180000 -3.755827562018364
-3.7566040137456946
For number of clusters 2
10000 -3.6106902511159875
20000 -3.6756207035175197
30000 -3.6977329693475545
40000 -3.7159255523838195
50000 -3.7699149089882935
60000 -3.7737173205672523
70000 -3.774120714731806
80000 -3.7853276686286716
90000 -3.783343042271286
100000 -3.7725489650811594
110000 -3.7767318593161847
120000 -3.7666382266819425
130000 -3.7563840685985865
140000 -3.75588187140704
150000 -3.751271403296186
160000 -3.7485455149498206
170000 -3.7525999016886455
180000 -3.7544509

KeyboardInterrupt: 