In [1]:
from sklearn.externals import joblib

onlineEM = joblib.load('pkl/onlineEM_15_days_7_alligned.pkl')

In [2]:
import numpy as np

m = onlineEM.m

initialize = np.eye(m)

for i in range(m):
    for j in range(m):
        if i != j:
            initialize[i][j] = 0.5
            
    initialize[i] /= (1 + 0.5 * (m - 1))
    
for host in onlineEM.hosts:
    tm = onlineEM.hosts[host]['transition_matrix']
    for i, points in enumerate(onlineEM.hosts[host]['points_per_cluster']):
        tm[i] = tm[i] * (points / (points + 1)) + initialize[i] / (points + 1)

In [3]:
from numpy.linalg import norm
import random as ran
import numpy as np
from math import log
from scipy.linalg import eig 
import sys

def my_kl_distance2(P, Q):
    total_distance = 0
    for i in range(len(P)):
        if P[i] == 0:
            continue
        total_distance += P[i] * log(P[i]/Q[i])
    return total_distance

def my_solveStationary(A):
    """ x = xA where x is the answer
    x - xA = 0
    x( I - A ) = 0 and sum(x) = 1
    """
    n = A.shape[0]
    a = np.eye( n ) - A
    a = np.vstack( (a.T, np.ones( n )) )
    b = np.matrix( [0] * n + [ 1 ] ).T
    return np.squeeze(np.asarray(np.linalg.lstsq( a, b )[0]))

def my_distance(A1, A2):
    stationary = my_solveStationary(A1)
    return sum([stationary[i] * (2*my_kl_distance2(A1[i], A2[i])) for i in range(len(stationary))])
    #return sum([(2*kl_distance2(A1[i], A2[i])) for i in range(len(stationary))])

def my_hmm_distance(A1, A2, K=7):
    A1 = np.reshape(A1, (K, K))
    A2 = np.reshape(A2, (K, K))
    return (my_distance(A1, A2) + my_distance(A2, A1)) / 2

In [6]:
X = []

for host in onlineEM.hosts:
    X.append(onlineEM.hosts[host]['transition_matrix'].ravel())

In [5]:
from sklearn.neighbors import LocalOutlierFactor 

lof = LocalOutlierFactor(n_neighbors=10, algorithm='auto', leaf_size=30, metric=my_hmm_distance,contamination=0.005)

y_pred = lof.fit_predict(X)

In [6]:
i = 0
for host in onlineEM.hosts:
    if y_pred[i] == -1:
        print(host)
    i += 1

C5732
C22271
C5739
C15086
C20690
C12366
C4436
C21986
C2217
C25525
C2685
C4473


In [7]:
distances = []
for host in onlineEM.hosts:
    distances.append([host, lof._decision_function([onlineEM.hosts[host]['transition_matrix'].ravel()])])

sorted(distances, key=lambda x: -x[1])[:10]

[['C27137', array([-0.88837353])],
 ['C12473', array([-0.89410751])],
 ['C6469', array([-0.89654995])],
 ['C6353', array([-0.89800753])],
 ['C4643', array([-0.89949698])],
 ['C6229', array([-0.90027119])],
 ['C144', array([-0.9009236])],
 ['C23382', array([-0.90177968])],
 ['C4593', array([-0.90525202])],
 ['C4745', array([-0.90541913])]]

In [8]:
sorted(distances, key=lambda x: x[1])[:10]

[['C2685', array([-22360400.31721617])],
 ['C5732', array([-14373039.841265])],
 ['C2217', array([-8551388.23352829])],
 ['C20690', array([-3587592.5357136])],
 ['C22271', array([-2044197.29270694])],
 ['C4473', array([-1927185.73624539])],
 ['C15086', array([-1732276.38833306])],
 ['C5739', array([-1517944.372956])],
 ['C12366', array([-1501514.32207707])],
 ['C4436', array([-931478.59931947])]]

neighbors = 20

0.02

C6177
C24135
C5732
C6016
C18263
C3873
C23304
C17185
C2972
C18261
C4207
C11573
C3268
C473
C14014
C2040
C16029
C6144
C25059
C21072
C22967
C6383
C979
C743
C1059
C5089
C5688
C5653
C18057
C9156
C16712
C21986
C2217
C5730
C1592
C3871
C5624
C1968
C23769
C5741
C5753
C4979
C17874
C25466
C24136
C6339

0.005

C18263
C3873
C23304
C2972
C18261
C4207
C14014
C21072
C5089
C5653
C18057
C16712

neighbors = 15

0.005

[['C23304', array([-21953.71514166])],
 ['C5653', array([-15267.28601233])],
 ['C5732', array([-6990.14215418])],
 ['C2685', array([-6002.80012367])],
 ['C2217', array([-5479.61569629])],
 ['C21986', array([-5057.10818874])],
 ['C22271', array([-4537.11154405])],
 ['C2040', array([-2438.24801088])],
 ['C20690', array([-1487.46729654])],
 ['C15459', array([-1273.65808549])]]
 
neighbors = 10

0.005

[['C2685', array([-22360400.31721617])],
 ['C5732', array([-14373039.841265])],
 ['C2217', array([-8551388.23352829])],
 ['C20690', array([-3587592.5357136])],
 ['C22271', array([-2044197.29270694])],
 ['C4473', array([-1927185.73624539])],
 ['C15086', array([-1732276.38833306])],
 ['C5739', array([-1517944.372956])],
 ['C12366', array([-1501514.32207707])],
 ['C4436', array([-931478.59931947])]]

In [10]:
import pandas as pd

test1 = joblib.load('pkl/groupped_data_15_days_00001_04320.pkl')
test2 = joblib.load('pkl/groupped_data_15_days_04321_08640.pkl')
test3 = joblib.load('pkl/groupped_data_15_days_08641_12960.pkl')
test4 = joblib.load('pkl/groupped_data_15_days_12961_17280.pkl')
groupped_data = pd.concat([test1, test2, test3, test4])

In [11]:
# RANDOM COMBINATION OF FLOWS AND NUMBER OF BYTES
import random as ran
import random

random_flows = list(set(groupped_data['number of flows'].values))
random_average_bytes = list(set(groupped_data['mean(byte count)'].values))

anomalous_host = 'A50'
epochs = 1500

data = []
for i in range(epochs):
    #flows = ran.randint(1,50)
    #average_bytes = ran.randint(1,50)
    flows = random.choice(random_flows)
    average_bytes = random.choice(random_average_bytes)
    data.append([flows, average_bytes, anomalous_host])

    
for data_point in data:
    onlineEM.update_host(data_point)

  previous)


In [14]:
#RANDOM VALUE ENCOUNTERED IN THE SET OF GROUPED_DATA
import random as ran
import random

size = 10000

test_df = groupped_data.head(size)
unique_values = np.vstack({tuple(row) for row in test_df.values[:,:2]})

anomalous_host = 'A61'
epochs = 1500

data = []
for i in range(epochs):
    values = random.choice(unique_values)
    data.append([values[0], values[1], anomalous_host])

    
for data_point in data:
    onlineEM.update_host(data_point)

  previous)


In [17]:
#RANDOM VALUE ENCOUNTERED IN GROUPED_DATA
import random as ran
import random

size = 10000

test_df = groupped_data.head(size)

anomalous_host = 'A72'
epochs = 1500

data = []
for i in range(epochs):
    values = random.choice(test_df.values[:,:2])
    data.append([values[0], values[1], anomalous_host])

    
for data_point in data:
    onlineEM.update_host(data_point)

In [18]:
tm = onlineEM.hosts[anomalous_host]['transition_matrix']
for i, points in enumerate(onlineEM.hosts[anomalous_host]['points_per_cluster']):
    tm[i] = tm[i] * (points / (points + 1)) + initialize[i] / (points + 1)

In [13]:
print('For host', anomalous_host)
lof._decision_function([onlineEM.hosts[anomalous_host]['transition_matrix'].ravel()])

For host A50


array([-189.37802117])

In [16]:
print('For host', anomalous_host)
lof._decision_function([onlineEM.hosts[anomalous_host]['transition_matrix'].ravel()])

For host A61


array([-1.68110665])

In [19]:
print('For host', anomalous_host)
lof._decision_function([onlineEM.hosts[anomalous_host]['transition_matrix'].ravel()])

For host A72


array([-2.06129155])

In [20]:
import random as ran

def random_vector(size=7):
    vec = [ran.random() for i in range(size)]
    return np.array(vec) / sum(vec)

def random_transition_matrix(size=7):
    return np.vstack([random_vector(size=size) for i in range(size)])

In [25]:
lof._decision_function([random_transition_matrix().ravel()])

array([-6.3504255])

In [45]:
def percentage_anomalous(sorted_distances, value):
    total = len(sorted_distances)
    values_smaller = len(np.where(np.vstack(np.array(sorted_distances, dtype=object))[:, 1] < value)[0])
    return values_smaller / total

percentage_anomalous(sorted(distances, key=lambda x: -x[1]), -1.6)

0.2896825396825397

In [8]:
from sklearn.decomposition import PCA

pca = PCA(n_components=2)
Y = pca.fit_transform(X)

In [9]:
import matplotlib.pyplot as plt
%matplotlib inlin

array([ 0.63973451, -0.33119189])