In [9]:
from hmmlearn.hmm import GaussianHMM
import pandas as pd
import numpy as np
import pickle

## IP Addresses

### Infected hosts
    - 147.32.84.165
    - 147.32.84.191
    - 147.32.84.192
    - 147.32.84.193
    - 147.32.84.204
    - 147.32.84.205
    - 147.32.84.206
    - 147.32.84.207
    - 147.32.84.208
    - 147.32.84.209
        
### Normal hosts:
    - 147.32.84.170
    - 147.32.84.134
    - 147.32.84.164
    - 147.32.87.36
    - 147.32.80.9
    - 147.32.87.11


In [10]:
with open('discretized_data/all_discretized_protocol_bytes.pkl', 'rb') as f:
    data = pickle.load(f)
infected_ip = '147.32.84.165'
data[(data['src_ip']== infected_ip) | (data['dst_ip']== infected_ip)]

Unnamed: 0,date,duration,protocol,src_ip,src_port,dst_ip,dst_port,flags,tos,packets,bytes,flows,label,protocol_num,flags_num,bytes_num,encoded
19115,2011-08-18 10:37:58.448,0.000,UDP,147.32.84.165,1025,147.32.80.9,53,INT,0,1,64,1,Botnet,1,5,0,4.0
19116,2011-08-18 10:37:58.448,0.000,UDP,147.32.80.9,53,147.32.84.165,1025,INT,0,1,139,1,Botnet,1,5,0,4.0
19203,2011-08-18 10:38:00.695,0.000,UDP,147.32.84.165,1025,147.32.80.9,53,INT,0,1,87,1,Botnet,1,5,0,4.0
19205,2011-08-18 10:38:00.705,0.000,UDP,147.32.80.9,53,147.32.84.165,1025,INT,0,1,503,1,Botnet,1,5,1,5.0
19206,2011-08-18 10:38:00.706,0.043,TCP,147.32.84.165,1027,74.125.232.206,80,SRPA_,0,4,629,1,Botnet,0,15,1,1.0
19207,2011-08-18 10:38:00.714,0.032,TCP,74.125.232.206,80,147.32.84.165,1027,SPA_,0,3,253,1,Botnet,0,3,0,0.0
19227,2011-08-18 10:38:03.255,0.000,UDP,147.32.84.165,1025,147.32.80.9,53,INT,0,1,76,1,Botnet,1,5,0,4.0
19228,2011-08-18 10:38:03.266,0.000,UDP,147.32.80.9,53,147.32.84.165,1025,INT,0,1,403,1,Botnet,1,5,1,5.0
19229,2011-08-18 10:38:03.267,2.990,UDP,147.32.84.165,123,65.55.56.40,123,INT,0,2,180,1,Botnet,1,5,0,4.0
26082,2011-08-18 10:44:50.523,0.000,TCP,147.32.96.45,2097,147.32.84.165,21,S_,0,1,74,1,Botnet,0,10,0,0.0


In [11]:
# the infected host flows that we will profile
chosen = data[(data['src_ip'] == infected_ip) | (data['dst_ip'] == infected_ip)]
# rest of the hosts split between benign and malicious for testing purposes
normal =  ['147.32.84.170', '147.32.84.134', '147.32.84.164', '147.32.87.36', '147.32.80.9', '147.32.87.11']
infected = ['147.32.84.191','147.32.84.192','147.32.84.193' , '147.32.84.204', '147.32.84.205',
            '147.32.84.206', '147.32.84.207','147.32.84.208','147.32.84.209']


In [12]:
def get_windows(data, window_size):
    size = len(data) - window_size
    # create sliding window data
    win_data = np.zeros((size,window_size),dtype=np.int32)
    for i in range(size):
        win_data[i] = np.array([flow for flow in data['encoded'][i:i+window_size]])
    return win_data

In [13]:
# define sliding window size
win = 5

win_data = get_windows(chosen, win)

# learn a Gaussian Hidden Markov Model with 4 states from the infected host data
hmm = GaussianHMM(n_components=4)
hmm.fit(win_data)
# store the log-likelihood of the host that trained the model
log_likelihood = hmm.decode(win_data)[0]

In [14]:
log_likelihood

176224.03608105314

In [15]:
hosts_log_likelihood = {}
win = 5
# compute log-likelihood of data sequence of normal IPs
for ip in normal:
    # get the flows of that host only
    host_data = data[(data['src_ip'] == ip) | (data['dst_ip'] == ip)]
    size = len(host_data) - win
    # if host has enough flows for creating a window
    if size > 0:
        # create sliding windows sequences
        normal_data = get_windows(host_data, win)
        # get the log-likelihood of the sequential data
        hosts_log_likelihood[ip] = hmm.decode(normal_data)[0]
    else:
        hosts_log_likelihood[ip] = 0

# repeat procedure for all infected IPs
for ip in infected:
    # get the flows of that host only
    host_data = data[(data['src_ip'] == ip) | (data['dst_ip'] == ip)]
    size = len(host_data) - win
    # if host has enough flows for creating a window
    if size > 0:
        # create sliding windows sequences
        infected_data = get_windows(host_data, win)
        # get the log-likelihood of the sequential data
        hosts_log_likelihood[ip] = hmm.decode(infected_data)[0]
    else:
        hosts_log_likelihood[ip] = 0

In [16]:
hosts_log_likelihood

{'147.32.84.170': -247565.52165472033,
 '147.32.84.134': -140377.41615853438,
 '147.32.84.164': -132427.65920189212,
 '147.32.87.36': -63014.78718325766,
 '147.32.80.9': -112623.64246361004,
 '147.32.87.11': 0,
 '147.32.84.191': 193534.12346751493,
 '147.32.84.192': 196883.0322018428,
 '147.32.84.193': 186180.06645540014,
 '147.32.84.204': 209701.7253157743,
 '147.32.84.205': 227326.40712409894,
 '147.32.84.206': 213948.40606197485,
 '147.32.84.207': 198066.68492343766,
 '147.32.84.208': 212495.93040175017,
 '147.32.84.209': 179933.08780984755}

In [18]:
# evaluate results using the log-likelihood distance of hosts from the one who trained the model
TP = 0
TN = 0
FP = 0
FN = 0
positives = []
negatives = []

# 
dist = {}
for ip in hosts_log_likelihood.keys():
    # absolute log-likelihood distance
    dist[ip] = abs(hosts_log_likelihood[ip] - log_likelihood)
    # threshold is half log-likelihood
    if dist[ip] > log_likelihood / 2:
        negatives.append(ip)
    else:
        positives.append(ip)

# evaluate all potentially malicious hosts
for i in positives:
    if i in infected:
        TP += 1
    else:
        print(i)
        FP += 1

# evaluate all potentially benign hosts
for i in negatives:
    if i in normal:
        TN += 1
    else:
        FN += 1

precision = TP / (TP + FP)
recall = TP / (TP + FN)
accuracy = (TP + TN) / (TP + TN + FP + FN)
print('True Positives : {}'.format(TP))
print('False Positives : {}'.format(FP))
print('True Negatives : {}'.format(TN))
print('False Negatives : {}'.format(FN))
print('Precision: {}'.format(precision))
print('Recall: {}'.format(recall))
print('Accuracy: {}'.format(accuracy))

True Positives : 9
False Positives : 0
True Negatives : 6
False Negatives : 0
Precision: 1.0
Recall: 1.0
Accuracy: 1.0
