In [1]:
import random
import pandas as pd
import collections
import operator
import mmh3
import math

aa= pd.read_csv('capture20110811.pcap.netflow.labeled',delimiter=' ')
columns=['Date','Duration','Protocol','Source','Dest','Label']
lst=[]
with open('capture20110811.pcap.netflow.labeled') as fp:  
    for cnt, line in enumerate(fp):
        if cnt!=0:
            dat=line.split("\t")
            lst.append([dat[0],dat[1],dat[2],dat[3].split(':')[0],dat[5].split(':')[0],dat[11].split('\n')[0]])
dataset=pd.DataFrame(lst, columns=columns)

In [2]:
# only malware
infected_host='147.32.84.165'
infected_dataset=dataset.loc[(dataset['Source']==infected_host) | (dataset['Dest']==infected_host)]
infected_dataset=infected_dataset.reset_index()

### First we compute the 10 most frequently connected ips with the selected host

which will be our ground truth form now on

In [3]:
def return_k_frequent_ips(dataset, k):
    ips=[]

    for i in range (0,len(dataset)):
        if (dataset.Source[i] == infected_host):
            ips.append(dataset.Dest[i])
        else:
            ips.append(dataset.Source[i])

    counter=collections.Counter(ips)
    return(counter.most_common(k), ips)

In [4]:
#find the 10 most frequently used ips except for the host
most_freq,ips = return_k_frequent_ips(infected_dataset, 10)
print(most_freq)

[('193.23.181.44', 6442), ('174.128.246.102', 4101), ('174.37.196.55', 3707), ('173.236.31.226', 3410), ('184.154.89.154', 3344), ('67.19.72.206', 3135), ('72.20.15.61', 2966), ('46.4.36.120', 1627), ('212.117.171.138', 967), ('147.32.80.9', 787)]


In [5]:
# print the number of connections and the percentage they posses in the whole dataset
for t in most_freq:
    print('number of connections:',t[1], 'percentile', t[1]*100.0/len(infected_dataset))

('number of connections:', 6442, 'percentile', 12.434133065683568)
('number of connections:', 4101, 'percentile', 7.9156131174120326)
('number of connections:', 3707, 'percentile', 7.155127487502171)
('number of connections:', 3410, 'percentile', 6.581868015209713)
('number of connections:', 3344, 'percentile', 6.454477021366944)
('number of connections:', 3135, 'percentile', 6.05107220753151)
('number of connections:', 2966, 'percentile', 5.724874056631087)
('number of connections:', 1627, 'percentile', 3.1403810148815845)
('number of connections:', 967, 'percentile', 1.8664710764538979)
('number of connections:', 787, 'percentile', 1.5190410932463472)


## The 10 most frequent ips with MIN-WISE sampling for different reservoir sizes

In [6]:
#MIN-WISE sampling
#select the k smallest values

for k in range(100, 10100, 100):
    for index, row in infected_dataset.iterrows():
        #begin by setting a random value at each row
        a = random.uniform(0,1)
        infected_dataset.set_value(index,'rn',a)

    sort_infections = infected_dataset.sort_values(['rn'], ascending=[True])
    sel_k = sort_infections[0:k]
    sel_k=sel_k.reset_index(level=0, drop=True)
    most_freq_minwise, ips_minwise = return_k_frequent_ips(sel_k, 10)
    print('=======> Value of reservoir:',k)
    print('Top 10:', most_freq_minwise)
    for t in most_freq_minwise:
        print('Element',t,'percentage', t[1]*100.0/k, '%')


  


('Top 10:', [('193.23.181.44', 11), ('67.19.72.206', 10), ('174.37.196.55', 8), ('72.20.15.61', 7), ('174.128.246.102', 5), ('147.32.80.9', 4), ('212.117.171.138', 4), ('46.4.36.120', 3), ('173.192.170.88', 3), ('173.236.31.226', 2)])
('Element', ('193.23.181.44', 11), 'percentage', 11.0, '%')
('Element', ('67.19.72.206', 10), 'percentage', 10.0, '%')
('Element', ('174.37.196.55', 8), 'percentage', 8.0, '%')
('Element', ('72.20.15.61', 7), 'percentage', 7.0, '%')
('Element', ('174.128.246.102', 5), 'percentage', 5.0, '%')
('Element', ('147.32.80.9', 4), 'percentage', 4.0, '%')
('Element', ('212.117.171.138', 4), 'percentage', 4.0, '%')
('Element', ('46.4.36.120', 3), 'percentage', 3.0, '%')
('Element', ('173.192.170.88', 3), 'percentage', 3.0, '%')
('Element', ('173.236.31.226', 2), 'percentage', 2.0, '%')
('Top 10:', [('193.23.181.44', 28), ('174.128.246.102', 15), ('67.19.72.206', 15), ('72.20.15.61', 12), ('174.37.196.55', 12), ('173.236.31.226', 9), ('184.154.89.154', 9), ('147.32.

('Top 10:', [('193.23.181.44', 112), ('174.128.246.102', 99), ('173.236.31.226', 80), ('67.19.72.206', 67), ('72.20.15.61', 59), ('184.154.89.154', 56), ('174.37.196.55', 56), ('46.4.36.120', 28), ('212.117.171.138', 21), ('217.163.21.36', 19)])
('Element', ('193.23.181.44', 112), 'percentage', 11.2, '%')
('Element', ('174.128.246.102', 99), 'percentage', 9.9, '%')
('Element', ('173.236.31.226', 80), 'percentage', 8.0, '%')
('Element', ('67.19.72.206', 67), 'percentage', 6.7, '%')
('Element', ('72.20.15.61', 59), 'percentage', 5.9, '%')
('Element', ('184.154.89.154', 56), 'percentage', 5.6, '%')
('Element', ('174.37.196.55', 56), 'percentage', 5.6, '%')
('Element', ('46.4.36.120', 28), 'percentage', 2.8, '%')
('Element', ('212.117.171.138', 21), 'percentage', 2.1, '%')
('Element', ('217.163.21.36', 19), 'percentage', 1.9, '%')
('Top 10:', [('193.23.181.44', 124), ('184.154.89.154', 86), ('174.37.196.55', 74), ('174.128.246.102', 70), ('173.236.31.226', 70), ('67.19.72.206', 69), ('72.2

('Top 10:', [('193.23.181.44', 199), ('174.37.196.55', 150), ('174.128.246.102', 147), ('173.236.31.226', 132), ('67.19.72.206', 123), ('184.154.89.154', 120), ('72.20.15.61', 105), ('46.4.36.120', 61), ('212.117.171.138', 37), ('173.192.170.88', 26)])
('Element', ('193.23.181.44', 199), 'percentage', 10.473684210526315, '%')
('Element', ('174.37.196.55', 150), 'percentage', 7.894736842105263, '%')
('Element', ('174.128.246.102', 147), 'percentage', 7.7368421052631575, '%')
('Element', ('173.236.31.226', 132), 'percentage', 6.947368421052632, '%')
('Element', ('67.19.72.206', 123), 'percentage', 6.473684210526316, '%')
('Element', ('184.154.89.154', 120), 'percentage', 6.315789473684211, '%')
('Element', ('72.20.15.61', 105), 'percentage', 5.526315789473684, '%')
('Element', ('46.4.36.120', 61), 'percentage', 3.210526315789474, '%')
('Element', ('212.117.171.138', 37), 'percentage', 1.9473684210526316, '%')
('Element', ('173.192.170.88', 26), 'percentage', 1.368421052631579, '%')
('Top

('Top 10:', [('193.23.181.44', 337), ('174.128.246.102', 222), ('174.37.196.55', 204), ('173.236.31.226', 183), ('67.19.72.206', 171), ('184.154.89.154', 168), ('72.20.15.61', 148), ('46.4.36.120', 77), ('173.192.170.88', 52), ('212.117.171.138', 50)])
('Element', ('193.23.181.44', 337), 'percentage', 12.035714285714286, '%')
('Element', ('174.128.246.102', 222), 'percentage', 7.928571428571429, '%')
('Element', ('174.37.196.55', 204), 'percentage', 7.285714285714286, '%')
('Element', ('173.236.31.226', 183), 'percentage', 6.535714285714286, '%')
('Element', ('67.19.72.206', 171), 'percentage', 6.107142857142857, '%')
('Element', ('184.154.89.154', 168), 'percentage', 6.0, '%')
('Element', ('72.20.15.61', 148), 'percentage', 5.285714285714286, '%')
('Element', ('46.4.36.120', 77), 'percentage', 2.75, '%')
('Element', ('173.192.170.88', 52), 'percentage', 1.8571428571428572, '%')
('Element', ('212.117.171.138', 50), 'percentage', 1.7857142857142858, '%')
('Top 10:', [('193.23.181.44', 3

('Top 10:', [('193.23.181.44', 439), ('174.128.246.102', 319), ('174.37.196.55', 287), ('173.236.31.226', 253), ('184.154.89.154', 248), ('72.20.15.61', 235), ('67.19.72.206', 217), ('46.4.36.120', 130), ('212.117.171.138', 54), ('147.32.80.9', 51)])
('Element', ('193.23.181.44', 439), 'percentage', 11.864864864864865, '%')
('Element', ('174.128.246.102', 319), 'percentage', 8.621621621621621, '%')
('Element', ('174.37.196.55', 287), 'percentage', 7.756756756756757, '%')
('Element', ('173.236.31.226', 253), 'percentage', 6.837837837837838, '%')
('Element', ('184.154.89.154', 248), 'percentage', 6.702702702702703, '%')
('Element', ('72.20.15.61', 235), 'percentage', 6.351351351351352, '%')
('Element', ('67.19.72.206', 217), 'percentage', 5.864864864864865, '%')
('Element', ('46.4.36.120', 130), 'percentage', 3.5135135135135136, '%')
('Element', ('212.117.171.138', 54), 'percentage', 1.4594594594594594, '%')
('Element', ('147.32.80.9', 51), 'percentage', 1.3783783783783783, '%')
('Top 10

('Top 10:', [('193.23.181.44', 581), ('174.128.246.102', 388), ('174.37.196.55', 323), ('184.154.89.154', 293), ('67.19.72.206', 291), ('173.236.31.226', 268), ('72.20.15.61', 237), ('46.4.36.120', 147), ('212.117.171.138', 92), ('147.32.80.9', 82)])
('Element', ('193.23.181.44', 581), 'percentage', 12.630434782608695, '%')
('Element', ('174.128.246.102', 388), 'percentage', 8.434782608695652, '%')
('Element', ('174.37.196.55', 323), 'percentage', 7.021739130434782, '%')
('Element', ('184.154.89.154', 293), 'percentage', 6.369565217391305, '%')
('Element', ('67.19.72.206', 291), 'percentage', 6.326086956521739, '%')
('Element', ('173.236.31.226', 268), 'percentage', 5.826086956521739, '%')
('Element', ('72.20.15.61', 237), 'percentage', 5.1521739130434785, '%')
('Element', ('46.4.36.120', 147), 'percentage', 3.1956521739130435, '%')
('Element', ('212.117.171.138', 92), 'percentage', 2.0, '%')
('Element', ('147.32.80.9', 82), 'percentage', 1.7826086956521738, '%')
('Top 10:', [('193.23.

('Top 10:', [('193.23.181.44', 664), ('174.128.246.102', 437), ('174.37.196.55', 414), ('173.236.31.226', 360), ('67.19.72.206', 352), ('184.154.89.154', 351), ('72.20.15.61', 300), ('46.4.36.120', 183), ('212.117.171.138', 93), ('147.32.80.9', 79)])
('Element', ('193.23.181.44', 664), 'percentage', 12.072727272727272, '%')
('Element', ('174.128.246.102', 437), 'percentage', 7.945454545454545, '%')
('Element', ('174.37.196.55', 414), 'percentage', 7.527272727272727, '%')
('Element', ('173.236.31.226', 360), 'percentage', 6.545454545454546, '%')
('Element', ('67.19.72.206', 352), 'percentage', 6.4, '%')
('Element', ('184.154.89.154', 351), 'percentage', 6.381818181818182, '%')
('Element', ('72.20.15.61', 300), 'percentage', 5.454545454545454, '%')
('Element', ('46.4.36.120', 183), 'percentage', 3.327272727272727, '%')
('Element', ('212.117.171.138', 93), 'percentage', 1.690909090909091, '%')
('Element', ('147.32.80.9', 79), 'percentage', 1.4363636363636363, '%')
('Top 10:', [('193.23.18

('Top 10:', [('193.23.181.44', 792), ('174.128.246.102', 512), ('174.37.196.55', 453), ('173.236.31.226', 450), ('184.154.89.154', 423), ('67.19.72.206', 370), ('72.20.15.61', 366), ('46.4.36.120', 192), ('212.117.171.138', 114), ('147.32.80.9', 92)])
('Element', ('193.23.181.44', 792), 'percentage', 12.375, '%')
('Element', ('174.128.246.102', 512), 'percentage', 8.0, '%')
('Element', ('174.37.196.55', 453), 'percentage', 7.078125, '%')
('Element', ('173.236.31.226', 450), 'percentage', 7.03125, '%')
('Element', ('184.154.89.154', 423), 'percentage', 6.609375, '%')
('Element', ('67.19.72.206', 370), 'percentage', 5.78125, '%')
('Element', ('72.20.15.61', 366), 'percentage', 5.71875, '%')
('Element', ('46.4.36.120', 192), 'percentage', 3.0, '%')
('Element', ('212.117.171.138', 114), 'percentage', 1.78125, '%')
('Element', ('147.32.80.9', 92), 'percentage', 1.4375, '%')
('Top 10:', [('193.23.181.44', 839), ('174.128.246.102', 509), ('174.37.196.55', 455), ('184.154.89.154', 427), ('67.1

('Top 10:', [('193.23.181.44', 916), ('174.128.246.102', 606), ('173.236.31.226', 524), ('174.37.196.55', 510), ('184.154.89.154', 460), ('67.19.72.206', 436), ('72.20.15.61', 418), ('46.4.36.120', 239), ('212.117.171.138', 133), ('147.32.80.9', 129)])
('Element', ('193.23.181.44', 916), 'percentage', 12.547945205479452, '%')
('Element', ('174.128.246.102', 606), 'percentage', 8.301369863013699, '%')
('Element', ('173.236.31.226', 524), 'percentage', 7.178082191780822, '%')
('Element', ('174.37.196.55', 510), 'percentage', 6.986301369863014, '%')
('Element', ('184.154.89.154', 460), 'percentage', 6.301369863013699, '%')
('Element', ('67.19.72.206', 436), 'percentage', 5.972602739726027, '%')
('Element', ('72.20.15.61', 418), 'percentage', 5.726027397260274, '%')
('Element', ('46.4.36.120', 239), 'percentage', 3.2739726027397262, '%')
('Element', ('212.117.171.138', 133), 'percentage', 1.821917808219178, '%')
('Element', ('147.32.80.9', 129), 'percentage', 1.7671232876712328, '%')
('Top

('Top 10:', [('193.23.181.44', 1025), ('174.128.246.102', 662), ('174.37.196.55', 609), ('67.19.72.206', 538), ('184.154.89.154', 505), ('173.236.31.226', 498), ('72.20.15.61', 464), ('46.4.36.120', 271), ('212.117.171.138', 140), ('147.32.80.9', 129)])
('Element', ('193.23.181.44', 1025), 'percentage', 12.5, '%')
('Element', ('174.128.246.102', 662), 'percentage', 8.073170731707316, '%')
('Element', ('174.37.196.55', 609), 'percentage', 7.426829268292683, '%')
('Element', ('67.19.72.206', 538), 'percentage', 6.560975609756097, '%')
('Element', ('184.154.89.154', 505), 'percentage', 6.158536585365853, '%')
('Element', ('173.236.31.226', 498), 'percentage', 6.073170731707317, '%')
('Element', ('72.20.15.61', 464), 'percentage', 5.658536585365853, '%')
('Element', ('46.4.36.120', 271), 'percentage', 3.3048780487804876, '%')
('Element', ('212.117.171.138', 140), 'percentage', 1.7073170731707317, '%')
('Element', ('147.32.80.9', 129), 'percentage', 1.5731707317073171, '%')
('Top 10:', [('1

('Top 10:', [('193.23.181.44', 1101), ('174.128.246.102', 730), ('174.37.196.55', 649), ('173.236.31.226', 586), ('184.154.89.154', 582), ('67.19.72.206', 546), ('72.20.15.61', 504), ('46.4.36.120', 301), ('212.117.171.138', 162), ('147.32.80.9', 127)])
('Element', ('193.23.181.44', 1101), 'percentage', 12.233333333333333, '%')
('Element', ('174.128.246.102', 730), 'percentage', 8.11111111111111, '%')
('Element', ('174.37.196.55', 649), 'percentage', 7.211111111111111, '%')
('Element', ('173.236.31.226', 586), 'percentage', 6.511111111111111, '%')
('Element', ('184.154.89.154', 582), 'percentage', 6.466666666666667, '%')
('Element', ('67.19.72.206', 546), 'percentage', 6.066666666666666, '%')
('Element', ('72.20.15.61', 504), 'percentage', 5.6, '%')
('Element', ('46.4.36.120', 301), 'percentage', 3.3444444444444446, '%')
('Element', ('212.117.171.138', 162), 'percentage', 1.8, '%')
('Element', ('147.32.80.9', 127), 'percentage', 1.4111111111111112, '%')
('Top 10:', [('193.23.181.44', 1

('Top 10:', [('193.23.181.44', 1229), ('174.128.246.102', 804), ('174.37.196.55', 716), ('184.154.89.154', 633), ('173.236.31.226', 622), ('67.19.72.206', 621), ('72.20.15.61', 555), ('46.4.36.120', 307), ('212.117.171.138', 182), ('147.32.80.9', 142)])
('Element', ('193.23.181.44', 1229), 'percentage', 12.414141414141413, '%')
('Element', ('174.128.246.102', 804), 'percentage', 8.121212121212121, '%')
('Element', ('174.37.196.55', 716), 'percentage', 7.232323232323233, '%')
('Element', ('184.154.89.154', 633), 'percentage', 6.393939393939394, '%')
('Element', ('173.236.31.226', 622), 'percentage', 6.282828282828283, '%')
('Element', ('67.19.72.206', 621), 'percentage', 6.2727272727272725, '%')
('Element', ('72.20.15.61', 555), 'percentage', 5.606060606060606, '%')
('Element', ('46.4.36.120', 307), 'percentage', 3.101010101010101, '%')
('Element', ('212.117.171.138', 182), 'percentage', 1.8383838383838385, '%')
('Element', ('147.32.80.9', 142), 'percentage', 1.4343434343434343, '%')
('

## The 10 most frequent ips with COUNT-MIN sketching for different w, d

In [7]:
#start the min-count sketch
class CountMin:
    
    def __init__(self, w, d):
        self.size = w*d
        self.w = w
        self.hash_count = d
        self.cm_array =  [[0]*w for i in range(d)]
        
    def add(self, string):
        for seed in range(self.hash_count):
            result = mmh3.hash(string, seed) % self.w
            self.cm_array[seed][result] += 1
        
    def point(self, string):
        min = 9999999
        for seed in range(self.hash_count):
            result = mmh3.hash(string, seed) % self.w
            if self.cm_array[seed][result]<min:
                min = self.cm_array[seed][result]
        return min

In [23]:
e = 2.718281828

# compute the frequnt ips for different values of d and w
for epsilon in [0.0001, 0.001, 0.005, 0.01, 0.1]:
    for delta in [0.0001, 0.001, 0.005, 0.01, 0.1]:
        
        # calculate the w, d
        w = round(e/epsilon)
        d = round(math.log(1/delta))

        # construct the matrix with the correct dimensions
        count_min_matrix = CountMin(int(w), int(d))

        # add each ip to the matrix
        for ip in ips:
                count_min_matrix.add(ip)
                
        # find frequency and store it to cm_res
        count_min = {}
        for ip in ips:
            count_min[ip] = count_min_matrix.point(ip)

        # sort them according to their value to find the 10 most frequent ones
        sorted_count_min = sorted(count_min.items(), key=operator.itemgetter(1), reverse = True)

        difference_w_ground = 0
        for i in range(10):
            #find the difference between the count min and the ground truth
            difference_w_ground+= abs(sorted_count_min[i][1] - most_freq[i][1])

        print('For w=', w, 'and d=', d, 'accuracy', 100-(difference_w_ground*100.0/51809),'%')
        print('epsilon= ',epsilon, 'and delta=', delta)

('For w=', 27183.0, 'and d=', 9.0, 'accuracy', 100.0, '%')
('epsilon= ', 0.0001, 'and delta=', 0.0001)
('For w=', 27183.0, 'and d=', 7.0, 'accuracy', 100.0, '%')
('epsilon= ', 0.0001, 'and delta=', 0.001)
('For w=', 27183.0, 'and d=', 5.0, 'accuracy', 100.0, '%')
('epsilon= ', 0.0001, 'and delta=', 0.005)
('For w=', 27183.0, 'and d=', 5.0, 'accuracy', 100.0, '%')
('epsilon= ', 0.0001, 'and delta=', 0.01)
('For w=', 27183.0, 'and d=', 2.0, 'accuracy', 100.0, '%')
('epsilon= ', 0.0001, 'and delta=', 0.1)
('For w=', 2718.0, 'and d=', 9.0, 'accuracy', 100.0, '%')
('epsilon= ', 0.001, 'and delta=', 0.0001)
('For w=', 2718.0, 'and d=', 7.0, 'accuracy', 100.0, '%')
('epsilon= ', 0.001, 'and delta=', 0.001)
('For w=', 2718.0, 'and d=', 5.0, 'accuracy', 100.0, '%')
('epsilon= ', 0.001, 'and delta=', 0.005)
('For w=', 2718.0, 'and d=', 5.0, 'accuracy', 100.0, '%')
('epsilon= ', 0.001, 'and delta=', 0.01)
('For w=', 2718.0, 'and d=', 2.0, 'accuracy', 99.99613966685325, '%')
('epsilon= ', 0.001, '

### Compare the run time for the two implemented methods

In [9]:
import timeit

start_min_wise = timeit.default_timer()

k=5000
for index, row in infected_dataset.iterrows():
    #begin by setting a random value at each row
    a = random.uniform(0,1)
    infected_dataset.set_value(index,'rn',a)

sort_infections = infected_dataset.sort_values(['rn'], ascending=[True])
sel_k = sort_infections[0:k]
sel_k=sel_k.reset_index(level=0, drop=True)
most_freq_minwise, ips_minwise = return_k_frequent_ips(sel_k, 10)


stop_min_wise = timeit.default_timer()
min_wise_time = stop_min_wise - start_min_wise


start_count_min = timeit.default_timer()

w = 2718
d = 4
# construct the matrix with the correct dimensions
count_min_matrix = CountMin(int(w), int(d))

# add each ip to the matrix
for ip in ips:
        count_min_matrix.add(ip)

# find frequency and store it to cm_res
count_min = {}
for ip in ips:
    count_min[ip] = count_min_matrix.point(ip)

# sort them according to their value to find the 10 most frequent ones
sorted_count_min = sorted(count_min.items(), key=operator.itemgetter(1), reverse = True)

stop_count_min = timeit.default_timer()

count_min_time = stop_count_min - start_count_min

  if __name__ == '__main__':


In [10]:
print(min_wise_time, count_min_time)

(4.7074267864227295, 0.34471797943115234)
