In [19]:
import operator
import random
from utils import read_data
import pandas as pd

In [20]:
infected_host = '147.32.84.165'
# data = read_data()
# data.to_pickle('./data.pkl')
data = pd.read_pickle('./data.pkl')
infected_dataset = data.loc[(data['src'] == infected_host) | (data['dst'] == infected_host)]
print('Rows with infected host: {}'.format(infected_dataset.shape[0]))

Rows with infected host: 21760


In [21]:

def compute_10_most_frequent(infected_dataset):
    connections = {}

    for index, row in infected_dataset.iterrows():
        src = row['src']
        dst = row['dst']
        if src == infected_host:
            if dst in connections:
                connections[dst] += 1
            else:
                connections[dst] = 1
        elif dst == infected_host:
            if src in connections:
                connections[src] += 1
            else:
                connections[src] = 1

    sorted_connections = sorted(connections.items(), key=operator.itemgetter(1), reverse=True)
    total_connections = len(infected_dataset)

    connection_df = pd.DataFrame(sorted_connections, columns=['IP', 'num_of_connections'])
    connection_df['percentage'] = round(100 * connection_df['num_of_connections'] / total_connections, 2)
    return connection_df[:10]

In [22]:
def reservoir_sampling(infected_dataset, k):
    result = []
    for i, (index, row) in enumerate(infected_dataset.iterrows()):
        i += 1
        if len(result) < k:
            result.append(index)
        else:
            s = random.randint(1, i)
            if s < k:
                result[s] = index
    return data.iloc[result]


In [23]:
normal_top = compute_10_most_frequent(infected_dataset)
print(normal_top)
normar_top_ips = normal_top['IP'].tolist()

                IP  num_of_connections  percentage
0      147.32.80.9                9774       44.92
1   184.173.217.40                2287       10.51
2  212.117.171.138                1725        7.93
3     65.55.37.104                 391        1.80
4    65.54.188.110                 198        0.91
5    94.63.149.150                 157        0.72
6     74.125.39.27                 143        0.66
7    205.188.103.1                 127        0.58
8     65.55.92.152                 120        0.55
9     74.125.93.27                 115        0.53


In [24]:
k_values = [100, 200, 500, 1000, 2000, 5000, 10000]
for k in k_values:
    res = reservoir_sampling(infected_dataset, k)
    reservoir_top = compute_10_most_frequent(res)
    reservoir_top_ips = reservoir_top['IP'].tolist()
    print('---------- k = {} ----------\n'.format(k))
    print(reservoir_top)
    print('\n')
    print('Different IPs: {}'.format(list(set(normar_top_ips) - set(reservoir_top_ips))))
    print('\n')
    

---------- k = 100 ----------

                IP  num_of_connections  percentage
0      147.32.80.9                  45        45.0
1   184.173.217.40                   7         7.0
2  212.117.171.138                   6         6.0
3     65.55.37.104                   3         3.0
4    66.94.237.139                   3         3.0
5    67.195.168.31                   2         2.0
6    64.12.175.136                   1         1.0
7    212.52.84.180                   1         1.0
8       64.4.56.23                   1         1.0
9      66.111.4.72                   1         1.0


Different IPs: ['205.188.103.1', '94.63.149.150', '74.125.39.27', '74.125.93.27', '65.55.92.152', '65.54.188.110']


---------- k = 200 ----------

                IP  num_of_connections  percentage
0      147.32.80.9                  86        43.0
1   184.173.217.40                  21        10.5
2  212.117.171.138                  16         8.0
3    65.54.188.110                   5         2.5
4  