# Rerervoir sampling task

In [1]:
import operator
import random
from utils import read_data
import pandas as pd
import time
import math

## Read the dataset and keep only the flows that contain the infected host

In [2]:
#initialize random seed to get always the same results
random.seed(0)
infected_host = '147.32.84.165'

#uncomment to read the data
data = read_data('datasets/CTU-Malware-Capture-Botnet-54')
# data.to_pickle('./data.pkl')

# load the dataset
# data = pd.read_pickle('./data.pkl')
infected_dataset = data.loc[(data['src_ip'] == infected_host) | (data['dst_ip'] == infected_host)]
print('Flows with infected host: {}'.format(infected_dataset.shape[0]))

Flows with infected host: 21760


## Helper functions

### Finds the frequency and the number of flows for each IP in the infected dataset

In [3]:
def compute_most_frequent(infected_dataset):
    connections = {}
    # compute the number of flows for each ip
    for index, row in infected_dataset.iterrows():
        src = row['src_ip']
        dst = row['dst_ip']
        if src == infected_host:
            if dst in connections:
                connections[dst] += 1
            else:
                connections[dst] = 1
        elif dst == infected_host:
            if src in connections:
                connections[src] += 1
            else:
                connections[src] = 1
    # sor the results
    sorted_connections = sorted(connections.items(), key=operator.itemgetter(1), reverse=True)
    total_connections = len(infected_dataset)
    
    # create a dataframe with the frequency ans the number of connections for each ip
    connection_df = pd.DataFrame(sorted_connections, columns=['IP', 'num_of_connections'])
    connection_df['frequency'] = round(100 * connection_df['num_of_connections'] / total_connections, 2)
    return connection_df

### Performs tha Reservoir Sampling

In [4]:
def reservoir_sampling(infected_dataset, k):
    result = []
    for i, (index, row) in enumerate(infected_dataset.iterrows()):
        i += 1
        if len(result) < k:
            result.append(index)
        else:
            s = random.randint(1, i)
            if s < k:
                result[s] = index
    return data.iloc[result]


### Finds the difference in the frequence for the top 10 IPs , between the true sequence and the one obtained from Reservoir sampling

In [5]:
def compute_mse(normal_top, sampled):
    sampled = sampled[:10]
    diff = []
    for index, row in normal_top.iterrows():
        if row['IP'] in sampled['IP'].values:
            normal_freq = row['frequency']
            sampled_freq = sampled.loc[sampled['IP'] == row['IP']].iloc[0]['frequency']
            diff += [abs(normal_freq - sampled_freq)**2]
        else:
            diff += [row['frequency']**2]
    mse = math.sqrt(sum(diff))
    return mse

## Find the 10 most frequent IPs of the stream

In [6]:
normal_top = compute_most_frequent(infected_dataset)[:10]
print(normal_top)
normar_top_ips = normal_top['IP'].tolist()

                IP  num_of_connections  frequency
0      147.32.80.9                9774      44.92
1   184.173.217.40                2287      10.51
2  212.117.171.138                1725       7.93
3     65.55.37.104                 391       1.80
4    65.54.188.110                 198       0.91
5    94.63.149.150                 157       0.72
6     74.125.39.27                 143       0.66
7    205.188.103.1                 127       0.58
8     65.55.92.152                 120       0.55
9     74.125.93.27                 115       0.53


## Find the 10 most frequent IPs of the stream by performing Reservoir Sampling for several reservoir values

In [7]:
k_values = [100, 500, 1000, 5000, 10000, 20000]
for k in k_values:
    start = time.time()
    res = reservoir_sampling(infected_dataset, k)
    reservoir_top = compute_most_frequent(r)[:10]
    stop = time.time()
    reservoir_top_ips = reservoir_top['IP'].tolist()
    print('---------- k = {} ----------\n'.format(k))
    print(reservoir_top)
    print('\nDifferent IPs: {}'.format(len(set(normar_top_ips) - set(reservoir_top_ips))))
    mse = compute_mse(normal_top, compute_most_frequent(res))
    print("Frequency difference: %0.3f" %mse)
    print('Execution time: ',stop - start, '\n')
    

---------- k = 100 ----------

                IP  num_of_connections  frequency
0      147.32.80.9                  44       44.0
1  212.117.171.138                  10       10.0
2   184.173.217.40                   9        9.0
3    66.94.238.147                   3        3.0
4    65.54.165.139                   2        2.0
5    209.86.93.227                   2        2.0
6    216.104.161.5                   2        2.0
7    64.12.175.136                   1        1.0
8    65.54.188.110                   1        1.0
9      195.3.96.71                   1        1.0

Different IPs: 6
Frequency difference: 3.540
Execution time:  2.5542798042297363 

---------- k = 500 ----------

                IP  num_of_connections  frequency
0      147.32.80.9                 209       41.8
1   184.173.217.40                  57       11.4
2  212.117.171.138                  38        7.6
3     65.55.37.104                  13        2.6
4    65.54.188.110                   8        1.6
5  2