# Rerervoir sampling task

In [2]:
import operator
import random
from utils import read_data
import pandas as pd
import time

## Read the dataset and keep only the flows that contain the infected host

In [3]:
infected_host = '147.32.84.165'

#uncomment to read the data
# data = read_data()
# data.to_pickle('./data.pkl')

# load the dataset
data = pd.read_pickle('./data.pkl')
infected_dataset = data.loc[(data['src'] == infected_host) | (data['dst'] == infected_host)]
print('Flows with infected host: {}'.format(infected_dataset.shape[0]))

Flows with infected host: 21760


## Helper functions

### Finds the frequency and the number of flows for each IP in the infected dataset

In [4]:
def compute_most_frequent(infected_dataset):
    connections = {}
    # compute the number of flows for each ip
    for index, row in infected_dataset.iterrows():
        src = row['src']
        dst = row['dst']
        if src == infected_host:
            if dst in connections:
                connections[dst] += 1
            else:
                connections[dst] = 1
        elif dst == infected_host:
            if src in connections:
                connections[src] += 1
            else:
                connections[src] = 1
    # sor the results
    sorted_connections = sorted(connections.items(), key=operator.itemgetter(1), reverse=True)
    total_connections = len(infected_dataset)
    
    # create a dataframe with the frequency ans the number of connections for each ip
    connection_df = pd.DataFrame(sorted_connections, columns=['IP', 'num_of_connections'])
    connection_df['percentage'] = round(100 * connection_df['num_of_connections'] / total_connections, 2)
    return connection_df

### Performs tha Reservoir Sampling

In [5]:
def reservoir_sampling(infected_dataset, k):
    result = []
    for i, (index, row) in enumerate(infected_dataset.iterrows()):
        i += 1
        if len(result) < k:
            result.append(index)
        else:
            s = random.randint(1, i)
            if s < k:
                result[s] = index
    return data.iloc[result]


### Finds the difference in the frequence for the top 10 IPs , between the true sequence and the one obtained from Reservoir sampling

In [6]:
def find_differences(normal_top, sampled):
    diff = 0
    for index, row in normal_top.iterrows():
        if row['IP'] in sampled['IP'].values:
            normal_freq = row['percentage']
            sampled_freq = sampled.loc[sampled['IP'] == row['IP']].iloc[0]['percentage']
            diff += abs(normal_freq - sampled_freq)
        else:
            diff += row['percentage']
    return diff

## Find the 10 most frequent IPs of the stream

In [7]:
normal_top = compute_most_frequent(infected_dataset)[:10]
print(normal_top)
normar_top_ips = normal_top['IP'].tolist()

                IP  num_of_connections  percentage
0      147.32.80.9                9774       44.92
1   184.173.217.40                2287       10.51
2  212.117.171.138                1725        7.93
3     65.55.37.104                 391        1.80
4    65.54.188.110                 198        0.91
5    94.63.149.150                 157        0.72
6     74.125.39.27                 143        0.66
7    205.188.103.1                 127        0.58
8     65.55.92.152                 120        0.55
9     74.125.93.27                 115        0.53


## Find the 10 most frequent IPs of the stream by performing Reservoir Sampling for several reservoir values

In [17]:
k_values = [100, 200, 500, 1000, 2000, 5000, 10000]
for k in k_values:
    res = reservoir_sampling(infected_dataset, k)
    reservoir_top = compute_most_frequent(res)[:10]
    reservoir_top_ips = reservoir_top['IP'].tolist()
    print('---------- k = {} ----------\n'.format(k))
    print(reservoir_top)
    print('\n')
    print('Different IPs: {}'.format(len(set(normar_top_ips) - set(reservoir_top_ips))))
    diff = find_differences(normal_top, compute_most_frequent(res))
    print("Frequency difference: %0.3f" %diff)
    

---------- k = 100 ----------

                IP  num_of_connections  percentage
0      147.32.80.9                  39        39.0
1   184.173.217.40                  15        15.0
2  212.117.171.138                   5         5.0
3    65.54.188.110                   3         3.0
4    64.12.175.136                   2         2.0
5     194.25.134.9                   2         2.0
6    98.137.54.237                   2         2.0
7     65.55.92.152                   2         2.0
8   209.191.88.254                   2         2.0
9     65.55.37.104                   2         2.0


Different IPs: 4
Frequency difference: 19.570
---------- k = 200 ----------

                IP  num_of_connections  percentage
0      147.32.80.9                  82        41.0
1  212.117.171.138                  18         9.0
2   184.173.217.40                  18         9.0
3     65.55.37.104                   6         3.0
4    65.54.188.110                   4         2.0
5     74.125.93.27     

## Run reservoir sampling for reservoir size k=100 and measure execution time

In [18]:
start = time.time()
k = 100
res = reservoir_sampling(infected_dataset, k)
reservoir_top = compute_most_frequent(res)[:10]
reservoir_top_ips = reservoir_top['IP'].tolist()
print('---------- k = {} ----------\n'.format(k))
print(reservoir_top)
print('\n')
print('Different IPs: {}'.format(len(set(normar_top_ips) - set(reservoir_top_ips))))
diff = find_differences(normal_top, compute_most_frequent(res))
print("Frequency difference: %0.3f" %diff)
# stop time recording
stop = time.time()
print('Execution time: ',stop - start)
# run_times[it][i] = stop - start

---------- k = 100 ----------

                IP  num_of_connections  percentage
0      147.32.80.9                  49        49.0
1   184.173.217.40                  11        11.0
2  212.117.171.138                   9         9.0
3     74.125.39.27                   2         2.0
4   205.188.59.193                   2         2.0
5     76.96.30.116                   2         2.0
6    94.63.149.150                   2         2.0
7    64.12.175.136                   1         1.0
8        68.1.17.9                   1         1.0
9       64.18.5.11                   1         1.0


Different IPs: 5
Frequency difference: 12.630
Execution time:  1.5446557998657227
