In [1]:
import utils.data as data
import utils.stats as stats
import numpy as np
import matplotlib.pyplot as plt

In [10]:
def flowsize_dist(dataset, filename, flow_tuple):
    df = data.load_data(dataset, filename, verbose=False)
    total_duration = df["time"].max()
    df = df[df["time"] < total_duration / 10]
    total_duration = total_duration / 10
    print("Truncated to first 1/10 duration: {:.3f}".format(total_duration))

    # dfg = df.groupby(stats.five_tuple)
    # srcip, srcport
    dfg = df.groupby(flow_tuple)

    flowsizes = dfg.size()

    max_raw_flow_header_index = np.argsort(list(flowsizes))[-1]
    max_raw_flow_header = list(dfg.groups.keys())[max_raw_flow_header_index]
    max_raw_flow = dfg.get_group(max_raw_flow_header)

    # print(max_raw_flow)

    # plt.hist(flowsizes, bins=10000, histtype="step")
    print("Grouped by {}, total number of flows: {}".format(
        flow_tuple,
        len(flowsizes)))
    for threshold in [2, 3, 5, 10, 50, 100]:
        num = np.sum(flowsizes < threshold)
        print("#flow w/ size < {}: {}, {:.2f}%".format(
            threshold, num, num/len(flowsizes)*100))
    
    if dataset in ["ugr16", "cidds", "ton"]:
        print("max flow has {} records, {} packets, lasts {:.3f} seconds".format(
            len(max_raw_flow), 
            np.sum(max_raw_flow["pkt"]),
            max_raw_flow["time"].max() - max_raw_flow["time"].min()))
    elif dataset in ["caida", "dc"]:
        print("max flow has {} packets, lasts {:.3f} seconds".format(
            len(max_raw_flow), 
            max_raw_flow["time"].max() - max_raw_flow["time"].min()))
    print()

In [13]:
flow_tuple = [stats.five_tuple[i] for i in [0, 2]]
flowsize_dist("caida", "raw.csv", flow_tuple)
flowsize_dist("dc", "raw.csv", flow_tuple)
flowsize_dist("ugr16", "raw.csv", flow_tuple)
flowsize_dist("cidds", "raw.csv", flow_tuple)
flowsize_dist("ton", "raw.csv", flow_tuple)


Loading data from:
	data/caida/raw.csv
Number of packets: 998912
Trace duration: 2.3428690433502197 seconds
Truncated to first 1/10 duration: 0.234
Grouped by ['srcip', 'dstip'], total number of flows: 17891
#flow w/ size < 2: 9728, 54.37%
#flow w/ size < 3: 12571, 70.26%
#flow w/ size < 5: 14674, 82.02%
#flow w/ size < 10: 16277, 90.98%
#flow w/ size < 50: 17641, 98.60%
#flow w/ size < 100: 17799, 99.49%
max flow has 1882 packets, lasts 0.190 seconds

Loading data from:
	data/dc/raw.csv
Number of packets: 1000000
Trace duration: 273.1515350341797 seconds
Truncated to first 1/10 duration: 27.315
Grouped by ['srcip', 'dstip'], total number of flows: 129
#flow w/ size < 2: 19, 14.73%
#flow w/ size < 3: 31, 24.03%
#flow w/ size < 5: 51, 39.53%
#flow w/ size < 10: 66, 51.16%
#flow w/ size < 50: 93, 72.09%
#flow w/ size < 100: 97, 75.19%
max flow has 20199 packets, lasts 27.309 seconds

Loading data from:
	data/ugr16/raw.csv
Number of packets: 1000000
Trace duration: 1342.636 seconds
Trunca

In [12]:
flow_tuple = [stats.five_tuple[i] for i in [0, 1, 2, 3, 4]]
flowsize_dist("caida", "raw.csv", flow_tuple)
flowsize_dist("dc", "raw.csv", flow_tuple)
flowsize_dist("ugr16", "raw.csv", flow_tuple)
flowsize_dist("cidds", "raw.csv", flow_tuple)
flowsize_dist("ton", "raw.csv", flow_tuple)

Loading data from:
	data/caida/raw.csv
Number of packets: 998912
Trace duration: 2.3428690433502197 seconds
Truncated to first 1/10 duration: 0.234
Grouped by ['srcip', 'srcport', 'dstip', 'dstport', 'proto'], total number of flows: 20095
#flow w/ size < 2: 11235, 55.91%
#flow w/ size < 3: 14601, 72.66%
#flow w/ size < 5: 16932, 84.26%
#flow w/ size < 10: 18568, 92.40%
#flow w/ size < 50: 19854, 98.80%
#flow w/ size < 100: 20011, 99.58%
max flow has 1876 packets, lasts 0.183 seconds

Loading data from:
	data/dc/raw.csv
Number of packets: 1000000
Trace duration: 273.1515350341797 seconds
Truncated to first 1/10 duration: 27.315
Grouped by ['srcip', 'srcport', 'dstip', 'dstport', 'proto'], total number of flows: 6756
#flow w/ size < 2: 3658, 54.14%
#flow w/ size < 3: 3806, 56.34%
#flow w/ size < 5: 4237, 62.71%
#flow w/ size < 10: 5689, 84.21%
#flow w/ size < 50: 6582, 97.42%
#flow w/ size < 100: 6662, 98.61%
max flow has 19469 packets, lasts 27.303 seconds

Loading data from:
	data/ugr1