In [1]:
import pickle
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
import numpy as np
import random
import json
import os
import re
import seaborn as sns
import math
import sys
import copy

In [21]:
def read_total_size_data(algo='cm', row=3, width=4096, level=1, seed=1, count=1, flowkey='srcIP', 
              epochs=['10'], dataset='caida20180517_10_caida20180816_0.pcap', window_size=200):
    res = []
    for epoch in epochs:
        
        path = f"../SketchPatternQuery/{algo}/{dataset}/"\
                f"{flowkey}/row_{row}_width_{width}_level_{level}_epoch_{epoch}_count_{count}_seed_{seed}/"
        
        for dir in sorted(os.listdir(path)):
            p = os.path.join(path, dir)
            if os.path.isdir(p): 
                window_dir = "window_" + str(window_size)
                full_path = os.path.join(path, dir, window_dir,'total_flow_size.txt')    
                with open(full_path, 'r') as f:
                    for val in f:
                        res.append(int(val))
            
    return res

# read_total_size_data()

In [2]:
def read_fsd_data(algo='cm', row=3, width=4096, level=1, seed=1, count=1, flowkey='srcIP', 
              epochs=['10'], dataset='caida0517-125w_10_.pcap', window_size=200):
    res = []
    for epoch in epochs:
        
        path = f"../SketchPatternQuery/{algo}/{dataset}/"\
                f"{flowkey}/row_{row}_width_{width}_level_{level}_epoch_{epoch}_count_{count}_seed_{seed}/"
        
        for dir in sorted(os.listdir(path)):
            p = os.path.join(path, dir)
            if os.path.isdir(p): 
                window_dir = "window_" + str(window_size)
                dynamic_full_path = os.path.join(path, dir, window_dir, "randk_summation")
                
                for file in sorted(os.listdir(dynamic_full_path)):  
                    fsd_file = os.path.join(dynamic_full_path, file)
                    fsd = {}
                    with open(fsd_file, 'r') as f:
                        for line in f:
                            fsd[int(line.strip().split()[0])] = int(line.strip().split()[1])
                            
                    res.append(fsd)
            
    return res

# read_fsd_data()

In [3]:
def read_single_window_fsd_data(algo='cm', row=3, width=4096, level=1, seed=1, count=1, flowkey='srcIP', 
              epochs=['10'], dataset='caida0517-125w_10_.pcap', window_size=200):
    res = []
    for epoch in epochs:
        
        path = f"../SketchPatternQuery/{algo}/{dataset}/"\
                f"{flowkey}/row_{row}_width_{width}_level_{level}_epoch_{epoch}_count_{count}_seed_{seed}/"
        
        for dir in sorted(os.listdir(path)):
            p = os.path.join(path, dir)
            if os.path.isdir(p): 
                window_dir = "window_" + str(window_size)
                dynamic_full_path = os.path.join(path, dir, window_dir, "single_window_randk_summation")
                
                for file in sorted(os.listdir(dynamic_full_path)):  
                    fsd_file = os.path.join(dynamic_full_path, file)
                    fsd = {}
                    with open(fsd_file, 'r') as f:
                        for line in f:
                            if int(line.strip().split()[0]) == 0:
                                continue
                            fsd[int(line.strip().split()[0])] = int(line.strip().split()[1])
                            
                    res.append(fsd)
            
    return res

# read_fsd_data()

In [226]:
def read_single_window_gt_fsd_data(algo='cm', row=3, width=4096, level=1, seed=1, count=1, flowkey='srcIP', 
              epochs=['10'], dataset='caida0517-125w_10_.pcap', window_size=200):
    res = []
    for epoch in epochs:
        
        path = f"../SketchPatternQuery/{algo}/{dataset}/"\
                f"{flowkey}/row_{row}_width_{width}_level_{level}_epoch_{epoch}_count_{count}_seed_{seed}/"
        
        for dir in sorted(os.listdir(path)):
            p = os.path.join(path, dir)
            if os.path.isdir(p): 
                window_dir = "window_" + str(window_size)
                dynamic_full_path = os.path.join(path, dir, window_dir, "single_window_randk_gt_summation")
                
                for file in sorted(os.listdir(dynamic_full_path)):  
                    fsd_file = os.path.join(dynamic_full_path, file)
                    fsd = {}
                    with open(fsd_file, 'r') as f:
                        for line in f:
                            if int(line.strip().split()[0]) == 0:
                                continue
                            fsd[int(line.strip().split()[0])] = int(line.strip().split()[1])
                            
                    res.append(fsd)
            
    return res

# read_fsd_data()

In [4]:
def read_gt_fsd_data(algo='cm', row=3, width=4096, level=1, seed=1, count=1, flowkey='srcIP', 
              epochs=['10'], dataset='caida0517-125w_10_.pcap', window_size=200):
    res = []
    for epoch in epochs:
        
        path = f"../SketchPatternQuery/{algo}/{dataset}/"\
                f"{flowkey}/row_{row}_width_{width}_level_{level}_epoch_{epoch}_count_{count}_seed_{seed}/"
        
        for dir in sorted(os.listdir(path)):
            p = os.path.join(path, dir)
            if os.path.isdir(p): 
                window_dir = "window_" + str(window_size)
                dynamic_full_path = os.path.join(path, dir, window_dir, "randk_gt_summation")
                
                for file in sorted(os.listdir(dynamic_full_path)):  
                    fsd_file = os.path.join(dynamic_full_path, file)
                    fsd = {}
                    with open(fsd_file, 'r') as f:
                        for line in f:
                            fsd[int(line.strip().split()[0])] = int(line.strip().split()[1])
                            
                    sorted_fsd = dict(sorted(fsd.items()))
                            
                    res.append(sorted_fsd)
            
    return res

# read_fsd_data()

In [176]:
def parse_flow_line(key, line):
    string_key = line.split(") ")[0]
    string_key += ")"

    left = line.split(") ")[1]

    left = left.replace("]", "")
    left = left.replace("[", "")
    splitted = left.split(" ")
    estimate = int(splitted[0])
    
    return string_key, estimate

In [227]:
def prepare_profiler_final_fsd_dict(algo='cm', row=3, width=4096, level=1, seed=1, count=1, flowkey='srcIP', 
                                epochs=['10']):
    fs_freq_map = {}
    # profiler_file = []
    
    profiler_folder_path = "/home/ming/SketchMercator/pattern_detection/traffic_generator/pcap_file/"
    profiler_file = ["caida0517-500w_10_.pcap",
                    "caida0816-600w_10_.pcap",
                    "zipf2a-150w_10_.pcap",
                    "zipf2b-400w_10_.pcap"
                    ]
    
    for epoch in epochs:
        for file in profiler_file:
            path = f"../SketchPadding/{algo}/{file}/"\
                    f"{flowkey}/row_{row}_width_{width}_level_{level}_epoch_{epoch}_count_{count}_seed_{seed}/"
            
            for dir in sorted(os.listdir(path)):
                p = os.path.join(path, dir)
                if os.path.isdir(p): 
                    flowkey_path = os.path.join(p, "flowkey.txt")
                    fs_freq_map[file] = {}
            
                    f = open(flowkey_path)
                    key = f.readline().strip()
                    # print(key)
                    for line in f:
                        string_key, estimate = parse_flow_line(key, line.strip())
                        fs_freq_map[file][string_key] = estimate
                    f.close()
            
    return fs_freq_map


In [211]:
def get_pdf(fsd):
    pdf_fsd = {}
    total_flows = sum(list(fsd.values()))
    for key, val in fsd.items():
        pdf_fsd[key] = val/total_flows
        
    return pdf_fsd
    

In [7]:
def get_profiler_sampled_fsd(profiler_fs_freq_map, randomk_key):
    fsd = {}
    for key in randomk_key:
        if key in profiler_fs_freq_map.keys():
            if profiler_fs_freq_map[key] in fsd.keys():
                fsd[profiler_fs_freq_map[key]] += 1
            else:
                fsd[profiler_fs_freq_map[key]] = 1
                
    sorted_fsd = dict(sorted(list(fsd.items())))
        
    return sorted_fsd
            

In [184]:
def get_sampled_flowkey(algo='cm', row=3, width=4096, level=1, seed=1, count=1, flowkey='srcIP', 
                            epochs=['10'], dataset = "caida0517-125w_10_.pcap", window_size=200, predict_time=1):
    
    sampled_flowkey = []
    for epoch in epochs:
        
        path = f"../SketchPadding/{algo}/{dataset}/"\
                f"{flowkey}/row_{row}_width_{width}_level_{level}_epoch_{epoch}_count_{count}_seed_{seed}/"
        
        for dir in sorted(os.listdir(path)):
            p = os.path.join(path, dir)
            if os.path.isdir(p): 
                for l in range(0, 1):
                    key_window_dir = '%s/level_%02d/key_window_%d/' % (p, l, window_size)
                    if not os.path.isdir(key_window_dir):
                        continue
                    
                    cnt = 0 # count for predict dist
                    for file in sorted(os.listdir(key_window_dir)):
                        key_list = []
                        key_window_path = os.path.join(key_window_dir, file)
                            
                        f = open(key_window_path)
                        key = f.readline().strip()
                        for line in f:
                            string_key, estimate = parse_flow_line(key, line.strip())
                            key_list.append(string_key)
                            
                        f.close()
                        
                        sampled_flowkey.append(key_list)
                        
                        cnt += 1
                        if predict_time != 0 and cnt >= int(predict_time*1000/window_size):
                            break
                        
                              
    return sampled_flowkey


### Prepare Dataset

In [37]:
## parameters

caida0517 = ["caida0517-500w", "caida0517-250w", "caida0517-125w"]
caida0816 = ["caida0816-600w", "caida0816-300w", "caida0816-150w"]
zipf2a = ["zipf2a-150w", "zipf2a-75w", "zipf2a-35w"]
zipf2b = ["zipf2b-400w", "zipf2b-200w", "zipf2b-100w"]
zipf4 = ["zipf4-60w", "zipf4-30w", "zipf4-15w"]


lens = [
        # ["5", "5"],
        ["6", "4"],
        # ["7", "3"],
        # ["8", "2"],
        ]

pcap_file = []
    
# # single dataset
pcap_file.append("caida0517-500w_10_.pcap")
pcap_file.append("caida0517-250w_10_.pcap")
pcap_file.append("caida0517-125w_10_.pcap")
pcap_file.append("caida0816-600w_10_.pcap")
pcap_file.append("caida0816-300w_10_.pcap")
pcap_file.append("caida0816-150w_10_.pcap")
pcap_file.append("zipf2a-150w_10_.pcap") 
pcap_file.append("zipf2a-75w_10_.pcap") 
pcap_file.append("zipf2a-35w_10_.pcap") 
pcap_file.append("zipf2b-400w_10_.pcap") 
pcap_file.append("zipf2b-200w_10_.pcap") 
pcap_file.append("zipf2b-100w_10_.pcap") 


# # # same dist, caida
for a in caida0517:
    for b in caida0816:
        for l in lens:
            pcap_file.append(f'{a}_{l[0]}_{b}_{l[1]}.pcap')
for a in caida0816:
    for b in caida0517:
        for l in lens:
            pcap_file.append(f'{a}_{l[0]}_{b}_{l[1]}.pcap')

            
# # same dist, zipf
for a in zipf2a:
    for b in zipf2b:
        for l in lens:
            pcap_file.append(f'{a}_{l[0]}_{b}_{l[1]}.pcap')
for a in zipf2b:
    for b in zipf2a:
        for l in lens:
            pcap_file.append(f'{a}_{l[0]}_{b}_{l[1]}.pcap')

# # # diff dist, caida + zipf2a
for a in caida0517:
    for b in zipf2a:
        for l in lens:
            pcap_file.append(f'{a}_{l[0]}_{b}_{l[1]}.pcap')
for a in caida0816:
    for b in zipf2a:
        for l in lens:
            pcap_file.append(f'{a}_{l[0]}_{b}_{l[1]}.pcap')
            
# # # diff dist, caida + zipf2b
for a in caida0517:
    for b in zipf2b:
        for l in lens:
            pcap_file.append(f'{a}_{l[0]}_{b}_{l[1]}.pcap')
for a in caida0816:
    for b in zipf2b:
        for l in lens:
            pcap_file.append(f'{a}_{l[0]}_{b}_{l[1]}.pcap')
            
# # # diff dist, zipf2a + caida
for a in zipf2a:
    for b in caida0517:
        for l in lens:
            pcap_file.append(f'{a}_{l[0]}_{b}_{l[1]}.pcap')
    for b in caida0816:
        for l in lens:
            pcap_file.append(f'{a}_{l[0]}_{b}_{l[1]}.pcap')
            
# # # diff dist, zipf2b + caida
for a in zipf2b:
    for b in caida0517:
        for l in lens:
            pcap_file.append(f'{a}_{l[0]}_{b}_{l[1]}.pcap')
    for b in caida0816:
        for l in lens:
            pcap_file.append(f'{a}_{l[0]}_{b}_{l[1]}.pcap')
            
print(pcap_file)    
print(f'Total Pcap File Number: {len(pcap_file)}')
# widths = [2048, 4096, 8192, 16384, 32768, 65536, 131072]
widths = [4096]
# widths = [1024]



['caida0517-500w_10_.pcap', 'caida0517-250w_10_.pcap', 'caida0517-125w_10_.pcap', 'caida0816-600w_10_.pcap', 'caida0816-300w_10_.pcap', 'caida0816-150w_10_.pcap', 'zipf2a-150w_10_.pcap', 'zipf2a-75w_10_.pcap', 'zipf2a-35w_10_.pcap', 'zipf2b-400w_10_.pcap', 'zipf2b-200w_10_.pcap', 'zipf2b-100w_10_.pcap', 'caida0517-500w_6_caida0816-600w_4.pcap', 'caida0517-500w_6_caida0816-300w_4.pcap', 'caida0517-500w_6_caida0816-150w_4.pcap', 'caida0517-250w_6_caida0816-600w_4.pcap', 'caida0517-250w_6_caida0816-300w_4.pcap', 'caida0517-250w_6_caida0816-150w_4.pcap', 'caida0517-125w_6_caida0816-600w_4.pcap', 'caida0517-125w_6_caida0816-300w_4.pcap', 'caida0517-125w_6_caida0816-150w_4.pcap', 'caida0816-600w_6_caida0517-500w_4.pcap', 'caida0816-600w_6_caida0517-250w_4.pcap', 'caida0816-600w_6_caida0517-125w_4.pcap', 'caida0816-300w_6_caida0517-500w_4.pcap', 'caida0816-300w_6_caida0517-250w_4.pcap', 'caida0816-300w_6_caida0517-125w_4.pcap', 'caida0816-150w_6_caida0517-500w_4.pcap', 'caida0816-150w_6_caida

### Predict TFS changes by adjacent TFS variation

In [19]:
def tfs_changes_occured(var, sec_var, n, ws=20):
    
    time_var = -1.0
    time_sec_var = -1.0
    
    val = 0
    max_time = -1.0
    
    # for each sliding window
    for i in range(len(var) - ws + 1):
        var_mean = np.mean(var[0+i:ws+i])
        var_std_dev = np.std(var[0+i:ws+i])
        
        # check by var outliers
        if abs(var[ws+i-1] - var_mean) > (n * var_std_dev):
            if time_var == -1.0:
                time_var = (1+(ws+i-1)-1)/5
                break
                
    for i in range(len(sec_var) - ws + 1):            
        sec_var_mean = np.mean(sec_var[0+i:ws+i])
        sec_var_std_dev = np.std(sec_var[0+i:ws+i])
        
        # check by sec var outliers
        if abs(sec_var[ws+i-1] - sec_var_mean) > (n * sec_var_std_dev):
            if time_sec_var == -1.0:
                time_sec_var = (2+(ws+i-1)-1)/5
                val = sec_var[ws+i-1]
                break
                
    if val == 0:
        val = max(sec_var)
        max_time = (2 + sec_var.index(val) - 1)/5
        
    
    var_hit = 0
    sec_var_hit = 0
    
    if time_var != -1.0:
        var_hit = 1
    if time_sec_var != -1.0:
        sec_var_hit = 1
    
    return var_hit, time_var, sec_var_hit, time_sec_var, val, max_time
    
    

In [66]:
def predict_tfs(algo='cm', row=3, width=4096, level=1, seed=1, count=1, flowkey='srcIP', 
              epochs=['10'], dataset='zipf2a_3_caida20180517_7.pcap', window_size=200, dev=3, ws=25):
    
    # total flow size
    res_sum = []
    res_var = []
    res_sec_var = []
    
    answer = int(dataset.split('_')[1])
    # print(dataset)
    
    res_var = [None]
    res_sec_var = [None, None]
            
    res_sum = read_total_size_data(algo, row, width, level, seed, count, flowkey, epochs, dataset, window_size)
        
    for i in range(1, len(res_sum)):
        res_var.append(res_sum[i] - res_sum[i-1])
        
    for i in range(2, len(res_var)):
        res_sec_var.append(abs(res_var[i] - res_var[i-1]))
            
    
    var_hit, time_var, sec_var_hit, time_sec_var, max_val, max_time = tfs_changes_occured(res_var[1:], res_sec_var[2:], dev, ws)
    
    change = 0
    changing_time = 0.0
    if sec_var_hit == 1:
        change = 1
        changing_time = time_sec_var

    return change, changing_time, answer


In [67]:
predict_tfs(width=widths[0], dataset=pcap_file[0], dev=3)

(1, 7.6, 10)

### Predict TFS Value

In [112]:
def predict_tfs_val(algo='cm', row=3, width=4096, level=1, seed=1, count=1, flowkey='srcIP', 
              epochs=['10'], dataset='zipf2a_3_caida20180517_7.pcap', window_size=200, start_time=30, predict_length=2):
    

    res = []
    # sum
    result = read_total_size_data(algo, row, width, level, seed, count, flowkey, epochs, dataset, window_size)[start_time:]
    
    # res.append(result)
    
    # variation
    fin_res_var = [None]
    for i in range(1, len(result)):
        fin_res_var.append(result[i] - result[i-1])
        
    res.append(fin_res_var)
    
    # second variation
    fin_sec_res_var = [None, None]
    for i in range(2, len(result)):
        fin_sec_res_var.append(abs(fin_res_var[i] - fin_res_var[i-1]))
        
    res.append(fin_sec_res_var)
    

    # actual_tfs = result[-1]
    predict_tfs = sum(fin_res_var[1:min(len(fin_res_var), 1+int(predict_length*1000/window_size))]) * (10/predict_length)
    
    return int(predict_tfs)
    

In [113]:
predict_tfs_val( dataset=pcap_file[13], start_time=30)

2907745

### Predict FSD changes by adjacent MRD variation

In [10]:
def prepare_profiler_bins():
    res = {}
    
    profiler_path = "../traffic_generator/fs_dist/"
    profiler_fsd = []
    for file in sorted(os.listdir(profiler_path)):
        profiler_fsd.append(file)
        
    for file in profiler_fsd:
        path = os.path.join(profiler_path, file)
        
        # Read file to get profilers' flow size distribution
        fsd = {}
        fn = 0
        with open(path, 'r') as f:
            for line in f:
                fsd[int(line.strip().split()[0])] = int(line.strip().split()[1])
                fn += int(line.strip().split()[1])
                
        # calcualte CDF
        cdf = {}
        culmulative_prob = 0.0
        for fs, freq in sorted(fsd.items()):
            culmulative_prob += (freq/fn)
            cdf[fs] = culmulative_prob
            
        # get bins
        bin = {}
        idx = 0
        for fs, prob in sorted(cdf.items()):
            while idx < round(prob * 100):
                bin[idx] = fs
                idx += 1
                
        res[file[:-4]] = bin
            
    return res
    
# prepare_profiler_bins()

In [11]:
def get_quantized_mapping(fsd, bins):
    qfsd = {}
    idx = 0
    for fs, freq in sorted(fsd.items()):
        while fs > bins[idx]:
            if idx < len(bins) - 1:
                idx += 1
            else:
                break
            
        if bins[idx] in qfsd.keys():
            qfsd[bins[idx]] += freq
        else:
            qfsd[bins[idx]] = freq
            
        
    return qfsd

In [12]:
def plot_mrd_variation(mrd_var, name, window_size=200, typ='Var'):
    
    res = []
    
    res.append(mrd_var)
    
    print(name)
    print(mrd_var)

    plt.figure(figsize=(12, 4))
        
    for i in range(len(res)):
        plt.plot(res[i], label=f'MRD {typ}')

    # Add labels and legend
    plt.xlabel('Time (sec)')
    if typ == "MRD":
        plt.ylabel('MRD')
    else:
        plt.ylabel('Value')
    plt.title('%s MRD Variation' % (name))
    # plt.axhline(10000, c="black")
    ticks = [i for i in range(int(10*1000/window_size) + 1)]
    adjusted_ticks = [tick * (window_size / 1000) for tick in ticks[0::int(1000 / window_size)]]
    plt.xticks(ticks[0::int(1000/window_size)], adjusted_ticks)
    plt.legend(loc='upper left', ncol=math.ceil(len(res)/4))

    # Show the plot
    plt.show()
    

In [23]:
def fsd_changes_occured(sec_var, n, ws=20):
    
    time_sec_var = -1.0
    
    val = 0
    max_time = -1.0
    
    # for each sliding window           
    for i in range(len(sec_var) - ws + 1):            
        sec_var_mean = np.mean(sec_var[0+i:ws+i])
        sec_var_std_dev = np.std(sec_var[0+i:ws+i])
        
        # check by sec var outliers
        if abs(sec_var[ws+i-1] - sec_var_mean) > (n * sec_var_std_dev):
            if time_sec_var == -1.0:
                time_sec_var = (4+(ws+i-1)-1)/5
                val = sec_var[ws+i-1]
                break
                
    if val == 0:
        val = max(sec_var)
        max_time = (4 + sec_var.index(val) - 1)/5
        
    sec_var_hit = 0
    
    if time_sec_var != -1.0:
        sec_var_hit = 1
    
    return sec_var_hit, time_sec_var, val, max_time
    
    

In [14]:
def calculate_mrd(fsd1, fsd2):
    MRD_nom = 0
    MRD_denom = 0
    for i in range(1, max(fsd1.keys())+1):
        if i in fsd1.keys():
            true = fsd1[i]
        else:
            true = 0
            
        if i in fsd2.keys():
            est = fsd2[i]
        else:
            est = 0
            
        MRD_nom += abs(true - est)
        MRD_denom += float(true + est)/2
    MRD = MRD_nom/MRD_denom
    
    return MRD

    # MRD_nom = 0
    # MRD_denom = 0
    # for key in fsd1.keys():
    #     true = fsd1[key]
    #     if key in fsd2.keys():
    #         est = fsd2[key]
    #     else:
    #         est = 0
            
    #     MRD_nom += abs(true - est)
    #     MRD_denom += float(true + est)/2
    # MRD = MRD_nom/MRD_denom
    
    # return MRD

In [68]:
def predict_fsd(algo='cm', row=3, width=4096, level=1, seed=1, count=1, flowkey='srcIP', 
              epochs=['10'], dataset='caida0517-125w_10_.pcap', window_size=200, dev=3.4, ws=25):
    
    # top 100
    res_fsd = []
    res_mrd = []
    res_mrd_var = []
    res_mrd_sec_var = []

    answer = int(dataset.split('_')[1])
    

    res_mrd = [None, None] # 0, 1
    res_mrd_var = [None, None, None] # 0, 1, 2
    res_mrd_sec_var = [None, None, None, None] # 0, 1, 2, 3
    
    # fsd_total = read_fsd_data(algo, row, w, level, seed, count, flowkey, epochs, d, window_size)
    fsd_total = read_single_window_fsd_data(algo, row, width, level, seed, count, flowkey, epochs, dataset, window_size)
    # fsd_total = read_gt_fsd_data(algo, row, w, level, seed, count, flowkey, epochs, d, window_size)

            
    res_fsd = fsd_total[:-1] # ignore last window (less than 200ms)
            
    for i in range(1, len(res_fsd)):
        res_mrd.append(calculate_mrd(res_fsd[i-1], res_fsd[i]))
     
    for i in range(3, len(res_mrd)):
        res_mrd_var.append(abs(res_mrd[i] - res_mrd[i-1]))
                
    for i in range(4, len(res_mrd_var)):
        res_mrd_sec_var.append(abs(res_mrd_var[i] - res_mrd_var[i-1]))
    
    sec_var_hit, time_sec_var, val, max_time = fsd_changes_occured(res_mrd_sec_var[4:], dev, ws)
    
    change = 0
    changing_time = 0.0
    if sec_var_hit == 1:
        change = 1
        changing_time = time_sec_var

    return change, changing_time, answer

In [69]:
predict_fsd(dataset=pcap_file[0], window_size=200, dev=3.4, ws=25)

(0, 0.0, 10)

### Predict distribution

In [183]:
def plot_mrd(file_name, mrd_list, window_size, predict_time, ans):
    
    res = []
    labels = []
    
    for i in range(len(mrd_list[0])):
        single_mrd = []
        for j in range(len(mrd_list)):
            single_mrd.append(mrd_list[j][i])
        res.append(single_mrd)
        
    profiler_folder_path = "/home/ming/SketchMercator/pattern_detection/traffic_generator/pcap_file/"
    for name in sorted(os.listdir(profiler_folder_path)):
        if name.split("-")[0] == "zipf4":
            continue
        labels.append(name.split("_")[0])    
        
        
    plt.figure(figsize=(8, 4))
    
    for i in range(0, len(res)):
        if labels[i] == ans:
            plt.plot(res[i], label=labels[i], color='tab:red', linewidth=3.0)
        else:
            plt.plot(res[i], label=labels[i])

    # Add labels and legend
    plt.xlabel('Time (sec)')
    plt.ylabel('MRD')
    plt.title('%s ,Random K MRD Variation' % (file_name))
    # plt.axhline(10000, c="black")
    ticks = [i for i in range(int(predict_time*1000/window_size) + 1)]
    adjusted_ticks = [tick * (window_size / 1000) for tick in ticks[0::int(1000 / window_size)]]
    plt.xticks(ticks[0::int(1000/window_size)], adjusted_ticks)
    plt.legend(loc='upper left')

    # Show the plot
    plt.show()

In [212]:
def find_distribution(profiler_fsd_dict, flowkey_list, unknown_fsd):
    # get profiler dist
    profiler_fsd = {}
    for name, map in profiler_fsd_dict.items():
        profiler_fsd[name] = get_pdf(get_profiler_sampled_fsd(map, flowkey_list))
        
    all_mrd = {}
    min_mrd = sys.float_info.max
    predict_dist = ""
    for name, dist in profiler_fsd.items():
        if dist == {}:
            # print(name, "is empty")
            mrd = abs(calculate_mrd(get_pdf(unknown_fsd), dist))
            # continue
        else:
            mrd = calculate_mrd(dist, get_pdf(unknown_fsd))
        all_mrd[name] = mrd
        if mrd < min_mrd:
            min_mrd = mrd
            predict_dist = name
    # for name, dist in profiler_fsd.items():
    #     mrd = calculate_mrd(dist, unknown_fsd)
    #     all_mrd[name] = mrd
    #     if mrd < min_mrd:
    #         min_mrd = mrd
    #         predict_dist = name
           
    # print(all_mrd)
    # print(predict_dist)       
    
    mrds = []
    for name, mrd in all_mrd.items():
        mrds.append(mrd)
    
    # print(len(mrds))
    # return predict_dist, profiler_fsd[predict_dist]
    return predict_dist, mrds
    

In [228]:
def predict_distribution(algo='cm', row=3, width=4096, level=1, seed=1, count=1, flowkey='srcIP', 
              epochs=['10'], dataset='caida0517-125w_10_.pcap', window_size=200, start_time=0, predict_length=2):
    
    res_fsd = []
    
    answer = int(dataset.split('_')[1])
    
    # fsd_total = read_fsd_data(algo, row, width, level, seed, count, flowkey, epochs, dataset, window_size)
    # fsd_total = read_single_window_fsd_data(algo, row, width, level, seed, count, flowkey, epochs, dataset, window_size)
    fsd_total = read_single_window_gt_fsd_data(algo, row, width, level, seed, count, flowkey, epochs, dataset, window_size)
    # fsd_total = read_gt_fsd_data(algo, row, w, level, seed, count, flowkey, epochs, d, window_size)
    
    res_fsd = fsd_total[start_time:]
            
    # find dist
    profiler_fsd_dict = prepare_profiler_final_fsd_dict()
    sampled_flowkey = get_sampled_flowkey(dataset=dataset, predict_time=0)[start_time:]
    
    print(len(res_fsd), len(sampled_flowkey))
    
    vote = {}
    all_mrds = []
    for j in range(min(len(res_fsd), len(sampled_flowkey), int(predict_length * 1000 / window_size))):
        dist_name, mrds = find_distribution(profiler_fsd_dict, sampled_flowkey[j], res_fsd[j])
        all_mrds.append(mrds)
        
        if dist_name[:-9] in vote:
            vote[dist_name[:-9]] += 1
        else:
            vote[dist_name[:-9]] = 1
            
    if vote == {}:
        return "NONE"
    return max(vote, key=vote.get)
    

---

In [202]:
import sys
sys.path.append('/home/ming/SketchMercator/')

from sw_dp_simulator.file_io.py.read_cm import load_cm
from sw_dp_simulator.hash_module.py.hash import compute_hash
from sw_dp_simulator.file_io.py.common import parse_line

def counter_estimate(key, sketch_array, index_hash_sub_list, d, w, hash, level):
    a = []

    for i in range(0, d):
        index = compute_hash(key, hash, index_hash_sub_list[i], w)
        estimate = sketch_array[i * w + index]
        a.append(estimate)

    return min(a)

def get_counter_value(full_dir, row, width, level, window_size):
    counter_list = []
    
    # load counter value
    for l in range(0, 1):
        window_dir = '%s/level_%02d/window_%d/' % (full_dir, l, window_size)
        print(window_dir)
        
        level_list = []
        for file in sorted(os.listdir(window_dir)):
            window_list = []
            path = os.path.join(window_dir, file)
            
            f = open(path)
            for i in range(0, row * width):
                pline = f.readline().strip()
                window_list.append(int(pline))
            f.close()
            
            level_list.append(window_list)
            
        final_counter_path = '%s/level_%02d/sketch_counter.txt' % (full_dir, l)
        window_list = []
        f = open(final_counter_path)
        for i in range(0, row * width):
            pline = f.readline().strip()
            window_list.append(int(pline))
        f.close()
        
        level_list.append(window_list)
        
        print(f'There are {len(level_list)} windows')
        
        counter_list.append(level_list)
    
    return counter_list

def get_topk_flowkey(full_dir, row, width, level, window_size, k):
    key_list = []
    
    # load counter value
    for l in range(0, 1):
        key_window_dir = '%s/level_%02d/key_window_%d/' % (full_dir, l, window_size)
        print(key_window_dir)
        
        level_list = []
        for file in sorted(os.listdir(key_window_dir)):
            key_window_list = []
            path = os.path.join(key_window_dir, file)
            
            f = open(path)
            key = f.readline().strip()
            # print(key)
            for line in f:
                string_key, estimate, flowkey = parse_line(key, line.strip())
                key_window_list.append(flowkey)
            f.close()
            
            level_list.append(key_window_list)
            
        final_counter_path = '%s/flowkey.txt' % (full_dir)
        cnt = 0
        key_window_list = []
        f = open(final_counter_path)
        key = f.readline().strip()
        # print(key)
        for line in f:
            string_key, estimate, flowkey = parse_line(key, line.strip())
            key_window_list.append(flowkey)
        f.close()
        
        # random sample k flow key
        if k != 0:
            random.shuffle(key_window_list)
            
            level_list.append(key_window_list[:k])
        else:
            level_list.append(key_window_list)
            
        print(f'There are {len(level_list)} windows')
        
        key_list.append(level_list)
    
    return key_list

def read_period_fsd_data(algo='cm', row=3, width=4096, level=1, seed=1, count=1, flowkey='srcIP', 
              epochs=['10'], dataset='caida0517-125w_10_.pcap', window_size=200, start=30, length=3):
    
    fs_dist = {}
    for epoch in epochs:
        
        path = f"../SketchPadding/{algo}/{dataset}/"\
                f"{flowkey}/row_{row}_width_{width}_level_{level}_epoch_{epoch}_count_{count}_seed_{seed}/"
        
        for dir in sorted(os.listdir(path)):
            p = os.path.join(path, dir)
            if os.path.isdir(p): 
                
                result = load_cm(p, width, row)
                index_hash_list = result["index_hash_list"]
                
                counter_list = get_counter_value(p, row, width, level, window_size)
                
                cArray_start = counter_list[0][start]
                cArray_end = counter_list[0][min(len(counter_list[0])-1, start + int(length*1000/window_size))]
                
                randomk_flowkey_list = get_topk_flowkey(p, row, width, level, window_size, 5000)
                
                for key in randomk_flowkey_list[0][start + int(length*1000/window_size)]:
                    est_start = counter_estimate(key, cArray_start, index_hash_list[0], row, width, "crc_hash", 0)
                    est_end = counter_estimate(key, cArray_end, index_hash_list[0], row, width, "crc_hash", 0)
                    
                    var = est_end - est_start
                    if var == 0:
                        continue
                    if var in fs_dist.keys(): 
                        fs_dist[var] += 1
                    else:
                        fs_dist[var] = 1
                        
                fs_dist = dict(sorted(fs_dist.items()))
            
    return fs_dist



In [204]:
def predict_period_distribution(algo='cm', row=3, width=4096, level=1, seed=1, count=1, flowkey='srcIP', 
              epochs=['10'], dataset='caida0517-125w_10_.pcap', window_size=200, start_time=0, predict_length=2):
    
    res_fsd = []
    
    answer = int(dataset.split('_')[1])
    
    # fsd_total = read_fsd_data(algo, row, width, level, seed, count, flowkey, epochs, dataset, window_size)
    fsd_total = read_single_window_fsd_data(algo, row, width, level, seed, count, flowkey, epochs, dataset, window_size)
    # fsd_total = read_gt_fsd_data(algo, row, w, level, seed, count, flowkey, epochs, d, window_size)
    
    res_fsd = fsd_total[start_time:]
            
    # find dist
    profiler_fsd_dict = prepare_profiler_final_fsd_dict()
    sampled_flowkey = get_sampled_flowkey(dataset=dataset, predict_time=0)
    if start_time >= len(sampled_flowkey):
        return "NONE"
    
    end_sampled_flowkey = sampled_flowkey[min(len(sampled_flowkey)-1,start_time+int(predict_length*1000/window_size))]
    res_fsd = read_period_fsd_data(algo, row, width, level, seed, count, flowkey, epochs, dataset, window_size, start_time, predict_length)
    
    # print(len(res_fsd), len(sampled_flowkey))
    
    dist_name, mrds = find_distribution(profiler_fsd_dict, end_sampled_flowkey, res_fsd)
    
    return dist_name
    

In [223]:
predict_distribution(width=widths[0], dataset=pcap_file[0], window_size=200, start_time=0, predict_length=2)
# predict_period_distribution(width=widths[0], dataset=pcap_file[0], window_size=200, start_time=30, predict_length=2)

48 47


'caida0517-500w'

### Predict PCD - TFS
---
- Start by detecting changes in TFS:
    - If TFS changed ⇒ Measure current DIST (2s).
    - If TFS unchanged ⇒ Check if FSD has changed:
        - If FSD changed ⇒ Measure current DIST (2s).
- Once the current DIST is known:
    - Select an appropriate profiler based on collected predict TFS (2s).

In [229]:
def predict_pcd_tfs(algo='cm', row=3, width=[4096], level=1, seed=1, count=1, flowkey='srcIP', 
              epochs=['10'], datasets=['caida0517-125w_10_.pcap'], window_size=200, ws=25):
    
    TFS_CHANGE = {}
    TFS_TIME = {}
    TFS_VAL = {}
    
    FSD_CHANGE = {}
    FSD_TIME = {}
    FSD_VAL = {}
    
    for d in datasets:
        for w in width:
            name = f'{d[:-5]}_{w}'
            
            # detect TFS changes
            tfs_dev = 3
            tfs_change, tfs_changing_time, answer = predict_tfs(algo, row, w, level, seed, count, flowkey, epochs, d, window_size, tfs_dev, ws)
            
            if answer == 10:
                if tfs_change == 0:
                    print(f'{d} [tfs correct] {answer}')
                    TFS_CHANGE[name] = 0
                else:
                    print(f"{d} [tfs error] {tfs_changing_time} {answer}")
                    TFS_CHANGE[name] = 1
                    TFS_TIME[name] = int(tfs_changing_time/(window_size/1000))
            else:
                if tfs_change == 0:
                    print(f"{d} [tfs error] {tfs_changing_time} {answer}")
                    TFS_CHANGE[name] = 0
                elif tfs_change == 1:
                    if abs(tfs_changing_time - answer) >= 1.0: # predict failed
                        print(f"{d} [tfs error] {tfs_changing_time} {answer}")
                        TFS_CHANGE[name] = 1
                        TFS_TIME[name] = int(tfs_changing_time/(window_size/1000))
                    else:
                        print(f'{d} [tfs correct] {tfs_changing_time} {answer}')
                        TFS_CHANGE[name] = 1
                        TFS_TIME[name] = int(tfs_changing_time/(window_size/1000))
                        
            # detect FSD changes
            fsd_dev = 3.4
            fsd_change, fsd_changing_time, answer = predict_fsd(algo, row, w, level, seed, count, flowkey, epochs, d, window_size, fsd_dev, ws)
            
            if answer == 10:
                if fsd_change == 0:
                    print(f'{d} [fsd correct] {answer}')
                    FSD_CHANGE[name] = 0
                else:
                    print(f"{d} [fsd error] {fsd_changing_time} {answer}")
                    FSD_CHANGE[name] = 1
                    FSD_TIME[name] = int(fsd_changing_time/(window_size/1000))
            else:
                if fsd_change == 0:
                    print(f"{d} [fsd error] {fsd_changing_time} {answer}")
                    FSD_CHANGE[name] = 0
                elif fsd_change == 1:
                    if abs(fsd_changing_time - answer) >= 1.0: # predict failed
                        print(f"{d} [fsd error] {fsd_changing_time} {answer}")
                        FSD_CHANGE[name] = 1
                        FSD_TIME[name] = int(fsd_changing_time/(window_size/1000))
                    else:
                        print(f'{d} [fsd correct] {fsd_changing_time} {answer}')
                        FSD_CHANGE[name] = 1
                        FSD_TIME[name] = int(fsd_changing_time/(window_size/1000))
                            
            # one of which changes -> select profiler
            predict_length = 3
            # if TFS_CHANGE[name] == 1 and FSD_CHANGE[name] == 0:
            #     TFS_VAL[name] = predict_tfs_val(algo, row, w, level, seed, count, flowkey, epochs, d, window_size, TFS_TIME[name], predict_length)
            #     FSD_VAL[name] = predict_distribution(algo, row, w, level, seed, count, flowkey, epochs, d, window_size, TFS_TIME[name], predict_length)
            # elif TFS_CHANGE[name] == 0 and FSD_CHANGE[name] == 1:
            #     TFS_VAL[name] = predict_tfs_val(algo, row, w, level, seed, count, flowkey, epochs, d, window_size, FSD_TIME[name], predict_length)
            #     FSD_VAL[name] = predict_distribution(algo, row, w, level, seed, count, flowkey, epochs, d, window_size, FSD_TIME[name], predict_length)
            # elif TFS_CHANGE[name] == 1 and FSD_CHANGE[name] == 1:
            #     changing_time = min(TFS_TIME[name], FSD_TIME[name]) # once the changing point is detect => make a prediction
            #     TFS_VAL[name] = predict_tfs_val(algo, row, w, level, seed, count, flowkey, epochs, d, window_size, changing_time, predict_length)
            #     FSD_VAL[name] = predict_distribution(algo, row, w, level, seed, count, flowkey, epochs, d, window_size, changing_time, predict_length)
            # else:
            #     print("no need to re-profiling")
            
            T = int(answer/(window_size/1000))
            
            if TFS_CHANGE[name] == 1 and FSD_CHANGE[name] == 0:
                TFS_VAL[name] = predict_tfs_val(algo, row, w, level, seed, count, flowkey, epochs, d, window_size, T, predict_length)
                FSD_VAL[name] = predict_distribution(algo, row, w, level, seed, count, flowkey, epochs, d, window_size, T, predict_length)
            elif TFS_CHANGE[name] == 0 and FSD_CHANGE[name] == 1:
                TFS_VAL[name] = predict_tfs_val(algo, row, w, level, seed, count, flowkey, epochs, d, window_size, T, predict_length)
                FSD_VAL[name] = predict_distribution(algo, row, w, level, seed, count, flowkey, epochs, d, window_size, T, predict_length)
            elif TFS_CHANGE[name] == 1 and FSD_CHANGE[name] == 1:
                TFS_VAL[name] = predict_tfs_val(algo, row, w, level, seed, count, flowkey, epochs, d, window_size, T, predict_length)
                FSD_VAL[name] = predict_distribution(algo, row, w, level, seed, count, flowkey, epochs, d, window_size, T, predict_length)
            else:
                print("no need to re-profiling")
                    
                
    print("-- result --")
    correct = 0
    half_correct = 0
    cnt = 0
    for d in datasets:
        for w in width:
            name = f'{d[:-5]}_{w}'
            
            print(cnt)
            cnt += 1
            origin = d.split("_")[0]
            after = d.split("_")[0]
            if len(d.split("_")) == 4:
                after = d.split("_")[2]
            print(f'\tORIGIN: {origin}')
            print(f'\tAFTER : {after}')
                
            if TFS_CHANGE[name] == 1 or FSD_CHANGE[name] == 1:
                print(f'\tpredict TFS = {TFS_VAL[name]}, predict FSD = {FSD_VAL[name]}')
                
                if FSD_VAL[name] == after:
                    correct += 1
                elif origin == after and FSD_VAL[name] == "NONE":
                        correct += 1
                        
                if FSD_VAL[name].split("-")[0] == after.split("-")[0]:
                    half_correct += 1
                elif origin.split("-")[0] == after.split("-")[0] and FSD_VAL[name] == "NONE":
                    half_correct += 1
            else: # both TFS and FSD no change => no trigger re-profiling
                print("\tno need to re-profiling")
                if origin == after:
                    correct += 1
                if origin.split("-")[0] == after.split("-")[0]:
                    half_correct += 1
                    
    print(f'   correct   predict: {correct}/{len(datasets)*len(width)} {correct*100/(len(datasets)*len(width))}%')
    print(f'half_correct predict: {half_correct}/{len(datasets)*len(width)} {half_correct*100/(len(datasets)*len(width))}%')    

In [230]:
# predict_pcd_tfs(width=widths, datasets=pcap_file[0:12])
predict_pcd_tfs(width=widths, datasets=pcap_file[12:30])
# predict_pcd_tfs(width=widths, datasets=pcap_file[30:48])
# predict_pcd_tfs(width=widths, datasets=pcap_file[48:66])
# predict_pcd_tfs(width=widths, datasets=pcap_file[66:84])
# predict_pcd_tfs(width=widths, datasets=pcap_file[84:102])
# predict_pcd_tfs(width=widths, datasets=pcap_file[102:120])

caida0517-500w_6_caida0816-600w_4.pcap [tfs error] 9.4 6
caida0517-500w_6_caida0816-600w_4.pcap [fsd error] 0.0 6
18 17
caida0517-500w_6_caida0816-300w_4.pcap [tfs correct] 5.6 6
caida0517-500w_6_caida0816-300w_4.pcap [fsd correct] 5.6 6
18 17
caida0517-500w_6_caida0816-150w_4.pcap [tfs correct] 5.4 6
caida0517-500w_6_caida0816-150w_4.pcap [fsd correct] 5.6 6
18 17
caida0517-250w_6_caida0816-600w_4.pcap [tfs correct] 5.4 6
caida0517-250w_6_caida0816-600w_4.pcap [fsd correct] 5.4 6
18 17
caida0517-250w_6_caida0816-300w_4.pcap [tfs error] 9.4 6
caida0517-250w_6_caida0816-300w_4.pcap [fsd correct] 5.4 6
18 17
caida0517-250w_6_caida0816-150w_4.pcap [tfs correct] 5.6 6
caida0517-250w_6_caida0816-150w_4.pcap [fsd correct] 5.6 6
18 17
caida0517-125w_6_caida0816-600w_4.pcap [tfs correct] 5.4 6
caida0517-125w_6_caida0816-600w_4.pcap [fsd correct] 5.4 6
18 17
caida0517-125w_6_caida0816-300w_4.pcap [tfs correct] 5.4 6
caida0517-125w_6_caida0816-300w_4.pcap [fsd correct] 5.4 6
18 17
caida0517-125w