In [1]:
import os
import pandas as pd 
import re 
import matplotlib.pyplot as plt
from matplotlib_venn import venn2 
import socket
import struct
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
from scipy.spatial.distance import cosine 
from scipy.spatial import distance 

In [2]:
tshark_directory = os.path.join('.', 'traffictracer')
statistic_directory = ['WLAN_statistics', 'Meta_statistics'] 
ori_directory = ['WLAN', 'Meta'] 
port_directory = 'tshark_port' 
conn_directory = 'conn_in_out' 
evaluation_directory = 'evaluation'
data_date = '24-11-07'

meta_prefix = 'Meta-' 
wlan_prefix = 'WLAN-' 
conn_prefix = 'conn-in-out-'

In [3]:
df_sni = pd.read_csv(os.path.join(tshark_directory, evaluation_directory, 'tt_stream_distribution', 'sni', data_date + '.csv')) 
new_sni_dict = {
    'Name': [], 
    'SNI': []
} 
new_sni_dict['Name'] = df_sni['Name'] 
new_sni_dict['SNI'] = df_sni['FilteredSNI']

In [4]:
def connSni(): 
    for conn_info in os.listdir(os.path.join(tshark_directory, conn_directory)): 
        if conn_info.startswith(conn_prefix + data_date): 
            df_conn = pd.read_csv(os.path.join(tshark_directory, conn_directory, conn_info)) 
            file_name = conn_info[12:-4]
            name_list = new_sni_dict['Name'].tolist()
            index = name_list.index(file_name) 
            sni_set = eval(new_sni_dict['SNI'][index]) 
            df_conn_sni = df_conn[df_conn['Server Name'].isin(sni_set)] 
            df_conn_sni.to_csv(os.path.join(tshark_directory, evaluation_directory, 'tt_stream_distribution', 'conn_sni', conn_info))

In [5]:
def flowCounts(): 
    for conn_sni in os.listdir(os.path.join(tshark_directory, evaluation_directory, 'tt_stream_distribution', 'conn_sni')): 
        if conn_sni.startswith(conn_prefix + data_date): 
            df_conn_sni = pd.read_csv(os.path.join(tshark_directory, evaluation_directory, 'tt_stream_distribution', 'conn_sni', conn_sni)) 
            flow_counts = df_conn_sni['W Flows'].value_counts().sort_index()
            df_flow_counts = flow_counts.to_frame()
            df_flow_counts.to_csv(os.path.join(tshark_directory, evaluation_directory, 'tt_stream_distribution', 'flow_counts', conn_sni))

In [12]:
# flowCounts()

In [6]:
length_dict = {
    'OriConn': [], 
    'FilteredConn': []
}
def lengthComparison(): 
    for conn_info in os.listdir(os.path.join(tshark_directory, conn_directory)): 
        if conn_info.startswith(conn_prefix + data_date): 
            df_conn = pd.read_csv(os.path.join(tshark_directory, conn_directory, conn_info)) 
            df_conn_sni = pd.read_csv(os.path.join(tshark_directory, evaluation_directory, 'tt_stream_distribution', 'conn_sni', conn_info)) 
            length_dict['OriConn'].append(df_conn.shape[0]) 
            length_dict['FilteredConn'].append(df_conn_sni.shape[0]) 
            

In [14]:
# lengthComparison()

In [16]:
# length_csv = pd.DataFrame(length_dict) 
# length_csv.to_csv(os.path.join(tshark_directory, evaluation_directory, 'tt_stream_distribution', 'length_comparison', data_date + '.csv'), index=False)

In [7]:
def ip_to_int(ip): 
    return struct.unpack("!L", socket.inet_aton(ip))[0] 

def tuple_to_vector(t):
    # ip1_1, ip1_2, ip1_3, ip1_4, port1, ip2_1, ip2_2, ip2_3, ip2_4, port2, protocol = t
    ip1_1, ip1_2, ip1_3, ip1_4, ip2_1, ip2_2, ip2_3, ip2_4 = t
    return [
        int(ip1_1), 
        int(ip1_2),
        int(ip1_3),
        int(ip1_4),
        # int(port1),
        int(ip2_1),
        int(ip2_2),
        int(ip2_3),
        int(ip2_4),
        # int(port2),
        # int(protocol)
    ] 

def tuple_to_vector2(t):
    # ip1_1, ip1_2, ip1_3, ip1_4, port1, ip2_1, ip2_2, ip2_3, ip2_4, port2, protocol = t
    ip1_1, ip1_2, ip1_3, ip1_4 = t
    return [
        int(ip1_1), 
        int(ip1_2),
        int(ip1_3),
        int(ip1_4),
    ]

def split_ip(ip_address):
    return [int(octet) for octet in ip_address.split('.')] 

def ip_distance(ip1: list, ip2: list): 
    distance = abs(ip1 - ip2) 
    return abs(ip1 - ip2) 

def calculate_mean(lst):
    return sum(lst) / len(lst)

In [9]:
# def get5Tuples(conn_info: str): 
#     similarities = []
#     if conn_info.startswith(conn_prefix + data_date): 
#         df_conn = pd.read_csv(os.path.join(tshark_directory, evaluation_directory, 'tt_stream_distribution', 'conn_sni', conn_info)) 
#         before_list = df_conn.apply(lambda row: 
#                                     split_ip(row['inRemoteIP']) + [row['inRemotePort']] + split_ip(row['inLocIP']) + [row['inLocPort'], 1], axis=1).to_list() 
#         after_list = df_conn.apply(lambda row: 
#                                     split_ip(row['outLocIP']) + [row['outLocPort']] + split_ip(row['outRemoteIP']) + [row['outRemotePort'], 1], axis=1).to_list() 
#         for before, after in zip(before_list, after_list): 
#             vec1 = tuple_to_vector(before) 
#             vec2 = tuple_to_vector(after) 
#             sim = cosine_similarity([vec1], [vec2])[0][0] 
#             # sim = 1 - cosine(vec1, vec2)
#             similarities.append(sim) 
#     return similarities 

In [8]:
def get2Tuples(conn_info: str): # 拆分点分十进制来评估IP的余弦相似度
    similarities = []
    if conn_info.startswith(conn_prefix + data_date): 
        df_conn = pd.read_csv(os.path.join(tshark_directory, evaluation_directory, 'tt_stream_distribution', 'conn_sni', conn_info)) 
        before_list = df_conn.apply(lambda row: 
                                    split_ip(row['inRemoteIP']) + split_ip(row['inLocIP']), axis=1).to_list() 
        after_list = df_conn.apply(lambda row: 
                                    split_ip(row['outLocIP']) + split_ip(row['outRemoteIP']), axis=1).to_list() 
        for before, after in zip(before_list, after_list): 
            vec1 = tuple_to_vector(before) 
            vec2 = tuple_to_vector(after) 
            # sim = cosine_similarity([vec1], [vec2])[0][0] 
            sim = distance.euclidean(vec1, vec2)
            # sim = 1 - cosine(vec1, vec2)
            similarities.append(sim) 
    return similarities 

def getSrcIPDistance(conn_info: str): # 拆分点分十进制来评估IP的余弦相似度
    similarities = []
    if conn_info.startswith(conn_prefix + data_date): 
        df_conn = pd.read_csv(os.path.join(tshark_directory, evaluation_directory, 'tt_stream_distribution', 'conn_sni', conn_info)) 
        before_list = df_conn.apply(lambda row: 
                                    split_ip(row['inRemoteIP']), axis=1).to_list() 
        after_list = df_conn.apply(lambda row: 
                                    split_ip(row['outLocIP']), axis=1).to_list() 
        for before, after in zip(before_list, after_list): 
            vec1 = tuple_to_vector2(before) 
            vec2 = tuple_to_vector2(after) 
            # sim = cosine_similarity([vec1], [vec2])[0][0] 
            sim = distance.euclidean(vec1, vec2)
            # sim = 1 - cosine(vec1, vec2)
            similarities.append(sim) 
    return similarities 

def getDstIPDistance(conn_info: str): # 拆分点分十进制来评估IP的余弦相似度
    similarities = []
    if conn_info.startswith(conn_prefix + data_date): 
        df_conn = pd.read_csv(os.path.join(tshark_directory, evaluation_directory, 'tt_stream_distribution', 'conn_sni', conn_info)) 
        before_list = df_conn.apply(lambda row: 
                                    split_ip(row['inLocIP']), axis=1).to_list() 
        after_list = df_conn.apply(lambda row: 
                                    split_ip(row['outRemoteIP']), axis=1).to_list() 
        for before, after in zip(before_list, after_list): 
            vec1 = tuple_to_vector2(before) 
            vec2 = tuple_to_vector2(after) 
            # sim = cosine_similarity([vec1], [vec2])[0][0] 
            sim = distance.euclidean(vec1, vec2)
            # sim = 1 - cosine(vec1, vec2)
            similarities.append(sim) 
    return similarities 

In [11]:
similarity_dict = {
    'Name': [],
    'Similarity': [],
    'AverageSimilarity': []
} 
def getSimilarity(): 
    for conn_info in os.listdir(os.path.join(tshark_directory, evaluation_directory, 'tt_stream_distribution', 'conn_sni')): 
        similarity = get2Tuples(conn_info) 
        similarity_dict['Name'].append(conn_info[12:-4]) 
        similarity_dict['Similarity'].append(similarity) 
        similarity_dict['AverageSimilarity'].append(calculate_mean(similarity))
        similarity_csv = pd.DataFrame(similarity_dict) 
        similarity_csv.to_csv(os.path.join(tshark_directory, evaluation_directory, 'tt_similarity', 'cosine_similarity', data_date + '.csv'))

In [9]:
def euclidean_distance_scalar(x, y):
    return abs(x - y) 

In [15]:
M_prefix = 'M ' 
W_prefix = 'W ' 
features = ['Packets', 'Bytes', 'Rel Start', 'Duration', 'Flows'] 

distance_dict = {
    'Name': [], 
    # 'Address A': [], 
    # 'Port A': [],
    # 'Address B': [], 
    # 'Port B': [], 
    'Packets': [], 
    'Packets Rate': [],
    'Bytes': [], 
    'Bytes Rate': [],
    'Rel Start': [], 
    'Rel Start Rate': [],
    'Duration': [], 
    'Duration Rate': [], 
    'Flows': [], 
    'Flows Rate': [], 
    'Bandwidth': [], 
    'Bandwith Rate': [] 
}

# for conn_info in os.listdir(os.path.join(tshark_directory, evaluation_directory, 'tt_stream_distribution', 'conn_sni')): 
#     M_f_dict = {} 
#     W_f_dict = {} 
#     if conn_info.startswith(conn_prefix + data_date): 
#         df_conn = pd.read_csv(os.path.join(tshark_directory, evaluation_directory, 'tt_stream_distribution', 'conn_sni', conn_info)) 
#         distance_dict['Name'].append(data_date)
#         for feature in features: # 初始化特征字典
#             M_f = M_prefix + feature 
#             W_f = W_prefix + feature 
#             M_f_dict[M_f] = df_conn[M_f] 
#             W_f_dict[W_f] = df_conn[W_f] 
#             distance = 0
#             distance_rate = 0
#             for i in range(df_conn.shape[0]): 
#                 distance_rate += euclidean_distance_scalar(M_f_dict[M_f][i], W_f_dict[W_f][i]) / M_f_dict[M_f][i] 
#                 distance += euclidean_distance_scalar(M_f_dict[M_f][i], W_f_dict[W_f][i]) 
#             aver_distance = distance / df_conn.shape[0] 
#             aver_rate = distance_rate / df_conn.shape[0] 
#             distance_dict[feature].append(aver_distance) 
#             distance_dict[feature + ' Rate'].append(aver_rate) 

for conn_info in os.listdir(os.path.join(tshark_directory, evaluation_directory, 'tt_stream_distribution', 'conn_sni')): 
    if conn_info.startswith(conn_prefix + data_date): 
        df_conn = pd.read_csv(os.path.join(tshark_directory, evaluation_directory, 'tt_stream_distribution', 'conn_sni', conn_info)) 
        distance_dict['Name'].append(conn_info[12:-4])
        
        # IP地址
        # src_ip_distance = getSrcIPDistance(conn_info) 

        M_T = M_prefix + 'Duration' 
        W_T = W_prefix + 'Duration' 
        M_Bytes = M_prefix + 'Bytes' 
        W_Bytes = W_prefix + 'Bytes' 
        non_zero_mask_M = df_conn[M_T] != 0 
        non_zero_mask_W = df_conn[W_T] != 0 
        M_bandwidth = df_conn[M_Bytes] / df_conn[M_T][non_zero_mask_M] 
        W_bandwidth = df_conn[W_Bytes] / df_conn[W_T][non_zero_mask_W] 
        average_bandwidth = (M_bandwidth - W_bandwidth).abs().mean() 
        average_bandwidth_rate  = ((M_bandwidth - W_bandwidth).abs() / M_bandwidth).mean() 
        distance_dict['Bandwidth'].append(round(average_bandwidth, 4)) 
        distance_dict['Bandwith Rate'].append(round(average_bandwidth_rate, 4))
        print(conn_info[12:-4], average_bandwidth, average_bandwidth_rate) 

        for feature in features: # 其他特征
            M_f = M_prefix + feature 
            W_f = W_prefix + feature 
            
            # 计算绝对距离
            distance = (df_conn[M_f] - df_conn[W_f]).abs().mean() 
            
            # 计算相对变化率，避免除零
            non_zero_mask = df_conn[M_f] != 0
            distance_rate = ((df_conn[M_f][non_zero_mask] - df_conn[W_f][non_zero_mask]).abs() / df_conn[M_f][non_zero_mask]).mean()
            
            distance_dict[feature].append(round(distance, 4))
            distance_dict[feature + ' Rate'].append(round(distance_rate, 4))

24-11-07--10-19-16 137009.38961392912 0.23093349225847198
24-11-07--10-25-29 96041.93084294569 0.274250625667509
24-11-07--10-32-20 39368.68434169867 42.677226690504106
24-11-07--11-08-34 15527.794982540598 0.8833970023308895
24-11-07--11-24-21 46523.03768274104 0.4980615513410874
24-11-07--11-31-59 28786.79411009544 0.8600966288105891
24-11-07--15-15-47 17060.18452255804 0.48002175737062713
24-11-07--15-36-37 8191.719005192092 0.3838886091300865
24-11-07--15-38-38 13741.716608357787 0.5107438963605363
24-11-07--15-41-25 18377.732590874028 0.723788348675093
24-11-07--15-42-58 39250.51078289472 0.6041286729035806
24-11-07--15-46-14 37418.23418782454 0.5557590251342105
24-11-07--15-52-05 15914.922088333922 0.520802869620752
24-11-07--15-58-43 69763.78678492561 0.2930839440694509
24-11-07--16-04-47 57374.03800105484 0.3881426094903079
24-11-07--16-55-07 8234.414731413744 0.32381105333181975
24-11-07--17-01-16 894.8757445672137 0.19819208759841647
24-11-07--17-06-23 7594.087931057845 0.337

In [14]:
distance_csv = pd.DataFrame(distance_dict) 
distance_csv.to_csv(os.path.join(tshark_directory, evaluation_directory, 'tt_similarity', 'distance', data_date + '_improved.csv'), index=False)

In [None]:
import pandas as pd

# M_bandwidth and W_bandwidth data
M_bandwidth = [
    8495.579972084035, 32221.253749456282, 262.9211543267581, 6422.117026185943, 
    25225.642212450708, 62.818386203591864, 75.16341672337579, 367756.55562177, 
    284.3320057461646, 281.97256627384604, 140.96004648149903, 142.82397872955653, 
    334.28470092190815, 144.56722198218486, 141.42682978969415, 88.29053359294443, 
    284.4787525001988, 286.15870663274006, 109.92803885641197, 142.84101090261063, 
    141.67351078703774, 146.19965348009612, 837989.44153014
]

W_bandwidth = [
    8615.318404759457, 16776.915221674655, 238.85423431293208, 3610.5337520168378, 
    120510.9743858279, 110.91967964762415, 109.50728024511643, 168826.8999510458, 
    15011.117133419715, 15184.17705363903, 11289.896595646514, 10132.429868309957, 
    294.39836037668397, 10292.259694018014, 10154.702068566543, 95.11148851389069, 
    22068.942943309296, 19724.98123547115, 15323.872313480095, 13542.954719425335, 
    14620.950511404119, 14870.895005651859, 415223.3859424236
]

# Convert to pandas Series
M_series = pd.Series(M_bandwidth)
W_series = pd.Series(W_bandwidth)

# Calculate average_bandwidth and average_bandwidth_rate
average_bandwidth = (M_series - W_series).abs().mean()
average_bandwidth_rate = ((M_series - W_series).abs() / M_series).mean()

average_bandwidth, average_bandwidth_rate
