In [1]:
import os
import pandas as pd 
import re 
import matplotlib.pyplot as plt
from matplotlib_venn import venn2

In [2]:
tshark_directory = os.path.join('.', 'traffictracer')
statistic_directory = ['WLAN_statistics', 'Meta_statistics'] 
ori_directory = ['WLAN', 'Meta'] 
port_directory = 'tshark_port' 
conn_directory = 'conn_in_out' 
evaluation_directory = 'evaluation'
data_date = '24-11-07'

meta_prefix = 'Meta-' 
wlan_prefix = 'WLAN-' 
conn_prefix = 'conn-in-out-'

## TrafficTracer

- 资源节省：conn-in-out与各个流程比较，比较的是stream的数量
  - meta
  - wlan
  - conn
  - 减少率


In [12]:
def readData(meta_path: str, wlan_path: str, conn_path: str):
    path_list = [meta_path, wlan_path, conn_path] 
    
    # 检查所有文件是否存在
    all_exist = all(os.path.exists(path) for path in path_list) 
    
    if all_exist: 
        try: 
            df_meta = pd.read_csv(meta_path) 
            df_wlan = pd.read_csv(wlan_path) 
            df_conn = pd.read_csv(conn_path) 
            return df_meta, df_wlan, df_conn 
        except Exception as e: 
            print(f"Reading error: {e}") 
            return None 
    else: 
        missing_files = [path for path in path_list if not os.path.exists(path)] 
        print(f"Following files not exist: {missing_files}") 
        return None 

In [13]:
def timeFromConn(conn_info: str): 
    pattern_time = r'\d{2}-\d{2}-\d{2}--\d{2}-\d{2}-\d{2}' 
    match = re.search(pattern_time, conn_info) 
    if match: 
        res = match.group() 
        return res
    return

In [14]:
def getListeningPorts(process_port_path: str) -> set: 
    pattern_process = r'tcp\.port in \{([0-9, ]+)\}' 
    with open(process_port_path, 'r') as file: 
        for line in file: 
            tcp_ports = re.search(pattern_process, line) 
            if tcp_ports: 
                process_port_tcp = [str(port.strip()) for port in tcp_ports.group(1).split(',')] 
    return set(process_port_tcp) 

In [15]:
def getConnPorts(df_conn: pd.DataFrame): 
    conn_port_tcp = set(df_conn['inRemotePort']) 
    return conn_port_tcp

In [16]:
def timeListening(portListening: str): 
    timestamp = portListening[:-4] 
    time_split = timestamp.rsplit('--', 1) 
    begin_time = time_split[0] 
    end_time = time_split[1] 
    return begin_time, end_time

In [17]:
def ttLength(df_meta: pd.DataFrame, df_wlan: pd.DataFrame, df_conn: pd.DataFrame): 
    return tuple(df.shape[0] for df in (df_meta, df_wlan, df_conn)) 

In [18]:
def ttEfficiency(lengthTuple: tuple):
    len_meta, len_wlan, len_conn = lengthTuple 
    
    # 避免除以零的情况
    meta_conn = (len_meta - len_conn) / len_meta if len_meta != 0 else 0 
    wlan_conn = (len_wlan - len_conn) / len_wlan if len_wlan != 0 else 0 
    
    return meta_conn, wlan_conn 

In [19]:
def getPathByPortListening(portListening: str): # Input is the name of port files 
    begin_time, _ = timeListening(portListening) 
    meta_path = os.path.join(tshark_directory, ori_directory[1], meta_prefix + begin_time + '.csv') 
    wlan_path = os.path.join(tshark_directory, ori_directory[0], wlan_prefix + begin_time + '.csv') 
    conn_path = os.path.join(tshark_directory, conn_directory, conn_prefix + begin_time + '.csv') 
    meta_s_path = os.path.join(tshark_directory, statistic_directory[1], meta_prefix + begin_time + '.csv') 
    wlan_s_path = os.path.join(tshark_directory,statistic_directory[0], wlan_prefix + begin_time + '.csv') 
    return meta_path, wlan_path, conn_path, meta_s_path, wlan_s_path 

In [20]:
# directory_conn = os.path.join('.', 'traffictracer', 'conn') 
# direcrory_meta = os.path.join('.', 'traffictracer', 'Meta_statistics') 
# directory_wlan = os.path.join('.', 'traffictracer', 'WLAN_statistics') 
# direcrory_conn_in_out = os.path.join('.', 'traffictracer', 'conn_in_out') 
# direcrory_meta_ori = os.path.join('.', 'traffictracer', 'Meta') 
# directory_wlan_ori = os.path.join('.', 'traffictracer', 'WLAN') 
# directory_listening = os.path.join('.', 'traffictracer','tshark_port') 

# begin_time = '24-10-27--15-49-13' 
# end_time = '15-53-27'

# conn_path = os.path.join(directory_conn, begin_time + '.csv')
# meta_path = os.path.join(direcrory_meta, 'Meta-'+ begin_time + '.csv')
# wlan_path = os.path.join(directory_wlan, 'WLAN-'+ begin_time + '.csv')
# conn_in_out_path = os.path.join(direcrory_conn_in_out, 'conn-in-out-'+ begin_time + '.csv')
# meta_ori_path = os.path.join(direcrory_meta_ori, 'Meta-'+ begin_time + '.csv') 
# wlan_ori_path = os.path.join(directory_wlan_ori, 'WLAN-'+ begin_time + '.csv') 
# listening_path = os.path.join(directory_listening, begin_time + '--' + end_time + '.txt')

In [21]:
efficiency_dict = {
    'Name': [], 
    'MetaLength': [], 
    'WLANLength': [], 
    'ConnLength': [], 
    'Meta-Conn': [], 
    'WLAN-Conn': []
}

for port_file in os.listdir(os.path.join(tshark_directory, port_directory)): 
    if port_file.startswith(data_date): 
        begin_time, _ = timeListening(port_file)
        # listening_path = os.path.join(tshark_directory, port_directory, port_file) 
        _, _, conn_path, meta_s_path, wlan_s_path = getPathByPortListening(port_file) 
        df_meta_s, df_wlan_s, df_conn = readData(meta_s_path, wlan_s_path, conn_path) 
        lengths = ttLength(df_meta_s, df_wlan_s, df_conn) 
        meta_conn, wlan_conn = ttEfficiency(lengths) 
        efficiency_dict['Name'].append(begin_time) 
        efficiency_dict['MetaLength'].append(lengths[0])
        efficiency_dict['WLANLength'].append(lengths[1])
        efficiency_dict['ConnLength'].append(lengths[2])
        efficiency_dict['Meta-Conn'].append(round(meta_conn, 4)) 
        efficiency_dict['WLAN-Conn'].append(round(wlan_conn, 4)) 

tt_efficiency_csv = pd.DataFrame(efficiency_dict) 


In [22]:
tt_efficiency_csv.to_csv(os.path.join(tshark_directory, evaluation_directory, 'tt_filter_efficiency',data_date + '.csv'))

In [23]:
port_dict = {
    'Name': [],
    'RawPorts': [], 
    'vNICPorts': [], 
    'ttPorts': [], 
    'NumRaw': [], 
    'NumNIC': [], 
    'NumTT': [],
    'ImprovementNIC': [], 
    'improvementTT': []
}

for port_file in os.listdir(os.path.join(tshark_directory, port_directory)): 
    if port_file.startswith(data_date): 
        begin_time, _ = timeListening(port_file)
        _, _, conn_path, _, _ = getPathByPortListening(port_file) 
        _, _, df_conn = readData(meta_s_path, wlan_s_path, conn_path) 
        port_path = os.path.join(tshark_directory, port_directory, port_file)
        set_ports_listening = getListeningPorts(port_path) 
        set_ports_conn = set(map(str, getConnPorts(df_conn))) 
        set_tt = set_ports_listening | set_ports_conn 
        number_ports_listening = len(set_ports_listening) 
        number_ports_conn = len(set_ports_conn) 
        number_port_tt = len(set_tt) 
        improvement_nic = (number_ports_conn - number_ports_listening) / number_ports_listening 
        improvement_tt = (number_port_tt - number_ports_listening) / number_ports_listening 

        port_dict['Name'].append(begin_time) 
        port_dict['RawPorts'].append(set_ports_listening) 
        port_dict['vNICPorts'].append(set_ports_conn) 
        port_dict['ttPorts'].append(set_tt)
        port_dict['NumRaw'].append(number_ports_listening) 
        port_dict['NumNIC'].append(number_ports_conn) 
        port_dict['NumTT'].append(number_port_tt) 
        port_dict['ImprovementNIC'].append(round(improvement_nic, 4)) 
        port_dict['improvementTT'].append(round(improvement_tt, 4))

tt_ports_csv = pd.DataFrame(port_dict) 

In [24]:
tt_ports_csv.to_csv(os.path.join(tshark_directory, evaluation_directory, 'tt_port_missing',data_date + '.csv'))

In [25]:
# set_listening = getListeningPorts(listening_path) 
# print("Length of listening ports: ", len(set_listening))
# set_conn = set(map(str, getConnPorts(df_conn))) 
# print("Length of Connection ports: ", len(set_conn)) 

# intersection = set_listening & set_conn 
# print("Length of intersection: ", len(intersection)) 

# other_listening = set_listening - intersection 
# other_conn = set_conn - intersection 
# print(other_listening) 
# print(set(map(int, other_conn))) 

## FlowReversals

强相关连接：与所浏览的业务直接相关的（内容、控制信息、账号）

弱相关连接：可能与所浏览的业务相关的（googleapis，负载均衡等）

- 各类业务情况：TOP5的准确率
- PFI：代理前后特征一致性
  - 衡量指标为某种距离、相似度
  - 五元组（协议、两方IP、端口号）
  - 统计信息（长度、时间窗口）

In [5]:
df_sni = pd.read_csv(os.path.join(tshark_directory, evaluation_directory, 'tt_stream_distribution', 'sni', data_date + '.csv')) 


In [7]:
# df_sni = df_sni.drop(df_sni.columns[0], axis=1) 
# df_sni.to_csv(os.path.join(tshark_directory, evaluation_directory, 'tt_stream_distribution', 'sni', data_date + '.csv'), index=False) 
new_sni_dict = {
    'Name': [], 
    'SNI': []
} 
new_sni_dict['Name'] = df_sni['Name'] 
new_sni_dict['SNI'] = df_sni['FilteredSNI']

In [None]:
for conn_info in os.

In [11]:
test = new_sni_dict['SNI'][0]
print(test)
test_set = eval(test)

{'rr2---sn-i3belnls.googlevideo.com', 'rr3---sn-q4flrnee.googlevideo.com', 'rr1---sn-i3belnll.googlevideo.com', 'rr5---sn-i3belne6.googlevideo.com', 'youtube.com', 'www.youtube.com', 'i.ytimg.com', 'yt3.ggpht.com', 'accounts.youtube.com'}
