# Description
process et-bert data for fine-tuning
label.pcap -> train/validation/test.tsv

In [1]:
import os
import logging
import scapy.all as scapy
import random
import binascii
import csv
import numpy as np
import pandas as pd
from multiprocessing import Pool, cpu_count

os.chdir('/root/data')

logging.basicConfig(       
    level=logging.INFO,            
    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',  
    handlers=[
        logging.FileHandler('logs/et-bert_process_flow.log', mode='w'),  
        logging.StreamHandler()          
    ],
    force=True
)

logger = logging.getLogger()

In [2]:
def cut(obj, sec):
    result = [obj[i:i+sec] for i in range(0,len(obj),sec)]
    try:
        remanent_count = len(result[0])%4
    except Exception as e:
        remanent_count = 0
        print("cut datagram error!")
    if remanent_count == 0:
        pass
    else:
        result = [obj[i:i+sec+remanent_count] for i in range(0,len(obj),sec+remanent_count)]
    return result

def bigram_generation(packet_datagram, packet_len = 64, flag=True):
    result = ''
    generated_datagram = cut(packet_datagram,1)
    token_count = 0
    for sub_string_index in range(len(generated_datagram)):
        if sub_string_index != (len(generated_datagram) - 1):
            token_count += 1
            if token_count > packet_len:
                break
            else:
                merge_word_bigram = generated_datagram[sub_string_index] + generated_datagram[sub_string_index + 1]
        else:
            break
        result += merge_word_bigram
        result += ' '
    
    return result

In [3]:
dataset = 'TLS120'
dataset_path = 'TLS120/filtered/sessions'
output_path = f'/root/Traffic/code/ET-BERT/datasets/{dataset}'
os.makedirs(output_path, exist_ok=True)

In [4]:
# just for packet-level
# type: train, test, val
# file: pcap file
# this is for generating dataset for ET-BERT with pcap and parquet files
# dataset_path/{train_val/test}/class_name/flow.pcap
# dataset_path/{train_val}/{train/val}/class_name/flow.pcap

def clean_packet(packet):
    if packet.haslayer(scapy.Ether):
        packet = packet[scapy.Ether].payload

    if packet.haslayer(scapy.IP):
        packet = packet[scapy.IP].payload
    elif packet.haslayer('IPv6'):
        packet = packet['IPv6'].payload

    if packet.haslayer(scapy.UDP):
        packet[scapy.UDP].sport = 0  
        packet[scapy.UDP].dport = 0  
    elif packet.haslayer(scapy.TCP):
        packet[scapy.TCP].sport = 0  
        packet[scapy.TCP].dport = 0  
    
    return packet

def get_feature_packet(packet, payload_length):
    packet_data_string = ''
    packet_data = packet.copy()
    packet_string = (binascii.hexlify(bytes(packet_data))).decode()[8:]  # remove eth header, ip header and port
    packet_data_string += bigram_generation(packet_string, packet_len=payload_length, flag=True)
    return packet_data_string

def save_to_tsv(dataset_file, output_path, type):
    with open(f"{output_path}/{type}.tsv", 'w', newline='') as f:
        tsv_w = csv.writer(f, delimiter='\t')
        tsv_w.writerows(dataset_file)

def process_file(path, class_name, payload_length):
    dataset_file = [["label", "text_a"]]
    dataset_numpy = []
    dataset_label = []

    pkts = scapy.PcapReader(f"{dataset_path}/{path}.pcap")

    feature_packet = ''
    for id, pkt in enumerate(pkts):
        if id < 5:
            pkt = clean_packet(pkt)
            feature_packet += get_feature_packet(pkt, payload_length)
        else:
            break
    dataset_file.append([int(class_name), feature_packet])
    dataset_numpy.append(feature_packet)
    dataset_label.append(class_name)
    # logger.info(f"Finish processing, the length of dataset is {len(dataset_numpy)}")
    return dataset_file, dataset_numpy, dataset_label

def generate_dataset(dataset_path, output_path, payload_length):
    for type in os.listdir(f'{dataset_path}'):
        logger.info(f"Start processing {dataset_path}/{type}")

        if type == 'test':
            dataset_file_list, dataset_numpy_list, dataset_label_list = [], [], []
            class_id = 0
            for class_name in os.listdir(f'{dataset_path}/{type}'):
                for flow_id, file_name in enumerate(os.listdir(f'{dataset_path}/{type}/{class_name}')):
                    logger.info(f"Start processing {type}/{class_name}/{file_name}")

                    dataset_file, dataset_numpy, dataset_label = process_file(f"{type}/{class_name}/{file_name[:-5]}", class_id, payload_length)

                    if class_id == 0 and flow_id == 0:
                        dataset_file_list.extend(dataset_file)
                    else:
                        dataset_file_list.extend(dataset_file[1:])

                    dataset_numpy_list.extend(dataset_numpy)
                    dataset_label_list.extend(dataset_label)

                class_id += 1
            print(class_id)

            save_to_tsv(dataset_file_list, output_path, type)
            np.save(f"{output_path}/x_payload_{type}.npy", dataset_numpy_list)
            np.save(f"{output_path}/y_label_{type}.npy", dataset_label_list)
        else:
            for folder in os.listdir(f'{dataset_path}/{type}'):
                logger.info(f"Start processing {dataset_path}/{type}/{folder}")
                
                dataset_file_list, dataset_numpy_list, dataset_label_list = [], [], []
                class_id = 0
                for class_name in os.listdir(f'{dataset_path}/{type}/{folder}'):
                    for flow_id, file_name in enumerate(os.listdir(f'{dataset_path}/{type}/{folder}/{class_name}')):
                        logger.info(f"Start processing {dataset_path}/{type}/{folder}/{class_name}/{file_name}")

                        dataset_file, dataset_numpy, dataset_label = process_file(f"{type}/{folder}/{class_name}/{file_name[:-5]}", class_id, payload_length)

                        if class_id == 0 and flow_id == 0:
                            dataset_file_list.extend(dataset_file)
                        else:
                            dataset_file_list.extend(dataset_file[1:])

                        dataset_numpy_list.extend(dataset_numpy)
                        dataset_label_list.extend(dataset_label)

                    class_id += 1
                
                print(class_id)
                save_to_tsv(dataset_file_list, f"{output_path}/{type}", folder)
                np.save(f"{output_path}/{type}/x_payload_{folder}.npy", dataset_numpy_list)
                np.save(f"{output_path}/{type}/y_label_{folder}.npy", dataset_label_list)
    

In [7]:

# dataset/type(train, test, validation)/.pcap
generate_dataset(dataset_path, output_path, payload_length = 128)

logger.info(f'Finish')
# main(dataset_path='your_dataset_path', type='your_type', output_path='your_output_path', payload_length=100)

2025-12-28 14:46:48,152 - root - INFO - Start processing TLS120/filtered/sessions/test
2025-12-28 14:46:48,153 - root - INFO - Start processing test/sina.com.cn/194.pcap
2025-12-28 14:46:48,157 - root - INFO - Start processing test/sina.com.cn/312.pcap
2025-12-28 14:46:48,160 - root - INFO - Start processing test/sina.com.cn/386.pcap
2025-12-28 14:46:48,165 - root - INFO - Start processing test/sina.com.cn/477.pcap
2025-12-28 14:46:48,169 - root - INFO - Start processing test/sina.com.cn/320.pcap
2025-12-28 14:46:48,173 - root - INFO - Start processing test/sina.com.cn/306.pcap
2025-12-28 14:46:48,176 - root - INFO - Start processing test/sina.com.cn/171.pcap
2025-12-28 14:46:48,179 - root - INFO - Start processing test/sina.com.cn/274.pcap
2025-12-28 14:46:48,184 - root - INFO - Start processing test/sina.com.cn/433.pcap
2025-12-28 14:46:48,188 - root - INFO - Start processing test/sina.com.cn/145.pcap
2025-12-28 14:46:48,191 - root - INFO - Start processing test/sina.com.cn/307.pcap


120


2025-12-28 14:47:28,268 - root - INFO - Start processing TLS120/filtered/sessions/train_val_split_1
2025-12-28 14:47:28,269 - root - INFO - Start processing TLS120/filtered/sessions/train_val_split_1/val
2025-12-28 14:47:28,272 - root - INFO - Start processing TLS120/filtered/sessions/train_val_split_1/val/sina.com.cn/425.pcap
2025-12-28 14:47:28,276 - root - INFO - Start processing TLS120/filtered/sessions/train_val_split_1/val/sina.com.cn/105.pcap
2025-12-28 14:47:28,279 - root - INFO - Start processing TLS120/filtered/sessions/train_val_split_1/val/sina.com.cn/10.pcap
2025-12-28 14:47:28,284 - root - INFO - Start processing TLS120/filtered/sessions/train_val_split_1/val/sina.com.cn/329.pcap
2025-12-28 14:47:28,287 - root - INFO - Start processing TLS120/filtered/sessions/train_val_split_1/val/sina.com.cn/228.pcap
2025-12-28 14:47:28,291 - root - INFO - Start processing TLS120/filtered/sessions/train_val_split_1/val/sina.com.cn/120.pcap
2025-12-28 14:47:28,295 - root - INFO - Start p

120


2025-12-28 14:47:40,138 - root - INFO - Start processing TLS120/filtered/sessions/train_val_split_1/train/sina.com.cn/48.pcap
2025-12-28 14:47:40,145 - root - INFO - Start processing TLS120/filtered/sessions/train_val_split_1/train/sina.com.cn/63.pcap
2025-12-28 14:47:40,149 - root - INFO - Start processing TLS120/filtered/sessions/train_val_split_1/train/sina.com.cn/179.pcap
2025-12-28 14:47:40,153 - root - INFO - Start processing TLS120/filtered/sessions/train_val_split_1/train/sina.com.cn/301.pcap
2025-12-28 14:47:40,157 - root - INFO - Start processing TLS120/filtered/sessions/train_val_split_1/train/sina.com.cn/462.pcap
2025-12-28 14:47:40,161 - root - INFO - Start processing TLS120/filtered/sessions/train_val_split_1/train/sina.com.cn/391.pcap
2025-12-28 14:47:40,165 - root - INFO - Start processing TLS120/filtered/sessions/train_val_split_1/train/sina.com.cn/466.pcap
2025-12-28 14:47:40,169 - root - INFO - Start processing TLS120/filtered/sessions/train_val_split_1/train/sina.co

120


2025-12-28 14:48:12,684 - root - INFO - Start processing TLS120/filtered/sessions/train_val_split_0
2025-12-28 14:48:12,685 - root - INFO - Start processing TLS120/filtered/sessions/train_val_split_0/val
2025-12-28 14:48:12,687 - root - INFO - Start processing TLS120/filtered/sessions/train_val_split_0/val/sina.com.cn/232.pcap
2025-12-28 14:48:12,748 - root - INFO - Start processing TLS120/filtered/sessions/train_val_split_0/val/sina.com.cn/321.pcap
2025-12-28 14:48:12,753 - root - INFO - Start processing TLS120/filtered/sessions/train_val_split_0/val/sina.com.cn/338.pcap
2025-12-28 14:48:12,758 - root - INFO - Start processing TLS120/filtered/sessions/train_val_split_0/val/sina.com.cn/70.pcap
2025-12-28 14:48:12,762 - root - INFO - Start processing TLS120/filtered/sessions/train_val_split_0/val/sina.com.cn/63.pcap
2025-12-28 14:48:12,766 - root - INFO - Start processing TLS120/filtered/sessions/train_val_split_0/val/sina.com.cn/301.pcap
2025-12-28 14:48:12,771 - root - INFO - Start pr

120


2025-12-28 14:48:29,211 - root - INFO - Start processing TLS120/filtered/sessions/train_val_split_0/train/sina.com.cn/105.pcap
2025-12-28 14:48:29,244 - root - INFO - Start processing TLS120/filtered/sessions/train_val_split_0/train/sina.com.cn/10.pcap
2025-12-28 14:48:29,249 - root - INFO - Start processing TLS120/filtered/sessions/train_val_split_0/train/sina.com.cn/271.pcap
2025-12-28 14:48:29,309 - root - INFO - Start processing TLS120/filtered/sessions/train_val_split_0/train/sina.com.cn/329.pcap
2025-12-28 14:48:29,314 - root - INFO - Start processing TLS120/filtered/sessions/train_val_split_0/train/sina.com.cn/415.pcap
2025-12-28 14:48:29,319 - root - INFO - Start processing TLS120/filtered/sessions/train_val_split_0/train/sina.com.cn/228.pcap
2025-12-28 14:48:29,323 - root - INFO - Start processing TLS120/filtered/sessions/train_val_split_0/train/sina.com.cn/120.pcap
2025-12-28 14:48:29,326 - root - INFO - Start processing TLS120/filtered/sessions/train_val_split_0/train/sina.c

120


2025-12-28 14:49:06,612 - root - INFO - Start processing TLS120/filtered/sessions/train_val_split_2
2025-12-28 14:49:06,613 - root - INFO - Start processing TLS120/filtered/sessions/train_val_split_2/val
2025-12-28 14:49:06,615 - root - INFO - Start processing TLS120/filtered/sessions/train_val_split_2/val/sina.com.cn/271.pcap
2025-12-28 14:49:06,712 - root - INFO - Start processing TLS120/filtered/sessions/train_val_split_2/val/sina.com.cn/415.pcap
2025-12-28 14:49:06,780 - root - INFO - Start processing TLS120/filtered/sessions/train_val_split_2/val/sina.com.cn/346.pcap
2025-12-28 14:49:06,785 - root - INFO - Start processing TLS120/filtered/sessions/train_val_split_2/val/sina.com.cn/303.pcap
2025-12-28 14:49:06,841 - root - INFO - Start processing TLS120/filtered/sessions/train_val_split_2/val/sina.com.cn/48.pcap
2025-12-28 14:49:06,844 - root - INFO - Start processing TLS120/filtered/sessions/train_val_split_2/val/sina.com.cn/179.pcap
2025-12-28 14:49:06,848 - root - INFO - Start p

120


2025-12-28 14:49:22,436 - root - INFO - Start processing TLS120/filtered/sessions/train_val_split_2/train/sina.com.cn/105.pcap
2025-12-28 14:49:22,440 - root - INFO - Start processing TLS120/filtered/sessions/train_val_split_2/train/sina.com.cn/10.pcap
2025-12-28 14:49:22,445 - root - INFO - Start processing TLS120/filtered/sessions/train_val_split_2/train/sina.com.cn/329.pcap
2025-12-28 14:49:22,448 - root - INFO - Start processing TLS120/filtered/sessions/train_val_split_2/train/sina.com.cn/232.pcap
2025-12-28 14:49:22,452 - root - INFO - Start processing TLS120/filtered/sessions/train_val_split_2/train/sina.com.cn/228.pcap
2025-12-28 14:49:22,457 - root - INFO - Start processing TLS120/filtered/sessions/train_val_split_2/train/sina.com.cn/120.pcap
2025-12-28 14:49:22,460 - root - INFO - Start processing TLS120/filtered/sessions/train_val_split_2/train/sina.com.cn/184.pcap
2025-12-28 14:49:22,464 - root - INFO - Start processing TLS120/filtered/sessions/train_val_split_2/train/sina.c

120


2025-12-28 14:49:53,666 - root - INFO - Finish


In [None]:
def generate_kfold_dataset(root_path, output_root, k=3, payload_length=128):
    global dataset_path # Ensure we are updating the global variable used by process_file
    
    for i in range(k):
        split_name = f"train_val_split_{i}"
        dataset_path = os.path.join(root_path, split_name)
        split_output_path = os.path.join(output_root, f"split_{i}")
        os.makedirs(split_output_path, exist_ok=True)
        
        logger.info(f"Processing split {i}: {dataset_path}")
        
        if not os.path.exists(dataset_path):
            logger.warning(f"Split path does not exist: {dataset_path}")
            continue

        # Iterate over train, val (and test if it exists inside)
        # Assuming structure: split/train/class/file and split/val/class/file
        for type_name in os.listdir(dataset_path):
            if not os.path.isdir(os.path.join(dataset_path, type_name)):
                continue
            
            # Skip if it's not a directory we expect (e.g. .DS_Store)
            if type_name not in ['train', 'val', 'test']: 
                continue

            logger.info(f"Start processing {type_name} in split {i}")
            
            dataset_file_list, dataset_numpy_list, dataset_label_list = [], [], []
            class_id = 0
            
            # Sort classes to ensure consistent ID mapping
            classes = sorted(os.listdir(os.path.join(dataset_path, type_name)))
            
            for class_name in classes:
                class_dir = os.path.join(dataset_path, type_name, class_name)
                if not os.path.isdir(class_dir):
                    continue
                    
                files = sorted(os.listdir(class_dir))
                for flow_id, file_name in enumerate(files):
                    if not file_name.endswith('.pcap'):
                        continue
                        
                    # logger.info(f"Start processing {type_name}/{class_name}/{file_name}")
                    
                    # process_file expects path relative to dataset_path, without extension
                    # path = type/class/filename_no_ext
                    file_path_rel = f"{type_name}/{class_name}/{file_name[:-5]}"
                    
                    try:
                        dataset_file, dataset_numpy, dataset_label = process_file(file_path_rel, class_id, payload_length)

                        if class_id == 0 and flow_id == 0:
                            dataset_file_list.extend(dataset_file)
                        else:
                            dataset_file_list.extend(dataset_file[1:])

                        dataset_numpy_list.extend(dataset_numpy)
                        dataset_label_list.extend(dataset_label)
                    except Exception as e:
                        logger.error(f"Error processing {file_path_rel}: {e}")

                class_id += 1
            
            print(f"Split {i} - {type_name}: {class_id} classes processed")

            save_to_tsv(dataset_file_list, split_output_path, type_name)
            np.save(f"{split_output_path}/x_payload_{type_name}.npy", dataset_numpy_list)
            np.save(f"{split_output_path}/y_label_{type_name}.npy", dataset_label_list)

# Configure paths
root_path = '/root/data/TLS120/filtered/flow-level-classification/tls' # Or vpn-app
output_root = '/root/Traffic/code/ET-BERT/datasets/TLS120/'

# Run generation
# generate_kfold_dataset(root_path, output_root, k=3)