# Description
process et-bert data for fine-tuning
label.pcap -> train/validation/test.tsv

In [1]:
import os
import logging
import scapy.all as scapy
import random
import binascii
import csv
import numpy as np
import pandas as pd
from multiprocessing import Pool, cpu_count

os.chdir('/root/data')

logging.basicConfig(       
    level=logging.INFO,            
    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',  
    handlers=[
        logging.FileHandler('logs/et-bert_process_flow.log', mode='w'),  
        logging.StreamHandler()          
    ],
    force=True
)

logger = logging.getLogger()

In [2]:
def cut(obj, sec):
    result = [obj[i:i+sec] for i in range(0,len(obj),sec)]
    try:
        remanent_count = len(result[0])%4
    except Exception as e:
        remanent_count = 0
        print("cut datagram error!")
    if remanent_count == 0:
        pass
    else:
        result = [obj[i:i+sec+remanent_count] for i in range(0,len(obj),sec+remanent_count)]
    return result

def bigram_generation(packet_datagram, packet_len = 64, flag=True):
    result = ''
    generated_datagram = cut(packet_datagram,1)
    token_count = 0
    for sub_string_index in range(len(generated_datagram)):
        if sub_string_index != (len(generated_datagram) - 1):
            token_count += 1
            if token_count > packet_len:
                break
            else:
                merge_word_bigram = generated_datagram[sub_string_index] + generated_datagram[sub_string_index + 1]
        else:
            break
        result += merge_word_bigram
        result += ' '
    
    return result

In [3]:
dataset = 'ISCX-VPN-2016'
dataset_path = 'ISCX-VPN-2016/filtered/flow'
output_path = f'/root/Traffic/code/ET-BERT/datasets/{dataset}'
os.makedirs(output_path, exist_ok=True)

In [6]:
# just for packet-level
# type: train, test, val
# file: pcap file
# this is for generating dataset for ET-BERT with pcap and parquet files
# dataset_path/{train_val/test}/class_name/flow.pcap
# dataset_path/{train_val}/{train/val}/class_name/flow.pcap

def clean_packet(packet):
    if packet.haslayer(scapy.Ether):
        packet = packet[scapy.Ether].payload

    if packet.haslayer(scapy.IP):
        packet = packet[scapy.IP].payload
    elif packet.haslayer('IPv6'):
        packet = packet['IPv6'].payload

    if packet.haslayer(scapy.UDP):
        packet[scapy.UDP].sport = 0  
        packet[scapy.UDP].dport = 0  
    elif packet.haslayer(scapy.TCP):
        packet[scapy.TCP].sport = 0  
        packet[scapy.TCP].dport = 0  
    
    return packet

def get_feature_packet(packet, payload_length):
    packet_data_string = ''
    packet_data = packet.copy()
    packet_string = (binascii.hexlify(bytes(packet_data))).decode()[8:]  # remove eth header, ip header and port
    packet_data_string += bigram_generation(packet_string, packet_len=payload_length, flag=True)
    return packet_data_string

def save_to_tsv(dataset_file, output_path, type):
    with open(f"{output_path}/{type}.tsv", 'w', newline='') as f:
        tsv_w = csv.writer(f, delimiter='\t')
        tsv_w.writerows(dataset_file)

def process_file(path, class_name, payload_length):
    dataset_file = [["label", "text_a"]]
    dataset_numpy = []
    dataset_label = []

    pkts = scapy.PcapReader(f"{dataset_path}/{path}.pcap")

    feature_packet = ''
    for id, pkt in enumerate(pkts):
        if id < 5:
            pkt = clean_packet(pkt)
            feature_packet += get_feature_packet(pkt, payload_length)
        else:
            break
    dataset_file.append([int(class_name), feature_packet])
    dataset_numpy.append(feature_packet)
    dataset_label.append(class_name)
    # logger.info(f"Finish processing, the length of dataset is {len(dataset_numpy)}")
    return dataset_file, dataset_numpy, dataset_label

def generate_dataset(dataset_path, output_path, payload_length):
    for type in os.listdir(f'{dataset_path}'):
        logger.info(f"Start processing {dataset_path}/{type}")

        if type == 'test':
            dataset_file_list, dataset_numpy_list, dataset_label_list = [], [], []
            class_id = 0
            for class_name in os.listdir(f'{dataset_path}/{type}'):
                for flow_id, file_name in enumerate(os.listdir(f'{dataset_path}/{type}/{class_name}')):
                    logger.info(f"Start processing {type}/{class_name}/{file_name}")

                    dataset_file, dataset_numpy, dataset_label = process_file(f"{type}/{class_name}/{file_name[:-5]}", class_id, payload_length)

                    if class_id == 0 and flow_id == 0:
                        dataset_file_list.extend(dataset_file)
                    else:
                        dataset_file_list.extend(dataset_file[1:])

                    dataset_numpy_list.extend(dataset_numpy)
                    dataset_label_list.extend(dataset_label)

                class_id += 1
            print(class_id)

            save_to_tsv(dataset_file_list, output_path, type)
            np.save(f"{output_path}/x_payload_{type}.npy", dataset_numpy_list)
            np.save(f"{output_path}/y_label_{type}.npy", dataset_label_list)
        else:
            for folder in os.listdir(f'{dataset_path}/{type}'):
                logger.info(f"Start processing {dataset_path}/{type}/{folder}")
                
                dataset_file_list, dataset_numpy_list, dataset_label_list = [], [], []
                class_id = 0
                for class_name in os.listdir(f'{dataset_path}/{type}/{folder}'):
                    for flow_id, file_name in enumerate(os.listdir(f'{dataset_path}/{type}/{folder}/{class_name}')):
                        logger.info(f"Start processing {dataset_path}/{type}/{folder}/{class_name}/{file_name}")

                        dataset_file, dataset_numpy, dataset_label = process_file(f"{type}/{folder}/{class_name}/{file_name[:-5]}", class_id, payload_length)

                        if class_id == 0 and flow_id == 0:
                            dataset_file_list.extend(dataset_file)
                        else:
                            dataset_file_list.extend(dataset_file[1:])

                        dataset_numpy_list.extend(dataset_numpy)
                        dataset_label_list.extend(dataset_label)

                    class_id += 1
                
                print(class_id)
                save_to_tsv(dataset_file_list, f"{output_path}/{type}", folder)
                np.save(f"{output_path}/{type}/x_payload_{folder}.npy", dataset_numpy_list)
                np.save(f"{output_path}/{type}/y_label_{folder}.npy", dataset_label_list)
    

In [7]:

# dataset/type(train, test, validation)/.pcap
generate_dataset(dataset_path, output_path, payload_length = 128)

logger.info(f'Finish')
# main(dataset_path='your_dataset_path', type='your_type', output_path='your_output_path', payload_length=100)

2025-12-15 15:01:19,237 - root - INFO - Start processing ISCX-VPN-2016/filtered/flow/test
2025-12-15 15:01:19,238 - root - INFO - Start processing test/netflix/session_46.pcap
2025-12-15 15:01:19,242 - root - INFO - Start processing test/netflix/session_38.pcap
2025-12-15 15:01:19,245 - root - INFO - Start processing test/netflix/session_222.pcap
2025-12-15 15:01:19,249 - root - INFO - Start processing test/netflix/session_228.pcap
2025-12-15 15:01:19,252 - root - INFO - Start processing test/netflix/session_39.pcap
2025-12-15 15:01:19,256 - root - INFO - Start processing test/netflix/session_19.pcap
2025-12-15 15:01:19,260 - root - INFO - Start processing test/spotify/session_75.pcap
2025-12-15 15:01:19,264 - root - INFO - Start processing test/spotify/session_53.pcap
2025-12-15 15:01:19,267 - root - INFO - Start processing test/spotify/session_130.pcap
2025-12-15 15:01:19,269 - root - INFO - Start processing test/spotify/session_50.pcap
2025-12-15 15:01:19,272 - root - INFO - Start p

16


2025-12-15 15:01:19,953 - root - INFO - Start processing ISCX-VPN-2016/filtered/flow/train_val_split_1/val/email/session_676.pcap
2025-12-15 15:01:19,956 - root - INFO - Start processing ISCX-VPN-2016/filtered/flow/train_val_split_1/val/email/session_242.pcap
2025-12-15 15:01:19,959 - root - INFO - Start processing ISCX-VPN-2016/filtered/flow/train_val_split_1/val/email/session_243.pcap
2025-12-15 15:01:19,961 - root - INFO - Start processing ISCX-VPN-2016/filtered/flow/train_val_split_1/val/email/session_74.pcap
2025-12-15 15:01:19,963 - root - INFO - Start processing ISCX-VPN-2016/filtered/flow/train_val_split_1/val/email/session_41.pcap
2025-12-15 15:01:19,966 - root - INFO - Start processing ISCX-VPN-2016/filtered/flow/train_val_split_1/val/facebook/session_5461.pcap
2025-12-15 15:01:19,968 - root - INFO - Start processing ISCX-VPN-2016/filtered/flow/train_val_split_1/val/facebook/session_820.pcap
2025-12-15 15:01:19,970 - root - INFO - Start processing ISCX-VPN-2016/filtered/flow/

16


2025-12-15 15:01:20,208 - root - INFO - Start processing ISCX-VPN-2016/filtered/flow/train_val_split_1/train/vimeo/session_240.pcap
2025-12-15 15:01:20,210 - root - INFO - Start processing ISCX-VPN-2016/filtered/flow/train_val_split_1/train/vimeo/session_88.pcap
2025-12-15 15:01:20,211 - root - INFO - Start processing ISCX-VPN-2016/filtered/flow/train_val_split_1/train/vimeo/session_53.pcap
2025-12-15 15:01:20,214 - root - INFO - Start processing ISCX-VPN-2016/filtered/flow/train_val_split_1/train/vimeo/session_148.pcap
2025-12-15 15:01:20,217 - root - INFO - Start processing ISCX-VPN-2016/filtered/flow/train_val_split_1/train/vimeo/session_39.pcap
2025-12-15 15:01:20,219 - root - INFO - Start processing ISCX-VPN-2016/filtered/flow/train_val_split_1/train/vimeo/session_41.pcap
2025-12-15 15:01:20,221 - root - INFO - Start processing ISCX-VPN-2016/filtered/flow/train_val_split_1/train/vimeo/session_65.pcap
2025-12-15 15:01:20,224 - root - INFO - Start processing ISCX-VPN-2016/filtered/f

16


2025-12-15 15:01:20,653 - root - INFO - Start processing ISCX-VPN-2016/filtered/flow/train_val_split_0/val/facebook/session_2.pcap
2025-12-15 15:01:20,656 - root - INFO - Start processing ISCX-VPN-2016/filtered/flow/train_val_split_0/val/facebook/session_4726.pcap
2025-12-15 15:01:20,658 - root - INFO - Start processing ISCX-VPN-2016/filtered/flow/train_val_split_0/val/facebook/session_2878.pcap
2025-12-15 15:01:20,661 - root - INFO - Start processing ISCX-VPN-2016/filtered/flow/train_val_split_0/val/facebook/session_5020.pcap
2025-12-15 15:01:20,663 - root - INFO - Start processing ISCX-VPN-2016/filtered/flow/train_val_split_0/val/gmail/session_185.pcap
2025-12-15 15:01:20,666 - root - INFO - Start processing ISCX-VPN-2016/filtered/flow/train_val_split_0/val/gmail/session_16.pcap
2025-12-15 15:01:20,668 - root - INFO - Start processing ISCX-VPN-2016/filtered/flow/train_val_split_0/val/gmail/session_184.pcap
2025-12-15 15:01:20,671 - root - INFO - Start processing ISCX-VPN-2016/filtere

16


2025-12-15 15:01:20,883 - root - INFO - Start processing ISCX-VPN-2016/filtered/flow/train_val_split_0/train/vimeo/session_39.pcap
2025-12-15 15:01:20,885 - root - INFO - Start processing ISCX-VPN-2016/filtered/flow/train_val_split_0/train/vimeo/session_1.pcap
2025-12-15 15:01:20,887 - root - INFO - Start processing ISCX-VPN-2016/filtered/flow/train_val_split_0/train/vimeo/session_45.pcap
2025-12-15 15:01:20,888 - root - INFO - Start processing ISCX-VPN-2016/filtered/flow/train_val_split_0/train/vimeo/session_40.pcap
2025-12-15 15:01:20,890 - root - INFO - Start processing ISCX-VPN-2016/filtered/flow/train_val_split_0/train/vimeo/session_241.pcap
2025-12-15 15:01:20,892 - root - INFO - Start processing ISCX-VPN-2016/filtered/flow/train_val_split_0/train/vimeo/session_41.pcap
2025-12-15 15:01:20,894 - root - INFO - Start processing ISCX-VPN-2016/filtered/flow/train_val_split_0/train/vimeo/session_65.pcap
2025-12-15 15:01:20,897 - root - INFO - Start processing ISCX-VPN-2016/filtered/flo

16


2025-12-15 15:01:21,407 - root - INFO - Start processing ISCX-VPN-2016/filtered/flow/train_val_split_2/val/facebook/session_828.pcap
2025-12-15 15:01:21,410 - root - INFO - Start processing ISCX-VPN-2016/filtered/flow/train_val_split_2/val/facebook/session_31.pcap
2025-12-15 15:01:21,412 - root - INFO - Start processing ISCX-VPN-2016/filtered/flow/train_val_split_2/val/facebook/session_4725.pcap
2025-12-15 15:01:21,414 - root - INFO - Start processing ISCX-VPN-2016/filtered/flow/train_val_split_2/val/facebook/session_5314.pcap
2025-12-15 15:01:21,416 - root - INFO - Start processing ISCX-VPN-2016/filtered/flow/train_val_split_2/val/facebook/session_822.pcap
2025-12-15 15:01:21,419 - root - INFO - Start processing ISCX-VPN-2016/filtered/flow/train_val_split_2/val/facebook/session_2871.pcap
2025-12-15 15:01:21,422 - root - INFO - Start processing ISCX-VPN-2016/filtered/flow/train_val_split_2/val/gmail/session_18.pcap
2025-12-15 15:01:21,424 - root - INFO - Start processing ISCX-VPN-2016/

16


2025-12-15 15:01:21,644 - root - INFO - Start processing ISCX-VPN-2016/filtered/flow/train_val_split_2/train/voipbuster/session_59.pcap
2025-12-15 15:01:21,646 - root - INFO - Start processing ISCX-VPN-2016/filtered/flow/train_val_split_2/train/voipbuster/session_92.pcap
2025-12-15 15:01:21,649 - root - INFO - Start processing ISCX-VPN-2016/filtered/flow/train_val_split_2/train/voipbuster/session_47.pcap
2025-12-15 15:01:21,652 - root - INFO - Start processing ISCX-VPN-2016/filtered/flow/train_val_split_2/train/vimeo/session_96.pcap
2025-12-15 15:01:21,655 - root - INFO - Start processing ISCX-VPN-2016/filtered/flow/train_val_split_2/train/vimeo/session_4.pcap
2025-12-15 15:01:21,658 - root - INFO - Start processing ISCX-VPN-2016/filtered/flow/train_val_split_2/train/vimeo/session_82.pcap
2025-12-15 15:01:21,660 - root - INFO - Start processing ISCX-VPN-2016/filtered/flow/train_val_split_2/train/vimeo/session_30.pcap
2025-12-15 15:01:21,663 - root - INFO - Start processing ISCX-VPN-201

16
