# Description
process et-bert data for fine-tuning
label.pcap (tsv format) -> train/validation/test.tsv

In [1]:
import os
import logging
import scapy.all as scapy
import random
import binascii
import csv
import json
import numpy as np
import pandas as pd
from multiprocessing import Pool, cpu_count

In [2]:
dataset = '/root/Traffic/flow-level-classification/vpn-app'

dataset_path = f'{dataset}'
output_path = f'/root/Traffic/code/PCAP_encoder/1.Datasets/Classification/vpn-app'
output_withoutIP_path = f'/root/Traffic/code/PCAP_encoder/1.Datasets/Classification/without_IP/vpn-app'

os.makedirs(output_path, exist_ok=True)
os.makedirs(output_withoutIP_path, exist_ok=True)

In [3]:
def clean_packet(packet):
    if packet.haslayer(scapy.Ether):
        packet = packet[scapy.Ether].payload

    if packet.haslayer(scapy.IP):
        packet[scapy.IP].src = "0.0.0.0"
        packet[scapy.IP].dst = "0.0.0.0"
    elif packet.haslayer('IPv6'):
        packet['IPv6'].src = "::"
        packet['IPv6'].dst = "::"

    if packet.haslayer(scapy.UDP):
        packet[scapy.UDP].sport = 0 
        packet[scapy.UDP].dport = 0  
    elif packet.haslayer(scapy.TCP):
        packet[scapy.TCP].sport = 0  
        packet[scapy.TCP].dport = 0  
    
    return packet

In [5]:


def group_string_by_n(pkt, n=4):
    s = binascii.hexlify(bytes(pkt)).decode()
    return ' '.join(s[i:i+n] for i in range(0, len(s), n))
class_indexs = {
    'aim': 0, 'facebook': 1, 'gmail': 2, 'icq': 3,
    'scp': 4, 'skype': 5, 'torrent': 6, 'voipbuster': 7,
    'email': 8, 'ftp': 9, 'hangout': 10, 'netflix': 11,
    'sftp': 12, 'spotify': 13, 'vimeo': 14, 'youtube': 15
}
for split_folder in os.listdir(dataset_path):
    print(f"Processing file: {split_folder}")
    if split_folder not in ['train_val_split_0']:
        continue
    if split_folder == 'test':
        dataset_file = [['question', 'class', 'type_q', 'context']]
        for label in os.listdir(f"{dataset_path}/{split_folder}"):
            for file in os.listdir(f"{dataset_path}/{split_folder}/{label}"):
                if file.endswith('.pcap'):
                    print(f"Processing file: {file}")
                    with scapy.PcapReader(f"{dataset_path}/{split_folder}/{label}/{file}") as pkt_reader:
                        for pkt in pkt_reader:
                            pkt = clean_packet(pkt)
                            context = group_string_by_n(pkt)
                            dataset_file.append(['What is the representation of this packet?', class_indexs[label], label, context])
        os.makedirs(f"{output_withoutIP_path}", exist_ok=True)
        output_dataframe = pd.DataFrame(dataset_file[1:], columns=dataset_file[0])
        output_dataframe.to_parquet(f"{output_withoutIP_path}/{split_folder}.parquet", index=False)
        print(f"Saved {output_withoutIP_path}/{split_folder}.parquet")
    else:
        for type in os.listdir(f"{dataset_path}/{split_folder}"):
            if type not in ['train']:
                continue
            dataset_file = [['question', 'class', 'type_q', 'context']]
            for label in os.listdir(f"{dataset_path}/{split_folder}/{type}"):    
                for file in os.listdir(f"{dataset_path}/{split_folder}/{type}/{label}"):
                    if file.endswith('.pcap'):
                        print(f"Processing file: {file}")
                        with scapy.PcapReader(f"{dataset_path}/{split_folder}/{type}/{label}/{file}") as pkt_reader:
                            for pkt in pkt_reader:
                                pkt = clean_packet(pkt)
                                context = group_string_by_n(pkt)
                                dataset_file.append(['What is the representation of this packet?', class_indexs[label], label, context])
            
            os.makedirs(f"{output_withoutIP_path}/{split_folder}", exist_ok=True)
            output_dataframe = pd.DataFrame(dataset_file[1:], columns=dataset_file[0])
            output_dataframe.to_parquet(f"{output_withoutIP_path}/{split_folder}/{type}.parquet", index=False)
            print(f"Saved {output_withoutIP_path}/{split_folder}/{type}.parquet")

Processing file: __MACOSX
Processing file: MRF
Processing file: train_val_split_0
Processing file: 249.pcap
Processing file: 00002.pcap
Processing file: 26.pcap
Processing file: 00017.pcap
Processing file: 311.pcap
Processing file: 310.pcap
Processing file: 00014.pcap
Processing file: 00005.pcap
Processing file: 210.pcap
Processing file: 303.pcap
Processing file: 00025.pcap
Processing file: 00032.pcap
Processing file: 286.pcap
Processing file: 00031.pcap
Processing file: 216.pcap
Processing file: 00030.pcap
Processing file: 313.pcap
Processing file: 00026.pcap
Processing file: 299.pcap
Processing file: 00006.pcap
Processing file: 00015.pcap
Processing file: 00007.pcap
Processing file: 00010.pcap
Processing file: 284.pcap
Processing file: 100.pcap
Processing file: 253.pcap
Processing file: 00001.pcap
Processing file: 00020.pcap
Processing file: 197.pcap
Processing file: 00023.pcap
Processing file: 00008.pcap
Processing file: 212.pcap
Processing file: 190.pcap
Processing file: 00024.pcap