# Split dataset

This is for Split data to three datasets (train, val, test)

All flows are disjointly, the test distribution is same with origin datasets

Train and Test datasets are balanced, which means every class has similar number packets

In [5]:
import os
import logging
import scapy.all as scapy
import random
from collections import defaultdict
from multiprocessing import Pool, cpu_count
import json

os.chdir('/root/data')

import logging

logging.basicConfig(       
    level=logging.INFO,            
    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',  
    handlers=[
        logging.FileHandler('logs/split_based_flow.log', mode='w'),  
        logging.StreamHandler()          
    ],
    force=True
)

logger = logging.getLogger()

In [6]:
k = 1 # k-folder
seed = 43 # random seed
threshold = 5 # threshold for the number of packets in a flow
test_size = 1/(k+1) # test size for train-test split
random.seed(seed)

dataset = 'ISCX-VPN-2016'
dataset_path = 'ISCX-VPN-2016/filtered/raw'
output_path = 'ISCX-VPN-2016/filtered/flow'

if dataset == 'ISCX-VPN-2016':
    datasets_class_name = ['aim', 'email', 'facebook', 'sftp', 'gmail', 'hangout', 'icq', 'netflix', 'scp', 'ftp', 'skype', 'spotify', 'vimeo', 'torrent', 'voipbuster', 'youtube']

In [7]:
# statistics the information of the dataset
# num_of_class = len(datasets_class_name)
num_of_original_flow = 0
num_of_filtered_flow = 0
num_of_flow_per_class = defaultdict(int) # number of flows per class, which is larger than threshold
flow_file_of_class = defaultdict(list) # list of flow files per class

def process_flow(args):
    flow, folder_path, class_name = args
    count = 0
    with scapy.PcapReader(f"{folder_path}/{flow}") as packets:
        for _ in packets:
            count += 1
            if count >= threshold:
                return class_name, f"{folder_path}/{flow}"
        
for folder in os.listdir(dataset_path): # AIM_chat_1 directory
    logger.info(f"Processing folder: {folder}")
    if dataset == 'tls' or dataset == 'vpn-app':
        class_name = folder
    else:
        class_name = next((name for name in datasets_class_name if name in folder or name.upper() in folder), None)
        if class_name is None:
            logger.warning(f"No matching class for folder: {folder}")
            continue
    logger.info(f"Class name: {class_name}")

    flows = [flow for flow in os.listdir(f"{dataset_path}/{folder}")] # all flow
    num_of_original_flow += len(flows)

    with Pool(cpu_count()) as pool:
        results = pool.map(process_flow, [(flow, f"{dataset_path}/{folder}", class_name) for flow in flows])

    for result in results:
        if result is not None:
            class_name, flow = result
            num_of_filtered_flow += 1
            num_of_flow_per_class[class_name] += 1
            flow_file_of_class[class_name].append(flow)

results = {
    'num_of_original_flow': num_of_original_flow,
    'num_of_filtered_flow': num_of_filtered_flow,
    'num_of_flow_per_class': dict(num_of_flow_per_class),
    'flow_file_of_class': dict(flow_file_of_class)
}

with open(f'statistics/{dataset}_{threshold}_flow.json', 'w', encoding='utf-8') as f:
    json.dump(results, f, indent=4, ensure_ascii=False)

logger.info('Results saved to results.json')
logger.info('Data processing completed successfully.')


2025-12-09 12:24:18,226 - root - INFO - Processing folder: netflix
2025-12-09 12:24:18,226 - root - INFO - Class name: netflix
2025-12-09 12:24:18,284 - root - INFO - Processing folder: spotify
2025-12-09 12:24:18,285 - root - INFO - Class name: spotify
2025-12-09 12:24:18,341 - root - INFO - Processing folder: skype
2025-12-09 12:24:18,342 - root - INFO - Class name: skype
2025-12-09 12:24:18,401 - root - INFO - Processing folder: ftp
2025-12-09 12:24:18,402 - root - INFO - Class name: ftp
2025-12-09 12:24:18,455 - root - INFO - Processing folder: voipbuster
2025-12-09 12:24:18,456 - root - INFO - Class name: voipbuster
2025-12-09 12:24:18,508 - root - INFO - Processing folder: sftp
2025-12-09 12:24:18,509 - root - INFO - Class name: sftp
2025-12-09 12:24:18,564 - root - INFO - Processing folder: vimeo
2025-12-09 12:24:18,565 - root - INFO - Class name: vimeo
2025-12-09 12:24:18,615 - root - INFO - Processing folder: torrent
2025-12-09 12:24:18,616 - root - INFO - Class name: torrent


In [4]:
test_num_of_flow_per_class = {key: int(test_size * value) for key, value in num_of_flow_per_class.items()}

train_val_num_of_flow_per_class = {key: value - test_num_of_flow_per_class[key] for key, value in num_of_flow_per_class.items()}

if dataset == 'tls': # because the latest class is too small, we use the second smallest class
    sorted_values = sorted(train_val_num_of_flow_per_class.values())
    min_train_num_of_flow_per_class= sorted_values[1]
else:
    min_train_num_of_flow_per_class = min(train_val_num_of_flow_per_class.values())   

train_val_num_of_flow_per_class = {key: min(min_train_num_of_flow_per_class, value) for key, value in train_val_num_of_flow_per_class.items()}

print(test_num_of_flow_per_class)
print(train_val_num_of_flow_per_class)

{'netflix': 1, 'spotify': 1, 'skype': 6, 'ftp': 1, 'voipbuster': 1, 'sftp': 2, 'vimeo': 1, 'torrent': 0, 'youtube': 2, 'hangout': 3, 'icq': 1, 'scp': 3, 'aim': 1, 'email': 1, 'facebook': 4, 'gmail': 0}
{'netflix': 1, 'spotify': 1, 'skype': 1, 'ftp': 1, 'voipbuster': 1, 'sftp': 1, 'vimeo': 1, 'torrent': 1, 'youtube': 1, 'hangout': 1, 'icq': 1, 'scp': 1, 'aim': 1, 'email': 1, 'facebook': 1, 'gmail': 1}


In [None]:
# get the test and train-val set
results = {}
train_val_flow_file_of_class = defaultdict(list)
test_flow_file_of_class = defaultdict(list)

for class_name, flow_files in flow_file_of_class.items():
    test_flow_file_of_class[class_name] = random.sample(flow_files, test_num_of_flow_per_class[class_name])

for class_name, flow_files in flow_file_of_class.items():
    train_val_flow_file_of_class[class_name] = list(set(flow_files) - set(test_flow_file_of_class[class_name]))

k_folds = defaultdict(list)
for class_name, flow_files in train_val_flow_file_of_class.items():
    random.shuffle(flow_files)
    fold_size = train_val_num_of_flow_per_class[class_name] // k
    for i in range(k):
        start = i * fold_size
        end = start + fold_size
        k_folds[class_name].append(flow_files[start:end])

for i in range(k):
    results[f'k_{i}'] = {class_name: k_folds[class_name][i] for class_name in k_folds}
results['test'] = test_flow_file_of_class

with open(f'outputs/{dataset}.json', 'w', encoding='utf-8') as f:
    json.dump(results, f, indent=4, ensure_ascii=False)

In [16]:
# create the output directory of the test dataset
os.makedirs(f"{output_path}/test", exist_ok=True)

for class_name, flow_files in test_flow_file_of_class.items():
    logger.info(f"Copying test files of class {class_name}")
    os.makedirs(f"{output_path}/test/{class_name}", exist_ok=True)
    for flow_file in flow_files:
        command_result = os.system(f"cp {flow_file} {output_path}/test/{class_name}")

2025-12-08 16:58:56,986 - root - INFO - Copying test files of class netflix
2025-12-08 16:58:58,736 - root - INFO - Copying test files of class spotify
2025-12-08 16:58:58,758 - root - INFO - Copying test files of class skype
2025-12-08 16:58:59,584 - root - INFO - Copying test files of class ftp
2025-12-08 17:01:08,621 - root - INFO - Copying test files of class voipbuster
2025-12-08 17:01:09,089 - root - INFO - Copying test files of class sftp
2025-12-08 17:01:22,559 - root - INFO - Copying test files of class vimeo
2025-12-08 17:01:23,212 - root - INFO - Copying test files of class torrent
2025-12-08 17:01:23,213 - root - INFO - Copying test files of class youtube
2025-12-08 17:01:23,873 - root - INFO - Copying test files of class hangout
2025-12-08 17:01:48,284 - root - INFO - Copying test files of class icq
2025-12-08 17:01:48,384 - root - INFO - Copying test files of class scp
2025-12-08 17:02:47,143 - root - INFO - Copying test files of class aim
2025-12-08 17:02:47,287 - root -

In [40]:
# create the output directory of the train-val dataset
for i in range(k):
    os.makedirs(f"{output_path}/train_val_split_{i}", exist_ok=True)
    os.makedirs(f"{output_path}/train_val_split_{i}/val", exist_ok=True)
    os.makedirs(f"{output_path}/train_val_split_{i}/train", exist_ok=True)

    for class_name, flow_files in k_folds.items():
        logger.info(f"Copying val files {i} of class {class_name} in split {i}")
        os.makedirs(f"{output_path}/train_val_split_{i}/val/{class_name}", exist_ok=True)

        for flow_file in flow_files[i]:
            command_result = os.system(f"cp {flow_file} {output_path}/train_val_split_{i}/val/{class_name}")


    other_index = list(range(i)) + list(range(i+1, k))
    for j in other_index:
        for class_name, flow_files in k_folds.items():
            logger.info(f"Copying train files {j} of class {class_name} in split {i}")
            os.makedirs(f"{output_path}/train_val_split_{i}/train/{class_name}", exist_ok=True)

            for flow_file in flow_files[j]:
                command_result = os.system(f"cp {flow_file} {output_path}/train_val_split_{i}/train/{class_name}")
