# For filter 

* use ISCXVPN as a demo
* Statistic many pkts information
* Filter many unrelated protocols
* VPN - {VPN/NoVPN}/{pcap}
* TLS - {Benigh/Malware}/{pcap}

In [1]:
import os
import logging
import scapy.all as scapy
import scapy.contrib.igmp as igmp
import scapy.contrib.igmpv3 as igmpv3
from collections import defaultdict

os.chdir('/root/data')

import logging

logging.basicConfig(       
    level=logging.INFO,            
    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',  
    handlers=[
        logging.FileHandler('logs/filter_vpn.log', mode='w'),  
        logging.StreamHandler()          
    ],
    force=True
)

logger = logging.getLogger()

In [2]:
data_path = 'ISCX-VPN-2016/pcap'
output_path = 'ISCX-VPN-2016/filtered'

In [3]:
import pandas as pd
import os

def save_to_csv(data, filename, csv_path):
    # Check if the CSV file already exists
    if os.path.exists(csv_path):
        # Load the existing CSV file
        df_existing = pd.read_csv(csv_path)
    else:
        # Create an empty DataFrame if the CSV file does not exist
        df_existing = pd.DataFrame()

    df_new = pd.DataFrame([{'filename' : filename}])
    df_new = df_new.assign(**data)

    # Combine the existing DataFrame with the new DataFrame
    df_combined = pd.concat([df_existing, df_new])
    df_combined = df_combined.fillna(0)
    df_combined.to_csv(csv_path, index=False)


In [4]:
protocols = {
    'network_management_protocols': ['icmp', 'icmpv6', 'dhcp', 'dhcpv6', 'igmp', 'snmp', 'arp', 'cops'],
    'nat_protocols': ['nat-pmp', 'rsip'],
    'route_management_protocols': ['db-lsp', 'db-lsp-disc', 'pathport', 'stp', 'bfd_echo', 'bgp', 'ecmp'],
    'service_management_protocols': ['ssdp', 'lldp', 'srvloc', 'ipxsap', 'opa', 'cbsp'],
    'link-local_protocols': ['llmnr', 'nbns', 'mdns', 'lsd'],
    'link_management_protocols': ['llc'],
    'distributed_protocols': ['thrift', 'dcerpc', 'rmi'],
    'real_time_protocols': ['rtcp', 'stun'],
    'remote_access_protocols': ['vnc', 'x11', 'msnms'],
    'network_time_protocols': ['ntp'],
    'security_protocols': ['ocsp', 'pkix-cert', 'egd', 'chargen', 'tpm', 'knet'],
    'industrial_protocols': ['r-goose', 'dcp-pft', 'dcp-af', 'nxp_802154_sniffer', 'enip', 'c1222', 'ax4000'],
    'file_protocols': ['lanman', 'bjnp', 'spoolss', 'ndps', 'laplink', 'bzr', 'cvspserver'],
    'quake_protocols': ['quake', 'quake2', 'quake3', 'quakeworld'],
    'iot_management_protocols': ['bat.vis', 'tplink-smarthome', 'coap','mqtt'],
    'mobile_protocols': ['gsm_ipa'],
    'database_protocols': ['tds']
}

In [5]:
import subprocess

# statistics pcap infomation
def statistics_pcap(pcap_path):
    command = f"tshark -r {pcap_path} -T fields -e _ws.col.Protocol | sort | uniq -c | sort -nr"
    result = subprocess.run(command, shell=True, capture_output=True, text=True)

    # Process the output
    lines = result.stdout.strip().split('\n')
    protocol_counts = {}

    for line in lines:
        count, protocol = line.strip().split(maxsplit=1)
        protocol_counts[protocol] = int(count)

    return protocol_counts

def filter_pcap(pcap_path, output_path):
    rule = ""
    for type in protocols:
        for protocol in protocols[type]:
            rule += f"not {protocol} and "
    rule = rule[:-5]
    
    command = f'tshark -r {pcap_path} -Y "{rule}" -w {output_path}'
    subprocess.run(command, shell=True, capture_output=True, text=True)


In [6]:
# Filter
from multiprocessing import Pool, cpu_count

def process_folder(folder):
    if folder == 'Filtered':
        return

    logger.info(f"Processing folder: {folder}")
    for file in os.listdir(f'{data_path}/{folder}'):
        logger.info(f"Processing file: {file}")

        filter_pcap(f'{data_path}/{folder}/{file}', f'{output_path}/Raw/{file}')

# Get the list of folders to process
folders_to_process = [folder for folder in os.listdir(data_path) if folder != 'Filtered']

# Use multiprocessing to process each folder in parallel
with Pool(cpu_count()) as pool:
    pool.map(process_folder, folders_to_process)
    pool.close()

2025-12-08 21:57:11,039 - root - INFO - Processing folder: ftp
2025-12-08 21:57:11,040 - root - INFO - Processing folder: sftp
2025-12-08 21:57:11,039 - root - INFO - Processing folder: voipbuster
2025-12-08 21:57:11,039 - root - INFO - Processing folder: spotify
2025-12-08 21:57:11,039 - root - INFO - Processing folder: netflix
2025-12-08 21:57:11,039 - root - INFO - Processing folder: skype
2025-12-08 21:57:11,040 - root - INFO - Processing folder: youtube
2025-12-08 21:57:11,040 - root - INFO - Processing folder: icq
2025-12-08 21:57:11,040 - root - INFO - Processing folder: torrent
2025-12-08 21:57:11,040 - root - INFO - Processing folder: aim
2025-12-08 21:57:11,040 - root - INFO - Processing folder: scp
2025-12-08 21:57:11,040 - root - INFO - Processing folder: email
2025-12-08 21:57:11,040 - root - INFO - Processing folder: hangout
2025-12-08 21:57:11,040 - root - INFO - Processing folder: vimeo
2025-12-08 21:57:11,042 - root - INFO - Processing file: vpn_ftps_B.pcap
2025-12-08 

In [7]:
# statistics the protocol counts
# seperate the pcap filter and statistics, just for acceleration

for folder in os.listdir(data_path):
    if folder == 'Filtered':
        continue

    logger.info(f"Processing folder: {folder}")
    for file in os.listdir(f'{data_path}/{folder}'):
        logger.info(f"Processing file: {file}")

        # Statistics the protocol counts
        origin_protocol_counts = statistics_pcap(f'{data_path}/{folder}/{file}')
        save_to_csv(origin_protocol_counts, file, f'./results/raw_protocol_counts.csv')
        logger.info(f"Original packets {folder}/{file}: {origin_protocol_counts}")

        filtered_protocol_counts = statistics_pcap(f'{output_path}/Raw/{file}')
        save_to_csv(filtered_protocol_counts, file, f'./results/filtered_protocol_counts.csv')
        logger.info(f"Filtered packets {folder}/{file}: {filtered_protocol_counts}")



2025-12-08 23:15:42,291 - root - INFO - Processing folder: netflix
2025-12-08 23:15:42,292 - root - INFO - Processing file: netflix3.pcap
2025-12-08 23:15:46,052 - root - INFO - Original packets netflix/netflix3.pcap: {'TCP': 121786, 'HTTP': 322, 'TLSv1.2': 187, 'ICMP': 39, 'DNS': 21, 'ARP': 8, 'BROWSER': 7, 'TLSv1': 6, 'NBNS': 4}
2025-12-08 23:15:50,506 - root - INFO - Filtered packets netflix/netflix3.pcap: {'TCP': 121786, 'HTTP': 322, 'TLSv1.2': 187, 'DNS': 21, 'BROWSER': 7, 'TLSv1': 6}
2025-12-08 23:15:50,507 - root - INFO - Processing file: netflix2.pcap
2025-12-08 23:15:52,807 - root - INFO - Original packets netflix/netflix2.pcap: {'TCP': 51451, 'HTTP': 269, 'TLSv1.2': 72, 'DNS': 12, 'ICMP': 11, 'ARP': 4, 'BROWSER': 3, 'TLSv1': 2, 'NBNS': 2}
2025-12-08 23:15:55,099 - root - INFO - Filtered packets netflix/netflix2.pcap: {'TCP': 51451, 'HTTP': 269, 'TLSv1.2': 72, 'DNS': 12, 'BROWSER': 3, 'TLSv1': 2}
2025-12-08 23:15:55,101 - root - INFO - Processing file: vpn_netflix_A.pcap
2025-