In [1]:
import os
import wandb

os.environ['HTTP_PROXY'] = 'http://proxy.uninsubria.it:3128/'
os.environ['HTTPS_PROXY'] = 'http://proxy.uninsubria.it:3128/'
wandb.login()
wb = True

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mjfcevallos[0m. Use [1m`wandb login --relogin`[0m to force relogin


In [3]:
# !pip install scapy -qU
# !pip install pcap-splitter

# Lib:

In [4]:
from scapy.all import *
from tqdm.notebook import tqdm, trange
import binascii
import numpy as np
import os
import torch
import torch.nn as nn
import pandas as pd
from PIL import Image
from torchvision import transforms

In [5]:
def get_flow_details_dataset(flow_pcap_dir, max_records=10000):

    # Initialize an empty list to store flow information
    flow_info = []

    # Iterate through the PCAP files in the directory
    for pcap_file in tqdm(os.listdir(flow_pcap_dir)[:max_records]):
        if pcap_file.endswith(".pcap"):
            pcap_file_path = os.path.join(flow_pcap_dir, pcap_file)

            # Use Scapy to read the PCAP file
            packets = rdpcap(pcap_file_path)

            for packet in packets:
                if IP in packet:
                    ip_layer = packet[IP]

                    if TCP in packet:
                        transport_layer = packet[TCP]
                        protocol = "TCP"
                    elif UDP in packet:
                        transport_layer = packet[UDP]
                        protocol = "UDP"
                    else:
                        transport_layer = None
                        protocol = None

                    # Extract relevant fields
                    time_stamp = datetime.fromtimestamp(int(packet.time)).strftime("%Y-%m-%d %H:%M:%S")
                    source_ip = ip_layer.src
                    dest_ip = ip_layer.dst
                    source_port = transport_layer.sport if transport_layer else None
                    dest_port = transport_layer.dport if transport_layer else None

                    flow_info.append({
                        "filename": pcap_file,
                        "stime": time_stamp,
                        "saddr": source_ip,
                        "sport": source_port,
                        "daddr": dest_ip,
                        "dport": dest_port,
                        "proto": protocol
                    })
                    break

    # Create a Pandas DataFrame from the flow information
    df = pd.DataFrame(flow_info)
    return df


def to_fixed_length(binary_data, fixed_length=512):
    current_length = len(binary_data)

    if current_length >= fixed_length:
        return binary_data[:fixed_length]

    # Calculate the number of zeros to add
    num_zeros_to_add = fixed_length - current_length

    # Create an array of zeros to pad
    padding = np.zeros(num_zeros_to_add, dtype=np.uint8)

    # Concatenate the padding to the end of the binary_data
    padded_data = np.concatenate((binary_data, padding))

    return padded_data


def process_pcap_file(file_path, flow_len=512, packet_len=512):

    packets = rdpcap(file_path)
    np_packets = []
    for packet in packets[:flow_len]:
        if packet.haslayer(IP):
            # we take from layer 3 on:
            packet = packet[IP]
            # And we mask the ip addressese
            packet.src = "0.0.0.0"
            packet.dst = "0.0.0.0"
            # We also mask the ports
            packet.soprt = 00000
            packet.dport = 00000
            # Convert packet to bytes
            packet = bytes(packet)
            # Convert bytes to numpy array of uint8
            packet = np.frombuffer(packet, dtype=np.uint8)
            # pad
            packet = to_fixed_length(packet, packet_len)
            np_packets.append(packet.reshape(1, -1))

    if len(np_packets) > 0:
        flow = np.concatenate(np_packets, 0)
        if flow.shape[0] < flow_len:
            pad = np.zeros((flow_len-flow.shape[0], packet_len))
            flow = np.concatenate([flow, pad], 0)

        return flow[:flow_len]
    else:
        return None


def get_flow_labels_df(pf_path):
    original_labels = pd.read_csv(
        pf_path,
        sep=';',
        low_memory=False)

    dates = pd.to_datetime(original_labels.stime, unit='s')

    original_labels.stime = dates.dt.strftime('%Y-%m-%d %H:%M:%S')
    original_labels['sport'] = pd.to_numeric(original_labels['sport'], errors='coerce')
    original_labels['dport'] = pd.to_numeric(original_labels['dport'], errors='coerce')
    original_labels.dropna(subset=['sport', 'dport'], inplace=True)
    original_labels = original_labels.astype({"sport":"int","dport":"int"})
    return original_labels


def get_data(flow_path, max_flows, normal_flow_filenames):
    attack_flow_df = []
    attack_flows_tensor = []

    normal_flow_df = []
    normal_flows_tensor = []

    for index, filename in tqdm(enumerate(os.listdir(flow_path))):
        if filename.endswith(".pcap"):
            file_path = os.path.join(flow_path, filename)
            flow_array = process_pcap_file(
                file_path,
                flow_len=512,
                packet_len=512)
            if flow_array is not None:
                flow_array = flow_array[np.newaxis, :]
                if filename in normal_flow_filenames:
                    normal_flow_df.append(
                        {'tensor_index': index,
                         'filename': filename})
                    normal_flows_tensor.append(torch.from_numpy(flow_array))
                else:
                    attack_flow_df.append(
                        {'tensor_index': index,
                         'filename': filename})
                    attack_flows_tensor.append(torch.from_numpy(flow_array))
        if index >= max_flows:
            break

    attack_flow_df = pd.DataFrame(attack_flow_df)
    normal_flow_df = pd.DataFrame(normal_flow_df)

    return attack_flow_df, attack_flows_tensor, normal_flow_df, normal_flows_tensor


def process_and_save_data(
        flow_path,
        max_flows,
        normal_flow_filenames,
        imgs_path,
        attack_name
        ):

    for index, filename in tqdm(enumerate(os.listdir(flow_path))):
        if filename.endswith(".pcap"):
            file_path = os.path.join(flow_path, filename)
            flow_array = process_pcap_file(
                file_path,
                flow_len=512,
                packet_len=512)
            if flow_array is not None:
                flow_array = flow_array[np.newaxis, :]
                if filename in normal_flow_filenames:
                    torch.save(
                        torch.from_numpy(flow_array),
                        f'Normal_from_{attack_name}_{index}')
                else:
                    torch.save(
                        torch.from_numpy(flow_array),
                        f'{imgs_path}/{attack_name}_{index}')
        if index >= max_flows:
            break


def process_and_save_data_opt(
        flow_path,
        max_flows,
        imgs_path,
        attack_name
        ):

    for index, filename in tqdm(enumerate(os.listdir(flow_path))):
        if filename.endswith(".pcap"):
            file_path = os.path.join(flow_path, filename)
            flow_array = process_pcap_file(
                file_path,
                flow_len=512,
                packet_len=512)
            if flow_array is not None:
                torch.save(
                    torch.from_numpy(flow_array),
                    f'{imgs_path}/{attack_name}_{index}')
        if index >= max_flows:
            break


def process_and_save_image_opt(
        flow_path,
        max_flows,
        imgs_path,
        attack_name
        ):

    for index, filename in tqdm(enumerate(os.listdir(flow_path))):
        if filename.endswith(".pcap"):
            file_path = os.path.join(flow_path, filename)
            flow_array = process_pcap_file(
                file_path,
                flow_len=512,
                packet_len=512)
            if flow_array is not None:
                image = Image.fromarray(flow_array)
                image = image.convert("L")
                image.save(f'{imgs_path}/{attack_name}_{index}.png')
        if index >= max_flows:
            break

# Data (bulk):
______

In this script we concatenate all the bidimensional tensors of each attack into a huge tensor and save it into a file... these files are saved into the nfs directory.

In [None]:
WORKSPACE_DIR =  '/home/jovyan/nfs/jcevallos/datasets/'
# !tar -xzvf {datafile} -C {WORKSPACE_DIR}
datadir = WORKSPACE_DIR + 'BoT_IoT_raw/'
normal_traffic_dir = datadir + 'Normal/'

# CHANGE THIS FOR EACH ATTACK:
macro_attack_str = 'Theft'
attack_str = 'Data_Exfiltration'
max_flows = 10000

attack_dir = datadir + f'{macro_attack_str}/{attack_str}/'
flow_path = attack_dir + 'flows'

# first, read flow details
flow_info_ds = get_flow_details_dataset(
    flow_path,
    max_records=max_flows)

# Then, read the labels
print('reading labels file...')
flow_labels_df = get_flow_labels_df(
    attack_dir+f'{attack_str}.csv')

# merge details and labels
print('merging metadata 1...')
merged_flows = pd.merge(flow_info_ds,
                        flow_labels_df,
                        on=['stime',
                            'saddr',
                            'sport',
                            'daddr',
                            'dport'])

# get normal_flow_filenames:
print('querying normal flows...')
normal_flow_filenames = merged_flows.filename[merged_flows['category'] == 'Normal']

# get data:
print('processing pcaps...')
a_flow_df, a_flows_tensor, n_flow_df, n_flows_tensor = get_data(
                                                        flow_path,
                                                        max_flows,
                                                        normal_flow_filenames)

# save attack flow tensors
print('saving tensor file...')
torch.save(a_flows_tensor,
           attack_dir + 'flows_tensor.pt')

# add metadata:
print('merging metadata 2...')
a_flow_df = pd.merge(
         a_flow_df,
         merged_flows,
         on=['filename'])

# save attack metadata:
print('saving attack metadata file...')
a_flow_df.to_csv(
    attack_dir+f'{attack_str}_flow_metadata.csv',
    index=False)

# check for normal data:
if len(n_flow_df) > 0:
    print('processing normal data...')
    # save normal flow tensors
    torch.save(
        n_flows_tensor,
        normal_traffic_dir + f'Normal_from_{attack_str}.pt')

    # add metadata
    n_flow_df = pd.merge(
             n_flow_df,
             merged_flows,
             on=['filename'])

    # save normal metadata:
    n_flow_df.to_csv(
        normal_traffic_dir+f'normal_from_{attack_str}_flow_metadata.csv')

print('preprocessing completed!')

  0%|          | 0/344 [00:00<?, ?it/s]

reading labels file...
merging metadata 1...
querying normal flows...
processing pcaps...


0it [00:00, ?it/s]

# Data (file-wise):
______

in this script we save each image in a file, where the name of the file gives info about the label. And we save all the images of every attack in the same folder in the shared folder for fast access during training in the future. 

In [None]:
WORKSPACE_DIR =  '/home/jovyan/nfs/jcevallos/datasets/'
# !tar -xzvf {datafile} -C {WORKSPACE_DIR}
datadir = WORKSPACE_DIR + 'BoT_IoT_raw/'
normal_traffic_dir = datadir + 'Normal/'
images_path = '/home/jovyan/shared/jesus/datasets/BoT_IoT_imgs/'
# CHANGE THIS FOR EACH ATTACK:
macro_attack_str = 'DDoS'
attack_str = 'DDoS_HTTP'
max_flows = 10000

attack_dir = datadir + f'{macro_attack_str}/{attack_str}/'
flow_path = attack_dir + 'flows'

# first, read flow details
print('getting flow details...')
flow_info_ds = get_flow_details_dataset(
    flow_path,
    max_records=max_flows)

# Then, read the labels
print('reading labels file...')
flow_labels_df = get_flow_labels_df(
    attack_dir+f'{attack_str}.csv')

# merge details and labels
print('merging metadata 1...')
merged_flows = pd.merge(flow_info_ds,
                        flow_labels_df,
                        on=['stime',
                            'saddr',
                            'sport',
                            'daddr',
                            'dport'])

# get normal_flow_filenames:
print('querying normal flows...')
normal_flow_filenames = merged_flows.filename[merged_flows['category'] == 'Normal']

# get data:
print('processing pcaps...')
process_and_save_data(
    flow_path,
    max_flows,
    normal_flow_filenames,
    images_path,
    attack_str
    )

print('preprocessing completed!')

# Optimized data preprocessing:
_______________
In the following code we just take into account the fact that we already removed the normal flows from the directories we are exploring, so we do not read too much metadata. 

In [12]:
WORKSPACE_DIR = '/home/jovyan/nfs/jcevallos/datasets/'
# !tar -xzvf {datafile} -C {WORKSPACE_DIR}
datadir = WORKSPACE_DIR + 'IIoT_Ferrag_raw/'

In [21]:
images_path = '/home/jovyan/shared/jesus/datasets/BoT_IoT_imgs/'
# CHANGE THIS FOR EACH ATTACK:
attack_str = 'Ransomware'
max_flows = 10000

attack_dir = datadir + f'{attack_str}'
flow_path = attack_dir + '_flows/'


# get data:
print('processing pcaps...')
process_and_save_data_opt(
    flow_path,
    max_flows,
    images_path,
    attack_str
    )

print('preprocessing completed!')

processing pcaps...


0it [00:00, ?it/s]

preprocessing completed!


# Optimized data processing 2.0 (images):
________
It turns out pytorch tensors are heavy :/ we will save and load our data as images and then convert themm to tensors in the dataloading...

In [55]:
WORKSPACE_DIR = '/home/jovyan/nfs/jcevallos/datasets/'
# !tar -xzvf {datafile} -C {WORKSPACE_DIR}
datadir = WORKSPACE_DIR + 'BoT_IoT_raw/'
normal_traffic_dir = datadir + 'Normal/'
images_path = '/home/jovyan/shared/jesus/datasets/optimized_BoT_IoT_imgs'
# CHANGE THIS FOR EACH ATTACK:
macro_attack_str = 'DDoS'
attack_str = 'DDoS_UDP'
max_flows = 10000

attack_dir = datadir + f'{macro_attack_str}/{attack_str}/'
flow_path = attack_dir + 'flows'


# get data:
print('processing pcaps...')
process_and_save_image_opt(
    flow_path,
    max_flows,
    images_path,
    attack_str
    )

print('preprocessing completed!')

processing pcaps...


0it [00:00, ?it/s]

preprocessing completed!
