In [6]:
import os
from pennylane import numpy as np
import pandas as pd

from utils.flow_windows import Flows
from utils.probabilities import Probabilities

In [7]:
RATE = 1  # in seconds

In [None]:
df_pcap_labels = pd.read_csv('datasets/mirai/mirai_labels.csv')
df_pcap_labels.rename(columns={'0': 'label'}, inplace=True)
df_pcap_labels = pd.concat([df_pcap_labels, pd.DataFrame([{'label': 1}])], ignore_index=True)

df_pcap_labels

Unnamed: 0,label
0,0
1,0
2,0
3,0
4,0
...,...
764132,1
764133,1
764134,1
764135,1


In [9]:
def expand_hex_ip_columns(df, original_column, delim=":"):
    """expand into 1 column for each byte"""
    if original_column not in df.columns:
        print(
            "Failed expand_hex_ip_columns(): original_column not in df.columns. Returning input df."
        )
        return df

    first_non_zero_value = df[original_column].iloc[(df[original_column] != 0).idxmax()]
    num_hex_columns = len(first_non_zero_value.split(delim))
    hex_columns = [original_column + "_" + str(i) for i in range(num_hex_columns)]

    if delim == ":":
        df_split = df[original_column].str.split(":", expand=True).fillna("0")
        df[hex_columns] = df_split.map(lambda x: int(x if x != "" else "0", 16))
        df[hex_columns] = df[hex_columns].astype(float)
        df.drop(original_column, axis=1, inplace=True)
    if delim == ".":
        df[hex_columns] = df[original_column].str.split(".", expand=True)
        df[hex_columns] = df[hex_columns].astype(float)
        df.drop(original_column, axis=1, inplace=True)

    return df

def one_hot_encoding(df, column):
    df = pd.concat([df, pd.get_dummies(df[column], prefix=column)], axis=1)
    df.drop(column, axis=1, inplace=True, errors='ignore')
    return df

# Raw Packets

In [None]:
df_raw = pd.read_csv("datasets/mirai/mirai_pcap.tsv", sep="\t")
df_raw = pd.concat([df_raw, df_pcap_labels], axis=1)

df_raw = df_raw.drop(columns=[col for col in df_raw.columns if 'Unnamed' in col], errors='ignore')
time_columns = ["frame.time_epoch", "frame.date_time", "flow_window"]
probability_columns = ["dm_prob", "dm_prob_softmax"]

other_columns = [
    'ip.version', 'ip.src', 'ip.dst', 'ip.id', 'ipv6.src', 'ipv6.dst',
    'arp.src.hw_mac', 'arp.src.proto_ipv4', 'arp.dst.hw_mac', 'arp.dst.proto_ipv4',
    'eth.src', 'eth.dst'
]
non_feature_columns = time_columns + probability_columns + other_columns

df_raw = df_raw.drop(columns=non_feature_columns, errors='ignore')

df_raw = one_hot_encoding(df_raw, 'ip.proto')
df_raw = one_hot_encoding(df_raw, 'ip.flags')
df_raw = one_hot_encoding(df_raw, 'eth.type')
df_raw = one_hot_encoding(df_raw, 'tcp.flags.str')
df_raw = one_hot_encoding(df_raw, 'icmp.type')
df_raw = one_hot_encoding(df_raw, 'icmp.code')
df_raw = one_hot_encoding(df_raw, 'arp.opcode')

df_raw.fillna(0, inplace=True)

In [None]:
df_raw.to_csv(f"datasets/mirai/mirai_pcap_preprocessed.csv", index=False)

## Prep DM and Stats Data

In [None]:
df_stats_and_dm = pd.read_csv("datasets/mirai/mirai_pcap.tsv", sep="\t")
df_stats_and_dm = pd.concat([df_stats_and_dm, df_pcap_labels], axis=1)

df_stats_and_dm = df_stats_and_dm.drop(columns=[col for col in df_stats_and_dm.columns if 'Unnamed' in col], errors='ignore')
other_columns = [
    'ip.version', 'ip.src', 'ip.dst', 'ip.id', 'ipv6.src', 'ipv6.dst',
    'arp.src.hw_mac', 'arp.src.proto_ipv4', 'arp.dst.hw_mac', 'arp.dst.proto_ipv4',
    'eth.src', 'eth.dst'
] 
df_stats_and_dm = df_stats_and_dm.drop(columns=other_columns, errors='ignore')

df_stats_and_dm = one_hot_encoding(df_stats_and_dm, 'ip.flags')
df_stats_and_dm = one_hot_encoding(df_stats_and_dm, 'eth.type')
df_stats_and_dm = one_hot_encoding(df_stats_and_dm, 'tcp.flags.str')
df_stats_and_dm = one_hot_encoding(df_stats_and_dm, 'icmp.type')
df_stats_and_dm = one_hot_encoding(df_stats_and_dm, 'icmp.code')
df_stats_and_dm = one_hot_encoding(df_stats_and_dm, 'arp.opcode')

df_stats_and_dm.fillna(0, inplace=True)

In [None]:
df_stats_and_dm.to_csv(f"datasets/mirai/mirai_pcap_preprocessed_sdm.csv", index=False)