In [4]:
import os, sys, glob
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
from IPython.display import display

src_path = os.path.abspath(os.path.join(os.getcwd(), '..', 'src'))
if src_path not in sys.path:
    sys.path.append(src_path)

from flow_features import flows_df_to_np

def prepare_single_class_data(flows_path, label):
    df = pd.read_parquet(flows_path)
    df['label'] = label
    df = df[df['packets_count'] >= 3]
    labels = df['label'].values
    features_df = df.drop(['label'], axis=1)
    features, metas = flows_df_to_np(features_df)
    return features, labels, metas


data_paths = [
    f'./../flows_udp/train/icsx-botnet-2014',
    f'./../flows_udp/test/icsx-botnet-2014',
]

for data_path in data_paths:
    benign = pd.read_parquet(f'{data_path}/benign')
    malicious = pd.read_parquet(f'{data_path}/malicious')

    if not benign.empty:
        benign = benign[benign['packets_count'] >= 3]
    malicious = malicious[malicious['packets_count'] >= 3]

    print(f"Data path: {data_path}")

    total_samples = len(benign) + len(malicious)

    print(f"Benign size: {len(benign)}")
    print(f"Malicious size: {len(malicious)}")
    print(f"Benign percentage: {len(benign) / total_samples * 100:.2f}%")
    print(f"Malicious percentage: {len(malicious) / total_samples * 100:.2f}%")

    subclasses = [os.path.basename(x) for x in glob.glob(f'{data_path}/malicious/*') if os.path.isdir(x)]
    recall_results = {}

    subclass_size = {}
    extra_subclass_size = {}

    for subclass in subclasses:
        features, labels, meta = prepare_single_class_data(f'{data_path}/malicious/{subclass}', label=1)
        subclass_size[subclass] = len(features)

    # Convert to DataFrames and calculate percentage
    malicious_df = pd.DataFrame(
        [(subclass, count, (count / total_samples) * 100) for subclass, count in subclass_size.items()],
        columns=["Subclass", "Count", "Percentage"]
    )

    # Display tables with a title
    print("Malicious Subclasses")
    display(malicious_df.round(2))

    malicious_extra_path = f'{data_path}/malicious_extra'
    if os.path.exists(malicious_extra_path):
        extra_subclasses = [os.path.basename(x) for x in glob.glob(f'{malicious_extra_path}/*') if os.path.isdir(x)]
        for subclass in extra_subclasses:
            features, labels, meta = prepare_single_class_data(f'{malicious_extra_path}/{subclass}', label=1)
            extra_subclass_size[subclass] = len(features)

        malicious_extra_df = pd.DataFrame(
            [(subclass, count, (count / total_samples) * 100) for subclass, count in extra_subclass_size.items()],
            columns=["Subclass", "Count", "Percentage"]
        )

        print("\nMalicious Extra Subclasses")
        display(malicious_extra_df.round(2))

Data path: ./../flows_udp/train/icsx-botnet-2014
Benign size: 16973
Malicious size: 5653
Benign percentage: 75.02%
Malicious percentage: 24.98%
Malicious Subclasses


Unnamed: 0,Subclass,Count,Percentage
0,IRC,488,2.16
1,Neris,4880,21.57
2,RBot,277,1.22
3,Virut,8,0.04


Data path: ./../flows_udp/test/icsx-botnet-2014
Benign size: 20073
Malicious size: 6984
Benign percentage: 74.19%
Malicious percentage: 25.81%
Malicious Subclasses


Unnamed: 0,Subclass,Count,Percentage
0,Black_hole_2,28,0.1
1,Black_hole_3,10,0.04
2,IRC,39,0.14
3,IRCbot_and_black_hole1,24,0.09
4,Menti,34,0.13
5,Murlo,406,1.5
6,Neris,59,0.22
7,Osx_trojan,3,0.01
8,RBot,1405,5.19
9,Smoke_bot,3,0.01
