In [2]:
import os, sys, glob
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
from IPython.display import display

src_path = os.path.abspath(os.path.join(os.getcwd(), '..', 'src'))
if src_path not in sys.path:
    sys.path.append(src_path)

from flow_features import flows_df_to_np

def prepare_single_class_data(flows_path, label):
    df = pd.read_parquet(flows_path)
    df['label'] = label
    df = df[df['packets_count'] >= 3]
    labels = df['label'].values
    features_df = df.drop(['label'], axis=1)
    features, metas = flows_df_to_np(features_df)
    return features, labels, metas


data_paths = [
    f'./../flows/train/icsx-botnet-2014',
    f'./../flows/test/icsx-botnet-2014',
    f'./../flows/test/ctu-13',
    f'./../flows/test/ctu-custom',
    f'./../flows/train/icsx-ctu-extended',
    f'./../flows/test/icsx-ctu-extended',
]

for data_path in data_paths:
    benign = pd.read_parquet(f'{data_path}/benign')
    malicious = pd.read_parquet(f'{data_path}/malicious')

    if not benign.empty:
        benign = benign[benign['packets_count'] >= 3]
    malicious = malicious[malicious['packets_count'] >= 3]

    print(f"Data path: {data_path}")

    total_samples = len(benign) + len(malicious)

    print(f"Benign size: {len(benign)}")
    print(f"Malicious size: {len(malicious)}")
    print(f"Benign percentage: {len(benign) / total_samples * 100:.2f}%")
    print(f"Malicious percentage: {len(malicious) / total_samples * 100:.2f}%")

    subclasses = [os.path.basename(x) for x in glob.glob(f'{data_path}/malicious/*') if os.path.isdir(x)]
    recall_results = {}

    subclass_size = {}
    extra_subclass_size = {}

    for subclass in subclasses:
        features, labels, meta = prepare_single_class_data(f'{data_path}/malicious/{subclass}', label=1)
        subclass_size[subclass] = len(features)

    # Convert to DataFrames and calculate percentage
    malicious_df = pd.DataFrame(
        [(subclass, count, (count / total_samples) * 100) for subclass, count in subclass_size.items()],
        columns=["Subclass", "Count", "Percentage"]
    )

    # Display tables with a title
    print("Malicious Subclasses")
    display(malicious_df.round(2))

    malicious_extra_path = f'{data_path}/malicious_extra'
    if os.path.exists(malicious_extra_path):
        extra_subclasses = [os.path.basename(x) for x in glob.glob(f'{malicious_extra_path}/*') if os.path.isdir(x)]
        for subclass in extra_subclasses:
            features, labels, meta = prepare_single_class_data(f'{malicious_extra_path}/{subclass}', label=1)
            extra_subclass_size[subclass] = len(features)

        malicious_extra_df = pd.DataFrame(
            [(subclass, count, (count / total_samples) * 100) for subclass, count in extra_subclass_size.items()],
            columns=["Subclass", "Count", "Percentage"]
        )

        print("\nMalicious Extra Subclasses")
        display(malicious_extra_df.round(2))

Data path: ./../flows/train/icsx-botnet-2014
Benign size: 93110
Malicious size: 47633
Benign percentage: 66.16%
Malicious percentage: 33.84%
Malicious Subclasses


Unnamed: 0,Subclass,Count,Percentage
0,IRC,2439,1.73
1,Neris,12020,8.54
2,RBot,32324,22.97
3,Virut,850,0.6


Data path: ./../flows/test/icsx-botnet-2014
Benign size: 27488
Malicious size: 57304
Benign percentage: 32.42%
Malicious percentage: 67.58%
Malicious Subclasses


Unnamed: 0,Subclass,Count,Percentage
0,IRC,250,0.29
1,Menti,2747,3.24
2,Murlo,2593,3.06
3,Neris,18780,22.15
4,RBot,154,0.18
5,Sogou,40,0.05
6,Virut,32740,38.61



Malicious Extra Subclasses


Unnamed: 0,Subclass,Count,Percentage
0,Black_hole,113,0.13
1,TBot,487,0.57
2,Weasel,23041,27.17
3,Zero_access,447,0.53
4,Zeus,213,0.25


Data path: ./../flows/test/ctu-13
Benign size: 0
Malicious size: 156965
Benign percentage: 0.00%
Malicious percentage: 100.00%
Malicious Subclasses


Unnamed: 0,Subclass,Count,Percentage
0,DonBot,2747,1.75
1,Murlo,2592,1.65
2,Neris,85137,54.24
3,NSIS.ay,370,0.24
4,RBot,32489,20.7
5,Sogou,40,0.03
6,Virut,33590,21.4


Data path: ./../flows/test/ctu-custom
Benign size: 18989
Malicious size: 339513
Benign percentage: 5.30%
Malicious percentage: 94.70%
Malicious Subclasses


Unnamed: 0,Subclass,Count,Percentage
0,Emotet,148905,41.54
1,Kazy,11916,3.32
2,TrickBot,153768,42.89
3,WannaCry,4531,1.26
4,Zeus,20393,5.69


Data path: ./../flows/train/icsx-ctu-extended
Benign size: 139178
Malicious size: 150386
Benign percentage: 48.06%
Malicious percentage: 51.94%
Malicious Subclasses


Unnamed: 0,Subclass,Count,Percentage
0,Emotet,32019,11.06
1,IRC,2439,0.84
2,Kazy,4991,1.72
3,Neris,12020,4.15
4,RBot,32324,11.16
5,TrickBot,19999,6.91
6,Virut,850,0.29
7,WannaCry,5744,1.98
8,Zeus,40000,13.81


Data path: ./../flows/test/icsx-ctu-extended
Benign size: 46477
Malicious size: 553782
Benign percentage: 7.74%
Malicious percentage: 92.26%
Malicious Subclasses


Unnamed: 0,Subclass,Count,Percentage
0,DonBot,2747,0.46
1,Emotet,148905,24.81
2,IRC,250,0.04
3,Kazy,11916,1.99
4,Menti,2747,0.46
5,Murlo,5185,0.86
6,Neris,103917,17.31
7,NSIS.ay,370,0.06
8,RBot,32643,5.44
9,Sogou,80,0.01



Malicious Extra Subclasses


Unnamed: 0,Subclass,Count,Percentage
0,Black_hole,113,0.02
1,TBot,487,0.08
2,Weasel,23041,3.84
3,Zero_access,447,0.07
4,Zeus,213,0.04
