In [1]:
import pandas as pd
import plotly.graph_objects as go
import numpy as np
import os

# Mon - Normal traffic
# Tue - Brute Force, FTP-Patator, SSH-Patator
# Wed - Web attack DoS/DDoS
# Thu - Web attack Infiltracja
# Fri - Botnet, Port Scan, DDoS

datasets = {
    "Mon": "../data/Mon-Expanded.csv",
    "Tue": "../data/Tue-Expanded.csv",
    "Wed": "../data/Wed-Expanded.csv",
    "Thu": "../data/Thu-Expanded.csv",
    "Fri": "../data/Fri-Expanded.csv"
}
filtered_dataset = "../data/Combined-Filtered.csv"

In [2]:
def load_dataset(day, encoding="latin1"):
    df = pd.read_csv(datasets[day], encoding=encoding)
    df.columns = df.columns.str.strip()
    df = df.dropna()
    return df

def read_timestamp(timestamp_str) -> pd.Timestamp:
    return pd.to_datetime(timestamp_str, errors='coerce')

def convert_to_datetime(df, column='Timestamp'):
    df[column] = pd.to_datetime(df[column], errors='coerce')
    return df

def load_cherrypicked_dataset(encoding="latin1"):
    df = pd.read_csv(filtered_dataset, encoding=encoding)
    df.columns = df.columns.str.strip()
    df = df.dropna(subset=['Label', 'Flow Duration'])
    return df

def load_dataset_as_single_df(encoding="latin1"):
    dfs = []
    for day, path in datasets.items():
        try:
            df = pd.read_csv(path, encoding=encoding)
            df.columns = df.columns.str.strip()
            df.dropna(subset=['Label', 'Flow Duration'], inplace=True)
            dfs.append(df)
            print(f" -> File '{path}' loaded successfully")
        except FileNotFoundError:
            print(f" -> WARNING: '{path}' file not found, skipping")

    if not dfs:
        print("ERROR: No data loaded, check file paths")
        return pd.DataFrame()

    full_df = pd.concat(dfs, ignore_index=True)
    print("Finished loading all datasets")
    return full_df


In [3]:
full_df = load_dataset_as_single_df()

 -> File '../data/Mon-Expanded.csv' loaded successfully
 -> File '../data/Tue-Expanded.csv' loaded successfully
 -> File '../data/Wed-Expanded.csv' loaded successfully
 -> File '../data/Thu-Expanded.csv' loaded successfully
 -> File '../data/Fri-Expanded.csv' loaded successfully
Finished loading all datasets


### 1. Overview

In [None]:
# Exclude Benign
all_labels = sorted(full_df['Label'].unique())
attack_labels = [lbl for lbl in all_labels if lbl != 'BENIGN']

dos_attacks = ['DoS Hulk', 'DoS GoldenEye', 'DoS slowloris', 'DoS Slowhttptest']
main_attacks = ['DDoS', 'PortScan', 'DoS']

def compute_sunburst_data(selected_labels):
    filtered_df = full_df[full_df['Label'].isin(selected_labels)]
    attack_counts = filtered_df['Label'].value_counts().to_dict()

    labels = ['All Attacks']
    parents = ['']
    values = [sum(attack_counts.values())]

    dos_total = sum(attack_counts.get(attack, 0) for attack in dos_attacks)
    main_counts = {
        'DDoS': attack_counts.get('DDoS', 0),
        'PortScan': attack_counts.get('PortScan', 0),
        'DoS': dos_total
    }
    rest_counts = {k: v for k, v in attack_counts.items() if k not in main_attacks and k not in dos_attacks}
    main_counts['Other'] = sum(rest_counts.values())

    for label, value in main_counts.items():
        labels.append(label)
        parents.append('All Attacks')
        values.append(value)

    for attack in dos_attacks:
        if attack in attack_counts:
            labels.append(attack)
            parents.append('DoS')
            values.append(attack_counts.get(attack, 0))

    for label, value in rest_counts.items():
        labels.append(label)
        parents.append('Other')
        values.append(value)

    return {'labels': labels, 'parents': parents, 'values': values}

sunburst_data = compute_sunburst_data(attack_labels)

fig = go.Figure(go.Sunburst(
    labels=sunburst_data['labels'],
    parents=sunburst_data['parents'],
    values=sunburst_data['values'],
    branchvalues="total",
    hovertemplate='<b>%{label}</b><br>Count: %{value:,}<br>Percentage of Parent: %{percentParent:.2%}',
    insidetextorientation='radial'
))
fig.update_layout(
    margin=dict(t=50, l=25, r=25, b=25),
    title_text="Distribution of Attack Types (Interactive)",
    title_font_size=20
)
fig.write_html("saved_plots/attack_sunburst_interactive.html")
fig.show()

In [4]:
output_path = 'saved_plots/label_counts.csv'
output_dir = os.path.dirname(output_path)
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

label_counts = full_df['Label'].value_counts()
label_counts_df = label_counts.rename('count').reset_index().rename(columns={'index': 'Label'})

label_counts_df.to_csv(output_path, index=False)
print(f"Saved in: '{output_path}'")

Saved in: 'saved_plots/label_counts.csv'


In [5]:
output_path = 'saved_plots/label_counts.csv'
label_counts_df = pd.read_csv(output_path)

fig = go.Figure()
for _, row in label_counts_df.iterrows():
    fig.add_trace(go.Bar(
        x=[row['Label']],
        y=[row['count']],
        name=row['Label']
    ))

fig.update_layout(
    title_text="Distribution of Traffic Types (Full Dataset)",
    xaxis_title="Traffic/Attack Type (Label)",
    yaxis_title="Number of Occurrences",
    height=600,
    showlegend=False
)

fig.show()

In [6]:
output_path = 'saved_plots/flow_duration_stats.csv'
output_dir = os.path.dirname(output_path)
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

stats_df = (
    full_df.groupby('Label')['Flow Duration']
    .quantile([0, 0.25, 0.5, 0.75, 1])
    .unstack()
    .rename(columns={0.0: 'min', 0.25: 'q1', 0.5: 'median', 0.75: 'q3', 1.0: 'max'})
)

stats_df.to_csv(output_path)
print(f"Saved in: '{output_path}'")

Saved in: 'saved_plots/flow_duration_stats.csv'


In [7]:
output_path = 'saved_plots/flow_duration_stats.csv'
stats_df = pd.read_csv(output_path, index_col='Label')

fig = go.Figure()
for label, row in stats_df.iterrows():
    fig.add_trace(go.Box(
        x=[label],
        name=label,
        q1=[row['q1']],
        median=[row['median']],
        q3=[row['q3']],
        lowerfence=[row['min']],
        upperfence=[row['max']],
        boxpoints=False
    ))

fig.update_layout(
    title_text="Distribution of Attack Durations (Full Dataset)",
    xaxis_title="Traffic/Attack Type (Label)",
    yaxis_title="Flow Duration (Log Scale)",
    yaxis_type="log",
    height=600,
    showlegend=False
)

fig.show()

In [8]:
output_path = 'saved_plots/violin_samples.csv'

sample_size_per_category = 2000
violin_sample_df = (
    full_df.groupby('Label')
    .apply(lambda x: x.sample(frac=1).head(sample_size_per_category))
    .reset_index(drop=True)[['Average Packet Size', 'Label']]
)

violin_sample_df.to_csv(output_path, index=False)
print(f"Saved in: '{output_path}'")

Saved in: 'saved_plots/violin_samples.csv'






In [9]:
output_path = 'saved_plots/violin_samples.csv'
violin_sample_df = pd.read_csv(output_path)

fig = go.Figure()
for label in sorted(violin_sample_df['Label'].unique()):
    data = violin_sample_df[violin_sample_df['Label'] == label]['Average Packet Size']
    fig.add_trace(go.Violin(
        y=data,
        name=label,
        box_visible=True,
        meanline_visible=True
    ))

fig.update_layout(
    title_text="Distribution of Average Packet Sizes (Full Dataset)",
    xaxis_title="Traffic/Attack Type (Label)",
    yaxis_title="Average Packet Size",
    height=600,
    showlegend=False
)

fig.show()

In [None]:
output_path = 'saved_plots/protocol_distribution.csv'

# Calculate protocol distribution per label
protocol_counts = full_df.groupby(['Label', 'Protocol']).size().reset_index(name='Count')
label_totals = protocol_counts.groupby('Label')['Count'].transform('sum')
protocol_counts['Percentage'] = (protocol_counts['Count'] / label_totals) * 100

# Map protocol numbers to names
protocol_map = {6: 'TCP', 17: 'UDP'}
protocol_counts['Protocol_Name'] = protocol_counts['Protocol'].map(protocol_map).fillna('Other')

final_df = (
    protocol_counts.groupby(['Label', 'Protocol_Name'])['Percentage']
    .sum()
    .reset_index()
)

final_df.to_csv(output_path, index=False)
print(f"Saved in: '{output_path}'")

Saved in: 'saved_plots/protocol_distribution.csv'


In [9]:
output_path = 'saved_plots/protocol_distribution.csv'
protocol_df = pd.read_csv(output_path)

print(protocol_df)

fig = go.Figure()

for protocol_name in ['TCP', 'UDP', 'Other']:
    plot_data = protocol_df[protocol_df['Protocol_Name'] == protocol_name]

    if not plot_data.empty:
        fig.add_trace(go.Bar(
            x=plot_data['Label'],
            y=plot_data['Percentage'],
            name=protocol_name
        ))

fig.update_layout(
    barmode='stack',
    title_text="Percentage Distribution of Protocols by Attack Type",
    xaxis_title="Attack Type (Label)",
    yaxis_title="Percentage (%)",
    height=600,
    legend_title_text='Protocol',
    xaxis={'categoryorder':'total descending'}
)

fig.show()

                         Label Protocol_Name  Percentage
0                       BENIGN         Other    0.074361
1                       BENIGN           TCP   55.947231
2                       BENIGN           UDP   43.978408
3                          Bot           TCP  100.000000
4                         DDoS           TCP  100.000000
5                DoS GoldenEye           TCP  100.000000
6                     DoS Hulk           TCP  100.000000
7             DoS Slowhttptest           TCP  100.000000
8                DoS slowloris           TCP  100.000000
9                  FTP-Patator           TCP  100.000000
10                  Heartbleed           TCP  100.000000
11                Infiltration           TCP  100.000000
12                    PortScan         Other    0.003775
13                    PortScan           TCP   99.995596
14                    PortScan           UDP    0.000629
15                 SSH-Patator           TCP  100.000000
16    Web Attack - Brute Force 

In [10]:
output_path = 'saved_plots/scatter_samples.csv'

sample_size_per_category = 2500

scatter_sample_df = (
    full_df.groupby('Label')
    .apply(lambda x: x.sample(n=sample_size_per_category, random_state=1) if len(x) > sample_size_per_category else x)
    .reset_index(drop=True)[['Total Fwd Packets', 'Total Backward Packets', 'Label']]
)

scatter_sample_df.to_csv(output_path, index=False)
print(f"Saved in: '{output_path}'")

Saved in: 'saved_plots/scatter_samples.csv'






In [11]:
output_path = 'saved_plots/scatter_samples.csv'
scatter_sample_df = pd.read_csv(output_path)

fig = go.Figure()

for label in sorted(scatter_sample_df['Label'].unique()):
    data = scatter_sample_df[scatter_sample_df['Label'] == label]
    fig.add_trace(go.Scatter(
        x=data['Total Fwd Packets'],
        y=data['Total Backward Packets'],
        name=label,
        mode='markers',
        marker=dict(size=5, opacity=0.7)
    ))

fig.update_layout(
    title_text="Relationship between Forward and Backward Packets (Log Scale)",
    xaxis_title="Total Fwd Packets (Log Scale)",
    yaxis_title="Total Backward Packets (Log Scale)",
    height=700,
    legend_title_text='Attack Type',
    xaxis_type="log",
    yaxis_type="log"
)

fig.show()

### 2. Massive attacks

In [10]:
dos_ddos_labels = [
    'BENIGN',
    'Bot',
    'DDoS',
    'DoS GoldenEye',
    'DoS Hulk',
    'DoS Slowhttptest',
    'DoS slowloris'
]

dos_ddos_df = full_df[full_df['Label'].isin(dos_ddos_labels)]

packets_stats_df = (
    dos_ddos_df.groupby('Label')['Flow Packets/s']
    .quantile([0, 0.25, 0.5, 0.75, 1])
    .unstack()
    .rename(columns={0.0: 'min', 0.25: 'q1', 0.5: 'median', 0.75: 'q3', 1.0: 'max'})
)

packets_stats_df.to_csv('saved_plots/flow_packets_stats.csv')
print("Saved in: 'saved_plots/flow_packets_stats.csv'")

Saved in: 'saved_plots/flow_packets_stats.csv'


In [11]:
output_path = 'saved_plots/flow_packets_stats.csv'
packets_stats_df = pd.read_csv(output_path, index_col='Label')

fig = go.Figure()
for index, row in packets_stats_df.iterrows():
    fig.add_trace(go.Box(
        x=[row.name],
        name=row.name,
        q1=[row['q1']],
        median=[row['median']],
        q3=[row['q3']],
        lowerfence=[row['min']],
        upperfence=[row['max']],
        boxpoints=False
    ))

fig.update_layout(
    title_text="Comparison of Flow Packets/s for DoS and DDoS Attacks",
    xaxis_title="Traffic/Attack Type (Label)",
    yaxis_title="Flow Packets/s (Log Scale)",
    yaxis_type="log",
    height=600,
    showlegend=False
)

fig.show()

In [12]:
output_path = 'saved_plots/flow_duration_bruteforce_stats.csv'

bruteforce_labels = [
    'FTP-Patator',
    'SSH-Patator',
    'Web Attack - Brute Force'
]

bruteforce_df = full_df[full_df['Label'].isin(bruteforce_labels)]

bruteforce_stats_df = (
    bruteforce_df.groupby('Label')['Flow Duration']
    .quantile([0, 0.25, 0.5, 0.75, 1])
    .unstack()
    .rename(columns={0.0: 'min', 0.25: 'q1', 0.5: 'median', 0.75: 'q3', 1.0: 'max'})
)

bruteforce_stats_df.to_csv(output_path)
print(f"Saved in: '{output_path}'")

Saved in: 'saved_plots/flow_duration_bruteforce_stats.csv'


### 3. Brute Force

In [13]:
output_path = 'saved_plots/flow_duration_bruteforce_stats.csv'
stats_df = pd.read_csv(output_path, index_col='Label')

fig = go.Figure()
for label, row in stats_df.iterrows():
    fig.add_trace(go.Box(
        x=[label],
        name=label,
        q1=[row['q1']],
        median=[row['median']],
        q3=[row['q3']],
        lowerfence=[row['min']],
        upperfence=[row['max']],
        boxpoints=False
    ))

fig.update_layout(
    title_text="Time Duration of Brute Force Attacks",
    xaxis_title="Traffic/Attack Type (Label)",
    yaxis_title="Flow Duration (Log Scale)",
    yaxis_type="log",
    height=600,
    showlegend=False
)

fig.show()

In [14]:
output_path = 'saved_plots/bruteforce_flag_means.csv'
output_dir = os.path.dirname(output_path)
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

bruteforce_labels = [
    'BENIGN',
    'FTP-Patator',
    'SSH-Patator',
    'Web Attack - Brute Force'
]

means_df = (
    full_df[full_df['Label'].isin(bruteforce_labels)]
    .groupby('Label')[['SYN Flag Count', 'FIN Flag Count']]
    .mean()
    .reset_index()
)

means_df.to_csv(output_path, index=False)
print(f"Saved in: '{output_path}'")

Saved in: 'saved_plots/bruteforce_flag_means.csv'


In [15]:
output_path = 'saved_plots/bruteforce_flag_means.csv'
means_df = pd.read_csv(output_path)

fig = go.Figure()
fig.add_trace(go.Bar(
    x=means_df['Label'],
    y=means_df['SYN Flag Count'],
    name='SYN Flag Count',
    marker_color='royalblue'
))
fig.add_trace(go.Bar(
    x=means_df['Label'],
    y=means_df['FIN Flag Count'],
    name='FIN Flag Count',
    marker_color='orange'
))

fig.update_layout(
    barmode='group',
    title_text="Average SYN/FIN Flags in Brute Force Attacks",
    xaxis_title="Traffic/Attack Type (Label)",
    yaxis_title="Average Flag Count",
    height=600,
    showlegend=True
)

fig.show()

### 4. Web attacks

In [16]:
output_path = 'saved_plots/violin_webattack_samples.csv'
output_dir = os.path.dirname(output_path)
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

selected_labels = [
    'BENIGN',
    'PortScan',
    'Infiltration',
    'Heartbleed',
    'Web Attack - Sql Injection',
    'Web Attack - XSS'
]

# this filter removes infiltration and heartbleed !
# that is because they operate on different ports
filtered_df = full_df[
    (full_df['Label'].isin(selected_labels))
    & (full_df['Destination Port'].isin([80, 443]))
]

sample_size_per_category = 2000
violin_sample_df = (
    filtered_df.groupby('Label')
    .apply(lambda x: x.sample(frac=1).head(sample_size_per_category))
    .reset_index(drop=True)[['Fwd Packet Length Max', 'Label']]
)

violin_sample_df.to_csv(output_path, index=False)
print(f"Saved in: '{output_path}'")

Saved in: 'saved_plots/violin_webattack_samples.csv'






In [17]:
output_path = 'saved_plots/violin_webattack_samples.csv'
violin_sample_df = pd.read_csv(output_path)

fig = go.Figure()
for label in sorted(violin_sample_df['Label'].unique()):
    data = violin_sample_df[violin_sample_df['Label'] == label]['Fwd Packet Length Max']
    fig.add_trace(go.Violin(
        y=data,
        name=label,
        box_visible=True,
        meanline_visible=True
    ))

fig.update_layout(
    title_text="Distribution of Fwd Packet Length Max for Selected Web Attacks and BENIGN (Ports 80/443)",
    xaxis_title="Traffic/Attack Type (Label)",
    yaxis_title="Fwd Packet Length Max",
    height=600,
    showlegend=False
)

fig.show()