In [19]:
from torch_geometric.utils import from_networkx, add_self_loops, degree
from torch_geometric.nn import MessagePassing
import torch_geometric.transforms as T
from torch_geometric.data import DataLoader
from torch_geometric.loader import NeighborSampler
import torch.nn as nn
import torch as th
import torch.nn.functional as F
# import dgl.function as fn
import networkx as nx
import pandas as pd
import socket
import struct
import random
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.utils import class_weight
# import seaborn as sns
# import matplotlib.pyplot as plt
import numpy as np
import os
import sys

project_root = os.path.abspath(os.path.join(os.getcwd(), '..', '..'))
sys.path.append(project_root)

from Datasets.CIC_IDS_2017.CIC_IDS_2017_config import CIC_IDS_2017_Config

In [20]:
train_df = pd.read_csv(os.path.join(project_root, "Datasets", f"CIC_IDS_2017/Train/train_scaled.csv"))
test_df = pd.read_csv(os.path.join(project_root, "Datasets", f"CIC_IDS_2017/Eval/eval_scaled.csv"))

DATASET_NAME = "CIC_IDS_2017"

SOURCE_IP_COL_NAME = CIC_IDS_2017_Config.SOURCE_IP_COL_NAME
DESTINATION_IP_COL_NAME = CIC_IDS_2017_Config.DESTINATION_IP_COL_NAME
SOURCE_PORT_COL_NAME = CIC_IDS_2017_Config.SOURCE_PORT_COL_NAME
DESTINATION_PORT_COL_NAME = CIC_IDS_2017_Config.DESTINATION_PORT_COL_NAME

ATTACK_CLASS_COL_NAME = CIC_IDS_2017_Config.ATTACK_CLASS_COL_NAME

BENIGN_CLASS_NAME = CIC_IDS_2017_Config.BENIGN_CLASS_NAME

TIME_COLS = CIC_IDS_2017_Config.TIME_COL_NAMES

COLS_TO_NORM = CIC_IDS_2017_Config.COLS_TO_NORM
CATEGORICAL_COLS = CIC_IDS_2017_Config.CATEGORICAL_COLS

label_col = ATTACK_CLASS_COL_NAME


print(train_df[ATTACK_CLASS_COL_NAME].value_counts())

checkpoint_path = os.path.join(project_root, "Models/E_GraphSAGE/logs", DATASET_NAME, f"checkpoints.pth")
best_model_path = os.path.join(project_root, "Models/E_GraphSAGE/logs", DATASET_NAME, f"best_model.pth")

os.makedirs(os.path.dirname(checkpoint_path), exist_ok=True)
os.makedirs(os.path.dirname(best_model_path), exist_ok=True)

Label
DoS Hulk                      196412
BENIGN                        193213
PortScan                      135090
DDoS                          108823
DoS GoldenEye                   8749
FTP-Patator                     6747
SSH-Patator                     5013
DoS slowloris                   4927
DoS Slowhttptest                4674
Bot                             1671
Web Attack - Brute Force        1281
Web Attack - XSS                 554
Infiltration                      31
Web Attack - Sql Injection        18
Heartbleed                         9
Name: count, dtype: int64


In [21]:
train_df[SOURCE_IP_COL_NAME] = train_df[SOURCE_IP_COL_NAME].apply(str)
train_df[DESTINATION_IP_COL_NAME] = train_df[DESTINATION_IP_COL_NAME].apply(str)
train_df[SOURCE_PORT_COL_NAME] = train_df[SOURCE_PORT_COL_NAME].apply(str)
train_df[DESTINATION_PORT_COL_NAME] = train_df[DESTINATION_PORT_COL_NAME].apply(str)

train_df[SOURCE_IP_COL_NAME] = train_df[SOURCE_IP_COL_NAME] + ':' + train_df[SOURCE_PORT_COL_NAME]
train_df[DESTINATION_IP_COL_NAME] = train_df[DESTINATION_IP_COL_NAME] + ':' + train_df[DESTINATION_PORT_COL_NAME]
train_df.drop(columns=[SOURCE_PORT_COL_NAME,DESTINATION_PORT_COL_NAME],inplace=True)

test_df[SOURCE_IP_COL_NAME] = test_df[SOURCE_IP_COL_NAME].apply(str)
test_df[DESTINATION_IP_COL_NAME] = test_df[DESTINATION_IP_COL_NAME].apply(str)
test_df[SOURCE_PORT_COL_NAME] = test_df[SOURCE_PORT_COL_NAME].apply(str)
test_df[DESTINATION_PORT_COL_NAME] = test_df[DESTINATION_PORT_COL_NAME].apply(str)

test_df[SOURCE_IP_COL_NAME] = test_df[SOURCE_IP_COL_NAME] + ':' + test_df[SOURCE_PORT_COL_NAME]
test_df[DESTINATION_IP_COL_NAME] = test_df[DESTINATION_IP_COL_NAME] + ':' + test_df[DESTINATION_PORT_COL_NAME]
test_df.drop(columns=[SOURCE_PORT_COL_NAME,DESTINATION_PORT_COL_NAME],inplace=True)

In [22]:
print(train_df.head)

<bound method NDFrame.head of          index            Source IP       Destination IP            Timestamp  \
0       422713  192.168.10.17:46076    54.192.37.109:443        5/7/2017 2:31   
1       259815  192.168.10.14:62176     52.16.178.12:443        4/7/2017 4:34   
2       271728     172.16.0.1:58056   192.168.10.50:8087        7/7/2017 3:22   
3       122292     172.16.0.1:52212     192.168.10.50:80        7/7/2017 4:08   
4        96161  192.168.10.12:58038    52.84.141.138:443  03/07/2017 11:03:33   
...        ...                  ...                  ...                  ...   
667207  337204  192.168.10.12:54662    192.82.242.50:443  03/07/2017 09:07:08   
667208  193134     172.16.0.1:44704   192.168.10.50:1123        7/7/2017 2:55   
667209  261086   192.168.10.3:60951      192.168.10.1:53  03/07/2017 02:57:10   
667210    4218   34.227.133.130:443  192.168.10.16:47036  03/07/2017 09:16:16   
667211  140962    192.243.232.58:80    192.168.10.9:4008       6/7/2017 11:51  

In [23]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
attack_labels_train = le.fit_transform(train_df[ATTACK_CLASS_COL_NAME])
attack_labels_test = le.fit_transform(test_df[ATTACK_CLASS_COL_NAME])
class_map = le.classes_
print(class_map)
print("Attack label mapping:", dict(zip(class_map, range(len(class_map)))))
train_df[ATTACK_CLASS_COL_NAME] = attack_labels_train
test_df[ATTACK_CLASS_COL_NAME] = attack_labels_test
num_classes = len(class_map)
class_dict = {le.inverse_transform([i])[0]: i for i in range(len(le.classes_))}


['BENIGN' 'Bot' 'DDoS' 'DoS GoldenEye' 'DoS Hulk' 'DoS Slowhttptest'
 'DoS slowloris' 'FTP-Patator' 'Heartbleed' 'Infiltration' 'PortScan'
 'SSH-Patator' 'Web Attack - Brute Force' 'Web Attack - Sql Injection'
 'Web Attack - XSS']
Attack label mapping: {'BENIGN': 0, 'Bot': 1, 'DDoS': 2, 'DoS GoldenEye': 3, 'DoS Hulk': 4, 'DoS Slowhttptest': 5, 'DoS slowloris': 6, 'FTP-Patator': 7, 'Heartbleed': 8, 'Infiltration': 9, 'PortScan': 10, 'SSH-Patator': 11, 'Web Attack - Brute Force': 12, 'Web Attack - Sql Injection': 13, 'Web Attack - XSS': 14}


In [24]:
train_df.drop(columns=TIME_COLS, inplace=True)
test_df.drop(columns=TIME_COLS, inplace=True)

categorical_cols = converted_categorical_cols = [col for col in train_df.columns if col.startswith(tuple(CATEGORICAL_COLS))]
feature_cols = categorical_cols + COLS_TO_NORM

print('Feature Columns:', feature_cols)

train_df['h'] = train_df[ feature_cols ].values.tolist()
test_df['h'] = test_df[ feature_cols ].values.tolist()

# X_train = train_df.drop(columns=[label_col])
# X_val = val_df.drop(columns=[label_col])
# X_test = test_df.drop(columns=[label_col])

y_train = train_df[label_col]
y_test = test_df[label_col]

pd.set_option('display.max_columns', None)

print("Number of training samples:", len(train_df))
print(y_train.value_counts())
print("Number of test samples:", len(test_df))
print(y_test.value_counts())

print(train_df.head(5))

Feature Columns: ['Protocol_0', 'Protocol_6', 'Protocol_17', 'Bwd Packet Length Min', 'Subflow Fwd Packets', 'Total Length of Fwd Packets', 'Fwd Packet Length Mean', 'Total Length of Fwd Packets', 'Fwd Packet Length Std', 'Fwd IAT Min', 'Flow IAT Min', 'Flow IAT Mean', 'Bwd Packet Length Std', 'Subflow Fwd Bytes', 'Flow Duration', 'Flow IAT Std', 'Active Min', 'Active Mean', 'Bwd IAT Mean', 'Subflow Bwd Bytes', 'Init_Win_bytes_forward', 'ACK Flag Count', 'Fwd PSH Flags', 'SYN Flag Count', 'Flow Packets/s', 'PSH Flag Count', 'Average Packet Size']


Number of training samples: 667212
Label
4     196412
0     193213
10    135090
2     108823
3       8749
7       6747
11      5013
6       4927
5       4674
1       1671
12      1281
14       554
9         31
13        18
8          9
Name: count, dtype: int64
Number of test samples: 117744
Label
4     34661
0     34097
10    23840
2     19204
3      1544
7      1191
11      884
6       869
5       825
1       295
12      226
14       98
9         5
13        3
8         2
Name: count, dtype: int64
    index            Source IP      Destination IP  Flow Duration  \
0  422713  192.168.10.17:46076   54.192.37.109:443       2.340240   
1  259815  192.168.10.14:62176    52.16.178.12:443      -0.601462   
2  271728     172.16.0.1:58056  192.168.10.50:8087      -0.601461   
3  122292     172.16.0.1:52212    192.168.10.50:80      -0.400280   
4   96161  192.168.10.12:58038   52.84.141.138:443      -0.493254   

   Total Fwd Packets  Total Backward Packets  Total Length of Fwd Packets  \
0  

In [25]:
def create_graph(df, source_ip_col, destination_ip_col, edge_attr, create_using=nx.MultiDiGraph(), **kwargs):
    G_nx = nx.from_pandas_edgelist(df, source_ip_col, destination_ip_col, edge_attr, create_using=create_using, **kwargs)
    G_pyg = from_networkx(G_nx)

    num_nodes = G_pyg.num_nodes
    num_edges = G_pyg.num_edges

    G_pyg.x = th.ones(num_nodes, len(df['h'].iloc[0])) 

    edge_attr_list = []
    edge_label_list = []

    for u, v, key, data in G_nx.edges(keys=True, data=True):
        edge_attr_list.append(data['h']) 
        edge_label_list.append(data[label_col]) 

    G_pyg.edge_attr = th.tensor(edge_attr_list, dtype=th.float32)
    G_pyg.edge_label = th.tensor(edge_label_list, dtype=th.long)

    print("Number of edges in G_pyg:", num_edges)
    print("Number of node in G_pyg:", num_nodes)
    print("Shape of node in G_pyg:", G_pyg.x.shape)
    print("Shape of edge attr in G_pyg:", G_pyg.edge_attr.shape)
    print("Shape of edge label in G_pyg:", G_pyg.edge_label.shape)

    return G_nx, G_pyg

In [26]:
G_nx_train, G_pyg_train = create_graph(train_df, SOURCE_IP_COL_NAME, DESTINATION_IP_COL_NAME, ['h', label_col], create_using=nx.MultiDiGraph())
G_nx_test, G_pyg_test = create_graph(test_df, SOURCE_IP_COL_NAME, DESTINATION_IP_COL_NAME, ['h', label_col], create_using=nx.MultiDiGraph())

Number of edges in G_pyg: 667212
Number of node in G_pyg: 183659
Shape of node in G_pyg: torch.Size([183659, 27])
Shape of edge attr in G_pyg: torch.Size([667212, 27])
Shape of edge label in G_pyg: torch.Size([667212])
Number of edges in G_pyg: 117744
Number of node in G_pyg: 61964
Shape of node in G_pyg: torch.Size([61964, 27])
Shape of edge attr in G_pyg: torch.Size([117744, 27])
Shape of edge label in G_pyg: torch.Size([117744])


In [27]:
class EGraphSAGEConv(MessagePassing):
    def __init__(self, node_in_channels, edge_in_channels, out_channels):
        super(EGraphSAGEConv, self).__init__(aggr='mean')  # mean aggregation
        self.lin_node = nn.Linear(node_in_channels, out_channels)
        self.lin_edge = nn.Linear(edge_in_channels, out_channels)
        self.lin_update = nn.Linear(node_in_channels + out_channels, out_channels) # out_channels * 2

    def forward(self, x, edge_index, edge_attr):
        # x: Node features, edge_attr: Edge features, edge_index: Connectivity
        edge_index, _ = add_self_loops(edge_index, num_nodes=x.size(0))

        if edge_attr is not None:
            if edge_attr.size(0) != edge_index.size(1):
                loop_attr = th.zeros((edge_index.size(1) - edge_attr.size(0), edge_attr.size(1))).to(edge_attr.device)
                edge_attr = th.cat([edge_attr, loop_attr], dim=0)
        else:
            print("edge_attr is unexist")
        
        # Propagate and aggregate neighbor information
        return self.propagate(edge_index, x=x, edge_attr=edge_attr)

    def message(self, x_j, edge_attr):
        # x_j represents the adjacent nodes of x
        # Compute messages by combining node and edge features
        return self.lin_node(x_j) + self.lin_edge(edge_attr)

    def update(self, aggr_out, x):
        # Update node features after message passing
        return self.lin_update(th.cat([x, aggr_out], dim=1))

class MLPPredictor(nn.Module):
    def __init__(self, in_channels, out_channels):
        super(MLPPredictor, self).__init__()
        self.lin = nn.Linear(in_channels * 2, out_channels)

    def forward(self, data, z):
        row, col = data.edge_index
        # Concatenate the features of source and target nodes for each edge
        edge_feat = th.cat([z[row], z[col]], dim=1)
        return self.lin(edge_feat)

class EGraphSAGE(nn.Module):
    def __init__(self, node_in_channels, edge_in_channels, hidden_channels, out_channels):
        super(EGraphSAGE, self).__init__()
        self.conv1 = EGraphSAGEConv(node_in_channels, edge_in_channels, hidden_channels)
        self.conv2 = EGraphSAGEConv(hidden_channels, edge_in_channels, hidden_channels)
        self.mlp_predictor = MLPPredictor(hidden_channels, out_channels)

    def forward(self, data):
        x, edge_index, edge_attr = data.x, data.edge_index, data.edge_attr
        x = F.relu(self.conv1(x, edge_index, edge_attr))
        x = F.dropout(x, p=0.2, training=self.training)
        x = self.conv2(x, edge_index, edge_attr)
        return self.mlp_predictor(data, x)

In [28]:
device = th.device("cuda:0" if th.cuda.is_available() else "cpu")
print(device)

cuda:0


In [29]:
th.cuda.empty_cache()

In [30]:


def init_weights(m):
    if isinstance(m, nn.Linear):
        nn.init.xavier_uniform_(m.weight)
        nn.init.constant_(m.bias, 0)



In [None]:
# Extract the best parameters from the grid search
best_hidden_dim = 256  # Replace with the best hidden_dim found
best_learning_rate = 0.005  # Replace with the best learning_rate found

# Create the graph for the entire training dataset
G_nx_train_full, G_pyg_train_full = create_graph(train_df, SOURCE_IP_COL_NAME, DESTINATION_IP_COL_NAME, ['h', label_col], create_using=nx.MultiDiGraph())

# Initialize the model with the best parameters
model = EGraphSAGE(node_in_channels=G_pyg_train_full.num_node_features,
                   edge_in_channels=G_pyg_train_full.num_edge_features,
                   hidden_channels=best_hidden_dim,
                   out_channels=num_classes).to(device)

model.apply(init_weights)

# Compute class weights for the training dataset
labels = G_pyg_train_full.edge_label.cpu().numpy()
class_weights = class_weight.compute_class_weight('balanced',
                                                  classes=np.unique(labels),
                                                  y=labels)

# Normalize class weights
class_weights = class_weights / np.mean(class_weights)
class_weights = th.FloatTensor(class_weights).to(device)
print("Class weights:", class_weights)

# Define the loss function and optimizer
criterion = nn.CrossEntropyLoss(weight=class_weights)
optimizer = th.optim.Adam(model.parameters(), lr=best_learning_rate)

# Move the graph data to the device
G_pyg_train_full.to(device)
G_pyg_train_full.edge_label = G_pyg_train_full.edge_label.to(device)
G_pyg_train_full.edge_attr = G_pyg_train_full.edge_attr.to(device)

best_f1 = 0
best_model_state = None

# Load checkpoint if exists
start_epoch = 0
epochs = 5000

if os.path.exists(checkpoint_path):
    checkpoint = th.load(checkpoint_path)
    model.load_state_dict(checkpoint['model_state_dict'])
    optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
    start_epoch = checkpoint['epoch'] + 1
    lowest_loss = checkpoint['lowest_loss']
    print(f"Resumed training from epoch {start_epoch}")

for epoch in range(epochs):
    model.train()
    optimizer.zero_grad()
    out = model(G_pyg_train_full)
    loss = criterion(out, G_pyg_train_full.edge_label)
    loss.backward()
    optimizer.step()

    if epoch % 100 == 0:
        print(f'Epoch {epoch}, Loss: {loss.item():.4f}')

    # Save the best model based on the lowest loss
    if loss.item() < lowest_loss:  # Here, lowest_loss is used to track the lowest loss
        lowest_loss = loss.item()
        best_model_state = model.state_dict()
        th.save(best_model_state, best_model_path)
        print("Saved best model. Lowest Loss:", lowest_loss)

    # Save checkpoint
    th.save({
        'epoch': epoch,
        'model_state_dict': model.state_dict(),
        'optimizer_state_dict': optimizer.state_dict(),
        'lowest_loss': lowest_loss
    }, checkpoint_path)

# Save the trained model
print("Model training completed and saved.")

Number of edges in G_pyg: 667212
Number of node in G_pyg: 183659
Shape of node in G_pyg: torch.Size([183659, 27])
Shape of edge attr in G_pyg: torch.Size([667212, 27])
Shape of edge label in G_pyg: torch.Size([667212])
Class weights: tensor([3.8241e-04, 4.4217e-02, 6.7896e-04, 8.4451e-03, 3.7618e-04, 1.5808e-02,
        1.4996e-02, 1.0951e-02, 8.2096e+00, 2.3834e+00, 5.4694e-04, 1.4739e-02,
        5.7679e-02, 4.1048e+00, 1.3337e-01], device='cuda:0')
Resumed training from epoch 5000
Epoch 0, Loss: 1.2288
Epoch 100, Loss: 1.2132


In [None]:
th.save(model.state_dict(), best_model_path)

In [None]:
from torch_geometric.utils import subgraph
from torch_geometric.data import Data
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

def compute_accuracy(pred, labels):
    return (pred.argmax(1) == labels).float().mean().item()

def eval(model, dataframe, adversarial=False):
    G_nx_test = nx.from_pandas_edgelist(dataframe, SOURCE_IP_COL_NAME, DESTINATION_IP_COL_NAME, ['h', label_col], create_using=nx.MultiDiGraph())

    G_pyg_test = from_networkx(G_nx_test)

    test_num_nodes = G_pyg_test.num_nodes
    test_num_edges = G_pyg_test.num_edges

    G_pyg_test.x = th.ones(test_num_nodes, len(test_df['h'].iloc[0]))

    test_edge_attr_list = []
    test_edge_label_list = []

    for u, v, key, data in G_nx_test.edges(keys=True, data=True):
        test_edge_attr_list.append(data['h']) 
        test_edge_label_list.append(data[label_col]) 

    G_pyg_test.edge_attr = th.tensor(test_edge_attr_list, dtype=th.float32)
    G_pyg_test.edge_label = th.tensor(test_edge_label_list, dtype=th.long)

    G_pyg_test = G_pyg_test.to(device)
    G_pyg_test.edge_label = G_pyg_test.edge_label.to(device)
    G_pyg_test.edge_attr = G_pyg_test.edge_attr.to(device)

    print("Number of edges in G_pyg_test:", G_pyg_test.num_edges)
    print("Number of node in G_pyg_test:", G_pyg_test.num_nodes)
    print("Shape of node in G_pyg_test:", G_pyg_test.x.shape)
    print("Shape of edge attr in G_pyg_test:", G_pyg_test.edge_attr.shape)
    print("Shape of edge label in G_pyg_test:", G_pyg_test.edge_label.shape)

    model.eval()

    print("inference start")
    with th.no_grad():
            
        try:
            out = model(G_pyg_test)
            
        except Exception as forward_error:
            print(f"Error during forward/backward pass at {forward_error}")

    print("inference done")

    test_accuracy = compute_accuracy(out, G_pyg_test.edge_label)
    print(f'Test Accuracy: {test_accuracy:.4f}')
    
    pred_labels = out.argmax(dim=1).cpu()
    all_test_labels = G_pyg_test.edge_label.cpu()

    
    global class_map
    class_map_2 = class_map
    if adversarial:
        class_map_2 = np.append(class_map, "Adversarial")

    # Generate a report
    cm = confusion_matrix(all_test_labels, pred_labels, labels=range(len(class_map_2)))
    print(cm)
    report = classification_report(all_test_labels, pred_labels, target_names=class_map_2, digits=4)
    print(report)

eval(model, test_df)


Number of edges in G_pyg_test: 117744
Number of node in G_pyg_test: 61964
Shape of node in G_pyg_test: torch.Size([61964, 27])
Shape of edge attr in G_pyg_test: torch.Size([117744, 27])
Shape of edge label in G_pyg_test: torch.Size([117744])
inference start
inference done
Test Accuracy: 0.5015
[[33603    46     0     0     2     4     1    87     0    28    65   255
      1     5     0]
 [    1   294     0     0     0     0     0     0     0     0     0     0
      0     0     0]
 [    0     0     0     0     2     4    94     0     0     0     0     0
     48 18950   106]
 [    0     0     0     0     0     2     7     0     0     0     0     0
      1  1524    10]
 [    5     0     0     0     7    41   327     0     0     0     0     0
    133 33831   317]
 [    0     0     0     0     1     4     7     0     0     0     0     0
      3   804     6]
 [    2     0     0     0     0     0    17     0     0     0     0     0
      7   830    13]
 [    1     0     0     0     0     0   

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [None]:
def attack_attacker(dataframe, ratio, num_injected_nodes):
    attack_eval = dataframe[dataframe[label_col] != class_dict[BENIGN_CLASS_NAME]]
    num_injected = int(ratio * len(attack_eval))

    # Sample attack rows
    sampled_attack_flows = attack_eval.sample(n=num_injected, random_state=42).copy().reset_index(drop=True)
    
    injected_rows = sampled_attack_flows.copy()
    print("Sampled attack flows:", len(sampled_attack_flows))
    print("Labels of sampled attack flows:", sampled_attack_flows[label_col].value_counts())

    node_ips = [f"192.168.1.{i+1}" for i in range(num_injected_nodes)]
    injected_rows[DESTINATION_IP_COL_NAME] = injected_rows[SOURCE_IP_COL_NAME] # Target the Real Attacker Nodes
    injected_rows[SOURCE_IP_COL_NAME] = [f"{node_ips[i % len(node_ips)]}:{random.randint(1024, 65535)}" for i in range(num_injected)]
    # injected_rows['pkSeqID'] = [f'Injected-{i}' for i in range(num_injected)]
    injected_rows[label_col] = len(class_map) # Assign a new class for injected samples
    print(injected_rows[0:5])

    # Append and reorder
    combined_df = pd.concat([dataframe, injected_rows], ignore_index=True)

    return combined_df

# Inject adversarial samples
attack_attacker_df = attack_attacker(test_df, 0.1, num_injected_nodes=1)
eval(model, attack_attacker_df, adversarial=True)

Sampled attack flows: 8364
Labels of sampled attack flows: Label
4     3529
10    2355
2     1908
3      171
7       96
11      94
6       85
5       73
1       26
12      21
14       5
9        1
Name: count, dtype: int64
    index          Source IP    Destination IP  Flow Duration  \
0   82325  192.168.1.1:36187  172.16.0.1:60552      -0.400526   
1   28126  192.168.1.1:56693  172.16.0.1:52256      -0.592535   
2   92025  192.168.1.1:50101  172.16.0.1:42784      -0.601461   
3  122453  192.168.1.1:59213  172.16.0.1:40022      -0.601462   
4  117569  192.168.1.1:63231  172.16.0.1:60826       1.925325   

   Total Fwd Packets  Total Backward Packets  Total Length of Fwd Packets  \
0                  8                       4                    -0.040966   
1                  3                       6                    -0.045662   
2                  1                       1                    -0.049419   
3                  2                       0                    -0.049732   
4

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [None]:
def normalise_attacker(dataframe, ratio, num_injected_nodes):

    normal_eval = dataframe[dataframe[label_col] == class_dict[BENIGN_CLASS_NAME]]
    attack_eval = dataframe[dataframe[label_col] != class_dict[BENIGN_CLASS_NAME]]
    print("Normal Flows:", len(normal_eval))
    print("Attack Flows:", len(attack_eval))
    num_injected = int(ratio * len(attack_eval))


    sampled_normal_flows = normal_eval.sample(n=num_injected, random_state=42).copy().reset_index(drop=True)
    sampled_attack_flows = attack_eval.sample(n=num_injected, random_state=42).copy().reset_index(drop=True)

    print("Sampled attack flows:", len(sampled_attack_flows))
    print("Labels of sampled attack flows:", sampled_attack_flows[label_col].value_counts())

    injected_rows = sampled_normal_flows.copy()
    node_ips = [f"192.168.1.{i+1}" for i in range(num_injected_nodes)]
    injected_rows[DESTINATION_IP_COL_NAME] = sampled_attack_flows[SOURCE_IP_COL_NAME] # Direct BENGIN Traffic to the Real Attacker Nodes
    injected_rows[SOURCE_IP_COL_NAME] = [f"{node_ips[i % len(node_ips)]}:{random.randint(1024, 65535)}" for i in range(num_injected)]
    injected_rows[label_col] = len(class_map)
    print(injected_rows[0:5])

    combined_df = pd.concat([dataframe, injected_rows], ignore_index=True)

    return combined_df

# Inject adversarial samples
normalise_attacker_df = normalise_attacker(test_df, 0.1, 1)
eval(model, normalise_attacker_df, adversarial=True)

Normal Flows: 34097
Attack Flows: 83647
Sampled attack flows: 8364
Labels of sampled attack flows: Label
4     3529
10    2355
2     1908
3      171
7       96
11      94
6       85
5       73
1       26
12      21
14       5
9        1
Name: count, dtype: int64
    index          Source IP    Destination IP  Flow Duration  \
0   71957  192.168.1.1:30871  172.16.0.1:60552       0.959856   
1  432598   192.168.1.1:8141  172.16.0.1:52256      -0.600359   
2   65183  192.168.1.1:32893  172.16.0.1:42784      -0.601457   
3   77228  192.168.1.1:39870  172.16.0.1:40022      -0.601457   
4  497915  192.168.1.1:38547  172.16.0.1:60826      -0.600100   

   Total Fwd Packets  Total Backward Packets  Total Length of Fwd Packets  \
0                 16                      16                     0.588506   
1                  2                       2                    -0.037835   
2                  2                       2                    -0.040027   
3                  2                  

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [None]:
def random_connection(dataframe, ratio, num_injected_nodes):

    normal_eval = dataframe[dataframe[label_col] == class_dict[BENIGN_CLASS_NAME]]

    num_injected = int(ratio * len(dataframe))
    print("injected rows:", num_injected)

    sampled_normal_flows = normal_eval.sample(n=num_injected, random_state=42).copy().reset_index(drop=True)

    injected_rows = sampled_normal_flows.copy()
    node_ips = [f"192.168.1.{i+1}" for i in range(num_injected_nodes)]
    injected_rows[DESTINATION_IP_COL_NAME] = [node_ips[i % len(node_ips)] for i in range(num_injected)]
    injected_rows[SOURCE_IP_COL_NAME] = [node_ips[(i + 1) % len(node_ips)] for i in range(num_injected)]
    injected_rows[label_col] = len(class_map)

    combined_df = pd.concat([dataframe, injected_rows], ignore_index=True)

    return combined_df

# Inject adversarial samples
random_connection_df = random_connection(test_df, 0.1, 1)
eval(model, random_connection_df, adversarial=True)

injected rows: 11774
Number of edges in G_pyg_test: 129518
Number of node in G_pyg_test: 61965
Shape of node in G_pyg_test: torch.Size([61965, 27])
Shape of edge attr in G_pyg_test: torch.Size([129518, 27])
Shape of edge label in G_pyg_test: torch.Size([129518])
inference start
inference done
Test Accuracy: 0.4560
[[33603    46     0     0     2     4     1    87     0    28    65   255
      1     5     0     0]
 [    1   294     0     0     0     0     0     0     0     0     0     0
      0     0     0     0]
 [    0     0     0     0     2     4    94     0     0     0     0     0
     48 18950   106     0]
 [    0     0     0     0     0     2     7     0     0     0     0     0
      1  1524    10     0]
 [    5     0     0     0     7    41   327     0     0     0     0     0
    133 33831   317     0]
 [    0     0     0     0     1     4     7     0     0     0     0     0
      3   804     6     0]
 [    2     0     0     0     0     0    17     0     0     0     0     0
    

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
