In [5]:
%py

UsageError: Line magic function `%py` not found.


In [17]:
# Импорт необходимых библиотек
import os
import pandas as pd
import numpy as np
import torch
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_curve, auc, confusion_matrix
from torch_geometric.data import Data
from torch_geometric.nn import GCNConv
import torch.nn.functional as F
from torch.nn import BatchNorm1d
import seaborn as sns
import networkx as nx
from matplotlib.backends.backend_pdf import PdfPages

# -----------------------------
# Загрузка и предварительная обработка данных для конкретного типа атаки
# -----------------------------
def load_attack_type(data_dir, attack_type, nrows=10000, n_attacks=20, amount_of_noise=0.3):
    path = os.path.join(data_dir, f"{attack_type}.csv")
    df = pd.read_csv(path, nrows=nrows, low_memory=False)
    df.columns = df.columns.str.strip()
    df['attack_type'] = attack_type
    df['Label'] = df['Label'].map(lambda x: 0 if 'BENIGN' in str(x).upper() else 1)

    # Разделяем на атакующие и нормальные строки
    attack_df = df[df['Label'] == 1].copy()
    benign_df = df[df['Label'] == 0].copy()

    if not attack_df.empty:
        generated_rows = []
        for i in range(1, n_attacks + 1):
            fake_ip = f"10.10.10.{i}"
            new_rows = attack_df.sample(frac=0.3, replace=True, random_state=i).copy()
            new_rows['Source IP'] = fake_ip

            # Добавление шума в числовые признаки
            num_cols = new_rows.select_dtypes(include=[np.number]).columns.difference(['Label'])
            for col in num_cols:
                std_dev = new_rows[col].std()
                noise = np.random.normal(0, amount_of_noise * std_dev if not np.isnan(std_dev) and std_dev > 0 else 1, size=new_rows.shape[0])
                new_rows[col] += noise

            if i % 5 == 0:
                new_rows['Label'] = 0

            generated_rows.append(new_rows)

        attack_df = pd.concat([attack_df] + generated_rows, ignore_index=True)

    df = pd.concat([benign_df, attack_df], ignore_index=True)
    return df.sample(frac=1, random_state=42).reset_index(drop=True)


# Удаление строк с отсутствующими критическими значениями
def preprocess_for_graph(df):
    return df

# -----------------------------
# Построение графа на основе сетевых потоков
# -----------------------------
def build_graph(flow_df):
    flow_df = flow_df.replace([np.inf, -np.inf], np.nan).fillna(0)

    # Создаем отображения IP в уникальные индексы узлов
    ip_set = pd.unique(flow_df[['Source IP', 'Destination IP']].values.ravel())
    ip_map = {ip: idx for idx, ip in enumerate(ip_set)}
    reverse_ip_map = {idx: ip for ip, idx in ip_map.items()}

    flow_df['src'] = flow_df['Source IP'].map(ip_map)
    flow_df['dst'] = flow_df['Destination IP'].map(ip_map)

    # Получаем числовые признаки для построения векторов узлов
    numeric_cols = flow_df.select_dtypes(include=[np.number]).columns.difference(['src', 'dst', 'Label'])
    node_features = []
    for node in range(len(ip_set)):
        rows = flow_df[flow_df['src'] == node]
        features = rows[numeric_cols].mean().values if not rows.empty else np.zeros(len(numeric_cols))
        node_features.append(features)

    # Собираем DataFrame с признаками узлов
    node_df = pd.DataFrame(node_features, columns=numeric_cols).replace([np.inf, -np.inf], np.nan).fillna(0)

    # Графовые тензоры
    edge_index = torch.tensor(flow_df[['src', 'dst']].values.T, dtype=torch.long)
    edge_attr = torch.tensor(StandardScaler().fit_transform(flow_df['Flow Bytes/s'].values.reshape(-1, 1)), dtype=torch.float)
    x = torch.tensor(StandardScaler().fit_transform(node_df), dtype=torch.float)

    # Выделяем атакующие узлы по IP
    attack_nodes = np.union1d(flow_df[flow_df['Label'] == 1]['src'].unique(), flow_df[flow_df['Label'] == 1]['dst'].unique())
    y = torch.tensor([1 if i in attack_nodes else 0 for i in node_df.index], dtype=torch.float)

    # Сохраняем имена признаков узлов для анализа важности признаков
    data = Data(x=x, edge_index=edge_index, edge_attr=edge_attr, y=y)
    data.node_ip_map = reverse_ip_map
    data.raw_flows = flow_df
    data.feature_names = list(numeric_cols)  # <--- добавили сюда имена признаков
    data.time_series = flow_df[['Timestamp', 'Flow Bytes/s']] if 'Timestamp' in flow_df.columns else None
    return data


# -----------------------------
# Модель на основе сверточной графовой нейросети (GCN)
# -----------------------------
class GCN(torch.nn.Module):
    def __init__(self, in_channels, hidden_channels):
        super().__init__()
        self.conv1 = GCNConv(in_channels, hidden_channels)
        self.conv2 = GCNConv(hidden_channels, 1)

    def forward(self, data):
        x, edge_index = data.x, data.edge_index
        x = self.conv1(x, edge_index)
        x = F.relu(x)
        x = F.dropout(x, p=0.3, training=self.training)
        x = self.conv2(x, edge_index)
        return torch.sigmoid(x).squeeze()


# Обучение модели GCN
def train(data, epochs, hidden_channels=32):
    model = GCN(data.x.shape[1], hidden_channels)
    optimizer = torch.optim.Adam(model.parameters(), lr=0.01)
    loss_fn = torch.nn.BCELoss()
    for epoch in range(epochs):
        model.train()
        optimizer.zero_grad()
        out = model(data)
        loss = loss_fn(out, data.y)
        loss.backward()
        optimizer.step()
    return model

# -----------------------------
# Оценка модели
# -----------------------------
def evaluate_model(model, data, threshold = 0.5):
    model.eval()
    with torch.no_grad():
        y_scores = model(data).numpy()
        y_true = data.y.numpy()
        y_pred = (y_scores > threshold).astype(int)
    acc = accuracy_score(y_true, y_pred)
    prec = precision_score(y_true, y_pred)
    rec = recall_score(y_true, y_pred)
    f1 = f1_score(y_true, y_pred)
    print(f"Accuracy: {acc:.4f} | Precision: {prec:.4f} | Recall: {rec:.4f} | F1: {f1:.4f}")
    return y_true, y_scores, y_pred, acc, prec, rec, f1

# Построение графиков метрик и матрицы ошибок
def plot_feature_importance(model, data, pdf):
    model.eval()
    x = data.x.clone().detach().requires_grad_(True)  # включаем градиенты по входу
    with torch.enable_grad():
        output = model.forward(Data(x=x, edge_index=data.edge_index))
        output.mean().backward()

    grads = x.grad.abs().mean(dim=0).numpy()
    names = data.feature_names

    # Отбираем топ-10 признаков
    importance = list(zip(names, grads))
    importance.sort(key=lambda x: x[1], reverse=True)
    top_features = importance[:10]
    top_names, top_grads = zip(*top_features)

    # Визуализация
    plt.figure(figsize=(10, 4))
    plt.barh(top_names[::-1], top_grads[::-1])  # отобразим от наибольшего сверху
    plt.title("Top 10 Feature Importances (Gradient-based)")
    plt.xlabel("Average Gradient Magnitude")
    plt.tight_layout()
    pdf.savefig(); plt.close()
# ------------------------------
# График изменения объема трафика во времени
# -----------------------------
def show_time_series(data, pdf):
    if data.time_series is not None:
        df = data.time_series.copy()
        df['Timestamp'] = pd.to_datetime(df['Timestamp'], errors='coerce')
        df = df.dropna().sort_values('Timestamp')
        df.set_index('Timestamp')['Flow Bytes/s'].plot(figsize=(10, 4), title="DDoS Behavior Over Time")
        plt.ylabel("Flow Bytes/s")
        plt.grid()
        pdf.savefig(); plt.close()

# -----------------------------
# Отображение топологии графа и выделение подозрительных узлов
# -----------------------------
def show_graph_topology(data, pdf, scores=None, threshold=0.5, title="Graph Topology"):
    G = nx.Graph()
    flows = data.raw_flows
    edges = list(zip(flows['src'], flows['dst']))
    G.add_edges_from(edges)

    if scores is not None:
        color_map = []
        for node in G:
            idx = node
            score = scores[idx] if idx < len(scores) else 0
            if score > threshold:
                color_map.append('red')      # высокая вероятность атаки
            elif score > 0.3:
                color_map.append('orange')   # подозрительный
            else:
                color_map.append('green')    # нормальный
        plt.figure(figsize=(8, 6))
        nx.draw(G, node_color=color_map, with_labels=True, font_size=6, node_size=100)
    else:
        plt.figure(figsize=(8, 6))
        nx.draw(G, node_color='skyblue', with_labels=True, font_size=6, node_size=100)
    plt.title(title)
    pdf.savefig(); plt.close()

# -----------------------------
# Стратегии смягчения последствий DDoS-атак
# -----------------------------
def get_mitigation_actions(data, scores, threshold_blacklist=0.5, threshold_limit=(0.3, 0.5)):
    actions = []
    for node_id, score in enumerate(scores):
        ip = data.node_ip_map.get(node_id, f'Node{node_id}')
        if score > threshold_blacklist:
            actions.append((ip, 'BLACKLIST'))  # заблокировать IP
        elif threshold_limit[0] <= score <= threshold_limit[1]:
            actions.append((ip, 'RATE_LIMIT_10MBPS'))  # ограничить пропускную способность
        elif score > 0.8:
            actions.append((ip, 'SINKHOLE'))  # перенаправить в пустоту
    return actions

def check_unique_ips(df):
    source_ips = df['Source IP'].unique()
    destination_ips = df['Destination IP'].unique()
    all_ips = pd.unique(df[['Source IP', 'Destination IP']].values.ravel())

    print(f"Уникальные IP в Source IP       : {len(source_ips)}")
    print(f"Уникальные IP в Destination IP : {len(destination_ips)}")
    print(f"Уникальные IP всего            : {len(all_ips)}")
    return all_ips

def plot_feature_importance(model, data, pdf):
    model.eval()
    x = data.x.clone().detach().requires_grad_(True)  # включаем градиенты по входу
    with torch.enable_grad():
        output = model.forward(Data(x=x, edge_index=data.edge_index))
        output.mean().backward()

    grads = x.grad.abs().mean(dim=0).numpy()
    names = data.feature_names

    # Отбираем топ-10 признаков
    importance = list(zip(names, grads))
    importance.sort(key=lambda x: x[1], reverse=True)
    top_features = importance[:10]
    top_names, top_grads = zip(*top_features)

    # Визуализация
    plt.figure(figsize=(10, 4))
    plt.barh(top_names[::-1], top_grads[::-1])  # отобразим от наибольшего сверху
    plt.title("Top 10 Feature Importances (Gradient-based)")
    plt.xlabel("Average Gradient Magnitude")
    plt.tight_layout()
    pdf.savefig(); plt.close()

def inject_label_noise(df, flip_fraction=0.1, seed=42):
    np.random.seed(seed)
    attack_indices = df[df['Label'] == 1].index
    n_flip = int(len(attack_indices) * flip_fraction)
    flip_indices = np.random.choice(attack_indices, n_flip, replace=False)
    df.loc[flip_indices, 'Label'] = 0
    return df

# -----------------------------
# Основной цикл: обучение и сохранение отчета для каждого типа атаки
# -----------------------------
data_dir = "01-12"
all_attacks = [
    "DrDoS_DNS", "DrDoS_LDAP", "DrDoS_MSSQL", "DrDoS_NetBIOS",
    "DrDoS_NTP", "DrDoS_SNMP", "DrDoS_SSDP", "DrDoS_UDP",
    "Syn", "TFTP", "UDPLag"
]

with PdfPages("ddos_gnn_report.pdf") as pdf:
    for attack_type in all_attacks:
        print(f"\n=== {attack_type} ===")
        df = load_attack_type(data_dir, attack_type, nrows=100000, n_attacks=20, amount_of_noise=0.3)
        check_unique_ips(df)
        df = preprocess_for_graph(df)

        df = inject_label_noise(df, flip_fraction=0.1)
        
        # Разделяем данные до построения графа
        train_flows, test_flows = train_test_split(df, test_size=0.3, stratify=df['Label'], random_state=42)

        # Строим графы
        train_data = build_graph(train_flows)
        test_data = build_graph(test_flows)

        # Визуализация до атаки
        show_graph_topology(train_data, pdf, title=f"{attack_type}: Pre-Attack Topology")
        show_time_series(train_data, pdf)

        # Обучение и тестирование
        model = train(train_data, epochs=50, hidden_channels=32)
        y_true, y_scores, y_pred, acc, prec, rec, f1 = evaluate_model(model, test_data, threshold=0.5)
        plot_results(y_true, y_scores, y_pred, pdf)

        plot_feature_importance(model, train_data, pdf)

        # Визуализация после атаки
        show_graph_topology(test_data, pdf, scores=y_scores, title=f"{attack_type}: Post-Attack Topology")

        # Страница с итоговыми метриками
        plt.figure()
        plt.axis("off")
        text = f"""
        Attack Type: {attack_type}

        Accuracy : {acc:.4f}
        Precision: {prec:.4f}
        Recall   : {rec:.4f}
        F1 Score : {f1:.4f}
        """
        plt.text(0, 0.5, text, fontsize=12, fontfamily="monospace")
        pdf.savefig(); plt.close()

        # Митигирующие действия
        actions = get_mitigation_actions(test_data, y_scores)
        print("\nMitigation Actions:")
        for ip, action in actions:
            print(f"{ip}: {action}")



=== DrDoS_DNS ===
Уникальные IP в Source IP       : 120
Уникальные IP в Destination IP : 113
Уникальные IP всего            : 139
Accuracy: 0.9669 | Precision: 0.8500 | Recall: 0.9444 | F1: 0.8947

Mitigation Actions:
172.16.0.5: BLACKLIST
192.168.50.1: BLACKLIST
10.10.10.10: RATE_LIMIT_10MBPS
10.10.10.1: BLACKLIST
10.10.10.9: BLACKLIST
10.10.10.14: BLACKLIST
10.10.10.4: BLACKLIST
10.10.10.13: BLACKLIST
10.10.10.5: BLACKLIST
10.10.10.15: BLACKLIST
10.10.10.20: BLACKLIST
10.10.10.19: BLACKLIST
10.10.10.8: BLACKLIST
10.10.10.6: BLACKLIST
10.10.10.12: BLACKLIST
10.10.10.7: BLACKLIST
10.10.10.17: BLACKLIST
10.10.10.16: BLACKLIST
10.10.10.18: BLACKLIST
10.10.10.11: BLACKLIST
10.10.10.3: BLACKLIST

=== DrDoS_LDAP ===
Уникальные IP в Source IP       : 30
Уникальные IP в Destination IP : 9
Уникальные IP всего            : 33
Accuracy: 0.8065 | Precision: 0.8000 | Recall: 0.8889 | F1: 0.8421

Mitigation Actions:
10.10.10.8: BLACKLIST
192.168.50.1: BLACKLIST
10.10.10.14: RATE_LIMIT_10MBPS
10.10

In [10]:
import os
import pandas as pd
import numpy as np
import torch
import sklearn
import matplotlib
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score,
    roc_curve, auc, confusion_matrix, precision_recall_curve
)
from torch_geometric.data import Data
from torch_geometric.nn import GCNConv
import torch.nn.functional as F
from torch.nn import BatchNorm1d
import seaborn as sns
import networkx as nx
from matplotlib.backends.backend_pdf import PdfPages

In [None]:
#first we import all the necessary libraries
import os
import pandas as pd
import numpy as np
import torch
import sklearn
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score,
    roc_curve, auc, confusion_matrix, precision_recall_curve
)
from torch_geometric.data import Data
from torch_geometric.nn import GCNConv
import torch.nn.functional as F
from torch.nn import BatchNorm1d
import seaborn as sns
import networkx as nx
from matplotlib.backends.backend_pdf import PdfPages

#proving brownish/beige palette for non topology plots in order to match with our paper colours
PALETTE = {
    'dark':   '#8b4513',  
    'med':    '#deb887',  
    'light':  '#f5deb3',  
    'accent': '#cd853f'   
}

#label encoding
def load_attack_type(data_dir, attack_type, nrows=10000, n_attacks=20, amount_of_noise=0.3):
    path = os.path.join(data_dir, f"{attack_type}.csv")
    df = pd.read_csv(path, nrows=nrows, low_memory=False)
    df.columns = df.columns.str.strip()  #removing extra spaces in headers
    df['attack_type'] = attack_type      
    df['Label'] = df['Label'].map(lambda x: 0 if 'benign' in str(x).lower() else 1)

    attack_df = df[df['Label'] == 1].copy()  #attack flows
    benign_df = df[df['Label'] == 0].copy()  #benign flows

    if not attack_df.empty:
        generated = []
        for i in range(1, n_attacks + 1):
            fake_ip = f"10.10.10.{i}"
            rows = attack_df.sample(frac=0.3, replace=True, random_state=i).copy()
            rows['Source IP'] = fake_ip

            #gaussian noise to numeric features 
            num_cols = rows.select_dtypes(include=[np.number]).columns.difference(['Label'])
            for col in num_cols:
                vals = rows[col].dropna()
                std_dev = vals.std(ddof=0) if len(vals) > 0 else 0
                sigma = amount_of_noise * std_dev if std_dev > 0 else 1
                rows[col] += np.random.normal(0, sigma, size=rows.shape[0])

            #flipping some labels back to benign
            if i % 5 == 0:
                rows['Label'] = 0

            generated.append(rows)

        attack_df = pd.concat([attack_df] + generated, ignore_index=True)

    df = pd.concat([benign_df, attack_df], ignore_index=True)
    return df.sample(frac=1, random_state=42).reset_index(drop=True)

#placeholder for any df cleaning steps
def preprocess_for_graph(df):
    return df

#convertinh flow dataframe into a torch_geometric data object
def build_graph(flow_df):
    #replacing inf and nans with 0s
    flow_df = flow_df.replace([np.inf, -np.inf], np.nan).fillna(0)

    #mapping each ip to a unique node index
    ip_set = pd.unique(flow_df[['Source IP', 'Destination IP']].values.ravel())
    ip_map = {ip: idx for idx, ip in enumerate(ip_set)}
    rev_map = {idx: ip for ip, idx in ip_map.items()}

    flow_df['src'] = flow_df['Source IP'].map(ip_map)
    flow_df['dst'] = flow_df['Destination IP'].map(ip_map)

    #building node feature matrix by avg numeric cols per node
    num_cols = flow_df.select_dtypes(include=[np.number]).columns.difference(['src', 'dst', 'Label'])
    node_features = []
    for node in range(len(ip_set)):
        sub = flow_df[flow_df['src'] == node]
        feat = sub[num_cols].mean().values if not sub.empty else np.zeros(len(num_cols))
        node_features.append(feat)
    node_df = pd.DataFrame(node_features, columns=num_cols).fillna(0)

    #edge_index/edge_attr tensors
    edge_index = torch.tensor(flow_df[['src', 'dst']].values.T, dtype=torch.long)
    edge_attr = torch.tensor(
        StandardScaler().fit_transform(flow_df['Flow Bytes/s'].values.reshape(-1, 1)),
        dtype=torch.float
    )
    x = torch.tensor(StandardScaler().fit_transform(node_df), dtype=torch.float)

    #labeling nodes as attack if any incident flow has Label=1
    attack_nodes = np.union1d(
        flow_df[flow_df['Label'] == 1]['src'].unique(),
        flow_df[flow_df['Label'] == 1]['dst'].unique()
    )
    y = torch.tensor([1 if i in attack_nodes else 0 for i in range(len(node_df))], dtype=torch.float)

    data = Data(x=x, edge_index=edge_index, edge_attr=edge_attr, y=y)
    data.node_ip_map = rev_map             #mapping back to IP
    data.raw_flows = flow_df               #keeping raw flow df for plotting
    data.feature_names = list(num_cols)    #we save feature names for importance
    data.time_series = flow_df[['Timestamp', 'Flow Bytes/s']] \
        if 'Timestamp' in flow_df.columns else None
    return data

#defining gnn model
class GCN(torch.nn.Module):
    def __init__(self, in_channels, hidden_channels):
        super().__init__()
        self.conv1 = GCNConv(in_channels, hidden_channels)
        self.conv2 = GCNConv(hidden_channels, 1)

    def forward(self, data):
        x, edge_index = data.x, data.edge_index
        x = self.conv1(x, edge_index)
        x = F.relu(x)
        x = F.dropout(x, p=0.3, training=self.training)
        x = self.conv2(x, edge_index)
        return torch.sigmoid(x).squeeze()

#training
def train(data, epochs, hidden_channels=32):
    model = GCN(data.x.shape[1], hidden_channels)
    optimizer = torch.optim.Adam(model.parameters(), lr=0.01)
    loss_fn = torch.nn.BCELoss()
    for _ in range(epochs):
        model.train()
        optimizer.zero_grad()
        out = model(data)
        loss = loss_fn(out, data.y)
        loss.backward()
        optimizer.step()
    return model

#metrics evaluation
def evaluate_model(model, data, threshold=0.5):
    model.eval()
    with torch.no_grad():
        scores = model(data).numpy()
        true = data.y.numpy()
        pred = (scores > threshold).astype(int)

    acc = accuracy_score(true, pred)
    prec = precision_score(true, pred, zero_division=0)
    rec = recall_score(true, pred, zero_division=0)
    f1 = f1_score(true, pred, zero_division=0)
    tn, fp, fn, tp = confusion_matrix(true, pred).ravel()
    tpr = tp / (tp + fn) if (tp + fn) > 0 else 0
    fpr = fp / (fp + tn) if (fp + tn) > 0 else 0

    print(f"tp: {tp}, fp: {fp}, tn: {tn}, fn: {fn}")
    print(f"accuracy: {acc:.4f}, precision: {prec:.4f}, recall: {rec:.4f}, f1: {f1:.4f}")
    print(f"tpr: {tpr:.4f}, fpr: {fpr:.4f}")

    return true, scores, pred, acc, prec, rec, f1, tp, fp, tn, fn

#plot feature importances 
def plot_feature_importance(model, data, pdf):
    model.eval()
    x = data.x.clone().detach().requires_grad_(True)
    with torch.enable_grad():
        out = model(Data(x=x, edge_index=data.edge_index))
        out.mean().backward()
    grads = x.grad.abs().mean(dim=0).numpy()
    names = data.feature_names
    imp = sorted(zip(names, grads), key=lambda z: z[1], reverse=True)[:10]
    top_names, top_grads = zip(*imp)

    plt.figure(figsize=(10, 4))
    plt.barh(top_names[::-1], top_grads[::-1], color=PALETTE['med'])
    plt.title("top10 feature importances")
    plt.xlabel("avg gradient magnitude")
    plt.tight_layout()
    pdf.savefig(); plt.close()

#roc and precision recall curves
def plot_roc_pr(y_true, y_scores, pdf):
    fpr_vals, tpr_vals, _ = roc_curve(y_true, y_scores)
    auc_val = auc(fpr_vals, tpr_vals)
    plt.figure()
    plt.plot(fpr_vals, tpr_vals, color=PALETTE['dark'], label=f"auc={auc_val:.2f}")
    plt.plot([0, 1], [0, 1], linestyle="--", color=PALETTE['light'])
    plt.xlabel("fpr"); plt.ylabel("tpr"); plt.title("roc curve"); plt.legend()
    pdf.savefig(); plt.close()

    prec_vals, rec_vals, _ = precision_recall_curve(y_true, y_scores)
    plt.figure()
    plt.plot(rec_vals, prec_vals, color=PALETTE['accent'])
    plt.xlabel("recall"); plt.ylabel("precision"); plt.title("precision recall curve")
    pdf.savefig(); plt.close()

#showing how tpr/fpr/precision vary with threshold
def plot_threshold_metrics(y_true, y_scores, pdf):
    thresholds = np.linspace(0, 1, 100)
    tprs, fprs, precs = [], [], []
    for t in thresholds:
        p = (y_scores >= t).astype(int)
        tn, fp, fn, tp = confusion_matrix(y_true, p).ravel()
        tprs.append(tp / (tp + fn) if tp + fn > 0 else 0)
        fprs.append(fp / (fp + tn) if fp + tn > 0 else 0)
        precs.append(precision_score(y_true, p, zero_division=0))
    plt.figure()
    plt.plot(thresholds, tprs, label="tpr", color=PALETTE['dark'])
    plt.plot(thresholds, fprs, label="fpr", color=PALETTE['med'])
    plt.plot(thresholds, precs, label="precision", color=PALETTE['accent'])
    plt.xlabel("threshold"); plt.ylabel("rate"); plt.title("threshold vs metrics"); plt.legend()
    pdf.savefig(); plt.close()

#heatmap
def plot_confusion_heatmap(y_true, y_pred, pdf):
    cm = confusion_matrix(y_true, y_pred)
    cmap = sns.light_palette(PALETTE['dark'], as_cmap=True)
    plt.figure()
    sns.heatmap(cm, annot=True, fmt="d", cmap=cmap,
                xticklabels=["benign", "attack"],
                yticklabels=["benign", "attack"])
    plt.title("confusion matrix"); plt.xlabel("predicted"); plt.ylabel("actual")
    pdf.savefig(); plt.close()

#plot flow bytes/s over time
def show_time_series(data, pdf):
    if data.time_series is not None:
        df = data.time_series.copy()
        df['Timestamp'] = pd.to_datetime(df['Timestamp'], errors='coerce')
        df = df.dropna().sort_values('Timestamp')
        plt.figure(figsize=(10, 4))
        plt.plot(df['Timestamp'], df['Flow Bytes/s'], color=PALETTE['accent'])
        plt.title("ddos behavior over time"); plt.ylabel("flow bytes/s"); plt.grid()
        pdf.savefig(); plt.close()

#drawing the netraph n highlighting suspicious nodes
def show_graph_topology(data, pdf, scores=None, threshold=0.5, title="graph topology"):
    G = nx.Graph()
    flows = data.raw_flows
    G.add_edges_from(zip(flows['src'], flows['dst']))

    if scores is not None:
        color_map = []
        for node in G:
            s = scores[node] if node < len(scores) else 0
            if s > threshold:
                color_map.append('red')
            elif s > 0.3:
                color_map.append('orange')
            else:
                color_map.append('green')
        plt.figure(figsize=(8, 6))
        nx.draw(G, node_color=color_map, with_labels=True, font_size=6, node_size=100)
    else:
        plt.figure(figsize=(8, 6))
        nx.draw(G, node_color='brown', with_labels=True, font_size=6, node_size=100)

    plt.title(title)
    pdf.savefig(); plt.close()

#getting mitigation actions where we decide to block/limit/sinkhole based on scores obtained
def get_mitigation_actions(data, scores, threshold_blacklist=0.5, threshold_limit=(0.3, 0.5)):
    actions = []
    for nid, sc in enumerate(scores):
        ip = data.node_ip_map.get(nid, f'node{nid}')
        if sc > threshold_blacklist:
            actions.append((ip, 'blacklist'))
        elif threshold_limit[0] <= sc <= threshold_limit[1]:
            actions.append((ip, 'rate limit 10mbps'))
        elif sc > 0.8:
            actions.append((ip, 'sinkhole'))
    return actions

#printing counts of unique source/dest IPs
def check_unique_ips(df):
    s = len(df['Source IP'].unique())
    d = len(df['Destination IP'].unique())
    a = len(pd.unique(df[['Source IP', 'Destination IP']].values.ravel()))
    print(f"unique ips in source ip       : {s}")
    print(f"unique ips in dest ip         : {d}")
    print(f"total unique ips              : {a}")
    return a

#inject label noise method 

def inject_label_noise(df, flip_fraction=0.1, seed=42):
    np.random.seed(seed)
    idxs = df[df['Label'] == 1].index
    n_flip = int(len(idxs) * flip_fraction)
    flips = np.random.choice(idxs, n_flip, replace=False)
    df.loc[flips, 'Label'] = 0
    return df
    
#datapath to file, one of the main parts 

data_dir = "01-12"
all_attacks = [
    "DrDoS_DNS", "DrDoS_LDAP", "DrDoS_MSSQL", "DrDoS_NetBIOS",
    "DrDoS_NTP", "DrDoS_SNMP", "DrDoS_SSDP", "DrDoS_UDP",
    "Syn", "TFTP", "UDPLag"
]

with PdfPages("visualizationrep.pdf") as pdf:
    for atk in all_attacks:
        print(f"\n{atk}\n")
        df = load_attack_type(data_dir, atk, nrows=200000, n_attacks=50, amount_of_noise=0.15)
        check_unique_ips(df)
        df = preprocess_for_graph(df)
        df = inject_label_noise(df, flip_fraction=0.05)

        #spliting into train/test flows
        train_flows, test_flows = train_test_split(
            df, test_size=0.3, stratify=df['Label'], random_state=42
        )

        train_data = build_graph(train_flows)
        test_data = build_graph(test_flows)

        #visualization before attack pre
        show_graph_topology(train_data, pdf, title=f"{atk}: pre atk topology")
        show_time_series(train_data, pdf)

        #training and evaluating
        model = train(train_data, epochs=50, hidden_channels=32)
        y_true, y_scores, y_pred, acc, prec, rec, f1, tp, fp, tn, fn = evaluate_model(
            model, test_data
        )

        #some performance plots
        plot_roc_pr(y_true, y_scores, pdf)
        plot_threshold_metrics(y_true, y_scores, pdf)
        plot_confusion_heatmap(y_true, y_pred, pdf)
        plot_feature_importance(model, train_data, pdf)

        #post attck visualization
        show_graph_topology(test_data, pdf, scores=y_scores, title=f"{atk}: post atk topology")

        #summary
        plt.figure()
        plt.axis('off')
        summary = (
            f"attack: {atk}\n\n"
            f"accuracy: {acc:.4f}  precision: {prec:.4f}\n"
            f"recall: {rec:.4f}  f1: {f1:.4f}\n"
            f"tp: {tp}  fp: {fp}  tn: {tn}  fn: {fn}"
        )
        plt.text(0, 0.5, summary, fontsize=12, fontfamily="monospace")
        pdf.savefig(); plt.close()

        #mitigation actions
        actions = get_mitigation_actions(test_data, y_scores)
        print("\nmitigation actions:")
        for ip, act in actions:
            print(f"{ip}: {act}")



DrDoS_DNS

unique ips in source ip       : 152
unique ips in dest ip         : 113
total unique ips              : 169
tp: 38, fp: 11, tn: 97, fn: 4
accuracy: 0.9000, precision: 0.7755, recall: 0.9048, f1: 0.8352
tpr: 0.9048, fpr: 0.1019

mitigation actions:
10.10.10.23: rate limit 10mbps
192.168.50.1: blacklist
10.10.10.41: blacklist
10.10.10.11: blacklist
10.10.10.17: blacklist
10.10.10.39: blacklist
172.16.0.5: blacklist
10.10.10.9: blacklist
10.10.10.15: blacklist
10.10.10.42: blacklist
10.10.10.28: blacklist
10.10.10.7: blacklist
10.10.10.8: blacklist
10.10.10.1: blacklist
10.10.10.19: blacklist
10.10.10.24: blacklist
10.10.10.16: blacklist
10.10.10.3: blacklist
10.10.10.38: blacklist
10.10.10.6: blacklist
10.10.10.18: blacklist
10.10.10.31: blacklist
10.10.10.45: blacklist
10.10.10.46: blacklist
10.10.10.37: blacklist
10.10.10.27: rate limit 10mbps
10.10.10.36: blacklist
10.10.10.14: blacklist
10.10.10.2: blacklist
10.10.10.4: blacklist
10.10.10.44: blacklist
10.10.10.12: blackl

In [12]:
print(f"pandas version: {pd.__version__}")
print(f"numpy version: {np.__version__}")
print(f"torch version: {torch.__version__}")
print(f"scikit-learn version: {sklearn.__version__}")
print(f"matplotlib version: {matplotlib.__version__}")
print(f"seaborn version: {sns.__version__}")
print(f"networkx version: {nx.__version__}")


pandas version: 2.2.2
numpy version: 1.26.4
torch version: 2.2.2+cpu
scikit-learn version: 1.5.2
matplotlib version: 3.9.2
seaborn version: 0.13.2
networkx version: 3.4.2
