In [1]:
import pandas as pd
import numpy as np
import os
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
import torch
from torch.utils.data import TensorDataset

# ============================
# Step 1: Load CICDDoS-2019 Parquet Files
# ============================

folder_path = "D:/federated learning/FL-IDS-Intrusion-Detection/ciciddos-2019/"  # <-- update path

# Collect all parquet files
all_files = [f for f in os.listdir(folder_path) if f.endswith(".parquet")]

train_files = [f for f in all_files if "training" in f.lower()]
test_files = [f for f in all_files if "testing" in f.lower()]

print("Training files:", train_files)
print("Testing files:", test_files)

# Load training files
df_train_list = []
for file in train_files:
    file_path = os.path.join(folder_path, file)
    print(f"Loading training file: {file}")
    df_train_list.append(pd.read_parquet(file_path))

df_train = pd.concat(df_train_list, axis=0, ignore_index=True)

# Load testing files
df_test_list = []
for file in test_files:
    file_path = os.path.join(folder_path, file)
    print(f"Loading testing file: {file}")
    df_test_list.append(pd.read_parquet(file_path))

df_test = pd.concat(df_test_list, axis=0, ignore_index=True)

print(f"Training shape: {df_train.shape}")
print(f"Testing shape: {df_test.shape}")

# ============================
# Step 2: Preprocessing
# ============================

# Drop NA values
df_train = df_train.dropna()
df_test = df_test.dropna()

# Map labels: BENIGN = 0, others = 1 (binary classification)
df_train['Label'] = df_train['Label'].apply(lambda x: 0 if x.upper() == 'BENIGN' else 1)
df_test['Label'] = df_test['Label'].apply(lambda x: 0 if x.upper() == 'BENIGN' else 1)

# Split features & labels
X_train = df_train.drop(columns=['Label']).values
y_train = df_train['Label'].values

X_test = df_test.drop(columns=['Label']).values
y_test = df_test['Label'].values

# Replace inf/nan
X_train = np.where(np.isinf(X_train), np.nan, X_train)
X_test = np.where(np.isinf(X_test), np.nan, X_test)

train_means = np.nanmean(X_train, axis=0)
test_means = np.nanmean(X_test, axis=0)

inds_train = np.where(np.isnan(X_train))
inds_test = np.where(np.isnan(X_test))

X_train[inds_train] = np.take(train_means, inds_train[1])
X_test[inds_test] = np.take(test_means, inds_test[1])

# Standardize
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Convert to torch tensors
X_train_tensor = torch.tensor(X_train, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train, dtype=torch.long)
X_test_tensor = torch.tensor(X_test, dtype=torch.float32)
y_test_tensor = torch.tensor(y_test, dtype=torch.long)

test_dataset = TensorDataset(X_test_tensor, y_test_tensor)

# ============================
# Step 3: Non-IID Split (5 clients)
# ============================

def split_noniid_data(X, y, num_clients):
    non_iid_data = []
    unique_labels = np.unique(y)
    label_indices = {label: np.where(y == label)[0] for label in unique_labels}

    for client_id in range(num_clients):
        client_data_indices = []
        for label in unique_labels:
            num_samples = int(len(label_indices[label]) / num_clients)
            if num_samples > 0:
                selected_indices = np.random.choice(label_indices[label], num_samples, replace=False)
                client_data_indices.extend(selected_indices)
                label_indices[label] = np.setdiff1d(label_indices[label], selected_indices)

        client_data_X = X[client_data_indices]
        client_data_y = y[client_data_indices]
        non_iid_data.append((client_data_X, client_data_y))

    return non_iid_data

num_clients = 5
client_data_splits = split_noniid_data(X_train, y_train, num_clients)

# Convert each client's data to TensorDataset
client_datasets = []
for client_data_X, client_data_y in client_data_splits:
    client_X_tensor = torch.tensor(client_data_X, dtype=torch.float32)
    client_y_tensor = torch.tensor(client_data_y, dtype=torch.long)
    client_datasets.append(TensorDataset(client_X_tensor, client_y_tensor))

# ============================
# Step 4: Verification
# ============================

for i, dataset in enumerate(client_datasets):
    print(f"Client {i+1} data size: {len(dataset)} samples")

print(f"Test dataset size: {len(test_dataset)} samples")


Training files: ['LDAP-training.parquet', 'MSSQL-training.parquet', 'NetBIOS-training.parquet', 'Portmap-training.parquet', 'Syn-training.parquet', 'UDP-training.parquet', 'UDPLag-training.parquet']
Testing files: ['LDAP-testing.parquet', 'MSSQL-testing.parquet', 'NetBIOS-testing.parquet', 'NTP-testing.parquet', 'SNMP-testing.parquet', 'Syn-testing.parquet', 'TFTP-testing.parquet', 'UDP-testing.parquet', 'UDPLag-testing.parquet']
Loading training file: LDAP-training.parquet
Loading training file: MSSQL-training.parquet
Loading training file: NetBIOS-training.parquet
Loading training file: Portmap-training.parquet
Loading training file: Syn-training.parquet
Loading training file: UDP-training.parquet
Loading training file: UDPLag-training.parquet
Loading testing file: LDAP-testing.parquet
Loading testing file: MSSQL-testing.parquet
Loading testing file: NetBIOS-testing.parquet
Loading testing file: NTP-testing.parquet
Loading testing file: SNMP-testing.parquet
Loading testing file: Syn-

In [2]:
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
import copy
from torch.utils.data import DataLoader
import torch.nn.functional as F
from opacus import PrivacyEngine
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

In [3]:
from transformers import DistilBertModel, DistilBertConfig

class TabularFeatureEmbedder(nn.Module):
    def __init__(self, input_dim=41, seq_len=41, embed_dim=768):
        super(TabularFeatureEmbedder, self).__init__()
        self.embedding = nn.Linear(input_dim, seq_len * embed_dim)
        self.seq_len = seq_len
        self.embed_dim = embed_dim

    def forward(self, x):
        x = self.embedding(x)
        return x.view(-1, self.seq_len, self.embed_dim)

class DistilBERTIntrusionClassifier(nn.Module):
    def __init__(self, hidden_size=768, output_size=2):
        super(DistilBERTIntrusionClassifier, self).__init__()
        config = DistilBertConfig(
            dim=hidden_size,
            hidden_dim=1024,
            n_layers=4,
            n_heads=4,
            dropout=0.1,
            attention_dropout=0.1
        )
        self.bert = DistilBertModel(config)
        self.classifier = nn.Linear(hidden_size, output_size)

    def forward(self, x):
        attention_mask = torch.ones(x.size(0), x.size(1)).to(x.device)
        x = self.bert(inputs_embeds=x, attention_mask=attention_mask).last_hidden_state[:, 0, :]
        return self.classifier(x)

class CombinedDistilBERTModel(nn.Module):
    def __init__(self, embedder, classifier):
        super(CombinedDistilBERTModel, self).__init__()
        self.embedder = embedder
        self.classifier = classifier

    def forward(self, x):
        x = self.embedder(x)
        return self.classifier(x)


In [4]:
def adversarial_attack(model, data, target, epsilon=0.1):
    data.requires_grad = True
    output = model(data)
    loss = nn.CrossEntropyLoss()(output, target)
    model.zero_grad()
    loss.backward()
    perturbed_data = data + epsilon * data.grad.sign()
    return perturbed_data.detach()

In [5]:
class Client:
    def __init__(self, client_id, model, dataset, lr=0.001, mu=0.1, epsilon=0.2, delta=1e-5):
        self.client_id = client_id
        self.model = copy.deepcopy(model)
        self.dataset = dataset
        self.dataloader = DataLoader(dataset, batch_size=32, shuffle=True)
        self.optimizer = optim.Adam(self.model.parameters(), lr=lr)
        self.criterion = nn.CrossEntropyLoss()
        self.mu = mu
        self.epsilon = epsilon
        self.delta = delta
        self.privacy_engine = PrivacyEngine()
        self.model, self.optimizer, self.dataloader = self.privacy_engine.make_private(
            module=self.model,
            optimizer=self.optimizer,
            data_loader=self.dataloader,
            noise_multiplier=0.3,
            max_grad_norm=1.5
        )
        self.grad_tracker = [torch.zeros_like(param) for param in self.model.parameters()]

    def train_local(self, global_model, epochs=1, adv_training=True, fkd=True):
        self.model.train()
        global_params = list(global_model.parameters())

        for epoch in range(epochs):
            for data, target in self.dataloader:
                data, target = data.to(torch.float32), target.to(torch.long)
                if adv_training:
                    data = adversarial_attack(self.model, data, target)
                self.optimizer.zero_grad()
                output = self.model(data)
                loss = self.criterion(output, target)

                # FedDyn regularization
                fed_dyn_reg = 0.0
                for param, g_param, z in zip(self.model.parameters(), global_params, self.grad_tracker):
                    fed_dyn_reg += torch.sum(param * (self.mu * (param - g_param.detach()) - z))
                loss += fed_dyn_reg

                if fkd:
                    with torch.no_grad():
                        global_output = global_model(data)
                    distill_loss = nn.KLDivLoss(reduction='batchmean')(
                        F.log_softmax(output, dim=1), F.softmax(global_output, dim=1)
                    )
                    loss += 0.4 * distill_loss

                loss.backward()
                self.optimizer.step()

        # Update the client’s historical gradient (FedDyn)
        with torch.no_grad():
            for i, (param, g_param) in enumerate(zip(self.model.parameters(), global_params)):
                self.grad_tracker[i] -= self.mu * (param.detach() - g_param.detach())

        return self.model.state_dict()


In [6]:
class Server:
    def __init__(self, model, num_clients, mu=0.1):
        self.global_model = model
        self.num_clients = num_clients
        self.clients = []
        self.mu = mu
        self.h_dict = {}  # Drift term for FedDyn

    def register_client(self, client):
        self.clients.append(client)
        self.h_dict[client.client_id] = {k: torch.zeros_like(v) for k, v in self.global_model.state_dict().items()}

    def aggregate_weights_feddyn(self, client_updates):
        new_global_weights = copy.deepcopy(self.global_model.state_dict())

        for key in new_global_weights.keys():
            avg_update = torch.stack([update[0][key] - (1 / self.mu) * update[1][key] for update in client_updates])
            new_global_weights[key] = avg_update.mean(dim=0)

        return new_global_weights

    def federated_training(self, rounds=10, epochs=1, adv_training=True, fkd=True):
        for r in range(rounds):
            # Adaptive client selection based on drift norm (FedDyn)
            drift_norms = {client.client_id: torch.norm(torch.cat([v.view(-1) for v in self.h_dict[client.client_id].values()]))
                           for client in self.clients}
            sorted_clients = sorted(self.clients, key=lambda x: drift_norms[x.client_id], reverse=True)
            selected_clients = sorted_clients[:max(1, len(self.clients) // 2)]

            print(f"Round {r+1} | Selected Clients: {[c.client_id for c in selected_clients]}")

            client_updates = []
            for client in selected_clients:
                state_dict, h_new = client.train_local(self.global_model, self.h_dict[client.client_id], epochs, adv_training, fkd)
                client_updates.append((state_dict, self.h_dict[client.client_id]))
                self.h_dict[client.client_id] = h_new

            # FedDyn Aggregation
            new_weights = self.aggregate_weights_feddyn(client_updates)
            self.global_model.load_state_dict(new_weights)

    def evaluate_model(self, test_loader):
        self.global_model.eval()
        y_true, y_pred = [], []
        total_loss = 0.0
        criterion = nn.CrossEntropyLoss()

        with torch.no_grad():
            for data, target in test_loader:
                data, target = data.to(torch.float32), target.to(torch.long)
                output = self.global_model(data)
                loss = criterion(output, target)
                total_loss += loss.item()

                predictions = torch.argmax(output, dim=1)
                y_true.extend(target.numpy())
                y_pred.extend(predictions.numpy())

        # Evaluation Metrics
        accuracy = accuracy_score(y_true, y_pred)
        precision = precision_score(y_true, y_pred, average='macro')
        recall = recall_score(y_true, y_pred, average='macro')
        f1 = f1_score(y_true, y_pred, average='macro')

        print("Evaluation Metrics:")
        print(f"Accuracy: {accuracy:.4f}")
        print(f"Precision: {precision:.4f}")
        print(f"Recall: {recall:.4f}")
        print(f"F1 Score: {f1:.4f}")

        # Confusion Matrix
        cm = confusion_matrix(y_true, y_pred)
        sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
        plt.xlabel('Predicted')
        plt.ylabel('Actual')
        plt.title('Confusion Matrix')
        plt.show()

        return accuracy, precision, recall, f1


In [None]:
embedder = TabularFeatureEmbedder(input_dim=41, seq_len=41, embed_dim=768)
classifier = DistilBERTIntrusionClassifier(hidden_size=768, output_size=2)
server_model = CombinedDistilBERTModel(embedder, classifier)

In [None]:
for i in range(num_clients):
    client = Client(i, server_model, client_datasets[i])
    server.register_client(client)

In [7]:
server.federated_training(rounds=35, epochs=5, adv_training=True, dynamic_fed=True)

Round 1 completed.
Round 2 completed.
Round 3 completed.
Round 4 completed.
Round 5 completed.
Round 6 completed.
Round 7 completed.
Round 8 completed.
Round 9 completed.
Round 10 completed.
Round 11 completed.
Round 12 completed.
Round 13 completed.
Round 14 completed.
Round 15 completed.
Round 16 completed.
Round 17 completed.
Round 18 completed.
Round 19 completed.
Round 20 completed.
Round 21 completed.
Round 22 completed.
Round 23 completed.
Round 24 completed.
Round 25 completed.
Round 26 completed.
Round 27 completed.
Round 28 completed.
Round 29 completed.
Round 30 completed.
Round 31 completed.
Round 32 completed.
Round 33 completed.
Round 34 completed.
Round 35 completed.


In [1]:
server.evaluate_model(DataLoader(test_dataset, batch_size=32, shuffle=False)

Evaluation Metrics:
Accuracy:0.9742
Recall:0.9665
Precision:0.9671
