In [1]:
# ================================
# 1. Imports
# ================================
import os, glob, random
import numpy as np
import pandas as pd
from tqdm import tqdm

# ML/Preprocessing
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split

# Torch (for later models)
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

# ================================
# 2. Mount Google Drive
# ================================
from google.colab import drive
drive.mount('/content/drive')

# Dataset path
DATA_PATH = "/content/drive/MyDrive/datasets/CICIDS2017/"

# ================================
# 3. Load all CSV files
# ================================
csv_files = glob.glob(DATA_PATH + "*.csv")
print("Found files:", len(csv_files))

dfs = []
for file in csv_files:
    df = pd.read_csv(file)
    dfs.append(df)

full_df = pd.concat(dfs, ignore_index=True)
print("Full dataset shape:", full_df.shape)

# ================================
# Encode labels (BENIGN = 0, ATTACK = 1)
# ================================
def encode_labels(df):
    le = LabelEncoder()
    df[' Label'] = le.fit_transform(df[' Label'])
    df[' Label'] = df[' Label'].apply(
        lambda x: 0 if le.classes_[x] == 'BENIGN' else 1
    )
    return df

full_df = encode_labels(full_df)

# Keep only numeric columns
full_df = full_df.select_dtypes(include=[np.number])
print("Dataset after label encoding + numeric filter:", full_df.shape)

# ================================
# Features & Labels + Cleaning
# ================================
X = full_df.drop(' Label', axis=1).values
y = full_df[' Label'].values

# Replace inf → nan
X[np.isinf(X)] = np.nan

# Drop rows with NaN
mask = ~np.isnan(X).any(axis=1)
X, y = X[mask], y[mask]

print("Final dataset shape:", X.shape, "Labels shape:", y.shape)

# ================================
# Standardize
# ================================
scaler = StandardScaler()
X = scaler.fit_transform(X)

# Train/Test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

print("Train:", X_train.shape, "Test:", y_test.shape)


# ================================
# 4. Config for Federated run
# ================================
NUM_CLIENTS = 5
NUM_EPOCHS = 5           # federated rounds
LOCAL_EPOCHS = 1         # local epochs per round
BATCH_SIZE = 64
LR = 1e-3
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

FED_NONIID = True        # False = IID split, True = Non-IID split
USE_BOOTSTRAP = True     # Toggle bootstrapping for client sampling
BOOTSTRAP_RATIO = 0.8    # Resample % of each client dataset


# ================================
# 5. Partitioning Functions
# ================================
def iid_partition(X, y, num_clients=NUM_CLIENTS, seed=42):
    rng = np.random.default_rng(seed)
    idx = np.arange(len(X))
    rng.shuffle(idx)
    splits = np.array_split(idx, num_clients)
    return [(X[s], y[s]) for s in splits]

def noniid_partition(X, y, num_clients=NUM_CLIENTS, seed=42):
    np.random.seed(seed)
    idx_0 = np.where(y == 0)[0]   # BENIGN
    idx_1 = np.where(y == 1)[0]   # ATTACK
    np.random.shuffle(idx_0); np.random.shuffle(idx_1)

    size_0 = len(idx_0) // num_clients
    size_1 = len(idx_1) // num_clients

    clients_data = []
    for i in range(num_clients):
        if i % 2 == 0:  # even clients → mostly BENIGN
            main_idx = idx_0[size_0*i : size_0*(i+1)]
            other_idx_end = min(len(idx_1), size_1*(i//3 + 1))
            other_idx = idx_1[size_1*i//3 : other_idx_end]
        else:           # odd clients → mostly ATTACK
            main_idx = idx_1[size_1*i : size_1*(i+1)]
            other_idx_end = min(len(idx_0), size_0*(i//3 + 1))
            other_idx = idx_0[size_0*i//3 : other_idx_end]

        indices = np.concatenate((main_idx, other_idx))
        np.random.shuffle(indices)
        clients_data.append((X[indices], y[indices]))
    return clients_data


# ================================
# 6. Bootstrapping Function
# ================================
def bootstrap_client_data(clients_data, ratio=BOOTSTRAP_RATIO, seed=42):
    rng = np.random.default_rng(seed)
    boot_clients = []
    for Xc, yc in clients_data:
        n_samples = int(len(Xc) * ratio)
        idx = rng.choice(len(Xc), n_samples, replace=True)
        boot_clients.append((Xc[idx], yc[idx]))
    return boot_clients


# ================================
# 7. Get Clients Data
# ================================
clients_data = (
    noniid_partition(X_train, y_train, NUM_CLIENTS)
    if FED_NONIID else
    iid_partition(X_train, y_train, NUM_CLIENTS)
)

if USE_BOOTSTRAP:
    clients_data = bootstrap_client_data(clients_data)

# Quick check: print label distribution
for i, (Xc, yc) in enumerate(clients_data):
    u, c = np.unique(yc, return_counts=True)
    print(f"Client {i} label counts:", dict(zip(u, c)))

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Found files: 8
Full dataset shape: (2830743, 79)
Dataset after label encoding + numeric filter: (2830743, 79)
Final dataset shape: (2827876, 78) Labels shape: (2827876,)
Train: (2262300, 78) Test: (565576,)
Client 0 label counts: {np.int64(0): np.int64(290522), np.int64(1): np.int64(71446)}
Client 1 label counts: {np.int64(0): np.int64(193795), np.int64(1): np.int64(71263)}
Client 2 label counts: {np.int64(0): np.int64(290619), np.int64(1): np.int64(23856)}
Client 3 label counts: {np.int64(0): np.int64(290885), np.int64(1): np.int64(71083)}
Client 4 label counts: {np.int64(0): np.int64(290797), np.int64(1): np.int64(47424)}


In [5]:
def get_clients_data(
    X, y,
    num_clients=5,
    noniid=True,
    use_bootstrap=True,
    bootstrap_ratio=0.8,
    seed=42
):
    """Prepare client datasets for federated training."""
    # Partitioning
    if noniid:
        clients_data = noniid_partition(X, y, num_clients, seed)
    else:
        clients_data = iid_partition(X, y, num_clients, seed)

    print("\n--- Before Bootstrapping ---")
    for i, (Xc, yc) in enumerate(clients_data):
        print(f" Client {i}: {Xc.shape[0]} samples, labels={dict(zip(*np.unique(yc, return_counts=True)))}")

    # Optional Bootstrapping
    if use_bootstrap:
        clients_data = bootstrap_client_data(clients_data, bootstrap_ratio, seed)
        print("\n--- After Bootstrapping ---")
        for i, (Xc, yc) in enumerate(clients_data):
            print(f" Client {i}: {Xc.shape[0]} samples, labels={dict(zip(*np.unique(yc, return_counts=True)))}")

    return clients_data

In [None]:
# ================================
# XGBoost Federated Training
# ================================
import xgboost as xgb
from sklearn.metrics import accuracy_score

def federated_xgboost(clients_data, X_test, y_test, rounds=3, params=None):
    """
    Simulated federated XGBoost training.
    """
    if params is None:
        params = {
            "objective": "binary:logistic",
            "eval_metric": "logloss",
            "learning_rate": 0.1,
            "max_depth": 6,
            "n_estimators": 50,
            "subsample": 0.8,
            "colsample_bytree": 0.8,
            "tree_method": "hist"
        }

    global_model = None

    for rnd in range(rounds):
        print(f"\n===== Federated Round {rnd+1} =====")
        client_models = []

        # Train local models
        for i, (Xc, yc) in enumerate(clients_data):
            clf = xgb.XGBClassifier(**params, use_label_encoder=False, verbosity=0)
            clf.fit(Xc, yc)
            client_models.append(clf)
            print(f" Client {i} trained.")

        # Aggregate (simple averaging of trees/boosters is tricky → we’ll average predictions)
        # Collect predictions from all client models and average logits
        test_preds = np.zeros(len(y_test))
        for clf in client_models:
            test_preds += clf.predict_proba(X_test)[:,1]
        test_preds /= len(client_models)

        # Threshold for classification
        y_pred = (test_preds >= 0.5).astype(int)
        acc = accuracy_score(y_test, y_pred)
        print(f" Global Model Accuracy (Round {rnd+1}): {acc:.4f}")

        # Pick best performing client model as proxy for global (optional simplification)
        global_model = client_models[0]

    return global_model

In [7]:
clients_data = get_clients_data(
    X_train, y_train,
    num_clients=5,
    noniid=True,
    use_bootstrap=True,
    bootstrap_ratio=0.2
)
global_model = federated_xgboost(clients_data, X_test, y_test, rounds=3)


--- Before Bootstrapping ---
 Client 0: 452460 samples, labels={np.int64(0): np.int64(363411), np.int64(1): np.int64(89049)}
 Client 1: 331323 samples, labels={np.int64(0): np.int64(242274), np.int64(1): np.int64(89049)}
 Client 2: 393094 samples, labels={np.int64(0): np.int64(363411), np.int64(1): np.int64(29683)}
 Client 3: 452460 samples, labels={np.int64(0): np.int64(363411), np.int64(1): np.int64(89049)}
 Client 4: 422777 samples, labels={np.int64(0): np.int64(363411), np.int64(1): np.int64(59366)}

--- After Bootstrapping ---
 Client 0: 90492 samples, labels={np.int64(0): np.int64(72811), np.int64(1): np.int64(17681)}
 Client 1: 66264 samples, labels={np.int64(0): np.int64(48442), np.int64(1): np.int64(17822)}
 Client 2: 78618 samples, labels={np.int64(0): np.int64(72663), np.int64(1): np.int64(5955)}
 Client 3: 90492 samples, labels={np.int64(0): np.int64(72714), np.int64(1): np.int64(17778)}
 Client 4: 84555 samples, labels={np.int64(0): np.int64(72749), np.int64(1): np.int64(

In [6]:
clients_data = get_clients_data(
    X_train, y_train,
    num_clients=5,
    noniid=True,
    use_bootstrap=True,
    bootstrap_ratio=0.8
)
global_model = federated_xgboost(clients_data, X_test, y_test, rounds=3)


--- Before Bootstrapping ---
 Client 0: 452460 samples, labels={np.int64(0): np.int64(363411), np.int64(1): np.int64(89049)}
 Client 1: 331323 samples, labels={np.int64(0): np.int64(242274), np.int64(1): np.int64(89049)}
 Client 2: 393094 samples, labels={np.int64(0): np.int64(363411), np.int64(1): np.int64(29683)}
 Client 3: 452460 samples, labels={np.int64(0): np.int64(363411), np.int64(1): np.int64(89049)}
 Client 4: 422777 samples, labels={np.int64(0): np.int64(363411), np.int64(1): np.int64(59366)}

--- After Bootstrapping ---
 Client 0: 361968 samples, labels={np.int64(0): np.int64(290522), np.int64(1): np.int64(71446)}
 Client 1: 265058 samples, labels={np.int64(0): np.int64(193795), np.int64(1): np.int64(71263)}
 Client 2: 314475 samples, labels={np.int64(0): np.int64(290619), np.int64(1): np.int64(23856)}
 Client 3: 361968 samples, labels={np.int64(0): np.int64(290885), np.int64(1): np.int64(71083)}
 Client 4: 338221 samples, labels={np.int64(0): np.int64(290797), np.int64(1)

In [8]:
# ========= Ensemble wrapper for the "global" model =========
class XGBEnsemble:
    """Wraps a list of trained XGBClassifier models and exposes predict/predict_proba."""
    def __init__(self, models):
        self.models = models

    def predict_proba(self, X):
        # Average the positive-class probabilities across clients
        import numpy as np
        probs = np.zeros(X.shape[0], dtype=float)
        for m in self.models:
            probs += m.predict_proba(X)[:, 1]
        probs /= len(self.models)  # <-- correct averaging (outside the loop)
        # Return 2-column proba to mimic sklearn API
        return np.vstack([1.0 - probs, probs]).T

    def predict(self, X, threshold=0.5):
        return (self.predict_proba(X)[:, 1] >= threshold).astype(int)

# ========= Metrics helper =========
from sklearn.metrics import accuracy_score, roc_auc_score, precision_recall_fscore_support

def eval_metrics(y_true, y_proba, threshold=0.5):
    y_pred = (y_proba >= threshold).astype(int)
    acc = accuracy_score(y_true, y_pred)
    # Guard AUC if one class missing in y_true
    try:
        auc = roc_auc_score(y_true, y_proba)
    except ValueError:
        auc = float("nan")
    prec, rec, f1, _ = precision_recall_fscore_support(y_true, y_pred, average="binary", zero_division=0)
    return {"acc": acc, "auc": auc, "precision": prec, "recall": rec, "f1": f1}

# ========= Federated XGBoost (simulation) =========
import numpy as np
import xgboost as xgb

def federated_xgboost(
    clients_data,
    X_test, y_test,
    rounds=3,
    params=None,
    client_fraction=1.0,        # e.g., 0.6 for partial participation
    rebootstrap=False,          # if True, re-sample (bootstrap) client data every round
    bootstrap_ratio=0.8,        # ignored unless rebootstrap=True
    bootstrap_seed=42,
    verbose=True
):
    """
    Simulated federated XGBoost with non-IID clients.
    - Aggregation: probability averaging (ensemble).
    - Optional partial participation (client_fraction).
    - Optional re-bootstrap per round (rebootstrap=True).
    Returns:
        global_ensemble (XGBEnsemble), history (list of metrics per round)
    """
    if params is None:
        params = {
            "objective": "binary:logistic",
            "eval_metric": "logloss",
            "learning_rate": 0.1,
            "max_depth": 6,
            "n_estimators": 50,
            "subsample": 0.8,
            "colsample_bytree": 0.8,
            "tree_method": "hist",
        }

    rng = np.random.default_rng(bootstrap_seed)
    original_clients = clients_data  # keep a copy for per-round bootstrapping
    history = []

    for rnd in range(1, rounds + 1):
        if verbose:
            print(f"\n===== Federated Round {rnd} =====")

        # Optional: re-bootstrap client data every round to simulate drift/variability
        if rebootstrap:
            # expects your bootstrap_client_data(clients_data, ratio, seed) to exist
            clients_data = bootstrap_client_data(original_clients, ratio=bootstrap_ratio, seed=rng.integers(0, 1_000_000))

        # Partial client participation
        num_clients = len(clients_data)
        m = max(1, int(np.ceil(client_fraction * num_clients)))
        selected = rng.choice(num_clients, size=m, replace=False)

        client_models = []
        for i in selected:
            Xc, yc = clients_data[i]
            clf = xgb.XGBClassifier(**params, use_label_encoder=False, verbosity=0)
            clf.fit(Xc, yc)
            client_models.append(clf)
            if verbose:
                print(f"  Client {i} trained on {len(yc)} samples.")

        # Build global ensemble and evaluate
        global_ensemble = XGBEnsemble(client_models)
        test_proba = global_ensemble.predict_proba(X_test)[:, 1]
        metrics = eval_metrics(y_test, test_proba, threshold=0.5)
        history.append(metrics)

        if verbose:
            print(f"  Global (ensemble) — "
                  f"ACC: {metrics['acc']:.4f} | AUC: {metrics['auc']:.4f} | "
                  f"P: {metrics['precision']:.4f} | R: {metrics['recall']:.4f} | F1: {metrics['f1']:.4f}")

    # Return the final ensemble and the metrics history
    return global_ensemble, history

In [9]:
clients_data = get_clients_data(
    X_train, y_train,
    num_clients=5,
    noniid=True,
    use_bootstrap=True,
    bootstrap_ratio=0.5
)
global_model, hist = federated_xgboost(
    clients_data,
    X_test, y_test,
    rounds=3,
    client_fraction=1.0,    # or < 1.0 for partial participation
    rebootstrap=True,       # set False to keep fixed client data across rounds
    bootstrap_ratio=0.8,
    bootstrap_seed=42,
    verbose=True
)

# Use the ensemble like a normal model:
y_proba = global_model.predict_proba(X_test)[:,1]
y_pred = global_model.predict(X_test)


--- Before Bootstrapping ---
 Client 0: 452460 samples, labels={np.int64(0): np.int64(363411), np.int64(1): np.int64(89049)}
 Client 1: 331323 samples, labels={np.int64(0): np.int64(242274), np.int64(1): np.int64(89049)}
 Client 2: 393094 samples, labels={np.int64(0): np.int64(363411), np.int64(1): np.int64(29683)}
 Client 3: 452460 samples, labels={np.int64(0): np.int64(363411), np.int64(1): np.int64(89049)}
 Client 4: 422777 samples, labels={np.int64(0): np.int64(363411), np.int64(1): np.int64(59366)}

--- After Bootstrapping ---
 Client 0: 226230 samples, labels={np.int64(0): np.int64(181712), np.int64(1): np.int64(44518)}
 Client 1: 165661 samples, labels={np.int64(0): np.int64(120967), np.int64(1): np.int64(44694)}
 Client 2: 196547 samples, labels={np.int64(0): np.int64(181714), np.int64(1): np.int64(14833)}
 Client 3: 226230 samples, labels={np.int64(0): np.int64(181702), np.int64(1): np.int64(44528)}
 Client 4: 211388 samples, labels={np.int64(0): np.int64(181552), np.int64(1)