In [2]:
import numpy as np
import pandas as pd
import copy
import warnings

from sklearn.linear_model import LogisticRegression
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score
from sklearn.exceptions import ConvergenceWarning

warnings.filterwarnings("ignore", category=ConvergenceWarning)

RANDOM_STATE = 42
np.random.seed(RANDOM_STATE)

In [3]:
DATA_PATH = "../data/processed/cicids_10pct_pruned.csv"

df = pd.read_csv(DATA_PATH)

train_files = [
    "Monday-WorkingHours.pcap_ISCX.csv",
    "Tuesday-WorkingHours.pcap_ISCX.csv",
    "Wednesday-workingHours.pcap_ISCX.csv",
    "Thursday-WorkingHours-Morning-WebAttacks.pcap_ISCX.csv",
    "Thursday-WorkingHours-Afternoon-Infilteration.pcap_ISCX.csv"
]

test_files = [
    "Friday-WorkingHours-Morning.pcap_ISCX.csv",
    "Friday-WorkingHours-Afternoon-PortScan.pcap_ISCX.csv",
    "Friday-WorkingHours-Afternoon-DDos.pcap_ISCX.csv"
]

df["target"] = (df["Label"] != "BENIGN").astype(int)

df_train = df[df["source_file"].isin(train_files)].copy()
df_test  = df[df["source_file"].isin(test_files)].copy()

print(f"Train Shape: {df_train.shape} | Test Shape: {df_test.shape}")

Train Shape: (212883, 65) | Test Shape: (70191, 65)


In [4]:
META_COLS = [
    "Label", "label_bin", "label_binary", "source_file", 
    "day", "attack_group", "target"
]

X_train_raw = df_train.drop(columns=META_COLS).select_dtypes(include=[np.number])
y_train = df_train["target"]

X_test_raw = df_test.drop(columns=META_COLS).select_dtypes(include=[np.number])
y_test = df_test["target"]

# Handle Inf/NaN before scaling
X_train_raw = X_train_raw.replace([np.inf, -np.inf], np.nan)
X_test_raw  = X_test_raw.replace([np.inf, -np.inf], np.nan)

pipeline = Pipeline([
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler()),
])

X_train = pipeline.fit_transform(X_train_raw)
X_test  = pipeline.transform(X_test_raw)

In [5]:
def split_clients_dirichlet(X, y, num_clients, alpha=0.1):
    """
    Non-IID split using Dirichlet distribution.
    """
    if isinstance(y, pd.Series):
        y = y.to_numpy()

    num_classes = len(np.unique(y))
    class_indices = {c: np.where(y == c)[0] for c in range(num_classes)}

    client_indices = [[] for _ in range(num_clients)]

    for c in range(num_classes):
        np.random.shuffle(class_indices[c])
        proportions = np.random.dirichlet(alpha * np.ones(num_clients))
        proportions = (np.cumsum(proportions) * len(class_indices[c])).astype(int)[:-1]
        splits = np.split(class_indices[c], proportions)

        for i in range(num_clients):
            client_indices[i].extend(splits[i])

    clients = []
    for idxs in client_indices:
        if len(idxs) > 0:
            clients.append((X[idxs], y[idxs]))

    return clients

In [6]:
NUM_CLIENTS = 10
ALPHA = 0.1   # Strong Non-IID

clients = split_clients_dirichlet(
    X_train,
    y_train,
    NUM_CLIENTS,
    alpha=ALPHA
)

print(f"\nNon-IID Dirichlet split (alpha={ALPHA})")
for i, (_, y_c) in enumerate(clients):
    unique, counts = np.unique(y_c, return_counts=True)
    print(f"Client {i:02d}: {dict(zip(unique, counts))}")


Non-IID Dirichlet split (alpha=0.1)
Client 00: {np.int64(0): np.int64(9761)}
Client 01: {np.int64(0): np.int64(76311), np.int64(1): np.int64(1)}
Client 02: {np.int64(0): np.int64(1438), np.int64(1): np.int64(425)}
Client 03: {np.int64(0): np.int64(2070), np.int64(1): np.int64(2)}
Client 04: {np.int64(0): np.int64(97)}
Client 05: {np.int64(1): np.int64(110)}
Client 06: {np.int64(0): np.int64(36846), np.int64(1): np.int64(1042)}
Client 07: {np.int64(0): np.int64(128), np.int64(1): np.int64(29)}
Client 08: {np.int64(0): np.int64(3)}
Client 09: {np.int64(0): np.int64(59386), np.int64(1): np.int64(25234)}


In [7]:
def init_model():
    """
    ENHANCEMENT: 
    1. Increased max_iter to 10 (helps local convergence).
    2. Added class_weight='balanced' (fixes 80% Benign bias).
    """
    return LogisticRegression(
        max_iter=10,        
        solver="lbfgs",
        warm_start=True,
        class_weight='balanced',  # <--- CRITICAL FOR IMBALANCE
        random_state=RANDOM_STATE
    )

def local_train(global_model, X_local, y_local):
    # skip degenerate clients (must have at least 1 class)
    # Note: Ideally needs 2 classes, but sklearn can handle 1 with warnings if we are careful
    if len(np.unique(y_local)) < 1: 
        return None
        
    model = copy.deepcopy(global_model)
    try:
        model.fit(X_local, y_local)
        return model
    except Exception:
        return None

def fedavg_m_update(global_model, client_models, eta=0.8):
    """
    ENHANCEMENT: Global Momentum.
    Instead of hard replacement, we blend old global weights with new average.
    global_new = (1-eta)*global_old + eta*average(clients)
    """
    if not client_models:
        return global_model

    avg_coef = np.mean(np.stack([m.coef_ for m in client_models]), axis=0)
    avg_intercept = np.mean(np.stack([m.intercept_ for m in client_models]), axis=0)

    # Momentum update
    global_model.coef_ = (1 - eta) * global_model.coef_ + eta * avg_coef
    global_model.intercept_ = (1 - eta) * global_model.intercept_ + eta * avg_intercept

    return global_model

In [8]:
ROUNDS = 20
SERVER_LR = 0.8 # Slightly conservative update rate for Non-IID

global_model = init_model()

# Warm start (optional but recommended)
init_idx = np.random.choice(len(X_train), size=2000, replace=False)
global_model.fit(X_train[init_idx], y_train.iloc[init_idx])

print(f"\nStarting Enhanced Non-IID FL | Alpha: {ALPHA} | Server LR: {SERVER_LR}")

for rnd in range(ROUNDS):
    client_models = []

    for X_c, y_c in clients:
        local_model = local_train(global_model, X_c, y_c)
        if local_model is not None:
            client_models.append(local_model)

    # Enhanced Aggregation
    global_model = fedavg_m_update(global_model, client_models, eta=SERVER_LR)

    # ---- evaluation ----
    y_pred = global_model.predict(X_test)
    y_prob = global_model.predict_proba(X_test)[:, 1]

    acc = accuracy_score(y_test, y_pred)
    f1  = f1_score(y_test, y_pred)
    auc = roc_auc_score(y_test, y_prob)

    print(
        f"Round {rnd+1:02d} | "
        f"Acc: {acc:.4f} | "
        f"F1: {f1:.4f} | "
        f"AUC: {auc:.4f}"
    )


Starting Enhanced Non-IID FL | Alpha: 0.1 | Server LR: 0.8
Round 01 | Acc: 0.6951 | F1: 0.4140 | AUC: 0.8439
Round 02 | Acc: 0.6940 | F1: 0.4105 | AUC: 0.8453
Round 03 | Acc: 0.6951 | F1: 0.4135 | AUC: 0.8463
Round 04 | Acc: 0.6949 | F1: 0.4130 | AUC: 0.8494
Round 05 | Acc: 0.6950 | F1: 0.4133 | AUC: 0.8496
Round 06 | Acc: 0.6951 | F1: 0.4135 | AUC: 0.8510
Round 07 | Acc: 0.6950 | F1: 0.4134 | AUC: 0.8513
Round 08 | Acc: 0.6950 | F1: 0.4133 | AUC: 0.8515
Round 09 | Acc: 0.6949 | F1: 0.4129 | AUC: 0.8515
Round 10 | Acc: 0.6949 | F1: 0.4129 | AUC: 0.8517
Round 11 | Acc: 0.6949 | F1: 0.4129 | AUC: 0.8523
Round 12 | Acc: 0.6949 | F1: 0.4129 | AUC: 0.8518
Round 13 | Acc: 0.6949 | F1: 0.4130 | AUC: 0.8523
Round 14 | Acc: 0.6950 | F1: 0.4134 | AUC: 0.8515
Round 15 | Acc: 0.6951 | F1: 0.4134 | AUC: 0.8510
Round 16 | Acc: 0.6951 | F1: 0.4134 | AUC: 0.8522
Round 17 | Acc: 0.6949 | F1: 0.4130 | AUC: 0.8529
Round 18 | Acc: 0.6950 | F1: 0.4132 | AUC: 0.8530
Round 19 | Acc: 0.6950 | F1: 0.4131 | AU