In [11]:
import numpy as np
import pandas as pd
import copy
import warnings

from sklearn.linear_model import LogisticRegression
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score
from sklearn.exceptions import ConvergenceWarning

warnings.filterwarnings("ignore", category=ConvergenceWarning)

# --- CONFIGURATION ---
RANDOM_STATE = 42
np.random.seed(RANDOM_STATE)

DATA_PATH = "../data/processed/cicids_10pct_pruned.csv"
NUM_CLIENTS = 10
ALPHA = 0.1             # Dirichlet non-IID strength
ROUNDS = 20
SERVER_LR = 0.8         # For FedAvgM
MALICIOUS_FRACTION = 0.2 # 20% Byzantine clients



In [12]:
# %% [1] DATA LOADING & PREPROCESSING
df = pd.read_csv(DATA_PATH)

train_files = [
    "Monday-WorkingHours.pcap_ISCX.csv", "Tuesday-WorkingHours.pcap_ISCX.csv",
    "Wednesday-workingHours.pcap_ISCX.csv", "Thursday-WorkingHours-Morning-WebAttacks.pcap_ISCX.csv",
    "Thursday-WorkingHours-Afternoon-Infilteration.pcap_ISCX.csv"
]
test_files = [
    "Friday-WorkingHours-Morning.pcap_ISCX.csv", "Friday-WorkingHours-Afternoon-PortScan.pcap_ISCX.csv",
    "Friday-WorkingHours-Afternoon-DDos.pcap_ISCX.csv"
]

df["target"] = (df["Label"] != "BENIGN").astype(int)
df_train = df[df["source_file"].isin(train_files)].copy()
df_test  = df[df["source_file"].isin(test_files)].copy()

META_COLS = ["Label", "label_bin", "label_binary", "source_file", "day", "attack_group", "target"]
X_train_raw = df_train.drop(columns=META_COLS).select_dtypes(include=[np.number]).replace([np.inf, -np.inf], np.nan)
y_train = df_train["target"]
X_test_raw = df_test.drop(columns=META_COLS).select_dtypes(include=[np.number]).replace([np.inf, -np.inf], np.nan)
y_test = df_test["target"]

pipeline = Pipeline([
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler()),
])

X_train = pipeline.fit_transform(X_train_raw)
X_test  = pipeline.transform(X_test_raw)



In [13]:
# %% [2] FEDERATED DATA SPLITTING (DIRICHLET)
def split_clients_dirichlet(X, y, num_clients, alpha=0.1):
    if isinstance(y, pd.Series): y = y.to_numpy()
    num_classes = len(np.unique(y))
    class_indices = {c: np.where(y == c)[0] for c in range(num_classes)}
    client_indices = [[] for _ in range(num_clients)]

    for c in range(num_classes):
        np.random.shuffle(class_indices[c])
        proportions = np.random.dirichlet(alpha * np.ones(num_clients))
        proportions = (np.cumsum(proportions) * len(class_indices[c])).astype(int)[:-1]
        splits = np.split(class_indices[c], proportions)
        for i in range(num_clients):
            client_indices[i].extend(splits[i])

    return [(X[idxs], y[idxs]) for idxs in client_indices if len(idxs) > 0]



In [14]:
# %% [3] BYZANTINE SIMULATION (LABEL FLIPPING)
def apply_label_flipping(clients, malicious_indices):
    """
    Byzantine Attack: Malicious clients flip their binary labels (0->1, 1->0).
    This simulates a coordinated effort to poison the global model.
    """
    attacked_clients = []
    for i, (X, y) in enumerate(clients):
        if i in malicious_indices:
            y_flipped = 1 - y 
            attacked_clients.append((X, y_flipped))
        else:
            attacked_clients.append((X, y))
    return attacked_clients

# %% [4] MODEL & AGGREGATION STRATEGIES
def init_model():
    return LogisticRegression(
        max_iter=10, solver="lbfgs", warm_start=True,
        class_weight='balanced', random_state=RANDOM_STATE
    )

def local_train(global_model, X_local, y_local):
    if len(np.unique(y_local)) < 2: return None # Robustness for non-IID
    model = copy.deepcopy(global_model)
    try:
        model.fit(X_local, y_local)
        return model
    except: return None

# Aggregator A: FedAvg
def aggregate_fedavg(client_models):
    avg_coef = np.mean(np.stack([m.coef_ for m in client_models]), axis=0)
    avg_int = np.mean(np.stack([m.intercept_ for m in client_models]), axis=0)
    return avg_coef, avg_int

# Aggregator B: FedAvgM (Momentum)
def aggregate_fedavg_m(global_model, client_models, eta=0.8):
    avg_coef, avg_int = aggregate_fedavg(client_models)
    new_coef = (1 - eta) * global_model.coef_ + eta * avg_coef
    new_int = (1 - eta) * global_model.intercept_ + eta * avg_int
    return new_coef, new_int

# Aggregator C: Coordinate-wise Median (Robust)
def aggregate_median(client_models):
    """
    Median is robust because it ignores extreme weight outliers 
    generated by the label-flipping Byzantine clients.
    """
    all_coefs = np.stack([m.coef_ for m in client_models])
    all_ints = np.stack([m.intercept_ for m in client_models])
    return np.median(all_coefs, axis=0), np.median(all_ints, axis=0)

# %% [5] EXPERIMENT ENGINE
def run_experiment(clients, agg_type='fedavg', rounds=ROUNDS):
    global_model = init_model()
    # Initial warm start on small clean sample
    init_idx = np.random.choice(len(X_train), size=1000, replace=False)
    global_model.fit(X_train[init_idx], y_train.iloc[init_idx])

    for rnd in range(rounds):
        client_models = []
        for X_c, y_c in clients:
            m = local_train(global_model, X_c, y_c)
            if m: client_models.append(m)

        if agg_type == 'fedavg':
            global_model.coef_, global_model.intercept_ = aggregate_fedavg(client_models)
        elif agg_type == 'fedavg_m':
            global_model.coef_, global_model.intercept_ = aggregate_fedavg_m(global_model, client_models, SERVER_LR)
        elif agg_type == 'median':
            global_model.coef_, global_model.intercept_ = aggregate_median(client_models)

    # Final Evaluation
    y_pred = global_model.predict(X_test)
    y_prob = global_model.predict_proba(X_test)[:, 1]
    return {
        "acc": accuracy_score(y_test, y_pred),
        "f1": f1_score(y_test, y_pred),
        "auc": roc_auc_score(y_test, y_prob)
    }



In [15]:
# %% [6] EXECUTION
# 1. Setup Data & Malicious Indices
clients_list = split_clients_dirichlet(X_train, y_train, NUM_CLIENTS, alpha=ALPHA)
num_malicious = int(MALICIOUS_FRACTION * len(clients_list))
malicious_idx = np.random.choice(len(clients_list), num_malicious, replace=False)
poisoned_clients = apply_label_flipping(clients_list, malicious_idx)

print(f"Malicious Clients: {malicious_idx}")
print("-" * 30)

# 2. Run Comparisons
results = {}
for tech in ['fedavg', 'fedavg_m', 'median']:
    print(f"Running {tech}...")
    results[tech] = run_experiment(poisoned_clients, agg_type=tech)

# 3. Display Final Table
res_df = pd.DataFrame(results).T
print("\nFinal Results (20% Byzantine Attack - Label Flipping):")
print(res_df)

Malicious Clients: [3 2]
------------------------------
Running fedavg...
Running fedavg_m...
Running median...

Final Results (20% Byzantine Attack - Label Flipping):
               acc        f1       auc
fedavg    0.647889  0.458135  0.660840
fedavg_m  0.653289  0.413901  0.684982
median    0.672608  0.394945  0.767208
