In [1]:
from phe import paillier
import numpy as np
import pandas as pd
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import LabelEncoder, StandardScaler

# ------------------------------
# Data Preprocessing for UNSW-NB15
# ------------------------------



def load_and_preprocess_data(path):
    df = pd.read_csv(path)
    
    # Drop unnecessary columns
    df.drop(columns=['ackdat', 'dloss', 'tcprtt', 'dbytes', 'djit', 'synack', 'spkts', 'dur', 'dinpkt',
                     'response_body_len', 'sbytes', 'ct_flw_http_mthd', 'is_ftp_login', 'ct_ftp_cmd',
                     'trans_depth', 'smean', 'sjit', 'sloss'], inplace=True)

    # Encode label column
    if df['label'].dtype == 'object':
        le = LabelEncoder()
        df['label'] = le.fit_transform(df['label'])

    # Balance the dataset: undersample class 1
    df_class0 = df[df['label'] == 0]
    df_class1 = df[df['label'] == 1]

    df_class1_sampled = df_class1.sample(n=len(df_class0), random_state=42)
    df_balanced = pd.concat([df_class0, df_class1_sampled]).sample(frac=1, random_state=42)  # shuffle

    y = df_balanced['label'].astype(int).values
    X = df_balanced.drop(columns=['label'])

    # One-hot encode categorical features
    X = pd.get_dummies(X)

    # Normalize features
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)
    print(df.shape)
    print(X.shape)
    print(y.shape)
    print("✅ Class balance after undersampling:", pd.Series(y).value_counts())

    return X_scaled, y



# ------------------------------
# Client Class
# ------------------------------

class Client:
    def __init__(self, id, public_key, private_key, data, labels):
        self.id = id
        self.public_key = public_key
        self.private_key = private_key

        self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(
            data, labels, test_size=0.3, random_state=42
        )

        self.model_weights = None

    def train_local_model(self):
        clf = SGDClassifier(loss='log_loss', max_iter=1000, learning_rate='constant', eta0=0.01)
        clf.fit(self.X_train, self.y_train)
        self.model_weights = clf.coef_[0]
        return self.model_weights

    def encrypt_weights(self):
        return [self.public_key.encrypt(x) for x in self.model_weights]

    def decrypt_weights(self, encrypted_weights):
        return np.array([self.private_key.decrypt(x) for x in encrypted_weights])


# ------------------------------
# Server Class
# ------------------------------

class Server:
    def __init__(self, public_key):
        self.public_key = public_key
        self.encrypted_models = []

    def receive_encrypted_model(self, encrypted_model):
        self.encrypted_models.append(encrypted_model)

    def aggregate_encrypted_models(self, num_clients):
        n_weights = len(self.encrypted_models[0])
        aggregated_model = []
        for i in range(n_weights):
            sum_enc = sum(client_weights[i] for client_weights in self.encrypted_models)
            avg_enc = sum_enc * (1 / num_clients)
            aggregated_model.append(avg_enc)
        return aggregated_model


# ------------------------------
# Federated Training Function
# ------------------------------

def federated_training(data_path, num_clients=3, num_rounds=3):
    X, y = load_and_preprocess_data(data_path)

    # ✅ Split once into train and test sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, stratify=y, random_state=42)

    # ✅ Stratified split of training data across clients
    skf = StratifiedKFold(n_splits=num_clients, shuffle=True, random_state=42)
    split_data = []
    split_labels = []

    for _, idx in skf.split(X_train, y_train):
        split_data.append(X_train[idx])
        split_labels.append(y_train[idx])

    public_key, private_key = paillier.generate_paillier_keypair()

    clients = [Client(i, public_key, private_key, split_data[i], split_labels[i]) for i in range(num_clients)]
    server = Server(public_key)

    for round in range(num_rounds):
        print(f"\n🔁 Round {round + 1} - Local training and encryption")

        server.encrypted_models.clear()
        for client in clients:
            local_weights = client.train_local_model()
            enc_weights = client.encrypt_weights()
            server.receive_encrypted_model(enc_weights)
            print(f"Client {client.id} encrypted and sent weights.")

        print("🔐 Server aggregating encrypted models...")
        encrypted_global_model = server.aggregate_encrypted_models(num_clients)

        for client in clients:
            decrypted_global_model = client.decrypt_weights(encrypted_global_model)
            client.model_weights = decrypted_global_model
            print(f"Client {client.id} decrypted global model.")

    return clients, X_test, y_test



# ------------------------------
# Evaluation
# ------------------------------

def evaluate_global_model(clients, X_test, y_test):
    # Use model from the first client (all clients share the same global weights after decryption)
    client = clients[0]

    clf = SGDClassifier()
    clf.coef_ = client.model_weights.reshape(1, -1)
    clf.intercept_ = np.array([0])
    clf.classes_ = np.array([0, 1])  # Explicitly define

    preds = clf.predict(X_test)
    acc = accuracy_score(y_test, preds)
    print(f"\n🧪 Global Test Set Predictions: {np.unique(preds, return_counts=True)}")
    print(f"✅ Global Model Test Accuracy: {acc:.4f}")

def evaluate_each_client(clients):
    for client in clients:
        clf = SGDClassifier()
        clf.coef_ = client.model_weights.reshape(1, -1)
        clf.intercept_ = np.array([0])
        clf.classes_ = np.array([0, 1])  # Explicitly define

        preds = clf.predict(client.X_test)
        acc = accuracy_score(client.y_test, preds)
        print(f"\n📍 Client {client.id} Test Set Predictions: {np.unique(preds, return_counts=True)}")
        print(f"Client {client.id} Test Accuracy: {acc:.4f}")



# ------------------------------
# Run
# ------------------------------

# Use the correct path to your dataset CSV
clients, X_test, y_test = federated_training("C:\\MAFL\\MAFL\\unsw-nb15-training-set.csv")
evaluate_global_model(clients, X_test, y_test)

evaluate_each_client(clients)  

(175341, 18)
(112000, 178)
(112000,)
✅ Class balance after undersampling: 1    56000
0    56000
Name: count, dtype: int64

🔁 Round 1 - Local training and encryption
Client 0 encrypted and sent weights.
Client 1 encrypted and sent weights.
Client 2 encrypted and sent weights.
🔐 Server aggregating encrypted models...
Client 0 decrypted global model.
Client 1 decrypted global model.
Client 2 decrypted global model.

🔁 Round 2 - Local training and encryption
Client 0 encrypted and sent weights.
Client 1 encrypted and sent weights.
Client 2 encrypted and sent weights.
🔐 Server aggregating encrypted models...
Client 0 decrypted global model.
Client 1 decrypted global model.
Client 2 decrypted global model.

🔁 Round 3 - Local training and encryption
Client 0 encrypted and sent weights.
Client 1 encrypted and sent weights.
Client 2 encrypted and sent weights.
🔐 Server aggregating encrypted models...
Client 0 decrypted global model.
Client 1 decrypted global model.
Client 2 decrypted global mod