# Thư viện

In [1]:
!pip install -q flwr[simulation]  pandas matplotlib scikit-learn torch

In [2]:
from collections import OrderedDict
from typing import List, Tuple
from collections import Counter

import matplotlib.pyplot as plt
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader, Dataset
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

import pickle

import flwr
from flwr.client import Client, ClientApp, NumPyClient
from flwr.common import Metrics, Context
from flwr.server import ServerApp, ServerConfig, ServerAppComponents
from flwr.server.strategy import FedAvg
from flwr.simulation import run_simulation
from flwr.server.client_proxy import ClientProxy
from flwr.common import Parameters, Scalar, FitRes, parameters_to_ndarrays
from typing import Optional, Union

DEVICE = torch.device("cpu")  # Try "cuda" to train on GPU
print(f"Training on {DEVICE}")
print(f"Flower {flwr.__version__} / PyTorch {torch.__version__}")


Training on cpu
Flower 1.13.0 / PyTorch 2.5.1


# Dataset

In [17]:
# Cấu hình thiết bị (CUDA hoặc CPU)
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [18]:
# Đường dẫn đến tập dữ liệu
DATA_PATH = r"..\..\data\SMSSpamCollection"

# Đọc dữ liệu
data = pd.read_csv(DATA_PATH, sep='\t', header=None, names=["label", "text"])

# Chuyển đổi nhãn 'ham' và 'spam' thành 0 và 1
data["label"] = data["label"].map({"ham": 0, "spam": 1})


In [19]:
BATCH_SIZE = 16
NUM_CLIENTS = 10

## Xử lý dữ liệu

In [20]:
class SMSDataset(Dataset):
    def __init__(self, texts, labels, vectorizer):
        self.texts = texts
        self.labels = labels
        self.vectorizer = vectorizer

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        text_vector = self.vectorizer.transform([self.texts[idx]]).toarray().squeeze()
        label = self.labels[idx]
        return torch.tensor(text_vector, dtype=torch.float32), torch.tensor(label, dtype=torch.long)

def load_datasets(partition_id: int):
    print(f"Loading dataset for partition ID: {partition_id}")

    # Load raw data
    data = pd.read_csv(DATA_PATH, sep="\t", header=None, names=["label", "text"])
    data["label"] = data["label"].map({"ham": 0, "spam": 1})

    # Vectorize text data
    vectorizer = TfidfVectorizer(stop_words="english", max_features=5000)
    vectorizer.fit(data["text"])

    # Split into NUM_CLIENTS partitions
    indices = np.arange(len(data))
    np.random.seed(42)
    np.random.shuffle(indices)
    split_indices = np.array_split(indices, NUM_CLIENTS)

    # Select partition
    partition_indices = split_indices[partition_id]
    partition_data = data.iloc[partition_indices]

    # Train/val split
    train_data, val_data = train_test_split(partition_data, test_size=0.1, random_state=42)
    train_dataset = SMSDataset(train_data["text"].tolist(), train_data["label"].tolist(), vectorizer)
    val_dataset = SMSDataset(val_data["text"].tolist(), val_data["label"].tolist(), vectorizer)

    # Dataloaders
    trainloader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
    valloader = DataLoader(val_dataset, batch_size=BATCH_SIZE)

    # Test set
    test_data = data.iloc[split_indices[-1]]
    test_dataset = SMSDataset(test_data["text"].tolist(), test_data["label"].tolist(), vectorizer)
    testloader = DataLoader(test_dataset, batch_size=BATCH_SIZE)

    print(f"Partition {partition_id}: Train {len(train_data)}, Val {len(val_data)}, Test {len(test_data)}")
    return trainloader, valloader, testloader


In [21]:
data.head()

Unnamed: 0,label,text
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."


# Train model

In [22]:
class Net(nn.Module):
    def __init__(self, input_dim: int):
        super(Net, self).__init__()
        self.fc1 = nn.Linear(input_dim, 128)  # Fully connected layer
        self.fc2 = nn.Linear(128, 64)
        self.fc3 = nn.Linear(64, 2)  # 2 output classes (ham, spam)

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = self.fc3(x)
        return x


In [23]:
# Lấy trainloader từ partition đầu tiên
trainloader, valloader, testloader = load_datasets(partition_id=0)

# Số chiều của đầu vào từ vectorizer
input_dim = 5000  # (đã được đặt trong load_datasets max_features=5000)

# Khởi tạo mô hình
net = Net(input_dim).to(DEVICE)
print(net)


Loading dataset for partition ID: 0
Partition 0: Train 502, Val 56, Test 557
Net(
  (fc1): Linear(in_features=5000, out_features=128, bias=True)
  (fc2): Linear(in_features=128, out_features=64, bias=True)
  (fc3): Linear(in_features=64, out_features=2, bias=True)
)


In [24]:
def train(net, trainloader, epochs: int, verbose=False):
    """Train the network on the training set."""
    criterion = torch.nn.CrossEntropyLoss()
    optimizer = torch.optim.Adam(net.parameters())
    net.train()
    for epoch in range(epochs):
        epoch_loss = 0.0
        correct, total = 0, 0
        for inputs, labels in trainloader:
            inputs, labels = inputs.to(DEVICE), labels.to(DEVICE)
            optimizer.zero_grad()
            outputs = net(inputs)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
            # Metrics
            epoch_loss += loss.item()
            total += labels.size(0)
            correct += (torch.max(outputs, 1)[1] == labels).sum().item()
        epoch_loss /= len(trainloader)
        epoch_acc = correct / total
        if verbose:
            print(f"Train loss {epoch_loss}, Accuracy {epoch_acc}")

def test(net, testloader):
    """Evaluate the network on the entire test set."""
    criterion = torch.nn.CrossEntropyLoss()
    correct, total, loss = 0, 0, 0.0
    net.eval()
    with torch.no_grad():
        for inputs, labels in testloader:
            inputs, labels = inputs.to(DEVICE), labels.to(DEVICE)
            outputs = net(inputs)
            loss += criterion(outputs, labels).item()
            _, predicted = torch.max(outputs.data, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()
    loss /= len(testloader)
    accuracy = correct / total
    return loss, accuracy


# Federated learning

## Chiến thuật 1: Nhiễu Gaussian

### Mô tả chiến thuật:
Thêm nhiễu Gaussian trực tiếp vào các tham số hoặc gradient trước khi gửi về server.

### Mục tiêu:
- Làm giảm hiệu suất của mô hình toàn cục.
- Gây nhiễu trong quá trình tổng hợp (aggregation), khiến mô hình toàn cục không hội tụ hoặc hội tụ chậm.

### Chiến thuật cụ thể:
1. **Thêm nhiễu nhỏ (𝜎 thấp):**
    - Mục tiêu: Làm giảm nhẹ hiệu suất của mô hình mà không gây sự chú ý.

2. **Thêm nhiễu lớn (𝜎 cao):**
    - Mục tiêu: Gây rối loạn nghiêm trọng trong aggregation.

3. **Thêm nhiễu chỉ vào một phần của tham số:**
    - Mục tiêu: Tấn công tinh vi, chỉ gây ảnh hưởng đến một lớp cụ thể hoặc một nhóm tham số.

In [25]:
def set_parameters(net, parameters: List[np.ndarray]):
    params_dict = zip(net.state_dict().keys(), parameters)
    state_dict = OrderedDict({k: torch.Tensor(v) for k, v in params_dict})
    net.load_state_dict(state_dict, strict=True)


def get_parameters(net) -> List[np.ndarray]:
    return [val.cpu().numpy() for _, val in net.state_dict().items()]

In [26]:
class FlowerClient(NumPyClient):
    def __init__(self, net, trainloader, valloader, is_bad_client=False):
        self.net = net
        self.trainloader = trainloader
        self.valloader = valloader
        self.is_bad_client = is_bad_client  # Đánh dấu client có phải là bad client không

    def get_parameters(self, config):
        return get_parameters(self.net)

    def fit(self, parameters, config):
        set_parameters(self.net, parameters)

        # Nếu là bad client, thực hiện thao tác độc hại
        if self.is_bad_client:
            print("Bad client performing model poisoning!")
            # Tạo mô hình độc hại ( thêm nhiễu Gaussian)
            poisoned_parameters = [
                # param + np.random.normal(0, 0.1, param.shape)  # Thêm nhiễu Gaussian nhỏ (𝜎 thấp)
                param + np.random.normal(0, 1, param.shape)  # Nhiễu lớn
                for param in parameters
            ]
            return poisoned_parameters, len(self.trainloader), {}
        

        # Nếu là client thông thường, huấn luyện bình thường
        train(self.net, self.trainloader, epochs=1)
        return get_parameters(self.net), len(self.trainloader), {}

    def evaluate(self, parameters, config):
        set_parameters(self.net, parameters)
        loss, accuracy = test(self.net, self.valloader)
        return float(loss), len(self.valloader), {"accuracy": float(accuracy)}


In [27]:
def client_fn(context: Context) -> Client:
    """Create a Flower client representing a single organization."""
    input_dim = 5000  # Đã được đặt trong load_datasets max_features=5000
    net = Net(input_dim).to(DEVICE)
    partition_id = context.node_config["partition-id"]
    trainloader, valloader, _ = load_datasets(partition_id=partition_id)

    # Định nghĩa các client độc hại (ví dụ: chỉ định partition ID là bad client)
    bad_clients = [1, 3, 5]  # Danh sách các partition ID của client độc hại
    is_bad_client = partition_id in bad_clients

    trainloader, valloader, _ = load_datasets(partition_id=partition_id)

    return FlowerClient(net, trainloader, valloader, is_bad_client).to_client()

# Create the ClientApp
client = ClientApp(client_fn=client_fn)



In [28]:
def weighted_average(metrics: List[Tuple[int, dict]]) -> dict:
    # Ensure there are metrics to aggregate
    if not metrics:
        return {}

    # Initialize storage for weighted sums
    weighted_sums = {}
    total_examples = 0

    for num_examples, metric_dict in metrics:
        total_examples += num_examples
        for key, value in metric_dict.items():
            if key not in weighted_sums:
                weighted_sums[key] = 0
            weighted_sums[key] += num_examples * value

    # Compute weighted averages
    aggregated_metrics = {
        key: weighted_sums[key] / total_examples for key in weighted_sums
    }
    return aggregated_metrics


In [29]:
# Custom SaveModelStrategy implementation
class SaveModelStrategy(FedAvg):
    def aggregate_fit(
        self,
        server_round: int,
        results: list[tuple[ClientProxy, FitRes]],
        failures: list[Union[tuple[ClientProxy, FitRes], BaseException]],
    ) -> tuple[Optional[Parameters], dict[str, Scalar]]:

        # Call aggregate_fit from the base class (FedAvg)
        aggregated_parameters, aggregated_metrics = super().aggregate_fit(
            server_round, results, failures
        )

        if aggregated_parameters is not None:
            # Convert `Parameters` to `list[np.ndarray]`
            aggregated_ndarrays = parameters_to_ndarrays(aggregated_parameters)

            # Save aggregated weights for each round
            print(f"Saving round {server_round} aggregated weights...")
            np.savez(f"round-{server_round}-weights.npz", *aggregated_ndarrays)

            # Save the MPA model at the end of training
            if server_round == 5:
                with open("MPA_model.pkl", "wb") as f:
                    pickle.dump(aggregated_ndarrays, f)
                print("MPA model saved as 'MPA_model.pkl'")
                # Lưu dưới dạng PyTorch
                torch.save(aggregated_ndarrays, "MPA_model.pth")
                print("MPA model saved as 'MPA_model.pth'")

        return aggregated_parameters, aggregated_metrics

In [30]:

# Define the server function
def server_fn(context: Context) -> ServerAppComponents:
    # Use the custom SaveModelStrategy
    strategy = SaveModelStrategy(
        fraction_fit=1.0,
        fraction_evaluate=0.5,
        min_fit_clients=10,
        min_evaluate_clients=5,
        min_available_clients=10,
        fit_metrics_aggregation_fn=weighted_average,
        evaluate_metrics_aggregation_fn=weighted_average,
    )

    # Configure the server for 5 rounds of training
    config = ServerConfig(num_rounds=5)

    return ServerAppComponents(strategy=strategy, config=config)


# Create a new server instance with the SaveModelStrategy
server = ServerApp(server_fn=server_fn)

In [31]:
# Specify the resources each of your clients need
# By default, each client will be allocated 1x CPU and 0x GPUs
backend_config = {"client_resources": {"num_cpus": 1, "num_gpus": 0.0}}

# When running on GPU, assign an entire GPU for each client
if DEVICE.type == "cuda":
    backend_config = {"client_resources": {"num_cpus": 1, "num_gpus": 1.0}}
    # Refer to our Flower framework documentation for more details about Flower simulations
    # and how to set up the `backend_config`

In [32]:
import os

# Disable oneDNN custom operations to avoid floating-point round-off errors
os.environ['TF_ENABLE_ONEDNN_OPTS'] = '0'

# Import TensorFlow and other required libraries
import tensorflow as tf

# Initialize TensorFlow
print("TensorFlow version:", tf.__version__)


TensorFlow version: 2.18.0


In [33]:
# Run simulation
history = run_simulation(
    server_app=server,
    client_app=client,
    num_supernodes=NUM_CLIENTS,
    backend_config=backend_config,
    verbose_logging=True
)



[94mDEBUG 2024-12-01 01:13:39,053[0m:     Asyncio event loop already running.
[94mDEBUG 2024-12-01 01:13:39,054[0m:     Logger propagate set to False
[94mDEBUG 2024-12-01 01:13:39,055[0m:     Pre-registering run with id 8019549350306750640
[94mDEBUG 2024-12-01 01:13:39,055[0m:     Using InMemoryState
[94mDEBUG 2024-12-01 01:13:39,056[0m:     Using InMemoryState
[92mINFO 2024-12-01 01:13:39,073[0m:      Starting Flower ServerApp, config: num_rounds=5, no round_timeout
[94mDEBUG 2024-12-01 01:13:39,091[0m:     Buffer time delay: 5s
[92mINFO 2024-12-01 01:13:39,324[0m:      
[92mINFO 2024-12-01 01:13:39,328[0m:      [INIT]
[92mINFO 2024-12-01 01:13:39,329[0m:      Requesting initial parameters from one random client
[94mDEBUG 2024-12-01 01:13:44,331[0m:     Using InMemoryState
[94mDEBUG 2024-12-01 01:13:44,332[0m:     Registered 10 nodes
[94mDEBUG 2024-12-01 01:13:44,333[0m:     Supported backends: ['ray']
[94mDEBUG 2024-12-01 01:13:44,333[0m:     Initialising:

[36m(ClientAppActor pid=30364)[0m Loading dataset for partition ID: 7
[36m(ClientAppActor pid=30364)[0m Partition 7: Train 501, Val 56, Test 557
[36m(ClientAppActor pid=30364)[0m Loading dataset for partition ID: 7


[92mINFO 2024-12-01 01:14:11,474[0m:      Received initial parameters from one random client
[92mINFO 2024-12-01 01:14:11,474[0m:      Starting evaluation of initial global parameters
[92mINFO 2024-12-01 01:14:11,475[0m:      Evaluation returned no results (`None`)
[92mINFO 2024-12-01 01:14:11,475[0m:      
[92mINFO 2024-12-01 01:14:11,476[0m:      [ROUND 1]
[92mINFO 2024-12-01 01:14:11,476[0m:      configure_fit: strategy sampled 10 clients (out of 10)


[36m(ClientAppActor pid=30364)[0m Partition 7: Train 501, Val 56, Test 557
[36m(ClientAppActor pid=30364)[0m Loading dataset for partition ID: 0
[36m(ClientAppActor pid=30364)[0m Partition 0: Train 502, Val 56, Test 557
[36m(ClientAppActor pid=30364)[0m Loading dataset for partition ID: 0
[36m(ClientAppActor pid=30364)[0m Partition 0: Train 502, Val 56, Test 557
[36m(ClientAppActor pid=24808)[0m Loading dataset for partition ID: 5
[36m(ClientAppActor pid=1956)[0m Loading dataset for partition ID: 4
[36m(ClientAppActor pid=1956)[0m Partition 4: Train 501, Val 56, Test 557
[36m(ClientAppActor pid=24808)[0m Bad client performing model poisoning!


[92mINFO 2024-12-01 01:14:24,104[0m:      aggregate_fit: received 10 results and 0 failures
[92mINFO 2024-12-01 01:14:24,165[0m:      configure_evaluate: strategy sampled 5 clients (out of 10)


Saving round 1 aggregated weights...
[36m(ClientAppActor pid=8544)[0m Loading dataset for partition ID: 1[32m [repeated 17x across cluster] (Ray deduplicates logs by default. Set RAY_DEDUP_LOGS=0 to disable log deduplication, or see https://docs.ray.io/en/master/ray-observability/ray-logging.html#log-deduplication for more options.)[0m


[92mINFO 2024-12-01 01:14:24,661[0m:      aggregate_evaluate: received 5 results and 0 failures
[92mINFO 2024-12-01 01:14:24,661[0m:      
[92mINFO 2024-12-01 01:14:24,662[0m:      [ROUND 2]
[92mINFO 2024-12-01 01:14:24,662[0m:      configure_fit: strategy sampled 10 clients (out of 10)


[36m(ClientAppActor pid=8544)[0m Partition 1: Train 502, Val 56, Test 557[32m [repeated 23x across cluster][0m
[36m(ClientAppActor pid=21160)[0m Bad client performing model poisoning![32m [repeated 2x across cluster][0m


[92mINFO 2024-12-01 01:14:28,985[0m:      aggregate_fit: received 10 results and 0 failures
[92mINFO 2024-12-01 01:14:29,042[0m:      configure_evaluate: strategy sampled 5 clients (out of 10)


Saving round 2 aggregated weights...


[92mINFO 2024-12-01 01:14:29,575[0m:      aggregate_evaluate: received 5 results and 0 failures
[92mINFO 2024-12-01 01:14:29,575[0m:      
[92mINFO 2024-12-01 01:14:29,577[0m:      [ROUND 3]
[92mINFO 2024-12-01 01:14:29,578[0m:      configure_fit: strategy sampled 10 clients (out of 10)


[36m(ClientAppActor pid=30492)[0m Loading dataset for partition ID: 9[32m [repeated 35x across cluster][0m
[36m(ClientAppActor pid=30364)[0m Partition 8: Train 501, Val 56, Test 557[32m [repeated 34x across cluster][0m
[36m(ClientAppActor pid=23904)[0m Bad client performing model poisoning![32m [repeated 3x across cluster][0m


[92mINFO 2024-12-01 01:14:32,149[0m:      aggregate_fit: received 10 results and 0 failures
[92mINFO 2024-12-01 01:14:32,204[0m:      configure_evaluate: strategy sampled 5 clients (out of 10)


Saving round 3 aggregated weights...


[92mINFO 2024-12-01 01:14:32,901[0m:      aggregate_evaluate: received 5 results and 0 failures
[92mINFO 2024-12-01 01:14:32,903[0m:      
[92mINFO 2024-12-01 01:14:32,903[0m:      [ROUND 4]
[92mINFO 2024-12-01 01:14:32,904[0m:      configure_fit: strategy sampled 10 clients (out of 10)
[92mINFO 2024-12-01 01:14:35,263[0m:      aggregate_fit: received 10 results and 0 failures
[92mINFO 2024-12-01 01:14:35,304[0m:      configure_evaluate: strategy sampled 5 clients (out of 10)


Saving round 4 aggregated weights...
[36m(ClientAppActor pid=20704)[0m Loading dataset for partition ID: 2[32m [repeated 55x across cluster][0m
[36m(ClientAppActor pid=24808)[0m Partition 8: Train 501, Val 56, Test 557[32m [repeated 50x across cluster][0m
[36m(ClientAppActor pid=30492)[0m Bad client performing model poisoning![32m [repeated 6x across cluster][0m


[92mINFO 2024-12-01 01:14:35,973[0m:      aggregate_evaluate: received 5 results and 0 failures
[92mINFO 2024-12-01 01:14:35,974[0m:      
[92mINFO 2024-12-01 01:14:35,974[0m:      [ROUND 5]
[92mINFO 2024-12-01 01:14:35,975[0m:      configure_fit: strategy sampled 10 clients (out of 10)
[92mINFO 2024-12-01 01:14:38,536[0m:      aggregate_fit: received 10 results and 0 failures
[92mINFO 2024-12-01 01:14:38,640[0m:      configure_evaluate: strategy sampled 5 clients (out of 10)


Saving round 5 aggregated weights...
MPA model saved as 'MPA_model.pkl'
MPA model saved as 'MPA_model.pth'


[92mINFO 2024-12-01 01:14:39,500[0m:      aggregate_evaluate: received 5 results and 0 failures
[92mINFO 2024-12-01 01:14:39,504[0m:      
[92mINFO 2024-12-01 01:14:39,506[0m:      [SUMMARY]
[92mINFO 2024-12-01 01:14:39,506[0m:      Run finished 5 round(s) in 28.03s
[92mINFO 2024-12-01 01:14:39,519[0m:      	History (loss, distributed):
[92mINFO 2024-12-01 01:14:39,521[0m:      		round 1: 0.7337283283472061
[92mINFO 2024-12-01 01:14:39,522[0m:      		round 2: 3.314071536064148
[92mINFO 2024-12-01 01:14:39,523[0m:      		round 3: 8.764304435253143
[92mINFO 2024-12-01 01:14:39,525[0m:      		round 4: 15.419864845275878
[92mINFO 2024-12-01 01:14:39,526[0m:      		round 5: 33.48049692530185
[92mINFO 2024-12-01 01:14:39,527[0m:      	History (metrics, distributed, evaluate):
[92mINFO 2024-12-01 01:14:39,528[0m:      	{'accuracy': [(1, 0.6142857142857142),
[92mINFO 2024-12-01 01:14:39,528[0m:      	              (2, 0.5785714285714285),
[92mINFO 2024-12-01 01:14:39

[36m(ClientAppActor pid=21160)[0m Loading dataset for partition ID: 8[32m [repeated 39x across cluster][0m
[36m(ClientAppActor pid=21160)[0m Partition 8: Train 501, Val 56, Test 557[32m [repeated 40x across cluster][0m
[36m(ClientAppActor pid=30492)[0m Bad client performing model poisoning![32m [repeated 3x across cluster][0m


[94mDEBUG 2024-12-01 01:14:41,525[0m:     Terminated RayBackend
[94mDEBUG 2024-12-01 01:14:41,526[0m:     Stopping Simulation Engine now.
