<a href="https://colab.research.google.com/github/ThisumiWijesinghe/Fraud-Detection-with-Federated-Learning/blob/main/federated_le.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
import kagglehub

# ================================
# Step 1: Load PaySim Dataset
# ================================

path = kagglehub.dataset_download("ealaxi/paysim1")
file_path = path + "/PS_20174392719_1491204439457_log.csv"

df = pd.read_csv(file_path)
print("Dataset shape:", df.shape)

# ================================
# Step 2: Basic Preprocessing
# ================================

# Encode transaction type
le = LabelEncoder()
df["type"] = le.fit_transform(df["type"])

# Target column
target_col = "isFraud"

# Features
feature_cols = [
    "step", "type", "amount",
    "oldbalanceOrg", "newbalanceOrig",
    "oldbalanceDest", "newbalanceDest"
]

X = df[feature_cols]
y = df[target_col]

# ================================
# Step 3: Dirichlet Non-IID Split
# ================================

NUM_CLIENTS = 12
ALPHA = 0.5   # smaller = more non-IID

np.random.seed(42)

# Separate fraud and non-fraud indices
fraud_idx = np.where(y == 1)[0]
nonfraud_idx = np.where(y == 0)[0]

# Dirichlet distribution
fraud_dist = np.random.dirichlet([ALPHA] * NUM_CLIENTS)
nonfraud_dist = np.random.dirichlet([ALPHA] * NUM_CLIENTS)

# Split indices
fraud_splits = np.split(
    fraud_idx,
    (np.cumsum(fraud_dist)[:-1] * len(fraud_idx)).astype(int)
)

nonfraud_splits = np.split(
    nonfraud_idx,
    (np.cumsum(nonfraud_dist)[:-1] * len(nonfraud_idx)).astype(int)
)

# ================================
# Step 4: Create Client Datasets
# ================================

clients_data = {}

for i in range(NUM_CLIENTS):
    client_indices = np.concatenate(
        [fraud_splits[i], nonfraud_splits[i]]
    )
    np.random.shuffle(client_indices)

    clients_data[i] = {
        "X": X.iloc[client_indices],
        "y": y.iloc[client_indices]
    }

# ================================
# Step 5: Print Fraud Samples per Client
# ================================

print("\nFraud samples per client:\n")

for i in range(NUM_CLIENTS):
    fraud_count = np.sum(clients_data[i]["y"] == 1)
    total_samples = len(clients_data[i]["y"])
    print(f"Client {i+1}: Fraud samples = {fraud_count}, Total samples = {total_samples}")

Using Colab cache for faster access to the 'paysim1' dataset.
Dataset shape: (6362620, 11)

Fraud samples per client:

Client 1: Fraud samples = 372, Total samples = 258223
Client 2: Fraud samples = 1753, Total samples = 51178
Client 3: Fraud samples = 65, Total samples = 478277
Client 4: Fraud samples = 8, Total samples = 5254
Client 5: Fraud samples = 1000, Total samples = 4192294
Client 6: Fraud samples = 1, Total samples = 115029
Client 7: Fraud samples = 2912, Total samples = 663175
Client 8: Fraud samples = 88, Total samples = 18551
Client 9: Fraud samples = 246, Total samples = 1712
Client 10: Fraud samples = 497, Total samples = 83514
Client 11: Fraud samples = 1044, Total samples = 121495
Client 12: Fraud samples = 227, Total samples = 373918


##Define DNN Model (Server & Clients)

In [3]:
import tensorflow as tf
from tensorflow.keras import layers, models, optimizers

INPUT_DIM = len(feature_cols)

def create_dnn_model():
    model = models.Sequential([
        layers.Input(shape=(INPUT_DIM,)),
        layers.Dense(64, activation="relu"),
        layers.Dense(32, activation="relu"),
        layers.Dense(1, activation="sigmoid")
    ])

    model.compile(
        optimizer=optimizers.Adam(0.001),
        loss="binary_crossentropy",
        metrics=["accuracy"]
    )
    return model


##Server Initializes Global Model (EMPTY)

In [4]:
global_model = create_dnn_model()


##FedAvg Aggregation Function (Server Side)

In [5]:
def fedavg(client_weights):
    avg_weights = []
    for weights in zip(*client_weights):
        avg_weights.append(np.mean(weights, axis=0))
    return avg_weights


##Federated Training Parameters

In [6]:
NUM_ROUNDS = 5       # Global rounds
LOCAL_EPOCHS = 1     # Client training
BATCH_SIZE = 1024


##FedAvg Training Loop

In [7]:
print("\nStarting Federated Training using FedAvg\n")

for round_num in range(NUM_ROUNDS):
    print(f"--- Global Round {round_num+1} ---")

    client_weights = []

    # Server sends global model to clients
    for client_id in range(NUM_CLIENTS):

        # Client receives global model
        local_model = create_dnn_model()
        local_model.set_weights(global_model.get_weights())

        # Client trains on its own data
        local_model.fit(
            clients_data[client_id]["X"],
            clients_data[client_id]["y"],
            epochs=LOCAL_EPOCHS,
            batch_size=BATCH_SIZE,
            verbose=0
        )

        # Client sends updated model to server
        client_weights.append(local_model.get_weights())

    # Server aggregates models (FedAvg)
    new_global_weights = fedavg(client_weights)
    global_model.set_weights(new_global_weights)

    print(f"Global Round {round_num+1} completed")



Starting Federated Training using FedAvg

--- Global Round 1 ---
Global Round 1 completed
--- Global Round 2 ---
Global Round 2 completed
--- Global Round 3 ---
Global Round 3 completed
--- Global Round 4 ---
Global Round 4 completed
--- Global Round 5 ---
Global Round 5 completed


In [8]:
def evaluate_global_model(global_model, clients_data):
    client_accuracies = []

    for i in range(NUM_CLIENTS):
        loss, acc = global_model.evaluate(
            clients_data[i]["X"],
            clients_data[i]["y"],
            verbose=0
        )
        client_accuracies.append(acc)

        print(f"Client {i+1} Accuracy: {acc:.4f}")

    avg_acc = np.mean(client_accuracies)
    print(f"\nAverage Client Accuracy: {avg_acc:.4f}\n")

    return client_accuracies, avg_acc


In [None]:
print("\nStarting Federated Training using FedAvg\n")

for round_num in range(NUM_ROUNDS):
    print(f"\n--- Global Round {round_num+1} ---")

    client_weights = []

    # -------- CLIENT TRAINING --------
    for client_id in range(NUM_CLIENTS):

        local_model = create_dnn_model()
        local_model.set_weights(global_model.get_weights())

        local_model.fit(
            clients_data[client_id]["X"],
            clients_data[client_id]["y"],
            epochs=LOCAL_EPOCHS,
            batch_size=BATCH_SIZE,
            verbose=0
        )

        client_weights.append(local_model.get_weights())

    # -------- SERVER AGGREGATION --------
    new_global_weights = fedavg(client_weights)
    global_model.set_weights(new_global_weights)

    print("Evaluating global model on each client:")

    # -------- EVALUATION --------
    evaluate_global_model(global_model, clients_data)



Starting Federated Training using FedAvg


--- Global Round 1 ---
Evaluating global model on each client:
Client 1 Accuracy: 0.9909
Client 2 Accuracy: 0.9832
Client 3 Accuracy: 0.9934
Client 4 Accuracy: 0.9952
Client 5 Accuracy: 0.9987
Client 6 Accuracy: 0.9996
Client 7 Accuracy: 0.9979
Client 8 Accuracy: 0.9976
Client 9 Accuracy: 0.9422
Client 10 Accuracy: 0.9971
Client 11 Accuracy: 0.9959
Client 12 Accuracy: 0.9994

Average Client Accuracy: 0.9909


--- Global Round 2 ---
Evaluating global model on each client:
Client 1 Accuracy: 0.9907
Client 2 Accuracy: 0.9817
Client 3 Accuracy: 0.9936
Client 4 Accuracy: 0.9947
Client 5 Accuracy: 0.9992
Client 6 Accuracy: 1.0000
Client 7 Accuracy: 0.9980
Client 8 Accuracy: 0.9977
Client 9 Accuracy: 0.9311
Client 10 Accuracy: 0.9970
Client 11 Accuracy: 0.9955
Client 12 Accuracy: 0.9997

Average Client Accuracy: 0.9899


--- Global Round 3 ---
Evaluating global model on each client:
Client 1 Accuracy: 0.9923
Client 2 Accuracy: 0.9830
Client 3 Accura