<a href="https://colab.research.google.com/github/ThisumiWijesinghe/Fraud-Detection-with-Federated-Learning/blob/main/Fraud_detection_system_FL.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Load dataset  and preprocessing part

In [None]:

# Step 1 - Data Preparation for 12 Clients (Banks)


import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split


# Load dataset

df = pd.read_csv("fraud.csv")

print("Original Shape:", df.shape)
print(df.head())


# Drop irrelevant columns

df = df.drop(['nameOrig', 'nameDest', 'isFlaggedFraud'], axis=1)

# Encode 'type' column
df['type'] = LabelEncoder().fit_transform(df['type'])

# Drop rows with missing values (if any)
df = df.dropna()


# Features & Target

X = df.drop(['isFraud'], axis=1)
y = df['isFraud']

print("Fraud distribution:", np.bincount(y))


# Handle class imbalance with SMOTE

smote = SMOTE(random_state=42)
X_res, y_res = smote.fit_resample(X, y)

print("After SMOTE:", np.bincount(y_res))


# Split into 12 Clients (Banks)

clients_X = []
clients_y = []

X_train_global, X_test_global, y_train_global, y_test_global = train_test_split(
    X_res, y_res, test_size=0.2, random_state=42, stratify=y_res
)

# Divide into 12 roughly equal client datasets
split_size = len(X_train_global) // 12

for i in range(12):
    start = i * split_size
    end = (i + 1) * split_size if i < 11 else len(X_train_global)

    X_client = X_train_global[start:end]
    y_client = y_train_global[start:end]

    # Scale per-client
    scaler = StandardScaler()
    X_client_scaled = scaler.fit_transform(X_client)

    clients_X.append(X_client_scaled)
    clients_y.append(y_client.values)

    print(f"Client {i+1}: {X_client_scaled.shape}, Fraud Cases: {sum(y_client)}")


# Save datasets per client

for i in range(12):
    client_df = pd.DataFrame(clients_X[i], columns=X.columns)
    client_df['isFraud'] = clients_y[i]
    client_df.to_csv(f"client_{i+1}_dataset.csv", index=False)

print(" 12 client datasets saved successfully!")


Original Shape: (233392, 11)
   step      type    amount     nameOrig  oldbalanceOrg  newbalanceOrig  \
0     1   PAYMENT   9839.64  C1231006815       170136.0       160296.36   
1     1   PAYMENT   1864.28  C1666544295        21249.0        19384.72   
2     1  TRANSFER    181.00  C1305486145          181.0            0.00   
3     1  CASH_OUT    181.00   C840083671          181.0            0.00   
4     1   PAYMENT  11668.14  C2048537720        41554.0        29885.86   

      nameDest  oldbalanceDest  newbalanceDest  isFraud  isFlaggedFraud  
0  M1979787155             0.0             0.0      0.0             0.0  
1  M2044282225             0.0             0.0      0.0             0.0  
2   C553264065             0.0             0.0      1.0             0.0  
3    C38997010         21182.0             0.0      1.0             0.0  
4  M1230701703             0.0             0.0      0.0             0.0  
Fraud distribution: [233234    157]
After SMOTE: [233234 233234]
Client 1: (

#ML models training

In [None]:
pip install pandas numpy scikit-learn imbalanced-learn xgboost joblib matplotlib




In [None]:
import os
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import (precision_score, recall_score, f1_score,
                             roc_auc_score, confusion_matrix, classification_report)
import joblib
import matplotlib.pyplot as plt


In [None]:

# Config
DATA_PATH = "fraud.csv"
OUTPUT_DIR = "baseline_output"
RANDOM_STATE = 42
TEST_SIZE = 0.20

os.makedirs(OUTPUT_DIR, exist_ok=True)

In [None]:
# Load & basic preprocessing
df = pd.read_csv(DATA_PATH)
print("Original shape:", df.shape)

# follow thisumi's preprocessing
df = df.drop(['nameOrig', 'nameDest', 'isFlaggedFraud'], axis=1, errors='ignore')
df['type'] = LabelEncoder().fit_transform(df['type'])
df = df.dropna()

X = df.drop(['isFraud'], axis=1)
y = df['isFraud'].astype(int)

print("Global fraud distribution (before SMOTE):", np.bincount(y))

Original shape: (655045, 11)
Global fraud distribution (before SMOTE): [654646    398]


In [None]:
smote = SMOTE(random_state=RANDOM_STATE)
X_res, y_res = smote.fit_resample(X, y)
print("After SMOTE distribution:", np.bincount(y_res))

After SMOTE distribution: [654646 654646]


In [None]:

# Train/test split (stratify)
X_train, X_test, y_train, y_test = train_test_split(
    X_res, y_res, test_size=TEST_SIZE, random_state=RANDOM_STATE, stratify=y_res
)

In [None]:
# Scale features (fit on train, apply to test)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

joblib.dump(scaler, os.path.join(OUTPUT_DIR, "scaler.joblib"))

['baseline_output/scaler.joblib']

In [None]:
# models
models = {
    "LogisticRegression": LogisticRegression(max_iter=1000, random_state=RANDOM_STATE),
    "RandomForest": RandomForestClassifier(n_estimators=200, n_jobs=-1, random_state=RANDOM_STATE),
    "XGBoost": XGBClassifier(n_estimators=200, use_label_encoder=False, eval_metric="logloss", random_state=RANDOM_STATE)
}


In [None]:
results = []

for name, model in models.items():
    print(f"\nTraining {name} ...")
    model.fit(X_train_scaled, y_train)
    # Save model
    joblib.dump(model, os.path.join(OUTPUT_DIR, f"{name}.joblib"))


Training LogisticRegression ...

Training RandomForest ...

Training XGBoost ...


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


In [None]:
# Predictions & scores
y_pred = model.predict(X_test_scaled)
if hasattr(model, "predict_proba"):
    y_proba = model.predict_proba(X_test_scaled)[:, 1]
else:
    # fallback: decision_function to probabilities (not expected here)
    try:
        y_proba = model.decision_function(X_test_scaled)
    except:
        y_proba = y_pred

precision = precision_score(y_test, y_pred, zero_division=0)
recall = recall_score(y_test, y_pred, zero_division=0)
f1 = f1_score(y_test, y_pred, zero_division=0)
roc_auc = roc_auc_score(y_test, y_proba)

# store results
results.append({
    "Model": name,
    "Precision": round(precision, 4),
    "Recall": round(recall, 4),
    "F1": round(f1, 4),
    "ROC_AUC": round(roc_auc, 4)
})

print(f"{name} - Precision: {precision:.4f}, Recall: {recall:.4f}, F1: {f1:.4f}, ROC_AUC: {roc_auc:.4f}")
print("Classification report:")
print(classification_report(y_test, y_pred, digits=4))

XGBoost - Precision: 0.9976, Recall: 0.9998, F1: 0.9987, ROC_AUC: 1.0000
Classification report:
              precision    recall  f1-score   support

           0     0.9998    0.9976    0.9987    130930
           1     0.9976    0.9998    0.9987    130929

    accuracy                         0.9987    261859
   macro avg     0.9987    0.9987    0.9987    261859
weighted avg     0.9987    0.9987    0.9987    261859



In [None]:
# Confusion matrix plot (save)
cm = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(4,4))
plt.imshow(cm, interpolation='nearest')
plt.title(f"Confusion Matrix - {name}")
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.colorbar()
for (i, j), v in np.ndenumerate(cm):
    plt.text(j, i, str(v), ha='center', va='center')
plt.tight_layout()
plt.savefig(os.path.join(OUTPUT_DIR, f"confusion_{name}.png"))
plt.close()

results.append({
      "model": name,
      "precision": precision,
      "recall": recall,
      "f1": f1,
      "roc_auc": roc_auc
  })

In [None]:
# Save results table
results_df = pd.DataFrame(results)
results_df.to_csv(os.path.join(OUTPUT_DIR, "baseline_results.csv"), index=False)
print("\nSaved baseline results to", os.path.join(OUTPUT_DIR, "baseline_results.csv"))
print("Saved models and confusion matrices to", OUTPUT_DIR)


Saved baseline results to baseline_output/baseline_results.csv
Saved models and confusion matrices to baseline_output


#Visualization part