In [6]:
import warnings

# Deactivate all warnings
warnings.filterwarnings("ignore")

In [8]:
import os
from dotenv import load_dotenv
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras import backend as K
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split
import json
from google.cloud import storage
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

load_dotenv()

# Custom weighted loss function
def weighted_loss(y_true, y_pred):
    loss = K.binary_crossentropy(y_true, y_pred)
    fraud_weight = tf.ones_like(y_true) * 1.0  # Shape: (batch_size, 1)
    non_fraud_weight = tf.ones_like(y_true) * 0.01
    weight = tf.where(tf.equal(y_true, 1), fraud_weight, non_fraud_weight)
    return K.mean(loss * weight)

# Suppress TensorFlow logs
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'

# Initialize Google Cloud Storage client
def init_gcp_client():
    client = storage.Client()
    return client

gcp_client = init_gcp_client()

# Load transactions from Google Cloud Storage
def load_transactions(bucket_name, file_name):
    try:
        bucket = gcp_client.get_bucket(bucket_name)
        blob = bucket.blob(file_name)
        transactions = json.loads(blob.download_as_text())
        return transactions
    except Exception as e:
        return []

In [9]:
# Define model
def create_model(input_dim):
    inputs = tf.keras.layers.Input(shape=(input_dim,))
    hidden = tf.keras.layers.Dense(32, activation="relu")(inputs)
    dropout = tf.keras.layers.Dropout(0.3)(hidden)
    output = tf.keras.layers.Dense(1, activation="sigmoid")(dropout)
    model = tf.keras.Model(inputs=inputs, outputs=output)
    model.compile(optimizer="adam", loss=weighted_loss, metrics=["AUC"])
    return model

In [10]:
GCS_BUCKET_NAME = os.getenv("GCS_BUCKET_NAME", "federated-learning")
NUM_ROUNDS = 1

In [11]:
# Load data
TRANSACTIONS_FILE = f"Bank_{BANK_ID}_transactions.json"
transactions = load_transactions(GCS_BUCKET_NAME, TRANSACTIONS_FILE)
#transactions[:1]

In [12]:
def evaluate_model(bank_id):
    TRANSACTIONS_FILE = f"Bank_{bank_id}_transactions.json"
    transactions = load_transactions(GCS_BUCKET_NAME, TRANSACTIONS_FILE)
    df = pd.json_normalize(transactions, sep="_")

    # Define possible party type-role combinations
    POSSIBLE_PARTY_COMBINATIONS = [
        ("individual", "UBO"),
        ("entity", "UBO"),
    ]
    party_columns = [f"party_{ptype}_{prole}" for ptype, prole in POSSIBLE_PARTY_COMBINATIONS]

    # Function to count occurrences based on predefined values
    def count_party_combinations(parties):
        counts = {col: 0 for col in party_columns}
        for party in parties:
            col_name = f"party_{party.get('party_type')}_{party.get('party_role')}"
            if col_name in counts:
                counts[col_name] += 1
        return counts

    # Apply function to transactions
    party_data = [count_party_combinations(tx.get("Transaction", {}).get("account", {}).get("parties", [])) for tx in transactions]
    df_parties = pd.DataFrame(party_data).reindex(columns=party_columns, fill_value=0)
    df = pd.concat([df, df_parties], axis=1)

    # Define predefined transaction beneficiary values
    POSSIBLE_BENEFICIARIES = [f"P{i}" for i in range(1, 11)]

    # One-hot encode categorical features
    encoder = OneHotEncoder(categories=[POSSIBLE_BENEFICIARIES], drop="first", sparse_output=False, handle_unknown="ignore")

    # Prepare features and labels
    X = df[[
        "Transaction_transaction_type",
        "Transaction_currency_amount",
        "Transaction_account_country_code",
        "Transaction_transaction_beneficiary_country_code",
        "Transaction_transaction_beneficiary"
    ] + party_columns]  # Include new party count features

    y = df["Transaction_local_label"]

    # Create a ColumnTransformer to apply OHE to categorical features
    preprocessor = ColumnTransformer(
        transformers=[
            ("transaction_type", encoder, ["Transaction_transaction_type"]),
            ("account_country", encoder, ["Transaction_account_country_code"]),
            ("beneficiary_country", encoder, ["Transaction_transaction_beneficiary_country_code"]),
            ("beneficiary", encoder, ["Transaction_transaction_beneficiary"]),
            ("currency_amount", "passthrough", ["Transaction_currency_amount"]),
            ("party_counts", "passthrough", party_columns),
        ]
    )

    # Apply the transformer
    X_processed = preprocessor.fit_transform(X)


    # Split the data
    X_train, X_test, y_train_local, y_test_local, train_indices, test_indices = train_test_split(
        X_processed, y, df.index, test_size=0.2, random_state=42, stratify=y
    )

    # Extract global labels for the test set
    y_test_global = df.loc[test_indices, "Transaction_global_label"].values

    # Scale currency amount (assuming min-max scaling)
    X_train[:, -len(party_columns)-1] = (X_train[:, -len(party_columns)-1] - 10) / (50000 - 10)
    X_test[:, -len(party_columns)-1] = (X_test[:, -len(party_columns)-1] - 10) / (50000 - 10)

    # Convert to NumPy arrays
    X_train = X_train.astype(np.float32)
    X_test = X_test.astype(np.float32)
    y_train, y_test_local = y_train_local.values, y_test_local.values

    model = create_model(X_train.shape[1])
    history = model.fit(X_train, y_train, epochs=30*NUM_ROUNDS, batch_size=64, verbose=0)
    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)
    local_train_auc = roc_auc_score(y_train, y_train_pred)
    local_auc = roc_auc_score(y_test_local, y_test_pred)
    global_auc = roc_auc_score(y_test_global, y_test_pred)
    print(f"""
            AUC of detecting 1 scenario on the training set: {local_train_auc:0.4f},
            AUC of detecting 1 scenario on the test set: {local_auc:0.4f},
            AUC of detecting 4 scenari on the test set : {global_auc:0.4f}
    """)

In [13]:
evaluate_model(1)

[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 385us/step
[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 723us/step

            AUC of detecting 1 scenario on the training set: 0.7415,
            AUC of detecting 1 scenario on the test set: 0.7128,
            AUC of detecting 4 scenari on the test set : 0.5553
    


In [14]:
evaluate_model(2)

[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 314us/step
[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 644us/step

            AUC of detecting 1 scenario on the training set: 0.5372,
            AUC of detecting 1 scenario on the test set: 0.5218,
            AUC of detecting 4 scenari on the test set : 0.5444
    


In [15]:
evaluate_model(3)

[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 471us/step
[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 649us/step

            AUC of detecting 1 scenario on the training set: 0.9996,
            AUC of detecting 1 scenario on the test set: 0.9998,
            AUC of detecting 4 scenari on the test set : 0.6601
    


In [16]:
evaluate_model(4)

[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 375us/step
[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step 

            AUC of detecting 1 scenario on the training set: 1.0000,
            AUC of detecting 1 scenario on the test set: 1.0000,
            AUC of detecting 4 scenari on the test set : 0.7055
    


## Results from Federated Learning after 1 epoch:

### Training Results:
- (4000, {'local_train_auc': 0.5619037130440171})
- (4000, {'local_train_auc': 0.9998835463760644})
- (4000, {'local_train_auc': 0.7388128731215261})
- (4000, {'local_train_auc': 1.0})

**Aggregated Local Training AUC**: 0.8251500331354019

### Testing Results:
- (1000, {'global_auc': 0.8739103111555133, 'local_auc': 0.9191768029882196})
- (1000, {'global_auc': 0.8814535884198805, 'local_auc': 0.4971177981437161})
- (1000, {'global_auc': 0.8660674121200437, 'local_auc': 0.8510115388920523})
- (1000, {'global_auc': 0.8820727895957254, 'local_auc': 0.4941996480082418})

**Aggregated local test AUC**: 0.6904
**Aggregated global test AUC**: 0.8759

## Summary

| Bank - Detection Focus        | Local Model for 1 Scenario | Local Model for 4 Scenari | Federated Learning for 1 Scenario | Federated Learning for 4 Scenari |
|------------------------------|-------------------------------|----------------------------------------|--------------------------------------|-------|
| Bank 1 - Large Cash Deposits  | AUC = 0.7072                  | AUC = 0.5735                           | AUC = 0.9191                            | AUC = **0.8739** |
| Bank 2 – High-Risk Transactions| AUC = 0.5229                  | AUC = 0.5738                           | AUC = 0.4971                            | AUC = **0.8814** |
| Bank 3 - Many UBOs            | AUC = 0.9999                  | AUC = 0.6689                           | AUC =  0.8510                            | AUC = **0.8660** |
| Bank 4 - Watchlist Entities   | AUC = 1.0000                  | AUC = 0.6977                           | AUC = 0.4941                            | AUC = **0.8820** |
