In [1]:
import os
from dotenv import load_dotenv
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras import backend as K
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split
import json
import boto3
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

load_dotenv()

# Custom weighted loss function
def weighted_loss(y_true, y_pred):
    loss = K.binary_crossentropy(y_true, y_pred)  
    fraud_weight = tf.ones_like(y_true) * 1.0  # Shape: (batch_size, 1)
    non_fraud_weight = tf.ones_like(y_true) * 0.01    
    weight = tf.where(tf.equal(y_true, 1), fraud_weight, non_fraud_weight)
    return K.mean(loss * weight)

# Suppress TensorFlow logs
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'

# Environment variables
BANK_ID = os.getenv("BANK_ID", "1")
SERVER_ADDRESS = os.getenv("SERVER_ADDRESS", "server:8080")
SPACES_ACCESS_KEY = os.getenv("SPACES_ACCESS_KEY")
SPACES_SECRET_KEY = os.getenv("SPACES_SECRET_KEY")
SPACE_NAME = os.getenv("SPACE_NAME", "federated-learning")
TRANSACTIONS_FILE = f"Bank_{BANK_ID}_transactions.json"

# Initialize DigitalOcean Spaces client
def init_s3_client():
    s3 = boto3.client(
        's3',
        endpoint_url='https://fra1.digitaloceanspaces.com',
        aws_access_key_id=SPACES_ACCESS_KEY,
        aws_secret_access_key=SPACES_SECRET_KEY,
    )
    return s3

s3_client = init_s3_client()

# Load transactions from DigitalOcean Spaces
def load_transactions(space_name, file_name):
    try:
        obj = s3_client.get_object(Bucket=space_name, Key=file_name)
        transactions = json.loads(obj["Body"].read().decode("utf-8"))
        return transactions
    except Exception as e:
        return []

In [2]:
# Load data
transactions = load_transactions(SPACE_NAME, TRANSACTIONS_FILE)

transactions[:1]

[]

In [3]:
df = pd.json_normalize(transactions, sep="_")

# Define possible party type-role combinations
POSSIBLE_PARTY_COMBINATIONS = [
    ("individual", "UBO"),
    ("entity", "UBO"),
]
party_columns = [f"party_{ptype}_{prole}" for ptype, prole in POSSIBLE_PARTY_COMBINATIONS]

# Function to count occurrences based on predefined values
def count_party_combinations(parties):
    counts = {col: 0 for col in party_columns}
    for party in parties:
        col_name = f"party_{party.get('party_type')}_{party.get('party_role')}"
        if col_name in counts:
            counts[col_name] += 1
    return counts

# Apply function to transactions
party_data = [count_party_combinations(tx.get("Transaction", {}).get("account", {}).get("parties", [])) for tx in transactions]
df_parties = pd.DataFrame(party_data).reindex(columns=party_columns, fill_value=0)
df = pd.concat([df, df_parties], axis=1)

# Define predefined transaction beneficiary values
POSSIBLE_BENEFICIARIES = [f"P{i}" for i in range(1, 11)]

# One-hot encode categorical features
encoder = OneHotEncoder(categories=[POSSIBLE_BENEFICIARIES], drop="first", sparse_output=False, handle_unknown="ignore")

# Prepare features and labels
X = df[[
    "Transaction_transaction_type", 
    "Transaction_currency_amount", 
    "Transaction_account_country_code", 
    "Transaction_transaction_beneficiary_country_code",
    "Transaction_transaction_beneficiary"
] + party_columns]  # Include new party count features

y = df["Transaction_local_label"]

# Create a ColumnTransformer to apply OHE to categorical features
preprocessor = ColumnTransformer(
    transformers=[
        ("transaction_type", encoder, ["Transaction_transaction_type"]),
        ("account_country", encoder, ["Transaction_account_country_code"]),
        ("beneficiary_country", encoder, ["Transaction_transaction_beneficiary_country_code"]),
        ("beneficiary", encoder, ["Transaction_transaction_beneficiary"]),
        ("currency_amount", "passthrough", ["Transaction_currency_amount"]),
        ("party_counts", "passthrough", party_columns),
    ]
)

# Apply the transformer
X_processed = preprocessor.fit_transform(X)

KeyError: "['Transaction_transaction_type', 'Transaction_currency_amount', 'Transaction_account_country_code', 'Transaction_transaction_beneficiary_country_code', 'Transaction_transaction_beneficiary'] not in index"

In [4]:
# Split the data
X_train, X_test, y_train_local, y_test_local, train_indices, test_indices = train_test_split(
    X_processed, y, df.index, test_size=0.2, random_state=42, stratify=y
)

# Extract global labels for the test set
y_test_global = df.loc[test_indices, "Transaction_global_label"].values

# Scale currency amount (assuming min-max scaling)
X_train[:, -len(party_columns)-1] = (X_train[:, -len(party_columns)-1] - 10) / (50000 - 10)
X_test[:, -len(party_columns)-1] = (X_test[:, -len(party_columns)-1] - 10) / (50000 - 10)

# Convert to NumPy arrays
X_train = X_train.astype(np.float32)
X_test = X_test.astype(np.float32)
y_train, y_test_local = y_train_local.values, y_test_local.values

In [5]:
# Define model
def create_model(input_dim):
    inputs = tf.keras.layers.Input(shape=(input_dim,))
    hidden = tf.keras.layers.Dense(32, activation="relu")(inputs)
    dropout = tf.keras.layers.Dropout(0.3)(hidden)
    output = tf.keras.layers.Dense(1, activation="sigmoid")(dropout)
    model = tf.keras.Model(inputs=inputs, outputs=output)
    model.compile(optimizer="adam", loss=weighted_loss, metrics=["AUC"])
    return model

model = create_model(X_train.shape[1])

2025-03-29 15:47:55.831055: E external/local_xla/xla/stream_executor/cuda/cuda_platform.cc:51] failed call to cuInit: INTERNAL: CUDA error: Failed call to cuInit: UNKNOWN ERROR (303)


In [6]:
NUM_ROUNDS = 1
history = model.fit(X_train, y_train, epochs=30*NUM_ROUNDS, batch_size=64, verbose=0)

In [7]:
y_train_pred = model.predict(X_train)

[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 792us/step


In [8]:
local_train_auc = roc_auc_score(y_train, y_train_pred)
local_train_auc * 100

73.73710095827299

In [9]:
y_local_pred = model.predict(X_test)
local_auc = roc_auc_score(y_test_local, y_local_pred)
local_auc * 100

[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step 


73.13058198587356

In [10]:
y_global_pred = model.predict(X_test)

[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step 


In [11]:
global_auc = roc_auc_score(y_test_global, y_global_pred)
global_auc * 100

52.15425360118855

## Conclusion

The AUC of detecting 1 scenario on the training set is: 0.7374.  
The AUC of detecting 1 scenario on the test set is: 0.7313.   
The AUC of detecting 4 scenari on the test set is: 0.5215 (Performance close to random). 

## Results from Federated Learning:

| Epoch | Train AUC (1 scenario) | Test AUC (1 scenario) | Test AUC (4 scenari) |
|-------|---------------------|--------------------|---------------------|
| 1     | 0.8230              | 0.7289             | 0.9083              |
