In [1]:
import pandas as pd
from sklearn.model_selection import StratifiedKFold
import json
import os

## Load Data

In [2]:
transactions = pd.read_csv("raw_datasets/LI-Small_Trans.csv")
accounts = pd.read_csv("raw_datasets/LI-Small_accounts.csv")

In [3]:
transactions.columns

Index(['Timestamp', 'From Bank', 'Account', 'To Bank', 'Account.1',
       'Amount Received', 'Receiving Currency', 'Amount Paid',
       'Payment Currency', 'Payment Format', 'Is Laundering'],
      dtype='object')

In [4]:
transactions.columns.tolist()

['Timestamp',
 'From Bank',
 'Account',
 'To Bank',
 'Account.1',
 'Amount Received',
 'Receiving Currency',
 'Amount Paid',
 'Payment Currency',
 'Payment Format',
 'Is Laundering']

In [6]:
transactions.shape

(6924049, 11)

In [7]:
accounts.shape

(712688, 5)

## Clean & Rename

In [10]:
transactions = transactions.rename(columns={
    "Account": "From_Account",
    "Account.1": "To_Account"
})

In [11]:
sender_accounts = accounts.rename(columns=lambda x: "Sender_" + x if x not in ["Account Number"] else "Sender_Account_Number")
transactions = transactions.merge(sender_accounts, left_on="From_Account", right_on="Sender_Account_Number", how="left")

In [12]:
receiver_accounts = accounts.rename(columns=lambda x: "Receiver_" + x if x not in ["Account Number"] else "Receiver_Account_Number")
transactions = transactions.merge(receiver_accounts, left_on="To_Account", right_on="Receiver_Account_Number", how="left")

## Stratified Split

In [13]:
def split_stratified(df, label_col="Is Laundering", n_splits=10):
    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)
    partitions = []
    for i, (_, idx) in enumerate(skf.split(df, df[label_col])):
        subset = df.iloc[idx].copy()
        subset["Bank_Partition"] = i
        partitions.append(subset)
    return partitions
  
partitions = split_stratified(transactions, label_col="Is Laundering", n_splits=10)

## Export JSON Feeds

In [27]:
output_folder = "cleaned_data_json"
os.makedirs(output_folder, exist_ok=True)

# Export each partition as JSON
for i, subset in enumerate(partitions):
    bank_data = {
        "bank_id": int(i),
        "records": subset.to_dict(orient="records")
    }
    output_file = os.path.join(output_folder, f"bank_{i}.json")
    with open(output_file, "w") as f:
        json.dump(bank_data, f, indent=2)

## Validate Class Balance

In [14]:
print("Class Balance per Partition:")
for i, subset in enumerate(partitions):
    total = len(subset)
    pos = subset["Is Laundering"].sum()
    print(f"Bank {i}: Total={total}, Laundering={pos}, Ratio={pos/total:.4f}")

Class Balance per Partition:
Bank 0: Total=692417, Laundering=357, Ratio=0.0005
Bank 1: Total=692416, Laundering=356, Ratio=0.0005
Bank 2: Total=692416, Laundering=356, Ratio=0.0005
Bank 3: Total=692416, Laundering=356, Ratio=0.0005
Bank 4: Total=692416, Laundering=356, Ratio=0.0005
Bank 5: Total=692416, Laundering=356, Ratio=0.0005
Bank 6: Total=692416, Laundering=357, Ratio=0.0005
Bank 7: Total=692416, Laundering=357, Ratio=0.0005
Bank 8: Total=692416, Laundering=357, Ratio=0.0005
Bank 9: Total=692416, Laundering=357, Ratio=0.0005
