### Necessary Imports

In [1]:
import numpy as np
import sys
import warnings
warnings.simplefilter(action="ignore", category=FutureWarning)

import json

from sklearn.metrics import (accuracy_score,classification_report,confusion_matrix)
from sklearn import metrics

import pickle
from importlib.resources import path
from pathlib import Path
import os
import importlib
import time

import numpy as np
import pandas as pd
import sklearn.utils
from pandas import Int64Index, MultiIndex
from sklearn import metrics

import submission_src.fincrime.solution_centralized as funcs
importlib.reload(funcs) 

import tensorflow as tf

from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import RobustScaler

from sklearn.model_selection import train_test_split


#### Merging swift Traindata with bankdata

In [2]:
dirname = os.path.dirname('D:/fincrime-federated/')
os.chdir(dirname)
print(os.getcwd())

start_time = time.time()

model_dir = 'D:/fincrime-federated/model/fincrime'
preds_format_path = 'D:/fincrime-federated/prediction/fincrime/prediction_format'
preds_dest_path = 'D:/fincrime-federated/prediction/fincrime/prediction'


## train on data
datapathjsonString = 'data/fincrime/centralized/train/data.json'
swift_data_path = funcs.json_to_dict(datapathjsonString)['swift_data_path']
bank_data_path = funcs.json_to_dict(datapathjsonString)['bank_data_path']

def json_to_dict(datapathjsonString):
    datapathJson = open(datapathjsonString)
    datapathDict = json.load(datapathJson)
    return datapathDict


def load_data(swift_data_path, bank_data_path):
    swift_data = pd.read_csv(swift_data_path, index_col="MessageId")
    swift_data["Timestamp"] = swift_data["Timestamp"].astype("datetime64[ns]")
    bank_data = pd.read_csv(bank_data_path)
    return swift_data, bank_data

train_data, bank_data = load_data(swift_data_path=swift_data_path, bank_data_path=bank_data_path)

# Merging with bank details
train_data = pd.merge(train_data,bank_data,left_on="OrderingAccount",right_on="Account",how="left",)
train_data = pd.merge(train_data,bank_data,left_on="BeneficiaryAccount",right_on="Account",how="left",
                      suffixes=["_order", "_ben"],)

trainset = train_data
print("Total time taken ", (time.time() - start_time)," seconds")
print(trainset.shape)

D:\fincrime-federated
Total time taken  101.5761559009552  seconds


#### Merging swift Testdata with bankdata

In [3]:
start_time = time.time()

datapathjsonString = 'data/fincrime/centralized/test/data.json'
swift_data_path = funcs.json_to_dict(datapathjsonString)['swift_data_path']
bank_data_path = funcs.json_to_dict(datapathjsonString)['bank_data_path']

test_data, bank_data = load_data(swift_data_path=swift_data_path, bank_data_path=bank_data_path)

# Merging with bank details
test_data = pd.merge(test_data,bank_data,left_on="OrderingAccount",right_on="Account",how="left",)
test_data = pd.merge(test_data,bank_data,left_on="BeneficiaryAccount",right_on="Account",how="left",
                     suffixes=["_order", "_ben"],)

testset = test_data
print("Total time taken ", (time.time() - start_time)," seconds")
print(testset.shape)

Total time taken  150.54939603805542  seconds


### Dropping Null Values

In [4]:
# trainset.dropna(axis=0,how='any',inplace=True)
# print("trainset shape is:"trainset.shape)
# testset.dropna(axis=0,how='any',inplace=True)
# print("testset shape is:"testset.shape)

### Normalization / Scaling

In [5]:
# def stats(train_dataset):
#     train_stats = train_dataset.describe()
#     train_stats = train_stats.transpose()
#     return train_stats

# def norm(x):
#     train_stats=stats(x)
#     return (x - train_stats['mean']) / train_stats['std']

In [6]:
scaler = MinMaxScaler()

# scaler = RobustScaler(quantile_range=(25, 75))

def norm(x):
    scaler.fit(x)
    return scaler.transform(x)

# Data Prep Functions (Call these functions)

In [7]:
def data_prep(train_data):
    # Feature engineering
#     train_data = train_data.dropna(axis=0)
    
    train_data = create_features(
        df=train_data, model_dir=model_dir, map_from_train_set=False
    )

    # Keep below columns for training and testing
    cols_to_keep = [
        "SettlementAmount",
        "InstructedAmount",
        "Label",
        "hour",
        "Flags_ben",
        "MissingBenAccount",
        "MissingOrdAccount",
        "Sender_hour_frequency",
        # 'sender_currency_amount_average',
        "Sender_Receiver_frequency",
        "Sender_InstructedCurrency_frequency",
        "seq",
        # 'receiver_transactions',
        "Receiver_SettlementCurrency_frequency",
        "Receiver_hour_frequency",
        "DifferentOrderNum",
        "DifferentBenNum",
        "DifferentOrderName",
        "DifferentBenName",
        "DifferentOrderStreet",
        "DifferentBenStreet",
        "DifferentOrderZip",
        "DifferentBenZip",
    ]

    train_data_2 = train_data[cols_to_keep]
    return train_data_2

In [8]:
def create_features(df, model_dir, map_from_train_set=False):

    ## Feature Engineering

    # Hour column
    df["hour"] = df["Timestamp"].dt.hour

    # Hour frequency for each sender
    freq_by_features(df=df,level_1="Sender",level_2="hour",model_dir=model_dir,save_as_object=True,
                     map_from_train_set=map_from_train_set,)
    # Hour frequency for each receiver
    freq_by_features(df=df,level_1="Receiver",level_2="hour",model_dir=model_dir,save_as_object=True,
                     map_from_train_set=map_from_train_set,)
    # Sender-Currency Frequency
    freq_by_features(df=df,level_1="Sender",level_2="InstructedCurrency",model_dir=model_dir,save_as_object=True,
                     map_from_train_set=map_from_train_set,)
    # Receiver-SettledCurrency Frequency
    freq_by_features(df=df,level_1="Receiver",level_2="SettlementCurrency",model_dir=model_dir,save_as_object=True,
                     map_from_train_set=map_from_train_set,)
    # Sender-Receiver Frequency
    freq_by_features(df=df,level_1="Sender",level_2="Receiver",model_dir=model_dir,save_as_object=True,
                     map_from_train_set=map_from_train_set,)

    # # Average Amount per Sender-Currency - not working
    # Sender_ICurrency_mean = mean_by_features(df = df, level_1="Sender" , level_2="InstructedCurrency")
    # df.loc[:,"Sender_ICurrency_mean"] = \
    #     df.loc[:,"Sender_InstructedCurrency"].map(Sender_ICurrency_mean)

    # Numbering the transactions within a account order - ben - date combination
    df = df.sort_values(
        by=[
            "SettlementDate",
            "Sender",
            "Receiver",
            "Account_order",
            "Account_ben",
            "Timestamp",
        ],
        ascending=True,
    )
    df["seq"] = (
        df.groupby(
            [
                "SettlementDate",
                "Sender",
                "Receiver",
                "Account_order",
                "Account_ben",
            ]
        ).cumcount()
        + 1
    )
    df["seq"] = df["seq"].replace(np.NAN, 1)

    # Flag columns for transactions with missing bank details
    df[["MissingBenAccount"]] = 0
    df.loc[df["Flags_ben"].isnull(), "MissingBenAccount"] = 1
    df[["MissingOrdAccount"]] = 0
    df.loc[df["Flags_order"].isnull(), "MissingOrdAccount"] = 1

    # Different sender account number from bank details
    df["DifferentOrderNum"] = np.where(
        df["Account_order"] == df["OrderingAccount"], 0, 1
    )
    # Different receiver account number from bank details
    df["DifferentBenNum"] = np.where(
        df["Account_ben"] == df["BeneficiaryAccount"], 0, 1
    )

    # Different sender account name from bank details
    df["DifferentOrderName"] = np.where(
        df["Name_order"] == df["OrderingName"], 0, 1
    )
    # Different receiver account name from bank details
    df["DifferentBenName"] = np.where(
        df["Name_ben"] == df["BeneficiaryName"], 0, 1
    )

    # Different sender account ordering street from bank details
    df["DifferentOrderStreet"] = np.where(
        df["Street_order"] == df["OrderingStreet"], 0, 1
    )
    # Different receiver account ordering street from bank details
    df["DifferentBenStreet"] = np.where(
        df["Street_ben"] == df["BeneficiaryStreet"], 0, 1
    )

    # Different sender account country code/zip from bank details
    df["DifferentOrderZip"] = np.where(
        df["CountryCityZip_order"] == df["OrderingCountryCityZip"], 0, 1
    )
    # Different receiver account country code/zip from bank details
    df["DifferentBenZip"] = np.where(
        df["CountryCityZip_ben"] == df["BeneficiaryCountryCityZip"], 0, 1
    )

    # Some missing value treatment
    df.loc[df["Flags_ben"].isna(), "Flags_ben"] = 99

    return df

In [9]:
def freq_by_features(df,level_1,level_2,model_dir,save_as_object=False,map_from_train_set=False,) -> None:
    if map_from_train_set == True:
        level_1_level_2_frequency = pickle.load(
            open(model_dir+"/"+ str(level_1 + "_" + level_2 + "_frequency")+ ".sav","rb",)
        )
        _level_1 = df.loc[:, level_1].unique()
        _level_2 = df.loc[:, level_2].unique()
        df[str(level_1 + "_" + level_2)] = df.loc[:, level_1] + df.loc[
            :, level_2
        ].astype(str)
        df.loc[:, str(level_1 + "_" + level_2 + "_frequency")] = df.loc[
            :, str(level_1 + "_" + level_2)
        ].map(level_1_level_2_frequency)
    else:
        _level_1 = df.loc[:, level_1].unique()
        _level_2 = df.loc[:, level_2].unique()
        df[str(level_1 + "_" + level_2)] = df.loc[:, level_1] + df.loc[
            :, level_2
        ].astype(str)
        level_1_level_2_frequency = {}
        for s in _level_1:
            level_1_rows = df[df[level_1] == s]
            for h in _level_2:
                level_1_level_2_frequency[s + str(h)] = len(
                    level_1_rows[level_1_rows.loc[:, level_2] == h]
                )

        df.loc[:, str(level_1 + "_" + level_2 + "_frequency")] = df.loc[
            :, str(level_1 + "_" + level_2)
        ].map(level_1_level_2_frequency)

        if save_as_object == True:
            pickle.dump(
                level_1_level_2_frequency,
                open(model_dir + "/" + str(level_1 + "_" + level_2 + "_frequency") + ".sav","wb",),
            )

    ...

## Centralized result using Keras (Executing Main Script)

In [10]:
# Train/Test/validation split

trainloadset,valloadset=train_test_split(trainset,test_size=0.3)
testloader=testset

In [11]:
(trainloadset.shape)

(3284207, 31)

In [12]:
testloader.shape

(705108, 31)

In [15]:
# Data Preparation
tl=data_prep(trainloadset)
vl=data_prep(valloadset)
testl=data_prep(testloader)

Y_train=tl.pop('Label')
Y_val=vl.pop('Label')
Y_test=testl.pop('Label')

X_train=norm(tl)
X_val=norm(vl)
X_test=norm(testl)

In [16]:
from tqdm.keras import TqdmCallback
total_num = Y_train.shape[0]
num1 = np.count_nonzero(Y_train)
num0 = total_num - num1

# import NN layers and other componenets.
tf.random.set_seed(13)

# Keras model 

def build_model(input_shape):
    model = tf.keras.Sequential()
    model.add(tf.keras.layers.Dense(32, input_shape = (input_shape,)))
    model.add(tf.keras.layers.Dense(32, activation='relu'))
    model.add(tf.keras.layers.Dense(1, activation='sigmoid'))
    return model

input_shape = X_train.shape[1]
model = build_model(input_shape)
learning_rate = 0.001
optimizer = 'adam'
model.compile(
    loss='binary_crossentropy',
    optimizer='adam',
    metrics=['accuracy',tf.keras.metrics.Precision(), tf.keras.metrics.Recall()]
)

model.save("my_model")

early_stopping = tf.keras.callbacks.EarlyStopping(patience=5, restore_best_weights=True)
logger = TqdmCallback(verbose=2)
model.fit(
    x=X_train,
    y=Y_train,
    batch_size=128,
    epochs=5,
    validation_data=(X_val,Y_val),
    class_weight={0: num1 / total_num, 1: num0 / total_num},
    callbacks=[early_stopping, logger],
    verbose=0
)


print('Test Split: ')
results =  model.evaluate(X_test, Y_test, verbose=2)
print(model.metrics_names)
print(results)


# Prediction and AUPRC score calcualtion
y_pred = model.predict(X_test)
ynew = np.round(y_pred).astype(int)
print(ynew)
print("AUPRC:", metrics.average_precision_score(y_true=Y_test, y_score=ynew))



INFO:tensorflow:Assets written to: my_model\assets


INFO:tensorflow:Assets written to: my_model\assets


0epoch [00:00, ?epoch/s]

  0%|          | 0.00/25.7k [00:00<?, ?batch/s]

  0%|          | 0.00/25.7k [00:00<?, ?batch/s]

  0%|          | 0.00/25.7k [00:00<?, ?batch/s]

  0%|          | 0.00/25.7k [00:00<?, ?batch/s]

  0%|          | 0.00/25.7k [00:00<?, ?batch/s]

Test Split: 
22035/22035 - 46s - loss: 693.2631 - accuracy: 0.1202 - precision: 0.0012 - recall: 0.9961 - 46s/epoch - 2ms/step
['loss', 'accuracy', 'precision', 'recall']
[693.2630615234375, 0.12016740441322327, 0.0012203524820506573, 0.9960578083992004]
[[1]
 [1]
 [1]
 ...
 [1]
 [1]
 [1]]
AUPRC: 0.0012197963341076439
