In [2]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import OrdinalEncoder, LabelEncoder, OneHotEncoder, StandardScaler

In [29]:
df = pd.read_csv('/content/drive/MyDrive/NUS/CS5340/HI-Medium_Trans.csv')
df.head(2)

Unnamed: 0,Timestamp,From Bank,Account,To Bank,Account.1,Amount Received,Receiving Currency,Amount Paid,Payment Currency,Payment Format,Is Laundering
0,2022/09/01 00:20,10,8000EBD30,10,8000EBD30,3697.34,US Dollar,3697.34,US Dollar,Reinvestment,0
1,2022/09/01 00:20,3208,8000F4580,1,8000F5340,0.01,US Dollar,0.01,US Dollar,Cheque,0


In [31]:
df['FromBankAcc'] = df.iloc[:,1].astype(str) + '_' + df.iloc[:,2]
df['ToBankAcc'] = df.iloc[:,3].astype(str) + '_' + df.iloc[:,4]

In [32]:
clean_df = pd.DataFrame()

clean_df['FromAccount'] = encode_acct.transform(df['FromBankAcc'])
clean_df['ToAccount'] = encode_acct.transform(df['ToBankAcc'])
clean_df['FromBank'] = encode_bank.transform(df['From Bank'])
clean_df['ToBank'] = encode_bank.transform(df['To Bank'])
clean_df['ReceivingCurrency'] = encode_curr.transform(df['Receiving Currency'])
clean_df['PaymentCurrency'] = encode_curr.transform(df['Payment Currency'])
#clean_df['PaymentFormat'] = encode_paym_format.transform(df['Payment Format'])
#clean_df['Timestamp'] = pd.to_datetime(df['Timestamp'])
clean_df['AmountPaid'] = df['Amount Paid']
clean_df['AmountReceived'] = df['Amount Received']
clean_df['IsLaundering'] = df['Is Laundering']

clean_df.head()

Unnamed: 0,FromAccount,ToAccount,FromBank,ToBank,ReceivingCurrency,PaymentCurrency,AmountPaid,AmountReceived,IsLaundering
0,6530,6530,8,8,12,12,3697.34,3697.34,0
1,358174,176809,109,0,12,12,0.01,0.01,0
2,358476,358476,110,110,12,12,14675.57,14675.57,0
3,74640,74640,10,10,12,12,2806.97,2806.97,0
4,6538,6538,8,8,12,12,36682.97,36682.97,0


In [33]:
def mapValueToCategories(df, processed_df, columns):
    value_to_category_mapping = {'Cash': 1, 'Cheque': 2, 'ACH': 3, 'Credit Card': 4, 'Wire': 5, 'Bitcoin': 6, 'Reinvestment': 7}
    for column in columns:
        processed_df[column] = df[column].map(value_to_category_mapping)

mapValueToCategories(df, clean_df, ['Payment Format'])

clean_df.head()

Unnamed: 0,FromAccount,ToAccount,FromBank,ToBank,ReceivingCurrency,PaymentCurrency,AmountPaid,AmountReceived,IsLaundering,Payment Format
0,6530,6530,8,8,12,12,3697.34,3697.34,0,7
1,358174,176809,109,0,12,12,0.01,0.01,0,2
2,358476,358476,110,110,12,12,14675.57,14675.57,0,7
3,74640,74640,10,10,12,12,2806.97,2806.97,0,7
4,6538,6538,8,8,12,12,36682.97,36682.97,0,7


In [34]:
# Normalisation Step (sample)
scaler = StandardScaler()

feature_df = clean_df.drop(columns=['IsLaundering'])

# scale only feature columns
output_df = scaler.set_output(transform='pandas').fit_transform(feature_df)
output_df.head(10)

Unnamed: 0,FromAccount,ToAccount,FromBank,ToBank,ReceivingCurrency,PaymentCurrency,AmountPaid,AmountReceived,Payment Format
0,-1.418831,-1.42599,-0.544312,-0.867952,0.877713,0.870396,-0.00518,-0.00577,2.223591
1,0.734133,-0.246515,-0.515976,-0.870592,0.877713,0.870396,-0.005184,-0.005774,-0.727843
2,0.735982,1.011842,-0.515696,-0.834289,0.877713,0.870396,-0.005168,-0.00576,2.223591
3,-1.001823,-0.954211,-0.543751,-0.867292,0.877713,0.870396,-0.005181,-0.005771,2.223591
4,-1.418782,-1.425934,-0.544312,-0.867952,0.877713,0.870396,-0.005142,-0.005739,2.223591
5,-0.376281,-0.246508,-0.546557,-0.870592,0.877713,0.870396,-0.005177,-0.005768,2.223591
6,-0.376397,-0.246639,-0.546557,-0.870592,0.877713,0.870396,-0.005184,-0.005774,2.223591
7,-0.376385,-0.246626,-0.546557,-0.870592,0.877713,0.870396,-0.005184,-0.005774,2.223591
8,-1.001854,0.398391,-0.543751,-0.84155,0.877713,0.870396,-0.005184,-0.005774,0.452731
9,-0.37636,-0.174401,-0.546557,1.26238,0.877713,0.870396,-0.005184,-0.005774,0.452731


In [35]:
from sklearn.model_selection import train_test_split
from sklearn.utils import resample

x_train, x_test, y_train, y_test = train_test_split(output_df, df.iloc[:, [-3]], test_size=0.2, random_state=42)

train_df = pd.concat([pd.DataFrame(x_train), pd.DataFrame(y_train)],axis=1)
class_counts = train_df['Is Laundering'].value_counts()
df_majority = train_df[train_df['Is Laundering'] == 0]
df_minority = train_df[train_df['Is Laundering'] == 1]
df_majority_downsampled = resample(df_majority, replace=False, n_samples=class_counts[1])
df_balanced = pd.concat([df_majority_downsampled, df_minority])
#print(df_balanced)
x_train, y_train = df_balanced.iloc[:, :-1], df_balanced.iloc[:, [-1]]

In [36]:
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import (
    accuracy_score,
    confusion_matrix,
    ConfusionMatrixDisplay,
    f1_score,
)

model = GaussianNB()

model.fit(x_train, y_train)

y_pred = model.predict(x_test)
accuray = accuracy_score(y_pred, y_test)
f1 = f1_score(y_pred, y_test)
f1_unweighted = f1_score(y_pred, y_test, average="weighted")

print("Accuracy:", accuray)
print("F1 Score:", f1)
print("F1 Score unweighted:", f1_unweighted)


  y = column_or_1d(y, warn=True)


Accuracy: 0.9967243265276384
F1 Score: 0.0023988005997001498
F1 Score unweighted: 0.996155090715067


In [41]:
from sklearn.metrics import roc_curve, auc
y_pred_proba = model.predict_proba(x_test)
fpr, tpr, thresholds = roc_curve(y_test, y_pred)
oc_auc = auc(fpr, tpr)
print('auc: ', oc_auc)

auc:  0.5007340522597006


In [44]:
CM = confusion_matrix(y_test, y_pred)
TN = CM[0][0]
FN = CM[1][0]
TP = CM[1][1]
FP = CM[0][1]
print(TP / (TP + FP))
print(TP / (TP + FN))


0.0017793594306049821
0.0036798528058877645
