In [1]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder, StandardScaler
import numpy as np
from hmmlearn import hmm
import random
import warnings
warnings.filterwarnings('ignore')

In [2]:
data_path = '../../data/HI-Small_Trans.csv'
df = pd.read_csv(data_path)
df.head()

Unnamed: 0,Timestamp,From Bank,Account,To Bank,Account.1,Amount Received,Receiving Currency,Amount Paid,Payment Currency,Payment Format,Is Laundering
0,2022/09/01 00:20,10,8000EBD30,10,8000EBD30,3697.34,US Dollar,3697.34,US Dollar,Reinvestment,0
1,2022/09/01 00:20,3208,8000F4580,1,8000F5340,0.01,US Dollar,0.01,US Dollar,Cheque,0
2,2022/09/01 00:00,3209,8000F4670,3209,8000F4670,14675.57,US Dollar,14675.57,US Dollar,Reinvestment,0
3,2022/09/01 00:02,12,8000F5030,12,8000F5030,2806.97,US Dollar,2806.97,US Dollar,Reinvestment,0
4,2022/09/01 00:06,10,8000F5200,10,8000F5200,36682.97,US Dollar,36682.97,US Dollar,Reinvestment,0


In [3]:
# necessary preprocessing
df['FromAccount'] = df['From Bank'].astype(str) + '_' + df['Account']
df['ToAccount'] = df['To Bank'].astype(str) + '_' + df['Account.1']
df.head()

Unnamed: 0,Timestamp,From Bank,Account,To Bank,Account.1,Amount Received,Receiving Currency,Amount Paid,Payment Currency,Payment Format,Is Laundering,FromAccount,ToAccount
0,2022/09/01 00:20,10,8000EBD30,10,8000EBD30,3697.34,US Dollar,3697.34,US Dollar,Reinvestment,0,10_8000EBD30,10_8000EBD30
1,2022/09/01 00:20,3208,8000F4580,1,8000F5340,0.01,US Dollar,0.01,US Dollar,Cheque,0,3208_8000F4580,1_8000F5340
2,2022/09/01 00:00,3209,8000F4670,3209,8000F4670,14675.57,US Dollar,14675.57,US Dollar,Reinvestment,0,3209_8000F4670,3209_8000F4670
3,2022/09/01 00:02,12,8000F5030,12,8000F5030,2806.97,US Dollar,2806.97,US Dollar,Reinvestment,0,12_8000F5030,12_8000F5030
4,2022/09/01 00:06,10,8000F5200,10,8000F5200,36682.97,US Dollar,36682.97,US Dollar,Reinvestment,0,10_8000F5200,10_8000F5200


In [4]:
# Encoding
encode_curr = LabelEncoder().fit(pd.concat([df['Receiving Currency'], df['Payment Currency']], ignore_index=True)) # For all Currency 
encode_paym_format = LabelEncoder().fit(df['Payment Format']) # Payment Format
encode_acct = LabelEncoder().fit(pd.concat([df['FromAccount'], df['ToAccount']], ignore_index=True)) # For all unique Account
encode_bank = LabelEncoder().fit(pd.concat([df['From Bank'], df['To Bank']], ignore_index=True)) # For all unique Bank codes


In [5]:
%%time

clean_df = pd.DataFrame()

clean_df['Timestamp'] = pd.to_datetime(df['Timestamp'])

clean_df['FromAccount'] = encode_acct.transform(df['FromAccount'])
clean_df['ToAccount'] = encode_acct.transform(df['ToAccount'])
clean_df['FromBank'] = encode_bank.transform(df['From Bank'])
clean_df['ToBank'] = encode_bank.transform(df['To Bank'])

clean_df['ReceivingCurrency'] = encode_curr.transform(df['Receiving Currency'])
clean_df['PaymentCurrency'] = encode_curr.transform(df['Payment Currency'])
clean_df['PaymentFormat'] = encode_paym_format.transform(df['Payment Format'])
clean_df['AmountPaid'] = df['Amount Paid']
clean_df['AmountReceived'] = df['Amount Received']
clean_df['FromBankOriginal'] = df['From Bank']
clean_df['ToBankOriginal'] = df['To Bank']
clean_df['FromAccountOriginal'] = df['Account']
clean_df['ToAccountOriginal'] = df['Account.1']
clean_df['ReceivingCurrencyOriginal'] = df['Receiving Currency']
clean_df['PaymentCurrencyOriginal'] = df['Payment Currency']
clean_df['PaymentFormatOriginal'] = df['Payment Format']
clean_df['IsLaundering'] = df['Is Laundering']

clean_df.head()


CPU times: total: 8.11 s
Wall time: 8.11 s


Unnamed: 0,Timestamp,FromAccount,ToAccount,FromBank,ToBank,ReceivingCurrency,PaymentCurrency,PaymentFormat,AmountPaid,AmountReceived,FromBankOriginal,ToBankOriginal,FromAccountOriginal,ToAccountOriginal,ReceivingCurrencyOriginal,PaymentCurrencyOriginal,PaymentFormatOriginal,IsLaundering
0,2022-09-01 00:20:00,6530,6530,8,8,12,12,5,3697.34,3697.34,10,10,8000EBD30,8000EBD30,US Dollar,US Dollar,Reinvestment,0
1,2022-09-01 00:20:00,358174,176809,109,0,12,12,3,0.01,0.01,3208,1,8000F4580,8000F5340,US Dollar,US Dollar,Cheque,0
2,2022-09-01 00:00:00,358476,358476,110,110,12,12,5,14675.57,14675.57,3209,3209,8000F4670,8000F4670,US Dollar,US Dollar,Reinvestment,0
3,2022-09-01 00:02:00,74640,74640,10,10,12,12,5,2806.97,2806.97,12,12,8000F5030,8000F5030,US Dollar,US Dollar,Reinvestment,0
4,2022-09-01 00:06:00,6538,6538,8,8,12,12,5,36682.97,36682.97,10,10,8000F5200,8000F5200,US Dollar,US Dollar,Reinvestment,0


In [7]:
clean_df.describe()

Unnamed: 0,Timestamp,FromAccount,ToAccount,FromBank,ToBank,ReceivingCurrency,PaymentCurrency,PaymentFormat,AmountPaid,AmountReceived,IsLaundering
count,5078345,5078345.0,5078345.0,5078345.0,5078345.0,5078345.0,5078345.0,5078345.0,5078345.0,5078345.0,5078345.0
mean,2022-09-05 07:16:08.194274816,238268.0,212398.0,1948.13,2637.933,8.382732,8.413146,3.042442,4509273.0,5988726.0,0.001019427
min,2022-09-01 00:00:00,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1e-06,1e-06,0.0
25%,2022-09-02 04:32:00,104485.0,96034.0,33.0,540.0,4.0,4.0,3.0,184.48,183.37,0.0
50%,2022-09-05 12:16:00,204657.0,192834.0,596.0,836.0,10.0,10.0,3.0,1414.54,1411.01,0.0
75%,2022-09-08 03:13:00,362585.0,289931.0,971.0,6111.0,12.0,12.0,4.0,12297.84,12346.27,0.0
max,2022-09-18 16:18:00,515087.0,515087.0,30469.0,30464.0,14.0,14.0,6.0,1046302000000.0,1046302000000.0,1.0
std,,163330.2,144368.5,3564.369,3030.044,4.121243,4.120945,1.489543,869772800.0,1037183000.0,0.03191219


In [6]:
clean_df.to_csv(f'HI-Small_Trans_processed_w_original.csv', index=False)