In [1]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import OrdinalEncoder, TargetEncoder, LabelEncoder, OneHotEncoder, StandardScaler

In [2]:
df = pd.read_csv('data/HI-Small_Trans.csv')
df.head(2)

Unnamed: 0,Timestamp,From Bank,Account,To Bank,Account.1,Amount Received,Receiving Currency,Amount Paid,Payment Currency,Payment Format,Is Laundering
0,2022/09/01 00:20,10,8000EBD30,10,8000EBD30,3697.34,US Dollar,3697.34,US Dollar,Reinvestment,0
1,2022/09/01 00:20,3208,8000F4580,1,8000F5340,0.01,US Dollar,0.01,US Dollar,Cheque,0


In [3]:
df['FromBankAcc'] = df.iloc[:,1].astype(str) + '_' + df.iloc[:,2]
df['ToBankAcc'] = df.iloc[:,3].astype(str) + '_' + df.iloc[:,4]

In [4]:
# # Ordinal Encoding
# encode_in_curr = OrdinalEncoder().fit(df['Receiving Currency'].to_numpy().reshape((-1,1))) # Receiving Currency
# encode_out_curr = OrdinalEncoder().fit(df['Payment Currency'].to_numpy().reshape((-1,1))) # Payment Currency
# encode_paym_format = OrdinalEncoder().fit(df['Payment Format'].to_numpy().reshape((-1,1))) # Payment Format

# # Target Encoding - deals with high-cardinality features
# encode_from_acct = OrdinalEncoder().fit(df['FromBankAcc'].to_numpy().reshape((-1,1)))
# encode_to_acct = OrdinalEncoder().fit(df['ToBankAcc'].to_numpy().reshape((-1,1)))
# encode_from_bank = OrdinalEncoder().fit(df['From Bank'].to_numpy().reshape((-1,1)))
# encode_to_bank = OrdinalEncoder().fit(df['To Bank'].to_numpy().reshape((-1,1)))

In [5]:
# Nominal Encoding
encode_curr = LabelEncoder().fit(pd.concat([df['Receiving Currency'], df['Payment Currency']], ignore_index=True)) # For all Currency 
encode_paym_format = LabelEncoder().fit(df['Payment Format']) # Payment Format
encode_acct = LabelEncoder().fit(pd.concat([df['FromBankAcc'], df['ToBankAcc']], ignore_index=True)) # For all unique Account
encode_bank = LabelEncoder().fit(pd.concat([df['From Bank'], df['To Bank']], ignore_index=True)) # For all unique Bank codes

In [6]:
clean_df = pd.DataFrame()

clean_df['FromAccount'] = encode_acct.transform(df['FromBankAcc'])
clean_df['ToAccount'] = encode_acct.transform(df['ToBankAcc'])
clean_df['FromBank'] = encode_bank.transform(df['From Bank'])
clean_df['ToBank'] = encode_bank.transform(df['To Bank'])
clean_df['Receiving Currency'] = df['Receiving Currency']
clean_df['Payment Currency'] = df['Payment Currency']
clean_df['ReceivingCurrency'] = encode_curr.transform(df['Receiving Currency'])
clean_df['PaymentCurrency'] = encode_curr.transform(df['Payment Currency'])
clean_df['PaymentFormat'] = encode_paym_format.transform(df['Payment Format'])
clean_df['Timestamp'] = pd.to_datetime(df['Timestamp'])
clean_df['AmountPaid'] = df['Amount Paid']
clean_df['AmountReceived'] = df['Amount Received']
clean_df['IsLaundering'] = df['Is Laundering']

clean_df.head()

Unnamed: 0,FromAccount,ToAccount,FromBank,ToBank,Receiving Currency,Payment Currency,ReceivingCurrency,PaymentCurrency,PaymentFormat,Timestamp,AmountPaid,AmountReceived,IsLaundering
0,6530,6530,8,8,US Dollar,US Dollar,12,12,5,2022-09-01 00:20:00,3697.34,3697.34,0
1,358174,176809,109,0,US Dollar,US Dollar,12,12,3,2022-09-01 00:20:00,0.01,0.01,0
2,358476,358476,110,110,US Dollar,US Dollar,12,12,5,2022-09-01 00:00:00,14675.57,14675.57,0
3,74640,74640,10,10,US Dollar,US Dollar,12,12,5,2022-09-01 00:02:00,2806.97,2806.97,0
4,6538,6538,8,8,US Dollar,US Dollar,12,12,5,2022-09-01 00:06:00,36682.97,36682.97,0


In [7]:
clean_df.describe()

Unnamed: 0,FromAccount,ToAccount,FromBank,ToBank,ReceivingCurrency,PaymentCurrency,PaymentFormat,Timestamp,AmountPaid,AmountReceived,IsLaundering
count,5078345.0,5078345.0,5078345.0,5078345.0,5078345.0,5078345.0,5078345.0,5078345,5078345.0,5078345.0,5078345.0
mean,238268.0,212398.0,1948.13,2637.933,8.382732,8.413146,3.042442,2022-09-05 07:16:08.194274816,4509273.0,5988726.0,0.001019427
min,1.0,0.0,0.0,0.0,0.0,0.0,0.0,2022-09-01 00:00:00,1e-06,1e-06,0.0
25%,104485.0,96034.0,33.0,540.0,4.0,4.0,3.0,2022-09-02 04:32:00,184.48,183.37,0.0
50%,204657.0,192834.0,596.0,836.0,10.0,10.0,3.0,2022-09-05 12:16:00,1414.54,1411.01,0.0
75%,362585.0,289931.0,971.0,6111.0,12.0,12.0,4.0,2022-09-08 03:13:00,12297.84,12346.27,0.0
max,515087.0,515087.0,30469.0,30464.0,14.0,14.0,6.0,2022-09-18 16:18:00,1046302000000.0,1046302000000.0,1.0
std,163330.2,144368.5,3564.369,3030.044,4.121243,4.120945,1.489543,,869772800.0,1037183000.0,0.03191219


In [8]:
# # Normalisation Step (sample)
# scaler = StandardScaler()

# feature_df = clean_df.drop(columns=['Timestamp', 'IsLaundering'])

# # scale only feature columns
# output_df = scaler.set_output(transform='pandas').fit_transform(feature_df)
# output_df.head(10)

In [9]:
# Save processed data for model exploration
clean_df.to_csv('data/HI_Small_Trans_Standardised.csv', index=False) # Standardised
# output_df.to_csv('data/HI_Small_Trans_Normalised.csv', index=False) # Standardised + Normalisation