In [1]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import OrdinalEncoder, TargetEncoder, LabelEncoder, OneHotEncoder, StandardScaler

In [2]:
df = pd.read_csv('data/HI-Small_Trans.csv')
df.head(2)

Unnamed: 0,Timestamp,From Bank,Account,To Bank,Account.1,Amount Received,Receiving Currency,Amount Paid,Payment Currency,Payment Format,Is Laundering
0,2022/09/01 00:20,10,8000EBD30,10,8000EBD30,3697.34,US Dollar,3697.34,US Dollar,Reinvestment,0
1,2022/09/01 00:20,3208,8000F4580,1,8000F5340,0.01,US Dollar,0.01,US Dollar,Cheque,0


In [3]:
df['FromBankAcc'] = df.iloc[:,1].astype(str) + '_' + df.iloc[:,2]
df['ToBankAcc'] = df.iloc[:,3].astype(str) + '_' + df.iloc[:,4]

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5078345 entries, 0 to 5078344
Data columns (total 13 columns):
 #   Column              Dtype  
---  ------              -----  
 0   Timestamp           object 
 1   From Bank           int64  
 2   Account             object 
 3   To Bank             int64  
 4   Account.1           object 
 5   Amount Received     float64
 6   Receiving Currency  object 
 7   Amount Paid         float64
 8   Payment Currency    object 
 9   Payment Format      object 
 10  Is Laundering       int64  
 11  FromBankAcc         object 
 12  ToBankAcc           object 
dtypes: float64(2), int64(3), object(8)
memory usage: 503.7+ MB


In [5]:
df = df.drop(columns=['Timestamp', 'Account', 'Account.1'])
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5078345 entries, 0 to 5078344
Data columns (total 10 columns):
 #   Column              Dtype  
---  ------              -----  
 0   From Bank           int64  
 1   To Bank             int64  
 2   Amount Received     float64
 3   Receiving Currency  object 
 4   Amount Paid         float64
 5   Payment Currency    object 
 6   Payment Format      object 
 7   Is Laundering       int64  
 8   FromBankAcc         object 
 9   ToBankAcc           object 
dtypes: float64(2), int64(3), object(5)
memory usage: 387.4+ MB


In [6]:
df['From Bank'] = df['From Bank'].astype('category')
df['To Bank'] = df['To Bank'].astype('category')
df['Receiving Currency'] = df['Receiving Currency'].astype('category')
df['Payment Currency'] = df['Payment Currency'].astype('category')
df['Payment Format'] = df['Payment Format'].astype('category')
df['Is Laundering'] = df['Is Laundering'].astype('category')
df['FromBankAcc'] = df['FromBankAcc'].astype('category')
df['ToBankAcc'] = df['ToBankAcc'].astype('category')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5078345 entries, 0 to 5078344
Data columns (total 10 columns):
 #   Column              Dtype   
---  ------              -----   
 0   From Bank           category
 1   To Bank             category
 2   Amount Received     float64 
 3   Receiving Currency  category
 4   Amount Paid         float64 
 5   Payment Currency    category
 6   Payment Format      category
 7   Is Laundering       category
 8   FromBankAcc         category
 9   ToBankAcc           category
dtypes: category(8), float64(2)
memory usage: 196.1 MB


In [19]:
df.isna().sum()

From Bank             0
To Bank               0
Amount Received       0
Receiving Currency    0
Amount Paid           0
Payment Currency      0
Payment Format        0
Is Laundering         0
FromBankAcc           0
ToBankAcc             0
dtype: int64

In [47]:
df[['From Bank', 'To Bank', 'Receiving Currency', 'Payment Currency', 'Payment Format', 'FromBankAcc', 'ToBankAcc']]

Unnamed: 0,From Bank,To Bank,Receiving Currency,Payment Currency,Payment Format,FromBankAcc,ToBankAcc
0,10,10,US Dollar,US Dollar,Reinvestment,10_8000EBD30,10_8000EBD30
1,3208,1,US Dollar,US Dollar,Cheque,3208_8000F4580,1_8000F5340
2,3209,3209,US Dollar,US Dollar,Reinvestment,3209_8000F4670,3209_8000F4670
3,12,12,US Dollar,US Dollar,Reinvestment,12_8000F5030,12_8000F5030
4,10,10,US Dollar,US Dollar,Reinvestment,10_8000F5200,10_8000F5200
...,...,...,...,...,...,...,...
5078340,54219,256398,Bitcoin,Bitcoin,Bitcoin,54219_8148A6631,256398_8148A8711
5078341,15,256398,Bitcoin,Bitcoin,Bitcoin,15_8148A8671,256398_8148A8711
5078342,154365,256398,Bitcoin,Bitcoin,Bitcoin,154365_8148A6771,256398_8148A8711
5078343,256398,256398,Bitcoin,Bitcoin,Bitcoin,256398_8148A6311,256398_8148A8711


In [7]:
ohe = OneHotEncoder()
encoded_cols = ohe.fit_transform(df[['From Bank', 'To Bank', 'Receiving Currency', 'Payment Currency', 'Payment Format', 'FromBankAcc', 'ToBankAcc']][:10000])
encoded_df = pd.DataFrame(encoded_cols.toarray(), columns=ohe.get_feature_names_out(['From Bank', 'To Bank', 'Receiving Currency', 'Payment Currency', 'Payment Format', 'FromBankAcc', 'ToBankAcc']))
encoded_df

Unnamed: 0,From Bank_1,From Bank_10,From Bank_12,From Bank_70,From Bank_220,From Bank_513,From Bank_701,From Bank_795,From Bank_908,From Bank_1047,...,ToBankAcc_908_8011171D0,ToBankAcc_908_80111D510,ToBankAcc_908_801168AF0,ToBankAcc_908_812F2F8A0,ToBankAcc_9129_805FF9500,ToBankAcc_9482_8067C8F80,ToBankAcc_9482_8070BB570,ToBankAcc_9482_807638B20,ToBankAcc_9482_80845BF30,ToBankAcc_9482_80FEA8C70
0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9996,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9997,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9998,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [16]:
df = pd.concat([encoded_df, df[['Amount Paid', 'Amount Received', 'Is Laundering']][:10000]], axis=1)

In [2]:
import pandas as pd
dataset = pd.read_csv('data/HI_Small_Trans_ordinal.csv')
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5078345 entries, 0 to 5078344
Data columns (total 10 columns):
 #   Column           Dtype  
---  ------           -----  
 0   0                float64
 1   1                float64
 2   2                float64
 3   3                float64
 4   4                float64
 5   5                float64
 6   6                float64
 7   Amount Paid      float64
 8   Amount Received  float64
 9   Is Laundering    int64  
dtypes: float64(9), int64(1)
memory usage: 387.4 MB


In [3]:
len(dataset[dataset['Is Laundering'] == 1])

5177

In [18]:
df.to_csv('data/HI_Small_Trans_onehot10000.csv', index=False)

In [21]:
df = pd.read_csv('data/HI_Small_Trans_onehot10000.csv')
df[:5]

Unnamed: 0,From Bank_1,From Bank_10,From Bank_12,From Bank_70,From Bank_220,From Bank_513,From Bank_701,From Bank_795,From Bank_908,From Bank_1047,...,ToBankAcc_908_812F2F8A0,ToBankAcc_9129_805FF9500,ToBankAcc_9482_8067C8F80,ToBankAcc_9482_8070BB570,ToBankAcc_9482_807638B20,ToBankAcc_9482_80845BF30,ToBankAcc_9482_80FEA8C70,Amount Paid,Amount Received,Is Laundering
0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3697.34,3697.34,0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.01,0.01,0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,14675.57,14675.57,0
3,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2806.97,2806.97,0
4,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,36682.97,36682.97,0


In [1]:
df[:5]

NameError: name 'df' is not defined

In [19]:
df.iloc[0].values

array([0.00000e+00, 1.00000e+00, 0.00000e+00, ..., 3.69734e+03,
       3.69734e+03, 0.00000e+00])

In [4]:
# # Ordinal Encoding
# encode_in_curr = OrdinalEncoder().fit(df['Receiving Currency'].to_numpy().reshape((-1,1))) # Receiving Currency
# encode_out_curr = OrdinalEncoder().fit(df['Payment Currency'].to_numpy().reshape((-1,1))) # Payment Currency
# encode_paym_format = OrdinalEncoder().fit(df['Payment Format'].to_numpy().reshape((-1,1))) # Payment Format

# # Target Encoding - deals with high-cardinality features
# encode_from_acct = OrdinalEncoder().fit(df['FromBankAcc'].to_numpy().reshape((-1,1)))
# encode_to_acct = OrdinalEncoder().fit(df['ToBankAcc'].to_numpy().reshape((-1,1)))
# encode_from_bank = OrdinalEncoder().fit(df['From Bank'].to_numpy().reshape((-1,1)))
# encode_to_bank = OrdinalEncoder().fit(df['To Bank'].to_numpy().reshape((-1,1)))

In [5]:
# Nominal Encoding
encode_curr = LabelEncoder().fit(pd.concat([df['Receiving Currency'], df['Payment Currency']], ignore_index=True)) # For all Currency 
encode_paym_format = LabelEncoder().fit(df['Payment Format']) # Payment Format
encode_acct = LabelEncoder().fit(pd.concat([df['FromBankAcc'], df['ToBankAcc']], ignore_index=True)) # For all unique Account
encode_bank = LabelEncoder().fit(pd.concat([df['From Bank'], df['To Bank']], ignore_index=True)) # For all unique Bank codes

In [6]:
clean_df = pd.DataFrame()

clean_df['FromAccount'] = encode_acct.transform(df['FromBankAcc'])
clean_df['ToAccount'] = encode_acct.transform(df['ToBankAcc'])
clean_df['FromBank'] = encode_bank.transform(df['From Bank'])
clean_df['ToBank'] = encode_bank.transform(df['To Bank'])
clean_df['ReceivingCurrency'] = encode_curr.transform(df['Receiving Currency'])
clean_df['PaymentCurrency'] = encode_curr.transform(df['Payment Currency'])
clean_df['PaymentFormat'] = encode_paym_format.transform(df['Payment Format'])
clean_df['Timestamp'] = pd.to_datetime(df['Timestamp'])
clean_df['AmountPaid'] = df['Amount Paid']
clean_df['AmountReceived'] = df['Amount Received']
clean_df['IsLaundering'] = df['Is Laundering']

clean_df.head()

Unnamed: 0,FromAccount,ToAccount,FromBank,ToBank,ReceivingCurrency,PaymentCurrency,PaymentFormat,Timestamp,AmountPaid,AmountReceived,IsLaundering
0,6530,6530,8,8,12,12,5,2022-09-01 00:20:00,3697.34,3697.34,0
1,358174,176809,109,0,12,12,3,2022-09-01 00:20:00,0.01,0.01,0
2,358476,358476,110,110,12,12,5,2022-09-01 00:00:00,14675.57,14675.57,0
3,74640,74640,10,10,12,12,5,2022-09-01 00:02:00,2806.97,2806.97,0
4,6538,6538,8,8,12,12,5,2022-09-01 00:06:00,36682.97,36682.97,0


In [7]:
clean_df.describe()

Unnamed: 0,FromAccount,ToAccount,FromBank,ToBank,ReceivingCurrency,PaymentCurrency,PaymentFormat,Timestamp,AmountPaid,AmountReceived,IsLaundering
count,5078345.0,5078345.0,5078345.0,5078345.0,5078345.0,5078345.0,5078345.0,5078345,5078345.0,5078345.0,5078345.0
mean,238268.0,212398.0,1948.13,2637.933,8.382732,8.413146,3.042442,2022-09-05 07:16:08.194274816,4509273.0,5988726.0,0.001019427
min,1.0,0.0,0.0,0.0,0.0,0.0,0.0,2022-09-01 00:00:00,1e-06,1e-06,0.0
25%,104485.0,96034.0,33.0,540.0,4.0,4.0,3.0,2022-09-02 04:32:00,184.48,183.37,0.0
50%,204657.0,192834.0,596.0,836.0,10.0,10.0,3.0,2022-09-05 12:16:00,1414.54,1411.01,0.0
75%,362585.0,289931.0,971.0,6111.0,12.0,12.0,4.0,2022-09-08 03:13:00,12297.84,12346.27,0.0
max,515087.0,515087.0,30469.0,30464.0,14.0,14.0,6.0,2022-09-18 16:18:00,1046302000000.0,1046302000000.0,1.0
std,163330.2,144368.5,3564.369,3030.044,4.121243,4.120945,1.489543,,869772800.0,1037183000.0,0.03191219


In [None]:
scaler = StandardScaler()

In [8]:
# Normalisation Step (sample)
scaler = StandardScaler()

feature_df = clean_df.drop(columns=['Timestamp', 'IsLaundering'])

# scale only feature columns
output_df = scaler.set_output(transform='pandas').fit_transform(feature_df)
output_df.head(10)

Unnamed: 0,FromAccount,ToAccount,FromBank,ToBank,ReceivingCurrency,PaymentCurrency,PaymentFormat,AmountPaid,AmountReceived
0,-1.418831,-1.42599,-0.544312,-0.867952,0.877713,0.870396,1.3142,-0.00518,-0.00577
1,0.734133,-0.246515,-0.515976,-0.870592,0.877713,0.870396,-0.028494,-0.005184,-0.005774
2,0.735982,1.011842,-0.515696,-0.834289,0.877713,0.870396,1.3142,-0.005168,-0.00576
3,-1.001823,-0.954211,-0.543751,-0.867292,0.877713,0.870396,1.3142,-0.005181,-0.005771
4,-1.418782,-1.425934,-0.544312,-0.867952,0.877713,0.870396,1.3142,-0.005142,-0.005739
5,-0.376281,-0.246508,-0.546557,-0.870592,0.877713,0.870396,1.3142,-0.005177,-0.005768
6,-0.376397,-0.246639,-0.546557,-0.870592,0.877713,0.870396,1.3142,-0.005184,-0.005774
7,-0.376385,-0.246626,-0.546557,-0.870592,0.877713,0.870396,1.3142,-0.005184,-0.005774
8,-1.001854,0.398391,-0.543751,-0.84155,0.877713,0.870396,0.642853,-0.005184,-0.005774
9,-0.37636,-0.174401,-0.546557,1.26238,0.877713,0.870396,0.642853,-0.005184,-0.005774


In [9]:
# Save processed data for model exploration
clean_df.to_csv('data/HI_Small_Trans_Standardised.csv', index=False) # Standardised
output_df.to_csv('data/HI_Small_Trans_Normalised.csv', index=False) # Standardised + Normalisation

In [3]:
df_test = pd.read_csv("./data/HI-Small_Trans_processed_w_original_test.csv")

In [4]:
df_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1069155 entries, 0 to 1069154
Data columns (total 18 columns):
 #   Column              Non-Null Count    Dtype  
---  ------              --------------    -----  
 0   Timestamp           1069155 non-null  object 
 1   FromAccount         1069155 non-null  int64  
 2   ToAccount           1069155 non-null  int64  
 3   FromBank            1069155 non-null  int64  
 4   ToBank              1069155 non-null  int64  
 5   ReceivingCurrency   1069155 non-null  int64  
 6   PaymentCurrency     1069155 non-null  int64  
 7   PaymentFormat       1069155 non-null  int64  
 8   AmountPaid          1069155 non-null  float64
 9   AmountReceived      1069155 non-null  float64
 10  From Bank           1069155 non-null  int64  
 11  To Bank             1069155 non-null  int64  
 12  Account             1069155 non-null  object 
 13  Account.1           1069155 non-null  object 
 14  Receiving Currency  1069155 non-null  object 
 15  Payment Currenc

In [5]:
df_test[:3]

Unnamed: 0,Timestamp,FromAccount,ToAccount,FromBank,ToBank,ReceivingCurrency,PaymentCurrency,PaymentFormat,AmountPaid,AmountReceived,From Bank,To Bank,Account,Account.1,Receiving Currency,Payment Currency,Payment Format,Is Laundering
0,2022-09-06 00:45:00,281020,2,19,598,13,13,3,1391409.78,1391409.78,24,10057,803A6D150,803AA8E90,Yen,Yen,Cheque,0
1,2022-09-06 12:22:00,152746,2,81,598,13,13,3,1983640.26,1983640.26,1686,10057,800F2FE90,803AA8E90,Yen,Yen,Cheque,0
2,2022-09-06 12:09:00,152746,2,81,598,13,13,4,1481597.58,1481597.58,1686,10057,800F2FE90,803AA8E90,Yen,Yen,Credit Card,0


In [None]:
df_train = pd.read_csv("./data/HI-Small_Trans_processed_w_original_train.csv")
df_train = df_train[["FromBank", "Account", "ToBank", "Account.1", "AmountReceived", \
                    "ReceivingCurrency", "AmountPaid", "PaymentCurrency", "PaymentFormat", \
                    "Is Laundering"]]
df_train["FromBankAcc"] = df_train.loc[:, "FromBank"].astype(str) + '_' + df_train.loc[:, "Account"]
df_train["ToBankAcc"] = df_train.loc[:, "ToBank"].astype(str) + '_' + df_train.loc[:, "Account.1"]
df_train.drop(columns=["Account", "Account.1"])
df['From Bank'] = df['From Bank'].astype('category')
df['To Bank'] = df['To Bank'].astype('category')
df['Receiving Currency'] = df['Receiving Currency'].astype('category')
df['Payment Currency'] = df['Payment Currency'].astype('category')
df['Payment Format'] = df['Payment Format'].astype('category')
df['Is Laundering'] = df['Is Laundering'].astype('category')
df['FromBankAcc'] = df['FromBankAcc'].astype('category')
df['ToBankAcc'] = df['ToBankAcc'].astype('category')

In [8]:
df_test = pd.read_csv("data/HI_Small_Trans_ordinal_train_vae.csv")
df_test.isna()

Unnamed: 0,0,1,2,3,4,5,6,AmountPaid,AmountReceived,Is Laundering
0,False,False,False,False,False,False,False,False,False,False
1,False,False,False,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False,False,False,False
3,False,False,False,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...
4009176,False,False,False,False,False,False,False,False,False,False
4009177,False,False,False,False,False,False,False,False,False,False
4009178,False,False,False,False,False,False,False,False,False,False
4009179,False,False,False,False,False,False,False,False,False,False


In [9]:
df_test.isna().sum()

0                 0
1                 0
2                 0
3                 0
4                 0
5                 0
6                 0
AmountPaid        0
AmountReceived    0
Is Laundering     0
dtype: int64

In [12]:
df_test.columns

Index(['0', '1', '2', '3', '4', '5', '6', 'AmountPaid', 'AmountReceived',
       'Is Laundering'],
      dtype='object')

In [11]:
df_train.info()

NameError: name 'df_train' is not defined