In [27]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
import joblib

In [28]:
df1 = pd.read_csv('/home/saif/Desktop/Anti-Money-Laundering-AML-/data/raw/HI-Small_Trans.csv')

In [29]:
df1.shape

(5078345, 11)

In [30]:
df1.sample(5)

Unnamed: 0,Timestamp,From Bank,Account,To Bank,Account.1,Amount Received,Receiving Currency,Amount Paid,Payment Currency,Payment Format,Is Laundering
1252096,2022/09/02 03:41,137218,80DD91EA0,238845,80E841A00,22.43,Swiss Franc,22.43,Swiss Franc,Credit Card,0
394595,2022/09/01 02:40,12,800156670,1068,800837E10,342.61,US Dollar,342.61,US Dollar,Credit Card,0
3752875,2022/09/08 00:04,127593,80ADF43E0,27755,80B238850,107.51,Australian Dollar,107.51,Australian Dollar,Credit Card,0
780847,2022/09/01 14:14,11405,801D60CD0,11405,801D60CD0,16.3,US Dollar,16.3,US Dollar,Reinvestment,0
4995331,2022/09/10 13:59,1411,80227E700,17907,80C564EF0,410.18,US Dollar,410.18,US Dollar,Credit Card,0


In [31]:
df1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5078345 entries, 0 to 5078344
Data columns (total 11 columns):
 #   Column              Dtype  
---  ------              -----  
 0   Timestamp           object 
 1   From Bank           int64  
 2   Account             object 
 3   To Bank             int64  
 4   Account.1           object 
 5   Amount Received     float64
 6   Receiving Currency  object 
 7   Amount Paid         float64
 8   Payment Currency    object 
 9   Payment Format      object 
 10  Is Laundering       int64  
dtypes: float64(2), int64(3), object(6)
memory usage: 426.2+ MB


In [32]:
df1.describe()

Unnamed: 0,From Bank,To Bank,Amount Received,Amount Paid,Is Laundering
count,5078345.0,5078345.0,5078345.0,5078345.0,5078345.0
mean,45730.57,65744.56,5988726.0,4509273.0,0.001019427
std,81765.62,84092.99,1037183000.0,869772800.0,0.03191219
min,1.0,1.0,1e-06,1e-06,0.0
25%,119.0,4259.0,183.37,184.48,0.0
50%,9679.0,21568.0,1411.01,1414.54,0.0
75%,28628.0,122332.0,12346.27,12297.84,0.0
max,356303.0,356294.0,1046302000000.0,1046302000000.0,1.0


In [33]:
df1.isnull().sum()

Timestamp             0
From Bank             0
Account               0
To Bank               0
Account.1             0
Amount Received       0
Receiving Currency    0
Amount Paid           0
Payment Currency      0
Payment Format        0
Is Laundering         0
dtype: int64

In [34]:
df1.duplicated().sum()

9

## Preprocessing

In [35]:
# Convert Timestamp to datetime from object
df1['Timestamp'] = pd.to_datetime(df1['Timestamp'])
df1.dtypes

Timestamp             datetime64[ns]
From Bank                      int64
Account                       object
To Bank                        int64
Account.1                     object
Amount Received              float64
Receiving Currency            object
Amount Paid                  float64
Payment Currency              object
Payment Format                object
Is Laundering                  int64
dtype: object

In [36]:
# convert hexadecimal values of account and account.1 to decimal
df1['Account'] = df1['Account'].apply(lambda x: int(x, 16))
df1['Account.1'] = df1['Account.1'].apply(lambda x: int(x, 16))
df1.dtypes

Timestamp             datetime64[ns]
From Bank                      int64
Account                        int64
To Bank                        int64
Account.1                      int64
Amount Received              float64
Receiving Currency            object
Amount Paid                  float64
Payment Currency              object
Payment Format                object
Is Laundering                  int64
dtype: object

In [37]:
df1.head()

Unnamed: 0,Timestamp,From Bank,Account,To Bank,Account.1,Amount Received,Receiving Currency,Amount Paid,Payment Currency,Payment Format,Is Laundering
0,2022-09-01 00:20:00,10,34360704304,10,34360704304,3697.34,US Dollar,3697.34,US Dollar,Reinvestment,0
1,2022-09-01 00:20:00,3208,34360739200,1,34360742720,0.01,US Dollar,0.01,US Dollar,Cheque,0
2,2022-09-01 00:00:00,3209,34360739440,3209,34360739440,14675.57,US Dollar,14675.57,US Dollar,Reinvestment,0
3,2022-09-01 00:02:00,12,34360741936,12,34360741936,2806.97,US Dollar,2806.97,US Dollar,Reinvestment,0
4,2022-09-01 00:06:00,10,34360742400,10,34360742400,36682.97,US Dollar,36682.97,US Dollar,Reinvestment,0


In [38]:
df1['Receiving Currency'].value_counts()

Receiving Currency
US Dollar            1879341
Euro                 1172017
Swiss Franc           237884
Yuan                  206551
Shekel                194988
Rupee                 192065
UK Pound              181255
Ruble                 157361
Yen                   156319
Bitcoin               148151
Canadian Dollar       141357
Australian Dollar     138511
Mexican Peso          111030
Saudi Riyal            89971
Brazil Real            71544
Name: count, dtype: int64

In [39]:
# Apply lable encoding to Receiving Currency and Payment Currency columns
recv_le = LabelEncoder()
df1['Receiving Currency'] = recv_le.fit_transform(df1['Receiving Currency'])


In [40]:
df1.head()

Unnamed: 0,Timestamp,From Bank,Account,To Bank,Account.1,Amount Received,Receiving Currency,Amount Paid,Payment Currency,Payment Format,Is Laundering
0,2022-09-01 00:20:00,10,34360704304,10,34360704304,3697.34,12,3697.34,US Dollar,Reinvestment,0
1,2022-09-01 00:20:00,3208,34360739200,1,34360742720,0.01,12,0.01,US Dollar,Cheque,0
2,2022-09-01 00:00:00,3209,34360739440,3209,34360739440,14675.57,12,14675.57,US Dollar,Reinvestment,0
3,2022-09-01 00:02:00,12,34360741936,12,34360741936,2806.97,12,2806.97,US Dollar,Reinvestment,0
4,2022-09-01 00:06:00,10,34360742400,10,34360742400,36682.97,12,36682.97,US Dollar,Reinvestment,0


In [41]:
pay_le = LabelEncoder()
df1['Payment Currency'] = pay_le.fit_transform(df1['Payment Currency'])
df1.head()

Unnamed: 0,Timestamp,From Bank,Account,To Bank,Account.1,Amount Received,Receiving Currency,Amount Paid,Payment Currency,Payment Format,Is Laundering
0,2022-09-01 00:20:00,10,34360704304,10,34360704304,3697.34,12,3697.34,12,Reinvestment,0
1,2022-09-01 00:20:00,3208,34360739200,1,34360742720,0.01,12,0.01,12,Cheque,0
2,2022-09-01 00:00:00,3209,34360739440,3209,34360739440,14675.57,12,14675.57,12,Reinvestment,0
3,2022-09-01 00:02:00,12,34360741936,12,34360741936,2806.97,12,2806.97,12,Reinvestment,0
4,2022-09-01 00:06:00,10,34360742400,10,34360742400,36682.97,12,36682.97,12,Reinvestment,0


In [42]:
# Save encoders for later inference
joblib.dump(recv_le, '/home/saif/Desktop/Anti-Money-Laundering-AML-/models/encoders_for_currency/recv_le.pkl')
joblib.dump(pay_le, '/home/saif/Desktop/Anti-Money-Laundering-AML-/models/encoders_for_currency/pay_le.pkl')

['/home/saif/Desktop/Anti-Money-Laundering-AML-/models/encoders_for_currency/pay_le.pkl']

In [43]:
# check what currency got which label
recv_le.classes_

array(['Australian Dollar', 'Bitcoin', 'Brazil Real', 'Canadian Dollar',
       'Euro', 'Mexican Peso', 'Ruble', 'Rupee', 'Saudi Riyal', 'Shekel',
       'Swiss Franc', 'UK Pound', 'US Dollar', 'Yen', 'Yuan'],
      dtype=object)

In [44]:
pay_le.classes_

array(['Australian Dollar', 'Bitcoin', 'Brazil Real', 'Canadian Dollar',
       'Euro', 'Mexican Peso', 'Ruble', 'Rupee', 'Saudi Riyal', 'Shekel',
       'Swiss Franc', 'UK Pound', 'US Dollar', 'Yen', 'Yuan'],
      dtype=object)

In [46]:
df1.head()

Unnamed: 0,Timestamp,From Bank,Account,To Bank,Account.1,Amount Received,Receiving Currency,Amount Paid,Payment Currency,Payment Format,Is Laundering
0,2022-09-01 00:20:00,10,34360704304,10,34360704304,3697.34,12,3697.34,12,Reinvestment,0
1,2022-09-01 00:20:00,3208,34360739200,1,34360742720,0.01,12,0.01,12,Cheque,0
2,2022-09-01 00:00:00,3209,34360739440,3209,34360739440,14675.57,12,14675.57,12,Reinvestment,0
3,2022-09-01 00:02:00,12,34360741936,12,34360741936,2806.97,12,2806.97,12,Reinvestment,0
4,2022-09-01 00:06:00,10,34360742400,10,34360742400,36682.97,12,36682.97,12,Reinvestment,0


In [47]:
df1['Payment Format'].value_counts()

Payment Format
Cheque          1864331
Credit Card     1323324
ACH              600797
Cash             490891
Reinvestment     481056
Wire             171855
Bitcoin          146091
Name: count, dtype: int64

In [48]:
# apply label encoding on payment format
pay_format_le = LabelEncoder()
df1['Payment Format'] = pay_format_le.fit_transform(df1['Payment Format'])
df1.head()

Unnamed: 0,Timestamp,From Bank,Account,To Bank,Account.1,Amount Received,Receiving Currency,Amount Paid,Payment Currency,Payment Format,Is Laundering
0,2022-09-01 00:20:00,10,34360704304,10,34360704304,3697.34,12,3697.34,12,5,0
1,2022-09-01 00:20:00,3208,34360739200,1,34360742720,0.01,12,0.01,12,3,0
2,2022-09-01 00:00:00,3209,34360739440,3209,34360739440,14675.57,12,14675.57,12,5,0
3,2022-09-01 00:02:00,12,34360741936,12,34360741936,2806.97,12,2806.97,12,5,0
4,2022-09-01 00:06:00,10,34360742400,10,34360742400,36682.97,12,36682.97,12,5,0


In [49]:
pay_format_le.classes_

array(['ACH', 'Bitcoin', 'Cash', 'Cheque', 'Credit Card', 'Reinvestment',
       'Wire'], dtype=object)

In [50]:
# save encoders for later inference
joblib.dump(pay_format_le, '/home/saif/Desktop/Anti-Money-Laundering-AML-/models/encoders_for_currency/pay_format_le.pkl')

['/home/saif/Desktop/Anti-Money-Laundering-AML-/models/encoders_for_currency/pay_format_le.pkl']

In [52]:
# save preprocessed data into data/processed
df1.to_csv('/home/saif/Desktop/Anti-Money-Laundering-AML-/data/processed/trans_processed_data.csv', index=False)