In [1]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder, StandardScaler, MinMaxScaler
import numpy as np
from hmmlearn import hmm
import random
import warnings
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, confusion_matrix
warnings.filterwarnings('ignore')

In [46]:
data_path = '../../data/HI-Small_Trans_processed.csv' # after categorical encoder, and field selection
df = pd.read_csv(data_path)
print(df.shape)
df = df.drop_duplicates()
print(df.shape)
df.head()

(5078345, 11)
(5078336, 11)


Unnamed: 0,Timestamp,FromAccount,ToAccount,FromBank,ToBank,ReceivingCurrency,PaymentCurrency,PaymentFormat,AmountPaid,AmountReceived,IsLaundering
0,2022-09-01 00:20:00,6530,6530,8,8,12,12,5,3697.34,3697.34,0
1,2022-09-01 00:20:00,358174,176809,109,0,12,12,3,0.01,0.01,0
2,2022-09-01 00:00:00,358476,358476,110,110,12,12,5,14675.57,14675.57,0
3,2022-09-01 00:02:00,74640,74640,10,10,12,12,5,2806.97,2806.97,0
4,2022-09-01 00:06:00,6538,6538,8,8,12,12,5,36682.97,36682.97,0


In [47]:
def group_by_accounts(df, col1, col2, other_columns):
    # Reshape the DataFrame to have a single account column, while preserving other specified columns
    melted_df = pd.melt(df.reset_index(), id_vars=['index'] + other_columns, value_vars=[col1, col2])
    melted_df.rename(columns={'value': 'Account'}, inplace=True)
    
    # Group by account and collect all unique indices for each account
    account_indices = melted_df.groupby('Account')['index'].unique()
    
    # Create a DataFrame for each account group using the collected indices
    account_group_dataframes = {account: df.loc[indices].drop_duplicates() for account, indices in account_indices.items()}
    return account_group_dataframes

In [48]:
%%time

other_columns = [x for x in df.columns if x not in ['FromAccount', 'ToAccount']]
grouped_data = group_by_accounts(df, 'FromAccount', 'ToAccount', other_columns)

CPU times: total: 18min 45s
Wall time: 18min 43s


In [49]:
print(len(grouped_data))

515088


In [50]:
for k, v in grouped_data.items():
    print(k)
    print(v)
    break

0
                  Timestamp  FromAccount  ToAccount  FromBank  ToBank  \
212996  2022-09-01 00:22:00       320763          0      6940     598   

        ReceivingCurrency  PaymentCurrency  PaymentFormat  AmountPaid  \
212996                 13               13              3        0.01   

        AmountReceived  IsLaundering  
212996            0.01             0  


In [71]:
min_len = 35
test_ratio = 0.2

import tqdm
X_train, X_test = [], []
for this_name, this_seq in tqdm.tqdm(grouped_data.items()):
    if this_seq.shape[0] < min_len:
        X_train.append(this_seq)
        continue
    test_len = int(this_seq.shape[0] * test_ratio)
    train_len = this_seq.shape[0] - test_len
    X_train.append(this_seq[:train_len])
    X_test.append(this_seq[train_len:])

print(len(X_train))
print(len(X_test))


100%|██████████| 515088/515088 [00:12<00:00, 41458.39it/s] 

515088
92971





In [72]:
train_data_seq = pd.concat(X_train).drop_duplicates()
test_data_seq = pd.concat(X_test).drop_duplicates()

In [73]:
print(train_data_seq.shape)
print(test_data_seq.shape)

(4948983, 11)
(1069155, 11)


In [75]:
test_data = df.loc[test_data_seq.index]
test_data.shape

(1069155, 11)

In [76]:
test_data.head()

Unnamed: 0,Timestamp,FromAccount,ToAccount,FromBank,ToBank,ReceivingCurrency,PaymentCurrency,PaymentFormat,AmountPaid,AmountReceived,IsLaundering
2798149,2022-09-06 00:45:00,281020,2,19,598,13,13,3,1391409.78,1391409.78,0
3023096,2022-09-06 12:22:00,152746,2,81,598,13,13,3,1983640.26,1983640.26,0
3023098,2022-09-06 12:09:00,152746,2,81,598,13,13,4,1481597.58,1481597.58,0
3719466,2022-09-07 23:21:00,152746,2,81,598,13,13,3,1983640.26,1983640.26,0
3719468,2022-09-07 23:05:00,152746,2,81,598,13,13,4,1481597.58,1481597.58,0


In [77]:
train_data = df.loc[~df.index.isin(test_data_seq.index)]
train_data.shape

(4009181, 11)

In [78]:
train_data.head()

Unnamed: 0,Timestamp,FromAccount,ToAccount,FromBank,ToBank,ReceivingCurrency,PaymentCurrency,PaymentFormat,AmountPaid,AmountReceived,IsLaundering
0,2022-09-01 00:20:00,6530,6530,8,8,12,12,5,3697.34,3697.34,0
1,2022-09-01 00:20:00,358174,176809,109,0,12,12,3,0.01,0.01,0
2,2022-09-01 00:00:00,358476,358476,110,110,12,12,5,14675.57,14675.57,0
3,2022-09-01 00:02:00,74640,74640,10,10,12,12,5,2806.97,2806.97,0
4,2022-09-01 00:06:00,6538,6538,8,8,12,12,5,36682.97,36682.97,0


In [79]:
train_data['IsLaundering'].value_counts()

IsLaundering
0    4005109
1       4072
Name: count, dtype: int64

In [80]:
test_data['IsLaundering'].value_counts()

IsLaundering
0    1068050
1       1105
Name: count, dtype: int64

In [81]:
test_data.to_csv(f'HI-Small_Trans_processed_test.csv', index=False)
train_data.to_csv(f'HI-Small_Trans_processed_train.csv', index=False)