# Anti Money Laundering Analysis
Dataset taken from [Kaggle](https://www.kaggle.com/datasets/ealtman2019/ibm-transactions-for-anti-money-laundering-aml?select=HI-Small_Trans.csv) and contains 5M records with a high volume of fraudulent activity

In [160]:
import torch
import pandas as pd
import seaborn as sns
import numpy as np 
import plotly.graph_objects as go
import matplotlib.pyplot as plt
import plotly.express as px
from typing import Callable, Optional
from sklearn import preprocessing
from torch_geometric.data import (
    Data,
    InMemoryDataset
)
import torch_geometric.transforms as T
from torch_geometric.loader import NeighborLoader

In [57]:
df = pd.read_csv("/Users/pavemakouski/Documents/RP_records/Code/datasets/HI-Small_Trans.csv")

In [58]:
df.head()

Unnamed: 0,Timestamp,From Bank,Account,To Bank,Account.1,Amount Received,Receiving Currency,Amount Paid,Payment Currency,Payment Format,Is Laundering
0,2022/09/01 00:20,10,8000EBD30,10,8000EBD30,3697.34,US Dollar,3697.34,US Dollar,Reinvestment,0
1,2022/09/01 00:20,3208,8000F4580,1,8000F5340,0.01,US Dollar,0.01,US Dollar,Cheque,0
2,2022/09/01 00:00,3209,8000F4670,3209,8000F4670,14675.57,US Dollar,14675.57,US Dollar,Reinvestment,0
3,2022/09/01 00:02,12,8000F5030,12,8000F5030,2806.97,US Dollar,2806.97,US Dollar,Reinvestment,0
4,2022/09/01 00:06,10,8000F5200,10,8000F5200,36682.97,US Dollar,36682.97,US Dollar,Reinvestment,0


In [12]:
#Finding the number of banks (nodes)
unique_to_banks = df['To Bank'].nunique()
unique_from_banks = df['From Bank'].nunique()
print(f"The number of sender is {unique_from_banks}, and the number of recepient banks is {unique_to_banks}")
#Making sure we have all the banks (unique) in one list
unique_banks = list(set(df['To Bank'].to_list() + df['From Bank'].to_list()))


The number of sender is 30470, and the number of recepients is 15811
30470


In [16]:
#Performing analagous operation for the individual accounts
#Accounts will serve as nodes in our graph analysis
unique_accounts = list(set(df['Account'].to_list() + df['Account.1'].to_list()))
print(len(unique_accounts))

515080


In [59]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5078345 entries, 0 to 5078344
Data columns (total 11 columns):
 #   Column              Dtype  
---  ------              -----  
 0   Timestamp           object 
 1   From Bank           int64  
 2   Account             object 
 3   To Bank             int64  
 4   Account.1           object 
 5   Amount Received     float64
 6   Receiving Currency  object 
 7   Amount Paid         float64
 8   Payment Currency    object 
 9   Payment Format      object 
 10  Is Laundering       int64  
dtypes: float64(2), int64(3), object(6)
memory usage: 426.2+ MB


In [14]:
df.apply(lambda x: x.isna().sum())

Timestamp             0
From Bank             0
Account               0
To Bank               0
Account.1             0
Amount Received       0
Receiving Currency    0
Amount Paid           0
Payment Currency      0
Payment Format        0
Is Laundering         0
dtype: int64

In [27]:
payment_format_counts = df['Payment Format'].value_counts().reset_index()

payment_format_counts.columns = ['Payment Format', 'Number of Rows']

# Plotting the bar chart using Plotly
fig = px.bar(payment_format_counts, 
             x='Payment Format', 
             y='Number of Rows', 
             title='Number of Rows by Payment Format',
             labels={'Number of Rows': 'Number of Records', 'Payment Format': 'Payment Format'},
             color_discrete_sequence=['lightblue'])

fig.show()

In [19]:
#Checking if the amount received is equivalent to the amount paid 
filter_df = df[df['Amount Paid'] != df['Amount Received']]
print(filter_df.shape[0])
#This difference may be due to the currency and transaction fees between countries
currency_df = df[df['Payment Currency'] != df['Receiving Currency']]
print(currency_df.shape[0])

72158
72170


In [33]:
#Looking at the proportion of target labels (money laundering) within the data

df.groupby("Is Laundering")['Timestamp'].count()

#A very unbalanced sample

Is Laundering
0    5073168
1       5177
Name: Timestamp, dtype: int64

## Feature Generation

In [60]:
#Defining pre-processing functions
from sklearn.preprocessing import LabelEncoder, MinMaxScaler


#Convering string values into numberical representations for graph 
def encode_labels(df, columns):
    le = LabelEncoder()  
    for i in columns:                  
        df[i] = le.fit_transform(df[i].astype(str))  
    return df

#Normalizing time (converting to time elapsed)

def normal_time(df, time_column):
    df[time_column] = pd.to_datetime(df[time_column])
    df[time_column] = df[time_column].apply(lambda x: x.value)
    scaler = MinMaxScaler()
    df[time_column] = scaler.fit_transform(df[[time_column]])
    return df

#Creating unique account ids
def unique_account(col1, col2):
    return col1 + "_" + col2
    
    
    
def pre_process_df(df):
    df = encode_labels(df, ['Receiving Currency', 'Payment Currency', 'Payment Format'])
    df = normal_time(df, 'Timestamp')
    df['account_id'] = df.apply(lambda row: unique_account(str(row['From Bank']), row['Account']), axis=1)
    df['r_account_id'] = df.apply(lambda row: unique_account(str(row['To Bank']), row['Account.1']), axis=1)
    df =df.sort_values(by = ['account_id'])
    return df

trans_df = pre_process_df(df)



In [66]:
trans_df.head(10)

Unnamed: 0,Timestamp,From Bank,Account,To Bank,Account.1,Amount Received,Receiving Currency,Amount Paid,Payment Currency,Payment Format,Is Laundering,account_id,r_account_id
4278714,0.45632,10057,803A115E0,29467,803E020C0,787197.11,13,787197.11,13,3,0,10057_803A115E0,29467_803E020C0
2798190,0.285018,10057,803A115E0,29467,803E020C0,787197.11,13,787197.11,13,3,0,10057_803A115E0,29467_803E020C0
2798191,0.284233,10057,803A115E0,29467,803E020C0,681262.19,13,681262.19,13,4,0,10057_803A115E0,29467_803E020C0
3918769,0.417079,10057,803A115E0,29467,803E020C0,681262.19,13,681262.19,13,4,0,10057_803A115E0,29467_803E020C0
213094,0.000746,10057,803A115E0,10057,803A115E0,146954.27,13,146954.27,13,5,0,10057_803A115E0,10057_803A115E0
3918768,0.416608,10057,803A115E0,29467,803E020C0,787197.11,13,787197.11,13,3,0,10057_803A115E0,29467_803E020C0
3437163,0.360162,10057,803A115E0,29467,803E020C0,787197.11,13,787197.11,13,3,0,10057_803A115E0,29467_803E020C0
3437164,0.360083,10057,803A115E0,29467,803E020C0,681262.19,13,681262.19,13,4,0,10057_803A115E0,29467_803E020C0
2570346,0.259211,10057,803A115E0,29467,803E020C0,787197.11,13,787197.11,13,3,0,10057_803A115E0,29467_803E020C0
2570347,0.258976,10057,803A115E0,29467,803E020C0,681262.19,13,681262.19,13,4,0,10057_803A115E0,29467_803E020C0


In [65]:
#Making sure the time variable has been transformed correctly 

min_value = trans_df['Timestamp'].min()
max_value = trans_df['Timestamp'].max()
median_value = trans_df['Timestamp'].median()

print(f"Min: {min_value}, Max: {max_value}, Median: {median_value}")


Min: 0.0, Max: 1.0, Median: 0.25516537041380616


In [90]:
#Creating separate dataframes for senders and receivers
sending_df = trans_df[['account_id', 'Payment Currency', 'Amount Paid', 'Payment Format']]
receiving_df = trans_df[['r_account_id', 'Receiving Currency', 'Amount Received', 'Payment Format']].rename({"r_account_id": "account_id"}, axis = 1)

In [89]:
sending_df.head()

Unnamed: 0,account_id,Payment Currency,Amount Paid,Payment Format
4278714,10057_803A115E0,13,787197.11,3
2798190,10057_803A115E0,13,787197.11,3
2798191,10057_803A115E0,13,681262.19,4
3918769,10057_803A115E0,13,681262.19,4
213094,10057_803A115E0,13,146954.27,5


In [91]:
receiving_df.head()

Unnamed: 0,account_id,Receiving Currency,Amount Received,Payment Format
4278714,29467_803E020C0,13,787197.11,3
2798190,29467_803E020C0,13,787197.11,3
2798191,29467_803E020C0,13,681262.19,4
3918769,29467_803E020C0,13,681262.19,4
213094,10057_803A115E0,13,146954.27,5


In [76]:
#Checking that currency labelling was sucessful 

print(receiving_df['Receiving Currency'].unique())
print(sending_df['Payment Currency'].unique())

[13  1 11  0  2 10  4 12  9 14  7  6  5  8  3]
[13  1 11  0  2 10  4 12  9 14  7  6  5  8  3]


## Creating node features

In [123]:
#Creating a basic list of nodes
def get_node_df(df):
        #Clean accounts first
        clean_df = df[df['Is Laundering'] == 0]
        clean_accounts = list(set(clean_df['account_id'].to_list() + clean_df['r_account_id'].to_list()))
        clean_nodes_df = pd.DataFrame({'account_id': clean_accounts, 'Is Laundering': 0})
        
        #Dirty accounts
        dirty_df = df[df['Is Laundering'] == 1]
        dirty_accounts = list(set(dirty_df['account_id'].to_list() + dirty_df['r_account_id'].to_list()))
        dirty_nodes_df =  pd.DataFrame({'account_id': dirty_accounts, 'Is Laundering': 1})
        
        combined_df = pd.concat([clean_nodes_df, dirty_nodes_df])
        nodes_df = combined_df.drop_duplicates(subset='account_id', keep='last')
        nodes_df = nodes_df.sort_values(by = ['account_id']).reset_index()
        return nodes_df


node_df = get_node_df(trans_df)
node_df.head()

Unnamed: 0,index,account_id,Is Laundering
0,158868,10057_801FB1090,0
1,50742,10057_803A115E0,0
2,312916,10057_803AA8E90,0
3,62957,10057_803AAB430,0
4,409251,10057_803AACE20,0


In [145]:
#Generating additional node features 
# - most used payment currency
# - frequency of using different payment methods (payment and receiving)
# - average amount sent in each currency
# - most used receiving currency 
# - average amount received in each currency 


def generate_features(mini_df, node_df, column_name):
    '''
    mini_df -- refers to a pd.df object containing either payment or receiving account data 
    node_df -- refers to a pd.df object with a list of all accounts and their status (whether or not they have been involved in laundering)
    column_name (str) -- either 'Payment' or 'Receiving' to indicate which dataframe you are drawing data from
    '''
    #I checked that each df contains all currencies
    if column_name == "Payment":
        group_column = 'Amount Paid'
    elif column_name == "Receiving":
        group_column = 'Amount Received'
    else:
        return "You are entering an invalid column name"
        
    currency_list = mini_df[f'{column_name} Currency'].unique()
    currency_list = sorted([int(x) for x in currency_list])
    #Getting data on payment methods
    pm_df = (mini_df.groupby(['account_id', 'Payment Format'])['account_id']
             .count()
             .reset_index(name = 'format_count')
             .pivot(index = "account_id", columns = 'Payment Format', values = 'format_count')
             .reset_index()
             .fillna(0))
    #Renaming the columns 
    pm_df.columns = [f'{column_name}_{col}' if i != 0 else col for i, col in enumerate(pm_df.columns)]
    #Calculating average amount for each currency and  number of transactions in each
    pf_df = (mini_df.groupby(['account_id', f'{column_name} Currency'])['Payment Format']
    .count()
    .reset_index(name = f'currency_count'))
    pf_df = pf_df.sort_values(by = [f'currency_count'], ascending = False)
    #NAs in top df occur if an account was not used for payment or did not receive any money
    top_df = pf_df.loc[pf_df.groupby('account_id')[f'currency_count'].idxmax()].reset_index(drop=True)
    top_df = (top_df[['account_id', f'{column_name} Currency']].
        rename({f'{column_name} Currency': f"{column_name}_currency_top"}, axis =1))
    #Manipulating currencies
    for currency in currency_list:
        currency_df = mini_df[mini_df[f"{column_name} Currency"] == currency]
        currency_df = currency_df[['account_id', group_column]].groupby('account_id').mean(group_column)
        node_df = node_df.merge(currency_df, how='left', on = 'account_id')
        node_df.rename(columns = {node_df.columns[-1]: f'avg_{currency}_{column_name}'}, inplace = True)
    node_df = node_df.fillna(0)
    node_df = node_df.merge(top_df, how = "left", on = "account_id")
    node_df = node_df.merge(pm_df, how = "left", on = "account_id")
    return node_df


payment_df = generate_features(sending_df, node_df, 'Payment')
received_df = generate_features(receiving_df, node_df, 'Receiving').drop(columns = ['Is Laundering', 'index'])
#Putting all the features together into one dataframe
full_df = payment_df.merge(received_df, how = "left", on = 'account_id')
full_df.head(10)


Unnamed: 0,index,account_id,Is Laundering,avg_0_Payment,avg_1_Payment,avg_2_Payment,avg_3_Payment,avg_4_Payment,avg_5_Payment,avg_6_Payment,...,avg_13_Receiving,avg_14_Receiving,Receiving_currency_top,Receiving_0,Receiving_1,Receiving_2,Receiving_3,Receiving_4,Receiving_5,Receiving_6
0,158868,10057_801FB1090,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.01,0.0,13.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
1,50742,10057_803A115E0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,146954.3,0.0,13.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
2,312916,10057_803AA8E90,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1590448.0,0.0,13.0,0.0,0.0,0.0,17.0,7.0,0.0,0.0
3,62957,10057_803AAB430,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,79150.11,0.0,13.0,0.0,0.0,7.0,8.0,0.0,1.0,0.0
4,409251,10057_803AACE20,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,3038217.0,0.0,13.0,0.0,0.0,0.0,8.0,8.0,2.0,0.0
5,25486,10057_803AB4F70,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,829162.6,0.0,13.0,0.0,0.0,0.0,8.0,2.0,1.0,0.0
6,38325,10057_803AB6210,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,45436.43,0.0,13.0,0.0,0.0,0.0,1.0,0.0,2.0,0.0
7,236809,10057_803AB6EC0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,7665845.0,0.0,13.0,0.0,0.0,0.0,3.0,5.0,2.0,0.0
8,256244,10057_803AB8EB0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,28781.08,0.0,13.0,0.0,0.0,12.0,12.0,10.0,0.0,0.0
9,90479,10057_803AB8F00,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,972771000.0,0.0,13.0,2.0,0.0,9.0,9.0,2.0,2.0,2.0


In [150]:
#This code creates node tensors
def create_node_tensors(main_df):
    label_tensor = torch.from_numpy(main_df['Is Laundering'].values).to(torch.float)
    main_df['Bank'] = main_df['account_id'].str.extract(r'(\d{5})_')
    main_df = main_df.drop(['account_id', 'Is Laundering'], axis=1)
    main_df = encode_labels(main_df,['Bank'])
    return main_df, label_tensor

node_attr_df, label_tensor = create_node_tensors(full_df)
print(node_attr_df)


         index  avg_0_Payment  avg_1_Payment  avg_2_Payment  avg_3_Payment  \
0       158868            0.0            0.0            0.0            0.0   
1        50742            0.0            0.0            0.0            0.0   
2       312916            0.0            0.0            0.0            0.0   
3        62957            0.0            0.0            0.0            0.0   
4       409251            0.0            0.0            0.0            0.0   
...        ...            ...            ...            ...            ...   
515083  318246            0.0            0.0            0.0            0.0   
515084  383863            0.0            0.0            0.0            0.0   
515085  151690            0.0            0.0            0.0            0.0   
515086   33884            0.0            0.0            0.0            0.0   
515087  215974            0.0            0.0            0.0            0.0   

        avg_4_Payment  avg_5_Payment  avg_6_Payment  avg_7_Paym

## Creating Edge features

Each edge in the graph is a transaction. The following features will be used: time of the transaction, amount, currency, and format


In [154]:
df.head()

Unnamed: 0,Timestamp,From Bank,Account,To Bank,Account.1,Amount Received,Receiving Currency,Amount Paid,Payment Currency,Payment Format,Is Laundering,account_id,r_account_id,From,To
0,0.000786,10,8000EBD30,10,8000EBD30,3697.34,12,3697.34,12,5,0,10_8000EBD30,10_8000EBD30,,
1,0.000786,3208,8000F4580,1,8000F5340,0.01,12,0.01,12,3,0,3208_8000F4580,1_8000F5340,,
2,0.0,3209,8000F4670,3209,8000F4670,14675.57,12,14675.57,12,5,0,3209_8000F4670,3209_8000F4670,,
3,7.9e-05,12,8000F5030,12,8000F5030,2806.97,12,2806.97,12,5,0,12_8000F5030,12_8000F5030,,
4,0.000236,10,8000F5200,10,8000F5200,36682.97,12,36682.97,12,5,0,10_8000F5200,10_8000F5200,,


In [155]:
def create_edge_attr(node_df, df):
        node_df = node_df.reset_index(drop=True)
        node_df['ID'] = node_df.index
        mapping_dict = dict(zip(node_df['account_id'], node_df['ID']))
        df['From'] = df['account_id'].map(mapping_dict)
        df['To'] = df['r_account_id'].map(mapping_dict)
        df = df.drop(['Account', 'Account.1', 'From Bank', 'To Bank', 'account_id', 'r_account_id'], axis=1)
        edge_index = torch.stack([torch.from_numpy(df['From'].values), torch.from_numpy(df['To'].values)], dim=0)
        df = df.drop(['Is Laundering', 'From', 'To'], axis=1)
        edge_attr = df
        return edge_attr, edge_index

edge_attr, edge_index = create_edge_attr(node_df, df)
print(edge_attr.head())


   Timestamp  Amount Received  Receiving Currency  Amount Paid  \
0   0.000786          3697.34                  12      3697.34   
1   0.000786             0.01                  12         0.01   
2   0.000000         14675.57                  12     14675.57   
3   0.000079          2806.97                  12      2806.97   
4   0.000236         36682.97                  12     36682.97   

   Payment Currency  Payment Format  
0                12               5  
1                12               3  
2                12               5  
3                12               5  
4                12               5  


## Building the network

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch_geometric.transforms as T
from torch_geometric.nn import GATConv, Linear


#This class defines the layers of the function (Two GAT layers followed by a linear one and sigmoid output)
class GAT(torch.nn.Module):
    def __init__(self, in_channels, hidden_channels, out_channels, heads):
        super().__init__()
        self.conv1 = GATConv(in_channels, hidden_channels, heads, dropout=0.6)
        self.conv2 = GATConv(hidden_channels * heads, int(hidden_channels/4), heads=1, concat=False, dropout=0.6)
        self.lin = Linear(int(hidden_channels/4), out_channels)
        self.sigmoid = nn.Sigmoid()

#The forward steps of the neural network
    def forward(self, x, edge_index, edge_attr):
        x = F.dropout(x, p=0.6, training=self.training)
        x = F.elu(self.conv1(x, edge_index, edge_attr))
        x = F.dropout(x, p=0.6, training=self.training)
        x = F.elu(self.conv2(x, edge_index, edge_attr))
        x = self.lin(x)
        x = self.sigmoid(x)
        
        return x

### Defining training functions 

In [161]:
class AMLtoGraph(InMemoryDataset):

    def __init__(self, root: str, edge_window_size: int = 10,
                 transform: Optional[Callable] = None,
                 pre_transform: Optional[Callable] = None):
        self.edge_window_size = edge_window_size
        super().__init__(root, transform, pre_transform)
        self.data, self.slices = torch.load(self.processed_paths[0])

    @property
    def raw_file_names(self) -> str:
        return '/Users/pavemakouski/Documents/RP_records/Code/datasets/HI-Small_Trans.csv'

    @property
    def processed_file_names(self) -> str:
        return '/Users/pavemakouski/Documents/RP_records/Code/datasets/data.pt'

    @property
    def num_nodes(self) -> int:
        return self._data.edge_index.max().item() + 1

    def encode_labels(df, columns):
        le = LabelEncoder()  
        for i in columns:                  
            df[i] = le.fit_transform(df[i].astype(str))  
        return df
    '''
    Inserting all previous functions into classes
    '''
    #Normalizing time (converting to time elapsed)

    def normal_time(df, time_column):
        df[time_column] = pd.to_datetime(df[time_column])
        df[time_column] = df[time_column].apply(lambda x: x.value)
        scaler = MinMaxScaler()
        df[time_column] = scaler.fit_transform(df[[time_column]])
        return df

    #Creating unique account ids
    def unique_account(col1, col2):
        return col1 + "_" + col2
    
    
    
    def pre_process_df(df):
        df = encode_labels(df, ['Receiving Currency', 'Payment Currency', 'Payment Format'])
        df = normal_time(df, 'Timestamp')
        df['account_id'] = df.apply(lambda row: unique_account(str(row['From Bank']), row['Account']), axis=1)
        df['r_account_id'] = df.apply(lambda row: unique_account(str(row['To Bank']), row['Account.1']), axis=1)
        df =df.sort_values(by = ['account_id'])
        return df

    def get_node_df(df):
        #Clean accounts first
        clean_df = df[df['Is Laundering'] == 0]
        clean_accounts = list(set(clean_df['account_id'].to_list() + clean_df['r_account_id'].to_list()))
        clean_nodes_df = pd.DataFrame({'account_id': clean_accounts, 'Is Laundering': 0})
        
        #Dirty accounts
        dirty_df = df[df['Is Laundering'] == 1]
        dirty_accounts = list(set(dirty_df['account_id'].to_list() + dirty_df['r_account_id'].to_list()))
        dirty_nodes_df =  pd.DataFrame({'account_id': dirty_accounts, 'Is Laundering': 1})
        
        combined_df = pd.concat([clean_nodes_df, dirty_nodes_df])
        nodes_df = combined_df.drop_duplicates(subset='account_id', keep='last')
        nodes_df = nodes_df.sort_values(by = ['account_id']).reset_index()
        return nodes_df
    
    def generate_features(mini_df, node_df, column_name):
        '''
        mini_df -- refers to a pd.df object containing either payment or receiving account data 
        node_df -- refers to a pd.df object with a list of all accounts and their status (whether or not they have been involved in laundering)
        column_name (str) -- either 'Payment' or 'Receiving' to indicate which dataframe you are drawing data from
        '''
        #I checked that each df contains all currencies
        if column_name == "Payment":
            group_column = 'Amount Paid'
        elif column_name == "Receiving":
            group_column = 'Amount Received'
        else:
            return "You are entering an invalid column name"
            
        currency_list = mini_df[f'{column_name} Currency'].unique()
        currency_list = sorted([int(x) for x in currency_list])
        #Getting data on payment methods
        pm_df = (mini_df.groupby(['account_id', 'Payment Format'])['account_id']
                .count()
                .reset_index(name = 'format_count')
                .pivot(index = "account_id", columns = 'Payment Format', values = 'format_count')
                .reset_index()
                .fillna(0))
        #Renaming the columns 
        pm_df.columns = [f'{column_name}_{col}' if i != 0 else col for i, col in enumerate(pm_df.columns)]
        #Calculating average amount for each currency and  number of transactions in each
        pf_df = (mini_df.groupby(['account_id', f'{column_name} Currency'])['Payment Format']
        .count()
        .reset_index(name = f'currency_count'))
        pf_df = pf_df.sort_values(by = [f'currency_count'], ascending = False)
        #NAs in top df occur if an account was not used for payment or did not receive any money
        top_df = pf_df.loc[pf_df.groupby('account_id')[f'currency_count'].idxmax()].reset_index(drop=True)
        top_df = (top_df[['account_id', f'{column_name} Currency']].
            rename({f'{column_name} Currency': f"{column_name}_currency_top"}, axis =1))
        #Manipulating currencies
        for currency in currency_list:
            currency_df = mini_df[mini_df[f"{column_name} Currency"] == currency]
            currency_df = currency_df[['account_id', group_column]].groupby('account_id').mean(group_column)
            node_df = node_df.merge(currency_df, how='left', on = 'account_id')
            node_df.rename(columns = {node_df.columns[-1]: f'avg_{currency}_{column_name}'}, inplace = True)
        node_df = node_df.fillna(0)
        node_df = node_df.merge(top_df, how = "left", on = "account_id")
        node_df = node_df.merge(pm_df, how = "left", on = "account_id")
        return node_df
    
    def create_node_tensors(main_df):
        label_tensor = torch.from_numpy(main_df['Is Laundering'].values).to(torch.float)
        main_df['Bank'] = main_df['account_id'].str.extract(r'(\d{5})_')
        main_df = main_df.drop(['account_id', 'Is Laundering'], axis=1)
        main_df = encode_labels(main_df,['Bank'])
        return main_df, label_tensor

    def create_edge_attr(node_df, df):
        node_df = node_df.reset_index(drop=True)
        node_df['ID'] = node_df.index
        mapping_dict = dict(zip(node_df['account_id'], node_df['ID']))
        df['From'] = df['account_id'].map(mapping_dict)
        df['To'] = df['r_account_id'].map(mapping_dict)
        df = df.drop(['Account', 'Account.1', 'From Bank', 'To Bank', 'account_id', 'r_account_id'], axis=1)
        edge_index = torch.stack([torch.from_numpy(df['From'].values), torch.from_numpy(df['To'].values)], dim=0)
        df = df.drop(['Is Laundering', 'From', 'To'], axis=1)
        edge_attr = df
        return edge_attr, edge_index

    def process(self):
        df = pd.read_csv(self.raw_paths[0])
        trans_df = self.pre_process_df(df)
        sending_df = trans_df[['account_id', 'Payment Currency', 'Amount Paid', 'Payment Format']]
        receiving_df = (trans_df[['r_account_id', 'Receiving Currency', 'Amount Received', 'Payment Format']]
                        .rename({"r_account_id": "account_id"}, axis = 1))
        node_df = self.get_node_df(trans_df)
        payment_df = self.generate_features(sending_df, node_df, 'Payment')
        received_df = self.generate_features(receiving_df, node_df, 'Receiving').drop(columns = ['Is Laundering', 'index'])
        #Putting all the features together into one dataframe
        full_df = payment_df.merge(received_df, how = "left", on = 'account_id')
        node_attr_df, label_tensor = self.create_node_tensors(full_df)
        edge_attr, edge_index = self.create_edge_attr(node_df, df)
        #Building a graph in pytorch geometric
        data = Data(x=node_attr_df,
                    edge_index=edge_index,
                    y=label_tensor,
                    edge_attr=edge_attr
                    )
        
        data_list = [data] 
        if self.pre_filter is not None:
            data_list = [d for d in data_list if self.pre_filter(d)]

        if self.pre_transform is not None:
            data_list = [self.pre_transform(d) for d in data_list]

        data, slices = self.collate(data_list)
        torch.save((data, slices), self.processed_paths[0])

## Training the model

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
dataset = AMLtoGraph('data')
data = dataset[0]
epoch = 100

model = GAT(in_channels=data.num_features, hidden_channels=16, out_channels=1, heads=8)
model = model.to(device)
criterion = torch.nn.BCELoss()
optimizer = torch.optim.SGD(model.parameters(), lr=0.0001)

split = T.RandomNodeSplit(split='train_rest', num_val=0.1, num_test=0)
data = split(data)

train_loader = loader = NeighborLoader(
    data,
    num_neighbors=[30] * 2,
    batch_size=256,
    input_nodes=data.train_mask,
)

test_loader = loader = NeighborLoader(
    data,
    num_neighbors=[30] * 2,
    batch_size=256,
    input_nodes=data.val_mask,
)

for i in range(epoch):
    total_loss = 0
    model.train()
    for data in train_loader:
        optimizer.zero_grad()
        data.to(device)
        pred = model(data.x, data.edge_index, data.edge_attr)
        ground_truth = data.y
        loss = criterion(pred, ground_truth.unsqueeze(1))
        loss.backward()
        optimizer.step()
        total_loss += float(loss)
    if epoch%10 == 0:
        print(f"Epoch: {i:03d}, Loss: {total_loss:.4f}")
        model.eval()
        acc = 0
        total = 0
        for test_data in test_loader:
            test_data.to(device)
            pred = model(test_data.x, test_data.edge_index, test_data.edge_attr)
            ground_truth = test_data.y
            correct = (pred == ground_truth.unsqueeze(1)).sum().item()
            total += len(ground_truth)
            acc += correct
        acc = acc/total
        print('accuracy:', acc)