In [1]:
import pandas as pd
import numpy as np
import torch
from preprocessing.feature_engineering import FeatureEngineering
from preprocessing.Resampling import Resampling
from graph.graph_construction import GraphConstruction
from models.GNNs import GraphSAGE, GAT, GraphSAGE2, GAT2, GAT3
from torch.optim import Adam
from torch.optim.lr_scheduler import StepLR
from torch_geometric.nn import to_hetero
from torch_geometric.loader import HGTLoader
from sklearn.metrics import average_precision_score, precision_score, recall_score, f1_score, roc_auc_score
from tqdm import tqdm
import matplotlib.pyplot as plt

In [2]:
dataset = pd.read_csv('C:/Users/ruben/OneDrive/Desktop/Datasets/original_dataset.csv')

# Apply feature engineering on the dataset
fe = FeatureEngineering(dataset)
processed_dataset = fe.apply_feature_engineering()

# Apply resampling on the dataset
resampler = Resampling(processed_dataset, test_size=0.4, random_state=42)
final_dataset = resampler.apply_resampling()

Feature engineering completed.
Fraud rate in training set before resampling: 0.54%
Fraud rate in testing set: 0.62%
Fraud rate in training set after resampling: 50.00%
Fraud rate in testing set after resampling: 0.62%
Length of training set: 953192
Length of testing set: 198889


In [3]:
final_dataset.head()
final_dataset.to_csv('C:/Users/ruben/OneDrive/Desktop/Datasets/final_dataset.csv', index=False)

In [2]:
# set device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print('Using device:', device)

Using device: cuda


In [3]:
final_dataset = pd.read_csv('C:/Users/ruben/OneDrive/Desktop/Datasets/final_dataset.csv')

# Percentage in Test and Val set is not the same because there is no stratified split performed
graph_constructor = GraphConstruction(final_dataset)
data = graph_constructor.apply_graph_construction()

Fraud Percentage in Train Mask: 50.00%
Fraud Percentage in Test Mask: 0.61%
Fraud Percentage in Val Mask: 0.63%
Graph Construction Successful!


In [4]:
print(data)

HeteroData(
  transaction={
    x=[1152081, 7],
    y=[1152081],
    num_classes=2,
    train_mask=[1152081],
    test_mask=[1152081],
    val_mask=[1152081],
  },
  client={ x=[983, 5] },
  merchant={ x=[693, 1] },
  (client, pays, transaction)={ edge_index=[2, 1152081] },
  (transaction, received by, merchant)={ edge_index=[2, 1152081] },
  (transaction, rev_pays, client)={ edge_index=[2, 1152081] },
  (merchant, rev_received by, transaction)={ edge_index=[2, 1152081] }
)


In [18]:
import torch
import torch.nn as nn
from torch_geometric.nn import GINConv, HeteroConv

class HeteroGINLayer(nn.Module):
    def __init__(self, in_channels_dict, hidden_channels, node_types, edge_types, is_first_layer=False):
        super(HeteroGINLayer, self).__init__()
        
        self.lin_dict = nn.ModuleDict()
        for node_type in node_types:
            if is_first_layer:
                self.lin_dict[node_type] = nn.Linear(in_channels_dict[node_type], hidden_channels)
            else:
                self.lin_dict[node_type] = nn.Identity()  # No projection needed after first layer
        
        gin_nn = lambda in_dim: nn.Sequential(
            nn.Linear(in_dim, hidden_channels),
            nn.ReLU(),
            nn.Linear(hidden_channels, hidden_channels)
        )
        
        self.conv = HeteroConv({
            edge_type: GINConv(gin_nn(hidden_channels if is_first_layer else hidden_channels))
            for edge_type in edge_types
        })
        
    def forward(self, x_dict, edge_index_dict):
        x_dict = {key: self.lin_dict[key](x) for key, x in x_dict.items()}
        return self.conv(x_dict, edge_index_dict)

In [19]:
class HeteroGIN(nn.Module):
    def __init__(self, in_channels_dict, hidden_channels, num_layers, data):
        super(HeteroGIN, self).__init__()
        
        self.node_types = data.node_types
        self.edge_types = data.edge_types
        
        self.convs = nn.ModuleList()
        for i in range(num_layers):
            conv = HeteroGINLayer(in_channels_dict, hidden_channels, self.node_types, self.edge_types, is_first_layer=(i==0))
            self.convs.append(conv)
        
        self.lin = nn.Linear(hidden_channels, data['transaction'].num_classes)
        
    def forward(self, x_dict, edge_index_dict):
        for conv in self.convs:
            x_dict = conv(x_dict, edge_index_dict)
            x_dict = {key: x.relu() for key, x in x_dict.items()}
        
        return self.lin(x_dict['transaction'])

In [21]:
in_channels_dict = {
    'transaction': 7,
    'client': 5,
    'merchant': 1
}

model = HeteroGIN(in_channels_dict=in_channels_dict, hidden_channels=64, num_layers=3, data=data)
data, model = data.to(device), model.to(device)

x_dict = {node_type: data[node_type].x for node_type in data.node_types}
edge_index_dict = {edge_type: data[edge_type].edge_index for edge_type in data.edge_types}

out = model(x_dict, edge_index_dict)

In [28]:
import torch
import torch.nn.functional as F
from sklearn.metrics import accuracy_score, f1_score
from tqdm import tqdm

def train(model, optimizer, data):
    model.train()
    optimizer.zero_grad()
    
    out = model(data.x_dict, data.edge_index_dict)
    out = out[data['transaction'].train_mask].float()
    target = data['transaction'].y[data['transaction'].train_mask].long()
    loss = F.cross_entropy(out, target)
    
    loss.backward()
    optimizer.step()
    
    return loss.item()

@torch.no_grad()
def test(model, data, mask):
    model.eval()
    out = model(data.x_dict, data.edge_index_dict)
    out = out[mask].float()
    pred = out.argmax(dim=1)
    true = data['transaction'].y[mask].long()
    
    accuracy = accuracy_score(true.cpu().numpy(), pred.cpu().numpy())
    f1 = f1_score(true.cpu().numpy(), pred.cpu().numpy(), average='binary')
    
    return accuracy, f1

def main(data, model, epochs=100, lr=0.01, weight_decay=5e-4):
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model = model.to(device)
    data = data.to(device)
    
    for key in data.x_dict:
        data.x_dict[key] = data.x_dict[key].float()
    data['transaction'].y = data['transaction'].y.long()
    
    optimizer = torch.optim.Adam(model.parameters(), lr=lr, weight_decay=weight_decay)
    
    best_val_f1 = 0
    best_model = None
    
    for epoch in tqdm(range(epochs)):
        loss = train(model, optimizer, data)
        train_acc, train_f1 = test(model, data, data['transaction'].train_mask)
        val_acc, val_f1 = test(model, data, data['transaction'].val_mask)
        
        if val_f1 > best_val_f1:
            best_val_f1 = val_f1
            best_model = model.state_dict()
        
        print(f'Epoch: {epoch:03d}, Loss: {loss:.4f}, Train Acc: {train_acc:.4f}, '
              f'Train F1: {train_f1:.4f}, Val Acc: {val_acc:.4f}, Val F1: {val_f1:.4f}')
    
    # Load best model and test
    model.load_state_dict(best_model)
    test_acc, test_f1 = test(model, data, data['transaction'].test_mask)
    print(f'Test Accuracy: {test_acc:.4f}, Test F1: {test_f1:.4f}')

In [29]:
in_channels_dict = {
    'transaction': 7,
    'client': 5,
    'merchant': 1
}

model = HeteroGIN(in_channels_dict=in_channels_dict, hidden_channels=64, num_layers=3, data=data)

# Run the training and testing pipeline
main(data, model, epochs=100, lr=0.01, weight_decay=5e-4)

  1%|          | 1/100 [02:14<3:41:16, 134.11s/it]

Epoch: 000, Loss: 4.0738, Train Acc: 0.5169, Train F1: 0.6702, Val Acc: 0.0447, Val F1: 0.0130


  2%|▏         | 2/100 [04:26<3:37:06, 132.92s/it]

Epoch: 001, Loss: 74.3193, Train Acc: 0.5000, Train F1: 0.0000, Val Acc: 0.9937, Val F1: 0.0000


  3%|▎         | 3/100 [06:22<3:22:31, 125.27s/it]

Epoch: 002, Loss: 389.4279, Train Acc: 0.5000, Train F1: 0.0000, Val Acc: 0.9937, Val F1: 0.0000
