In [16]:
import pandas as pd
import numpy as np
import torch
from preprocessing.feature_engineering import FeatureEngineering
from preprocessing.Resampling import Resampling
from graph.graph_construction import GraphConstruction
from sklearn.metrics import average_precision_score, precision_score, recall_score, f1_score, roc_auc_score
from torch_geometric.nn import MetaPath2Vec

In [17]:
dataset = pd.read_csv('C:/Users/ruben/OneDrive/Desktop/Datasets/original_dataset.csv')

# Apply feature engineering on the dataset
fe = FeatureEngineering(dataset)
processed_dataset = fe.apply_feature_engineering()

# Apply resampling on the dataset
resampler = Resampling(processed_dataset, test_size=0.4, random_state=42)
final_dataset = resampler.apply_resampling()

Feature engineering completed.
Fraud rate in training set before resampling: 0.54%
Fraud rate in testing set: 0.62%
Fraud rate in training set after resampling: 50.00%
Fraud rate in testing set after resampling: 0.62%
Length of training set: 953192
Length of testing set: 198889


In [18]:
final_dataset.head()
final_dataset.to_csv('C:/Users/ruben/OneDrive/Desktop/Datasets/final_dataset.csv', index=False)

In [42]:
# set device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print('Using device:', device)

Using device: cuda


In [43]:
final_dataset = pd.read_csv('C:/Users/ruben/OneDrive/Desktop/Datasets/final_dataset.csv')

# Percentage in Test and Val set is not the same because there is no stratified split performed
graph_constructor = GraphConstruction(final_dataset)
data = graph_constructor.apply_graph_construction()

Fraud Percentage in Train Mask: 50.00%
Fraud Percentage in Test Mask: 0.61%
Fraud Percentage in Val Mask: 0.63%
Graph Construction Successful!


In [44]:
data['transaction'].y_index = torch.from_numpy(final_dataset['index'].values)
print(data)

HeteroData(
  transaction={
    x=[1152081, 7],
    y=[1152081],
    num_classes=2,
    train_mask=[1152081],
    test_mask=[1152081],
    val_mask=[1152081],
    y_index=[1152081],
  },
  client={ x=[983, 5] },
  merchant={ x=[693, 1] },
  (client, pays, transaction)={ edge_index=[2, 1152081] },
  (transaction, received by, merchant)={ edge_index=[2, 1152081] },
  (transaction, rev_pays, client)={ edge_index=[2, 1152081] },
  (merchant, rev_received by, transaction)={ edge_index=[2, 1152081] }
)


In [45]:
metapath = [
    ('client', 'pays', 'transaction'),
    ('transaction', 'received by', 'merchant'),
]

In [46]:
model = MetaPath2Vec(data.edge_index_dict, embedding_dim=128,
                     metapath=metapath, walk_length=2, context_size=1,
                     walks_per_node=5, num_negative_samples=5,
                     sparse=True).to(device)

loader = model.loader(batch_size=128, shuffle=True, num_workers=6)
optimizer = torch.optim.SparseAdam(list(model.parameters()), lr=0.01)

In [54]:
from tqdm import tqdm
def train(model, epoch, log_steps=100, eval_steps=2000):
    model.train()
    global loss
    total_loss = 0
    for i, (pos_rw, neg_rw) in tqdm(enumerate(loader)):
      optimizer.zero_grad()
      loss = model.loss(pos_rw.to(device), neg_rw.to(device))
      loss.backward()
      optimizer.step()
      total_loss += loss.item()

In [55]:
epoch_loss = []
for epoch in range(1, 11):
    train(model,epoch)
    print(f"epoch: {epoch}, loss: {loss}")
    epoch_loss.append(loss)

8it [00:10,  1.31s/it]


epoch: 1, loss: nan


8it [00:10,  1.29s/it]


epoch: 2, loss: nan


8it [00:10,  1.28s/it]


epoch: 3, loss: nan


0it [00:05, ?it/s]


KeyboardInterrupt: 

In [47]:
def train(epoch, log_steps=100, eval_steps=2000):
    model.train()

    total_loss = 0
    for i, (pos_rw, neg_rw) in enumerate(loader):
        optimizer.zero_grad()
        loss = model.loss(pos_rw.to(device), neg_rw.to(device))
        loss.backward()
        optimizer.step()

        total_loss += loss.item()
        if (i + 1) % log_steps == 0:
            print(f'Epoch: {epoch}, Step: {i + 1:05d}/{len(loader)}, '
                  f'Loss: {total_loss / log_steps:.4f}')
            total_loss = 0

        if (i + 1) % eval_steps == 0:
            acc = test()
            print(f'Epoch: {epoch}, Step: {i + 1:05d}/{len(loader)}, '
                  f'Acc: {acc:.4f}')


In [48]:
@torch.no_grad()
def test(train_ratio=0.1):
    model.eval()

    z = model('transaction', batch=data['transaction'].y_index.to(device))
    y = data['transaction'].y

    perm = torch.randperm(z.size(0))
    train_perm = perm[:int(z.size(0) * train_ratio)]
    test_perm = perm[int(z.size(0) * train_ratio):]

    return model.test(z[train_perm], y[train_perm], z[test_perm], y[test_perm],
                      max_iter=150)

In [49]:
for epoch in range(1, 6):
    train(epoch)
    acc = test()
    print(f'Epoch: {epoch}, Accuracy: {acc:.4f}')

Epoch: 1, Accuracy: 0.5852
Epoch: 2, Accuracy: 0.5850
Epoch: 3, Accuracy: 0.5852
Epoch: 4, Accuracy: 0.5851
Epoch: 5, Accuracy: 0.5856
