In [1]:
import sys
sys.path.append("./Preprocessing")
sys.path.append("./Graph")
sys.path.append("./Models")

import pandas as pd
import torch
from feature_engineering import FeatureEngineering
from Resampling import Resampling
from graph_construction import GraphConstruction
from GNNs import GraphSAGE, GCN, GAT
from torch.optim import Adam
from torch.optim.lr_scheduler import StepLR
from torch_geometric.nn import to_hetero
from torch_geometric.loader import HGTLoader
from sklearn.metrics import average_precision_score, precision_score, recall_score, f1_score, roc_auc_score
from tqdm import tqdm

In [2]:
dataset = pd.read_csv('C:/Users/ruben/OneDrive/Desktop/Datasets/original_dataset.csv')

# Apply feature engineering on the dataset
fe = FeatureEngineering(dataset)
processed_dataset = fe.apply_feature_engineering()

# Apply resampling on the dataset
resampler = Resampling(processed_dataset, test_size=0.4, random_state=42)
final_dataset = resampler.apply_resampling()

Feature engineering completed.
Fraud rate in training set before resampling: 0.54%
Fraud rate in testing set: 0.62%
Fraud rate in training set after resampling: 50.00%
Fraud rate in testing set after resampling: 0.62%
Length of training set: 953192
Length of testing set: 198889


In [None]:
final_dataset.head()
final_dataset.to_csv('C:/Users/ruben/OneDrive/Desktop/Datasets/final_dataset.csv', index=False)

In [2]:
# set device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print('Using device:', device)

Using device: cuda


In [3]:
final_dataset = pd.read_csv('C:/Users/ruben/OneDrive/Desktop/Datasets/final_dataset.csv')

# Percentage in Test and Val set is not the same because there is no stratified split performed
graph_constructor = GraphConstruction(final_dataset)
data = graph_constructor.apply_graph_construction()

Fraud Percentage in Train Mask: 50.00%
Fraud Percentage in Test Mask: 0.61%
Fraud Percentage in Val Mask: 0.63%
Graph Construction Successful!


In [12]:
# Initialize the model, optimizer, and scheduler
model = GraphSAGE(hidden_channels=64, out_channels=1, dropout_prob=0.5)
model = to_hetero(model, data.metadata(), aggr='sum')
model = model.to(device)

optimizer = Adam(model.parameters(), lr=0.01, weight_decay=1e-5)
scheduler = StepLR(optimizer, step_size=10, gamma=0.1)
criterion = torch.nn.BCEWithLogitsLoss()



In [14]:
# create loaders to facilitate mini-batch training
# trainloader
train_loader = HGTLoader(
    data,
    num_samples={key: [512] for key in data.node_types},
    batch_size=512,
    input_nodes=('transaction', data['transaction'].train_mask)
)

# test loader
test_loader = HGTLoader(
    data,
    num_samples={key: [512] for key in data.node_types},
    batch_size=512,
    input_nodes=('transaction', data['transaction'].test_mask)
)

# val loader
val_loader = HGTLoader(
    data,
    num_samples={key: [512] for key in data.node_types},
    batch_size=512,
    input_nodes=('transaction', data['transaction'].val_mask)
)

In [15]:
# Initialize parameters
@torch.no_grad()
def init_params():
    batch = next(iter(train_loader))
    batch = batch.to(device)
    model(batch.x_dict, batch.edge_index_dict)

In [16]:
# Training function
def train():
    model.train()
    total_examples = total_loss = 0
    
    for batch in tqdm(train_loader):
        optimizer.zero_grad()
        batch = batch.to(device)
        batch_size = batch['transaction'].batch_size
        out = model(batch.x_dict, batch.edge_index_dict)['transaction'][:batch_size]
        loss = criterion(out.squeeze(1), batch['transaction'].y[:batch_size].float())
        loss.backward()
        
        # Clip gradients
        # torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
        
        optimizer.step()

        total_examples += batch_size
        total_loss += float(loss) * batch_size

    scheduler.step()  # Update learning rate
    return total_loss / total_examples

In [17]:
# Testing function
@torch.no_grad()
def test(loader):
    model.eval()
    y_pred_probas = []
    y_trues = []
    total_examples = total_loss = 0

    for batch in tqdm(loader):
        batch = batch.to(device)
        batch_size = batch['transaction'].batch_size
        y = batch['transaction'].y[:batch_size]
        y_hat = model(batch.x_dict, batch.edge_index_dict)['transaction'][:batch_size]
        loss = criterion(y_hat.squeeze(1), y.float())

        total_examples += batch_size
        total_loss += float(loss) * batch_size
        y_pred_probas.append(torch.sigmoid(y_hat.cpu()).numpy())
        y_trues.append(y.cpu().numpy())

    y_true = np.concatenate(y_trues)
    y_pred_proba = np.concatenate(y_pred_probas)

    return y_true, y_pred_proba, total_loss / total_examples

In [22]:
init_params() # initialize parameters

training_losses = []
validation_losses = []

consecutive_increases = 0
max_consecutive_increases = 5

for epoch in range(999):
    loss = train()
    print(f'Epoch {epoch+1:02d}, Training loss: {loss:.4f}')
    training_losses.append(loss)

    y_train, y_train_pred_proba, train_loss = test(train_loader) 
    y_val, y_val_pred_proba, val_loss = test(val_loader)
    print(f'Epoch {epoch+1:02d}, Validation loss: {val_loss:.4f}')
    validation_losses.append(val_loss)

    ap_train = average_precision_score(y_train, y_train_pred_proba)
    roc_train = roc_auc_score(y_train, y_train_pred_proba)

    ap_val = average_precision_score(y_val, y_val_pred_proba)
    roc_val = roc_auc_score(y_val, y_val_pred_proba)

    print(f'Epoch {epoch+1:02d}, Average precision training set: {ap_train:.2f}, ROC AUC training set: {roc_train:.2f}')
    print(f'Epoch {epoch+1:02d}, Average precision validation set: {ap_val:.2f}, ROC AUC validation set: {roc_val:.2f}')
    
    """
    y_val_pred = (y_val_pred_proba >= 0.4).astype(int) # Convert probabilities to binary predictions
    precision_val = precision_score(y_val, y_val_pred)
    recall_val = recall_score(y_val, y_val_pred)
    f1_val = f1_score(y_val, y_val_pred)
    
    print(f'Epoch {epoch+1:02d}, Average precision val: {ap_val:.2f}, F1-score: {f1_val:.2f}, Precision val: {precision_val:.2f}, Recall val: {recall_val:.2f}')
    """

    # Check for early stopping
    if epoch > 0 and val_loss > validation_losses[-2]:
        consecutive_increases += 1
        if consecutive_increases >= max_consecutive_increases:
            print(f'Early stopping triggered at epoch {epoch+1}')
            break
    else:
        consecutive_increases = 0

torch.save(model.state_dict(), 'C:/Users/ruben/OneDrive/Desktop/GNN/model_80_epochs.pth')
print('Model saved to C:/Users/ruben/OneDrive/Desktop/GNN/model_80_epochs.pth')

 19%|█▉        | 354/1862 [00:30<02:11, 11.48it/s]


KeyboardInterrupt: 