In [None]:
import os
import torch
import torch.nn.functional as F
from torch.geometric.nn import SAGEConv
from torch.geometric.datasets import DGraphFin
from torch.geometric.loader import NeighborLoader
from torch.geometric.transforms import RandomNodeSplit
from torch.geometric.data import Data
from sklearn.metrics import f1_score
from tqdm import tqdm
import numpy

In [None]:
DEVICE = torch.device("cuda" if torch.cuda.is.available() else "cpu")

In [None]:
DATASET_PATH = "/Users/kostas/Documents/GitHub/SMCS_FraudDetection/dataset" 
dataset = DGraphFin(root=DATASET_PATH)[0].to(DEVICE)
#remove unused attributes
dataset.pop("edge_type")
dataset.pop("edge_time")
#normalize features (z-score)
dataset.x = (dataset.x - dataset.x.mean(dim=0))/dataset.x.std(dim=0)
NUM_CLASSES = dataset.y.max().item()+1
NUM_FEATURES = dataset.num_features


In [None]:
BATCH_SIZE = 1024
train_loader = NeighborLoader(dataset, num_neighbors=[10,5], batch_size=BATCH_SIZE, input_nodes= dataset.train_mask, shuffle=True)
test_loader = NeighborLoader(dataset, num_neighbors=[10,5], batch_size=BATCH_SIZE, input_nodes= dataset.test_mask, shuffle=False)

In [15]:
def __init__(self, in_channels, hidden_channels, out_channels, num_layers=2): 

    super(GraphSAGE, self).__init__() 
    self.num_layers = num_layers 
    self.convs = torch.nn.ModuleList() 
    # Input layer 
    self.convs.append(SAGEConv(in_channels, hidden_channels)) 
    # Hidden layers 
    for _ in range(num_layers - 2): 
        self.convs.append(SAGEConv(hidden_channels, hidden_channels)) 
    # Output layer 
    self.convs.append(SAGEConv(hidden_channels, out_channels)) 

def forward(self, x, edge_index): 
    for conv in self.convs[:-1]: 
        x = conv(x, edge_index) 
        x = F.relu(x) 
        x = F.dropout(x, p=0.5, training=self.training) 
    x = self.convs[-1](x, edge_index) 
    return F.log_softmax(x, dim=1)

In [None]:
model = GraphSAGE(in_channels=NUM_FEATURES, hidden_channels=128, out_channels=NUM_CLASSES).to(DEVICE)
optimizer = torch.optim.Adam(model.parameters(), lr = 0.005, weight_decay=5e-4)
loss_fn = torch.nn.CrossEntropyLoss()

In [None]:
def train():
    model.train()
    total_loss = 0   
    for batch in tqdm(train_loader, desc="Training"):   
        batch = batch.to(DEVICE)   
        optimizer.zero_grad()   
        out = model(batch.x, batch.edge_index)   
        loss = loss_fn(out[batch.train_mask], batch.y[batch.train_mask])     
        loss.backward()     
        optimizer.step()   
        total_loss += loss.item()  
    
    return total_loss / len(train_loader)

In [18]:
@torch.no_grad()
def test():
    model.eval()
    correct = 0
    total = 0
    all_preds, all_labels = [],[]

    for batch in test_loader:    
        batch = batch.to(DEVICE)    
        out = model(batch.x, batch.edge_index)    
        pred = out.argmax(dim=1)    
        correct += (pred[batch.test_mask] == batch.y[batch.test_mask]).sum().item()    
        total += batch.test_mask.sum().item()    
        all_preds.append(pred[batch.test_mask].cpu())    
        all_labels.append(batch.y[batch.test_mask].cpu())    
    
    # Compute accuracy    
    accuracy = correct / total    
        
    # Compute F1-score    
    all_preds = torch.cat(all_preds)    
    all_labels = torch.cat(all_labels)    
    f1 = f1_score(all_labels.numpy(), all_preds.numpy(), average="micro")    
        
    return accuracy, f1



In [22]:
EPOCHS = 10

for epoch in range(EPOCHS):
    loss = train()
    acc, f1 = test()
    print(f"Epoch{epoch+1}/{EPOCHS}, Loss: {loss:.4f}, Accuracy: {acc:.4f},F1-score:{f1:4f}")
print("training completed")


Training:   0%|          | 0/2530 [00:00<?, ?it/s]


RuntimeError: Numpy is not available

## old

In [None]:
import os
import time
import torch
import torch.nn.functional as F
import tqdm
from sklearn.metrics import f1_score
from torch_geometric.nn import GraphSAGE
from torch_geometric.loader import NeighborLoader
from data_processing import load_dgraphfin, split_into_batches


In [None]:
# Set device (CPU or GPU)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
dataset, num_classes = load_dgraphfin(path_to_folder="/Users/kostas/Documents/GitHub/SMCS_FraudDetection/")

# Step 2: Create Data Loaders
train_loader = split_into_batches(graph=dataset, num_batches=512, num_neighbours=10, num_hops=2, shuffle=True)
test_loader = split_into_batches(graph=dataset, num_batches=512, num_neighbours=10, num_hops=2, shuffle=False)


In [None]:
# Step 3: Define the GraphSAGE Model
class GraphSAGEModel(torch.nn.Module):
    def __init__(self, in_channels, hidden_channels, out_channels):
        super(GraphSAGEModel, self).__init__()
        self.conv1 = GraphSAGE(in_channels, hidden_channels, num_layers=2, out_channels=hidden_channels)
        self.conv2 = GraphSAGE(hidden_channels, hidden_channels, num_layers=2, out_channels=out_channels)

    def forward(self, x, edge_index):
        x = self.conv1(x, edge_index)
        x = F.relu(x)
        x = self.conv2(x, edge_index)
        return F.log_softmax(x, dim=1)


In [None]:
# Initialize model and optimizer
model = GraphSAGEModel(dataset.num_features, hidden_channels=64, out_channels=num_classes).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=0.005)

In [None]:
def train():
    model.train()
    total_loss = 0
    total_examples = 0
    
    for batch in tqdm.tqdm(train_loader):
        batch = batch.to(device)
        optimizer.zero_grad()
        out = model(batch.x, batch.edge_index)

        loss = F.cross_entropy(out[batch.train_mask], batch.y[batch.train_mask])
        loss.backward()
        optimizer.step()

        total_loss += float(loss) * batch.train_mask.sum().item()
        total_examples += batch.train_mask.sum().item()

    return total_loss / total_examples

In [None]:
# Step 5: Define Evaluation Function
@torch.no_grad()
def test(loader):
    model.eval()
    preds, labels = [], []

    for batch in loader:
        batch = batch.to(device)
        out = model(batch.x, batch.edge_index)
        preds.append(out.argmax(dim=1)[batch.test_mask].cpu())
        labels.append(batch.y[batch.test_mask].cpu())

    preds = torch.cat(preds, dim=0)
    labels = torch.cat(labels, dim=0)
    f1 = f1_score(labels.numpy(), preds.numpy(), average="micro")

    return f1

In [21]:
import torch
print(torch.__version__)

2.4.1


In [None]:
# Step 6: Train and Evaluate the Model
for epoch in range(1, 11):
    start = time.time()
    loss = train()
    f1_score_test = test(test_loader)

    print(f"Epoch: {epoch}, Loss: {loss:.4f}, Test F1: {f1_score_test:.4f}, Time: {time.time() - start:.2f}s")


In [None]:
from typing import Union
import torch
from torch_geometric.loader import NeighborSampler

def check_sampling_dependencies() -> bool:
    """
    Checks if required dependencies for neighbor sampling are installed.
    
    Returns:
        bool: True if either pyg-lib or torch-sparse is available
    """
    try:
        import pyg_lib
        return True
    except ImportError:
        try:
            import torch_sparse
            return True
        except ImportError:
            return False

def create_neighbor_sampler(
    edge_index: torch.Tensor,
    sizes: list[int],
    num_nodes: int,
    batch_size: int,
    shuffle: bool = True
) -> Union[NeighborSampler, None]:
    """
    Creates a NeighborSampler instance with proper dependency checking.
    
    Args:
        edge_index (torch.Tensor): Graph edge indices
        sizes (list[int]): Number of neighbors to sample per layer
        num_nodes (int): Total number of nodes in the graph
        batch_size (int): Size of batches
        shuffle (bool, optional): Whether to shuffle the nodes. Defaults to True.
    
    Returns:
        Union[NeighborSampler, None]: NeighborSampler instance if dependencies are met, None otherwise
    
    Raises:
        ImportError: If neither pyg-lib nor torch-sparse is installed
    """
    if not check_sampling_dependencies():
        raise ImportError(
            "NeighborSampler requires either 'pyg-lib' or 'torch-sparse'. "
            "Please install at least one of them:\n"
            "pip install pyg-lib torch-sparse\n"
            "or\n"
            "conda install pyg -c pyg"
        )
    
    try:
        return NeighborSampler(
            edge_index,
            node_idx=torch.arange(num_nodes),
            sizes=sizes,
            batch_size=batch_size,
            shuffle=shuffle,
        )
    except Exception as e:
        print(f"Error creating NeighborSampler: {str(e)}")
        return None

# Usage example:
try:
    train_loader = create_neighbor_sampler(
        edge_index=data.edge_index,
        sizes=[25, 10],  # Sample 25 neighbors for first hop, 10 for second hop
        num_nodes=data.num_nodes,
        batch_size=128
    )
    if train_loader is None:
        raise RuntimeError("Failed to create NeighborSampler")
        
except ImportError as e:
    print(f"Dependency Error: {str(e)}")
    # Handle the error appropriately (e.g., fall back to a different sampling method)
except Exception as e:
    print(f"Unexpected error: {str(e)}")

In [None]:
from utils import DGraphFin
from utils.utils import prepare_folder
from utils.evaluator import Evaluator
from models import MLP, MLPLinear, GCN, SAGE, GAT, GATv2
from logger import Logger

import argparse

import torch
import torch.nn.functional as F
import torch.nn as nn

import torch_geometric.transforms as T
from torch_sparse import SparseTensor
from torch_geometric.utils import to_undirected
import pandas as pd

sage_parameters = {'lr':0.01
              , 'num_layers':2
              , 'hidden_channels':128
              , 'dropout':0
              , 'batchnorm': False
              , 'l2':5e-7
             }
def train(model, data, train_idx, optimizer, no_conv=False):
    # data.y is labels of shape (N, ) 
    model.train()

    optimizer.zero_grad()
    if no_conv:
        out = model(data.x[train_idx])
    else:
        out = model(data.x, data.adj_t)[train_idx]
    loss = F.nll_loss(out, data.y[train_idx])
    loss.backward()
    optimizer.step()

    return loss.item()
