In [1]:
import pandas as pd
import numpy as np
import gc
import torch
import torch.nn.functional as F
from torch_geometric.nn import SAGEConv
from torch_geometric.data import Data
from torch_geometric.loader import NeighborLoader
from torch.optim import Adam
import networkx as nx

### GraphSAGE load

In [2]:
#load data
classes = pd.read_csv('/Users/kostas/Documents/Data Science/p3/scientific/datasets/elliptic_bitcoin_dataset/elliptic_txs_classes.csv')
features = pd.read_csv('/Users/kostas/Documents/Data Science/p3/scientific/datasets/elliptic_bitcoin_dataset/elliptic_txs_features.csv', header=None)
edges = pd.read_csv('/Users/kostas/Documents/Data Science/p3/scientific/datasets/elliptic_bitcoin_dataset/elliptic_txs_edgelist.csv')

#Convert classes to numerical format
class_map = {"unknown": 2, "licit": 0, "illicit": 1}
classes["class"] = classes["class"].map(class_map)

#extract IDs, features and labels
node_id = features[0].values
x = torch.tensor(features.iloc[:, 1:].values, dtype=torch.float)
y = torch.tensor(classes["class"].values, dtype=torch.long)

#edges to tensor
edge_index = torch.tensor(edges.values.T, dtype=torch.long)

#create data object
data = Data(x=x, edge_index=edge_index, y=y)
print(data)

Data(x=[203769, 166], edge_index=[2, 234355], y=[203769])


### DGraphFin load

In [10]:
# #load data
# data = np.load('/Users/kostas/Documents/Data Science/p3/scientific/datasets/DGraphFin/dgraphfin.npz')

# x = torch.tensor(data['features'], dtype = torch.float)
# y = torch.float(data['labels'], dtype=torch.long)
# edge_idx =  torch.tensor(data['edges'].T, dtype=torch.long)

# data = Data(x=x, edge_idx=edge_idx, y=y)
import torch_geometric

# check your torch_geometric version and make sure it is not lower than 2.2.0
print(torch_geometric.__version__)


# Please download DGraphFin dataset file 'DGraphFin.zip' on our website 'https://dgraph.xinye.com' and place it under directory './dataset/raw'
# Otherwise an error would pop out "Dataset not found. Please download 'DGraphFin.zip' from 'https://dgraph.xinye.com' and move it to './raw' "
from torch_geometric.datasets import DGraphFin

dataset = DGraphFin(root='/Users/kostas/Documents/Data Science/p3/scientific/datasets')
data = dataset[0]

2.6.1


Processing...
Done!


In [12]:
python gnn.py --model sage --dataset DGraphFin --epochs 200 --runs 10 --device 0

SyntaxError: invalid syntax (977009493.py, line 1)

In [None]:
class GraphSAGE(torch.nn.Module):
  def __init__(self, in_channels, hidden_channels, out_channels):
    super(GraphSAGE, self).__init__()
    self.conv1 = SAGEConv(in_channels, hidden_channels)
    self.conv2 = SAGEConv(hidden_channels, out_channels)

  def forward(self, x, edge_index):
    x = self.conv1(x, edge_index)
    x = F.relu(x)
    x = self.conv2(x, edge_index)
    return F.log_softmax(x, dim=1)
model = GraphSAGE(in_channels=data.num_features, hidden_channels = 32, out_channels = 2)

In [None]:
gc.collect()

In [None]:
import psutil
print(f"Available RAM: {psutil.virtual_memory().available / 1e9} GB")

### Setup GraphSAGE for Elliptic

In [None]:
#dataLoader for neighborhood sampling
train_loader = NeighborLoader(data.to('cpu'), num_neighbors = [10,5], batch_size =8, shuffle = True, num_workers=0, pin_memory = False)

#define optimizer and loss function
optimizer = Adam(model.parameters(), lr= 0.01, weight_decay = 5e-4)
loss_fn = torch.nn.CrossEntropyLoss()

def train():
  model.train()
  total_loss = 0
    
  for batch in train_loader:
    optimizer.zero_grad()
    out = model(batch.x, batch.edge_index)
    print(f"Output Shape: {out.shape}, Label Shape: {batch.y.shape}")

    if torch.isnan(out).any():
        print("warning:NaN detected in output")
        return total_loss
    mask = batch.y != 2
    loss = loss_fn(out[mask], batch.y[mask])
      
    loss.backward()
    optimizer.step()
    total_loss += loss.item()
  return total_loss / len(train_loader)

for epoch in range(20):
  loss = train()
  print(f"epoch {epoch+1}, loss: {loss:.4f}")

### Setup GraphSAGE for DGraphFin

In [None]:
train_loader = NeighborLoader(graph_data, num_neighbors=[10,5],batch_size=8,shuffle=True,num_workers=0,pin_memory=False)

In [None]:
#evaluation
from sklearn.metrics import f1_score
model.eval()
out = model(data.x, data.edge_index)
pred = out.argmax(dim=1)

f1 = f1_score(data.y.cpu().numpy(), pred.cpu().numpy(), average ="macro")
print(f"f1 score:{f1:.4f}")