In [1]:
import pandas as pd
import numpy as np
import gc
import torch
import torch.nn.functional as F
from torch_geometric.nn import SAGEConv
from torch_geometric.data import Data
from torch_geometric.loader import NeighborLoader
from torch.optim import Adam
import networkx as nx

### GraphSAGE load

In [2]:
#load data
classes = pd.read_csv('/Users/kostas/Documents/Data Science/p3/scientific/datasets/elliptic_bitcoin_dataset/elliptic_txs_classes.csv')
features = pd.read_csv('/Users/kostas/Documents/Data Science/p3/scientific/datasets/elliptic_bitcoin_dataset/elliptic_txs_features.csv', header=None)
edges = pd.read_csv('/Users/kostas/Documents/Data Science/p3/scientific/datasets/elliptic_bitcoin_dataset/elliptic_txs_edgelist.csv')

#Convert classes to numerical format
class_map = {"unknown": 2, "licit": 0, "illicit": 1}
classes["class"] = classes["class"].map(class_map)

#extract IDs, features and labels
node_id = features[0].values
x = torch.tensor(features.iloc[:, 1:].values, dtype=torch.float)
y = torch.tensor(classes["class"].values, dtype=torch.long)

#edges to tensor
edge_index = torch.tensor(edges.values.T, dtype=torch.long)

#create data object
data = Data(x=x, edge_index=edge_index, y=y)
print(data)

Data(x=[203769, 166], edge_index=[2, 234355], y=[203769])


In [3]:
class GraphSAGE(torch.nn.Module):
  def __init__(self, in_channels, hidden_channels, out_channels):
    super(GraphSAGE, self).__init__()
    self.conv1 = SAGEConv(in_channels, hidden_channels)
    self.conv2 = SAGEConv(hidden_channels, out_channels)

  def forward(self, x, edge_index):
    x = self.conv1(x, edge_index)
    x = F.relu(x)
    x = self.conv2(x, edge_index)
    return F.log_softmax(x, dim=1)

In [4]:
gc.collect()

31

In [5]:
import psutil
print(f"Available RAM: {psutil.virtual_memory().available / 1e9} GB")

Available RAM: 5.570863104 GB


### Setup GraphSAGE for Elliptic

In [None]:
#dataLoader for neighborhood sampling
train_mask = torch.zeros(data.num_nodes,dtype=torch.bool)
train_mask[:int(0.7*data.num_nodes)]= True
data.train_mask = train_mask

train_loader = NeighborLoader(data, num_neighbors = [10,5], batch_size =8, shuffle = True, num_workers=0, pin_memory = False)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = GraphSAGE(in_channels=data.num_features, hidden_channels = 32, out_channels = 2).to(device)
data = data.to(device)
#define optimizer and loss function
optimizer = Adam(model.parameters(), lr= 0.005, weight_decay = 5e-4)
loss_fn = torch.nn.CrossEntropyLoss()

def train():
  model.train()
  total_loss = 0
    
  for batch in train_loader:
    batch = batch.to(device)
    optimizer.zero_grad()
    out = model(batch.x, batch.edge_index)
    print(f"Output Shape: {out.shape}, Label Shape: {batch.y.shape}")

    if torch.isnan(out).any():
        print("warning:NaN detected in output")
        return total_loss
        
    mask = batch.train_mask
    loss = loss_fn(out[mask], batch.y[mask])
      
    loss.backward()
    optimizer.step()
    total_loss += loss.item()
  return total_loss 

for epoch in range(20):
  loss = train()
  print(f"epoch {epoch+1}, loss: {loss:.4f}")