## Data Exploration

In [1]:
import networkx as nx
import pandas as pd
import gc
import torch
import torch.nn.functional as F
from tqdm.auto import tqdm
from torch_geometric.data import Data
from model.GCN import GCNClassifier
from model.GAT import GATClassifier
from model.SAGE import SAGEClassifier

In [2]:
edgelists = pd.read_csv('data/elliptic_bitcoin_dataset/elliptic_txs_edgelist.csv')
features = pd.read_csv('data/elliptic_bitcoin_dataset/elliptic_txs_features.csv')
classes = pd.read_csv('data/elliptic_bitcoin_dataset/elliptic_txs_classes.csv')

In [3]:
features.columns = ['txId'] + [f"V{i + 1}" for i in range(len(features.columns) - 1)]

In [4]:
print("Number of nodes:", len(features))
print("Number of edges:", len(edgelists))

Number of nodes: 203768
Number of edges: 234355


In [None]:
classes['class_mapped'] = classes['class'].replace({'1': 'illicit', '2': 'licit'})

percentage_distribution = round(100 * classes['class_mapped'].value_counts(normalize=True), 2)
class_counts = classes['class_mapped'].value_counts()

emoji_mapping = {
    'licit': '✅', 
    'illicit': '❌', 
    'unknown': '🤷'
}
classes['emoji'] = classes['class_mapped'].map(emoji_mapping)

classes_df = pd.DataFrame({
    'Class Mapped': classes['class_mapped'].unique(),
    'Class Raw': classes['class'].unique(),    
    'Counts': class_counts.values,
    'Percentage': percentage_distribution.values,
    'Emoji': [emoji_mapping[class_label] for class_label in classes['class_mapped'].unique()]
})

assert len(classes_df) == 3, "There should be 3 unique classes"
assert sum(classes_df['Counts']) == len(classes), "Total counts should match the number of rows in classes"

In [6]:
classes_df

Unnamed: 0,Class Mapped,Class Raw,Counts,Percentage,Emoji
0,unknown,unknown,157205,77.15,🤷
1,licit,2,42019,20.62,✅
2,illicit,1,4545,2.23,❌


## Loading preprocessed data

In [7]:
with torch.serialization.safe_globals([Data]):
    data = torch.load('data/elliptic_bitcoin_dataset/elliptic_data.pt', weights_only=False)

print("Data loaded successfully. Number of nodes:", data.num_nodes)

known_mask = (data.y != 2)
known_indices = torch.where(known_mask)[0]

# Shuffle and split into 80% train, 20% test
perm = known_indices[torch.randperm(len(known_indices))]
split_idx = int(0.8 * len(perm))

train_indices = perm[:split_idx]
test_indices = perm[split_idx:]

# Initialize boolean masks for all nodes
data.train_mask = torch.zeros(data.num_nodes, dtype=torch.bool)
data.test_mask = torch.zeros(data.num_nodes, dtype=torch.bool)

data.train_mask[train_indices] = True
data.test_mask[test_indices] = True

print(f"Train nodes: {data.train_mask.sum().item()}")
print(f"Test nodes: {data.test_mask.sum().item()}")

Data loaded successfully. Number of nodes: 203768
Train nodes: 37251
Test nodes: 9313


## Model Initialization

In [8]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model_1 = GATClassifier(data.num_node_features, hidden_dim=64, num_classes=3).to(device)
model_2 = GCNClassifier(data.num_node_features, hidden_dim=64, num_classes=3).to(device)
model_3 = SAGEClassifier(data.num_node_features, hidden_dim=64, num_classes=3).to(device)
data = data.to(device)

## Model training

In [9]:
def process(model,
            lr: float = 1e-4,
            weight_decay: float = 5e-4,
            n_epochs: int = 512):
    optimizer = torch.optim.Adam(model.parameters(), lr=1e-4, weight_decay=5e-4)
    n_epochs = 512

    model.train()
    for epoch in tqdm(range(n_epochs), desc="Training", unit="epoch"):
        optimizer.zero_grad()
        out = model(data.x, data.edge_index)
        loss = F.nll_loss(out[data.train_mask], data.y[data.train_mask])
        loss.backward()
        optimizer.step()

    model.eval()
    with torch.no_grad():
        logits = model(data.x, data.edge_index)
        preds = logits.argmax(dim=1)
        
        correct = preds[data.test_mask].eq(data.y[data.test_mask]).sum().item()
        acc = correct / data.test_mask.sum().item()
        
        print(f"Test Accuracy on unseen nodes: {acc * 100:.2f}%")

In [10]:
process(model_1)

Training:   0%|          | 0/512 [00:00<?, ?epoch/s]

Test Accuracy on unseen nodes: 95.47%


In [11]:
process(model_2)

Training:   0%|          | 0/512 [00:00<?, ?epoch/s]

Test Accuracy on unseen nodes: 96.33%


In [12]:
process(model_3)

Training:   0%|          | 0/512 [00:00<?, ?epoch/s]

Test Accuracy on unseen nodes: 96.82%
