In [11]:
import pandas as pd
import numpy as np
from tqdm import tqdm
import torch
from torch_geometric.data import Data
from torch_geometric.nn import GCNConv
from torch.nn import Linear
from sklearn.preprocessing import StandardScaler
from sklearn.utils.class_weight import compute_class_weight
import torch.nn.functional

In [12]:
classes_path = "../elliptic_bitcoin_dataset/modified_elliptic_txs_classes.csv"
edges_path = "../elliptic_bitcoin_dataset/modified_elliptic_txs_edgelist.csv"
features_path = "../elliptic_bitcoin_dataset/modified_elliptic_txs_features.csv"

classes = pd.read_csv(classes_path)
edges = pd.read_csv(edges_path)
feat_cols = ['txId', 'time_step'] + [f'trans_feat_{i}' for i in range(93)] + [f'agg_feat_{i}' for i in range(72)]
feats = pd.read_csv(features_path, header=None, names=feat_cols)

# Preprocess the classes DataFrame
classes.columns = ['txId', 'label']
df = classes.set_index('txId').join(feats.set_index('txId'))

# Create a mapping for all nodes
all_nodes_dict = {tx_id: i for i, tx_id in enumerate(classes['txId'])}

# Create edges list with all nodes
edges_list = [
    (all_nodes_dict[edges['txId1'][i]], all_nodes_dict[edges['txId2'][i]])
    for i in tqdm(range(len(edges)))
    if edges['txId1'][i] in all_nodes_dict and edges['txId2'][i] in all_nodes_dict
]
edge_index = torch.tensor(edges_list, dtype=torch.long).T

# Convert node features and labels    Parameter to (V) choose features used
#node_features = torch.tensor(df.iloc[:, 1:].values, dtype=torch.float)
time_step = torch.tensor(df['time_step'].values, dtype=torch.float)

scaler = StandardScaler()
scaled_features = scaler.fit_transform(df.iloc[:, 2:].values)  # Exclude txId and time_step

# Combine time_step back with scaled features
node_features = torch.cat((time_step.unsqueeze(1), torch.tensor(scaled_features, dtype=torch.float)), dim=1)

label_mapping = {'1': 0, '2': 1, 'unknown': -1} 
labels = torch.tensor(classes['label'].map(label_mapping).values, dtype=torch.long)

# Create graph data object
data = Data(x=node_features, edge_index=edge_index, y=labels)


100%|██████████| 234355/234355 [00:01<00:00, 121538.41it/s]


In [13]:
time_step_column_index = 0 
time_step = data.x[:, time_step_column_index]

train_mask = (time_step >= 1) & (time_step <= 34)
test_mask = (time_step >= 35) & (time_step <= 49)

train_mask = train_mask.clone().detach().to(torch.bool)
test_mask = test_mask.clone().detach().to(torch.bool)

In [14]:
#Hyperparameters
# embeddings length = 128
# Leaky ReLU
# lr = 0.02
# weight_decay = 0.001
# epochs = 251
# Precision node embeddings (GCN): 0.8458498023715415
# Recall node embeddings (GCN): 0.19759926131117267
# F1 node embeddings (GCN): 0.3203592814371258
# Cross-validated F1 Score node embeddings (GCN): 0.500450874236994
# Precision embeddings + all features: 0.9718670076726342
# Recall embeddings + all features: 0.7017543859649122
# F1 embeddings + all features: 0.8150134048257373
# Cross-validated F1 Score embeddings + all features: 0.6736564758872677

embeddings_length = 128
lr = 0.02
weight_decay = 0.001
epochs = 251


# Define GCN model
class GCN(torch.nn.Module):
    def __init__(self, num_features, num_classes):
        super().__init__()
        self.gcn = GCNConv(num_features, embeddings_length)  # GCNConv layer, 3 output channels
        self.out = Linear(embeddings_length, num_classes)    # Linear layer for classification output
    
    def forward(self, x, edge_index):
        h = torch.nn.functional.leaky_relu(self.gcn(x, edge_index))   # Apply GCN and ReLU
        z = self.out(h)                      # Output layer
        return h, z


# Initialize the model
num_features = data.x.shape[1]  # Number of features (columns in x)
num_classes = 2  # Number of classes (2 in this case)
model = GCN(num_features, num_classes)
print(model)
print(num_classes)

# Loss function and optimizer

valid_labels = classes['label'].map(label_mapping)
valid_labels = valid_labels[valid_labels != -1]  # Exclude 'unknown'

# Compute class weights only for valid labels
class_weights = compute_class_weight(
    class_weight='balanced',
    classes=np.array([0, 1]),  # Include only valid classes
    y=valid_labels
)

# Convert to a PyTorch tensor for use in the loss function
class_weights_tensor = torch.tensor(class_weights, dtype=torch.float)

# Define the loss function with class weights
criterion = torch.nn.CrossEntropyLoss(weight=class_weights_tensor)

#criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=lr, weight_decay=weight_decay)

# Accuracy calculation function
def accuracy(pred_y, y):
    return (pred_y == y).sum() / len(y)

# Training loop
for epoch in range(epochs):
    optimizer.zero_grad()
    h, z = model(data.x, data.edge_index)  # h: embeddings, z: logits

    # Exclude unlabeled nodes from the loss calculation
    mask = data.y[train_mask] != -1  
    loss = criterion(z[train_mask][mask], data.y[train_mask][mask]) # Compute loss
    
    loss.backward()                         # Backpropagate
    optimizer.step()                        # Update model parameters
    
    if epoch % 10 == 0:
        acc = accuracy(z[train_mask].argmax(dim=1)[mask], data.y[train_mask][mask])  # Calculate accuracy
        print(f'Epoch {epoch:>3} | Loss: {loss:.2f} | Acc: {acc*100:.2f}%')

GCN(
  (gcn): GCNConv(166, 128)
  (out): Linear(in_features=128, out_features=2, bias=True)
)
2
Epoch   0 | Loss: 0.75 | Acc: 27.19%
Epoch  10 | Loss: 0.41 | Acc: 87.78%
Epoch  20 | Loss: 0.29 | Acc: 86.39%
Epoch  30 | Loss: 0.24 | Acc: 86.52%
Epoch  40 | Loss: 0.20 | Acc: 89.00%
Epoch  50 | Loss: 0.17 | Acc: 90.60%
Epoch  60 | Loss: 0.15 | Acc: 91.41%
Epoch  70 | Loss: 0.13 | Acc: 93.13%
Epoch  80 | Loss: 0.13 | Acc: 92.17%
Epoch  90 | Loss: 0.12 | Acc: 94.33%
Epoch 100 | Loss: 0.11 | Acc: 94.98%
Epoch 110 | Loss: 0.11 | Acc: 94.92%
Epoch 120 | Loss: 0.10 | Acc: 95.03%
Epoch 130 | Loss: 0.10 | Acc: 93.80%
Epoch 140 | Loss: 0.10 | Acc: 96.40%
Epoch 150 | Loss: 0.12 | Acc: 91.84%
Epoch 160 | Loss: 0.10 | Acc: 95.08%
Epoch 170 | Loss: 0.09 | Acc: 95.95%
Epoch 180 | Loss: 0.09 | Acc: 95.98%
Epoch 190 | Loss: 0.08 | Acc: 95.85%
Epoch 200 | Loss: 0.08 | Acc: 96.16%
Epoch 210 | Loss: 0.08 | Acc: 96.08%
Epoch 220 | Loss: 0.08 | Acc: 95.15%
Epoch 230 | Loss: 0.08 | Acc: 94.57%
Epoch 240 | Loss

In [15]:
# Extract node embeddings
model.eval()
with torch.no_grad():
    embeddings, _ = model(data.x, data.edge_index)  # h: embeddings

# Ensure alignment of txId and labels with embeddings
# The order in `data.x` corresponds to `classes['txId']` due to how `all_nodes_dict` was built
aligned_df = pd.DataFrame({
    'txId': classes['txId'],  # Use the original node order
    'time_step': time_step.cpu().numpy(),  # Extract time_step from the GCN input
    'label': classes['label']  # Use the original labels
})

# Add embeddings
embeddings_df = pd.concat([aligned_df, pd.DataFrame(embeddings.cpu().numpy())], axis=1)

# Save to CSV
embeddings_df.to_csv('embeddings_gcn.csv', index=False)
