In [10]:
import pandas as pd
import numpy as np
from tqdm import tqdm
import torch
from torch_geometric.data import Data
from torch_geometric.nn import GATConv
from torch.nn import Linear
from sklearn.preprocessing import StandardScaler
from sklearn.utils.class_weight import compute_class_weight
import torch.nn.functional

In [11]:
classes_path = "../elliptic_bitcoin_dataset/modified_elliptic_txs_classes.csv"
edges_path = "../elliptic_bitcoin_dataset/modified_elliptic_txs_edgelist.csv"
features_path = "../elliptic_bitcoin_dataset/modified_elliptic_txs_features.csv"

classes = pd.read_csv(classes_path)
edges = pd.read_csv(edges_path)
feat_cols = ['txId', 'time_step'] + [f'trans_feat_{i}' for i in range(93)] + [f'agg_feat_{i}' for i in range(72)]
feats = pd.read_csv(features_path, header=None, names=feat_cols)

# Preprocess the classes DataFrame
classes.columns = ['txId', 'label']
df = classes.set_index('txId').join(feats.set_index('txId'))

# Create a mapping for all nodes
all_nodes_dict = {tx_id: i for i, tx_id in enumerate(classes['txId'])}

# Create edges list with all nodes
edges_list = [
    (all_nodes_dict[edges['txId1'][i]], all_nodes_dict[edges['txId2'][i]])
    for i in tqdm(range(len(edges)))
    if edges['txId1'][i] in all_nodes_dict and edges['txId2'][i] in all_nodes_dict
]
edge_index = torch.tensor(edges_list, dtype=torch.long).T

# Convert node features and labels    Parameter to (V) choose features used
#node_features = torch.tensor(df.iloc[:, 1:].values, dtype=torch.float)
time_step = torch.tensor(df['time_step'].values, dtype=torch.float)

scaler = StandardScaler()
scaled_features = scaler.fit_transform(df.iloc[:, 2:].values)  # Exclude txId and time_step

# Combine time_step back with scaled features
node_features = torch.cat((time_step.unsqueeze(1), torch.tensor(scaled_features, dtype=torch.float)), dim=1)

label_mapping = {'1': 0, '2': 1, 'unknown': -1} 
labels = torch.tensor(classes['label'].map(label_mapping).values, dtype=torch.long)

# Create graph data object
data = Data(x=node_features, edge_index=edge_index, y=labels)

100%|████████████████████████████████████████████████████████████████████████████| 234355/234355 [00:01<00:00, 121145.13it/s]


In [12]:
time_step_column_index = 0 
time_step = data.x[:, time_step_column_index]

train_mask = (time_step >= 1) & (time_step <= 34)
test_mask = (time_step >= 35) & (time_step <= 49)

train_mask = train_mask.clone().detach().to(torch.bool)
test_mask = test_mask.clone().detach().to(torch.bool)

In [16]:
num_features = data.x.shape[1]  # Number of features (columns in x)
num_classes = 2  # Number of classes (2 in this case)
heads = 2
#Hyperparameters
embeddings_length = 128
lr = 0.02
weight_decay = 0.001
epochs = 251


class GAT(torch.nn.Module):
    def __init__(self, in_channels, embeddings_length, heads, num_classes):
        super().__init__()
        self.gat= GATConv(in_channels=in_channels, out_channels=embeddings_length, heads=heads)
        self.out = Linear(embeddings_length * heads, num_classes)
    
    def forward(self, x, edge_index):
        h = self.gat(x, edge_index)
        z = self.out(h)
        return h, z




# Initialize the model
num_features = data.x.shape[1]  # Number of features (columns in x)
num_classes = 2  # Number of classes (2 in this case)
embeddings_length = 128
heads = 2
model = GAT(num_features, embeddings_length, heads, num_classes)
print(model)
print(num_classes)

# Loss function and optimizer

valid_labels = classes['label'].map(label_mapping)
valid_labels = valid_labels[valid_labels != -1]  # Exclude 'unknown'

# Compute class weights only for valid labels
class_weights = compute_class_weight(
    class_weight='balanced',
    classes=np.array([0, 1]),  # Include only valid classes
    y=valid_labels
)

# Convert to a PyTorch tensor for use in the loss function
class_weights_tensor = torch.tensor(class_weights, dtype=torch.float)

# Define the loss function with class weights
criterion = torch.nn.CrossEntropyLoss(weight=class_weights_tensor)

#criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=lr, weight_decay=weight_decay)

# Accuracy calculation function
def accuracy(pred_y, y):
    return (pred_y == y).sum() / len(y)

# Training loop
for epoch in range(epochs):
    optimizer.zero_grad()
    h, z = model(data.x, data.edge_index)  # h: embeddings, z: logits

    # Exclude unlabeled nodes from the loss calculation
    mask = data.y[train_mask] != -1  
    loss = criterion(z[train_mask][mask], data.y[train_mask][mask]) # Compute loss
    
    loss.backward()                         # Backpropagate
    optimizer.step()                        # Update model parameters
    
    if epoch % 10 == 0:
        acc = accuracy(z[train_mask].argmax(dim=1)[mask], data.y[train_mask][mask])  # Calculate accuracy
        print(f'Epoch {epoch:>3} | Loss: {loss:.2f} | Acc: {acc*100:.2f}%')


GAT(
  (gat): GATConv(166, 128, heads=2)
  (out): Linear(in_features=256, out_features=2, bias=True)
)
2
Epoch   0 | Loss: 0.66 | Acc: 29.06%
Epoch  10 | Loss: 0.73 | Acc: 71.20%
Epoch  20 | Loss: 0.34 | Acc: 81.81%
Epoch  30 | Loss: 0.29 | Acc: 82.07%
Epoch  40 | Loss: 0.27 | Acc: 86.52%
Epoch  50 | Loss: 0.25 | Acc: 85.96%
Epoch  60 | Loss: 0.24 | Acc: 87.41%
Epoch  70 | Loss: 0.23 | Acc: 87.22%
Epoch  80 | Loss: 0.22 | Acc: 87.64%
Epoch  90 | Loss: 0.21 | Acc: 88.18%
Epoch 100 | Loss: 0.20 | Acc: 88.87%
Epoch 110 | Loss: 0.19 | Acc: 89.62%
Epoch 120 | Loss: 0.28 | Acc: 81.49%
Epoch 130 | Loss: 0.22 | Acc: 87.61%
Epoch 140 | Loss: 0.22 | Acc: 85.42%
Epoch 150 | Loss: 0.20 | Acc: 88.09%
Epoch 160 | Loss: 0.19 | Acc: 90.29%
Epoch 170 | Loss: 0.18 | Acc: 89.60%
Epoch 180 | Loss: 0.17 | Acc: 90.33%
Epoch 190 | Loss: 0.17 | Acc: 90.46%
Epoch 200 | Loss: 0.17 | Acc: 90.99%
Epoch 210 | Loss: 0.31 | Acc: 87.63%
Epoch 220 | Loss: 0.23 | Acc: 91.12%
Epoch 230 | Loss: 0.21 | Acc: 87.17%
Epoch 2

In [17]:
# Extract node embeddings
model.eval()
with torch.no_grad():
    embeddings, _ = model(data.x, data.edge_index)  # h: embeddings

# Ensure alignment of txId and labels with embeddings
# The order in `data.x` corresponds to `classes['txId']` due to how `all_nodes_dict` was built
aligned_df = pd.DataFrame({
    'txId': classes['txId'],  # Use the original node order
    'time_step': time_step.cpu().numpy(),  # Extract time_step from the GCN input
    'label': classes['label']  # Use the original labels
})

# Add embeddings
embeddings_df = pd.concat([aligned_df, pd.DataFrame(embeddings.cpu().numpy())], axis=1)

# Save to CSV
embeddings_df.to_csv('embeddings_gat.csv', index=False)