In [5]:
import pandas as pd
import numpy as np
from tqdm import tqdm
import torch
from torch_geometric.data import Data
from torch_geometric.nn import GINConv
from torch.nn import Linear, LeakyReLU, Sequential
from sklearn.preprocessing import StandardScaler
from sklearn.utils.class_weight import compute_class_weight
import torch.nn.functional


In [6]:
classes_path = "../elliptic_bitcoin_dataset/modified_elliptic_txs_classes.csv"
edges_path = "../elliptic_bitcoin_dataset/modified_elliptic_txs_edgelist.csv"
features_path = "../elliptic_bitcoin_dataset/modified_elliptic_txs_features.csv"

classes = pd.read_csv(classes_path)
edges = pd.read_csv(edges_path)
feat_cols = ['txId', 'time_step'] + [f'trans_feat_{i}' for i in range(93)] + [f'agg_feat_{i}' for i in range(72)]
feats = pd.read_csv(features_path, header=None, names=feat_cols)

# Preprocess the classes DataFrame
classes.columns = ['txId', 'label']
df = classes.set_index('txId').join(feats.set_index('txId'))

# Create a mapping for all nodes
all_nodes_dict = {tx_id: i for i, tx_id in enumerate(classes['txId'])}

# Create edges list with all nodes
edges_list = [
    (all_nodes_dict[edges['txId1'][i]], all_nodes_dict[edges['txId2'][i]])
    for i in tqdm(range(len(edges)))
    if edges['txId1'][i] in all_nodes_dict and edges['txId2'][i] in all_nodes_dict
]
edge_index = torch.tensor(edges_list, dtype=torch.long).T

# Convert node features and labels    Parameter to (V) choose features used
#node_features = torch.tensor(df.iloc[:, 1:].values, dtype=torch.float)
time_step = torch.tensor(df['time_step'].values, dtype=torch.float)

scaler = StandardScaler()
scaled_features = scaler.fit_transform(df.iloc[:, 2:].values)  # Exclude txId and time_step

# Combine time_step back with scaled features
node_features = torch.cat((time_step.unsqueeze(1), torch.tensor(scaled_features, dtype=torch.float)), dim=1)

label_mapping = {'1': 0, '2': 1, 'unknown': -1} 
labels = torch.tensor(classes['label'].map(label_mapping).values, dtype=torch.long)

# Create graph data object
data = Data(x=node_features, edge_index=edge_index, y=labels)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)
data = data.to(device)


100%|██████████| 234355/234355 [00:02<00:00, 102818.58it/s]


cpu


In [7]:
time_step_column_index = 0 
time_step = data.x[:, time_step_column_index]

train_mask = (time_step >= 1) & (time_step <= 34)
test_mask = (time_step >= 35) & (time_step <= 49)

train_mask = train_mask.clone().detach().to(torch.bool)
test_mask = test_mask.clone().detach().to(torch.bool)

In [8]:
#Hyperparameters
# embeddings length = 128
# Leaky ReLU
# lr = 0.02
# weight_decay = 0.001
# epochs = 251

embeddings_length = 128
lr = 0.02
weight_decay = 0.001
epochs = 201


# Define model
class GIN(torch.nn.Module):
    def __init__(self, num_features, num_classes):
        super().__init__()
        self.mlp = Sequential(
            Linear(num_features, embeddings_length*2),  
            LeakyReLU(negative_slope=0.2), 
            Linear(embeddings_length*2, embeddings_length)  
        )
        self.gin = GINConv(self.mlp)
        self.out = Linear(embeddings_length, num_classes)
    
    def forward(self, x, edge_index):
        h = torch.nn.functional.leaky_relu(self.gin(x, edge_index))
        z = self.out(h)
        
        return h, z


# Initialize the model
num_features = data.x.shape[1]  # Number of features (columns in x)
num_classes = 2  # Number of classes (2 in this case)
model = GIN(num_features, num_classes)
model.to(device)
print(model)
print(num_classes)

# Loss function and optimizer

valid_labels = classes['label'].map(label_mapping)
valid_labels = valid_labels[valid_labels != -1]  # Exclude 'unknown'

# Compute class weights only for valid labels
class_weights = compute_class_weight(
    class_weight='balanced',
    classes=np.array([0, 1]),  # Include only valid classes
    y=valid_labels
)

# Convert to a PyTorch tensor for use in the loss function
class_weights_tensor = torch.tensor(class_weights, dtype=torch.float).to(device)

# Define the loss function with class weights
criterion = torch.nn.CrossEntropyLoss(weight=class_weights_tensor).to(device)

#criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=lr, weight_decay=weight_decay)

# Accuracy calculation function
def accuracy(pred_y, y):
    return (pred_y == y).sum() / len(y)

# Training loop
for epoch in range(epochs):
    optimizer.zero_grad()
    h, z = model(data.x, data.edge_index)  # h: embeddings, z: logits

    # Exclude unlabeled nodes from the loss calculation
    mask = data.y[train_mask] != -1  
    loss = criterion(z[train_mask][mask], data.y[train_mask][mask]) # Compute loss
    
    loss.backward()                         # Backpropagate
    optimizer.step()                        # Update model parameters
    
    if epoch % 10 == 0:
        acc = accuracy(z[train_mask].argmax(dim=1)[mask], data.y[train_mask][mask])  # Calculate accuracy
        print(f'Epoch {epoch:>3} | Loss: {loss:.2f} | Acc: {acc*100:.2f}%')

GIN(
  (mlp): Sequential(
    (0): Linear(in_features=166, out_features=256, bias=True)
    (1): LeakyReLU(negative_slope=0.2)
    (2): Linear(in_features=256, out_features=128, bias=True)
  )
  (gin): GINConv(nn=Sequential(
    (0): Linear(in_features=166, out_features=256, bias=True)
    (1): LeakyReLU(negative_slope=0.2)
    (2): Linear(in_features=256, out_features=128, bias=True)
  ))
  (out): Linear(in_features=128, out_features=2, bias=True)
)
2
Epoch   0 | Loss: 0.79 | Acc: 15.91%
Epoch  10 | Loss: 0.53 | Acc: 76.89%
Epoch  20 | Loss: 0.33 | Acc: 83.11%
Epoch  30 | Loss: 0.29 | Acc: 83.87%
Epoch  40 | Loss: 0.25 | Acc: 88.42%
Epoch  50 | Loss: 0.23 | Acc: 86.48%
Epoch  60 | Loss: 0.20 | Acc: 90.50%
Epoch  70 | Loss: 0.18 | Acc: 92.76%
Epoch  80 | Loss: 0.21 | Acc: 86.41%
Epoch  90 | Loss: 0.20 | Acc: 85.81%
Epoch 100 | Loss: 0.20 | Acc: 89.42%
Epoch 110 | Loss: 0.16 | Acc: 91.73%
Epoch 120 | Loss: 0.14 | Acc: 94.19%
Epoch 130 | Loss: 0.15 | Acc: 96.10%
Epoch 140 | Loss: 0.18 | 

In [9]:
# Extract node embeddings
model.eval()
with torch.no_grad():
    embeddings, _ = model(data.x, data.edge_index)  # h: embeddings

# Ensure alignment of txId and labels with embeddings
# The order in `data.x` corresponds to `classes['txId']` due to how `all_nodes_dict` was built
aligned_df = pd.DataFrame({
    'txId': classes['txId'],  # Use the original node order
    'time_step': time_step.cpu().numpy(),  # Extract time_step from the GCN input
    'label': classes['label']  # Use the original labels
})

# Add embeddings
embeddings_df = pd.concat([aligned_df, pd.DataFrame(embeddings.cpu().numpy())], axis=1)

# Save to CSV
embeddings_df.to_csv('../data/embeddings_gin.csv', index=False)
