In [1]:
import pandas as pd
from tqdm import tqdm
import torch
from torch_geometric.data import Data
from torch_geometric.nn import GCNConv
import random
from torch.nn import Linear


In [11]:
classes_path = "../elliptic_bitcoin_dataset/modified_elliptic_txs_classes.csv"
edges_path = "../elliptic_bitcoin_dataset/modified_elliptic_txs_edgelist.csv"
features_path = "../elliptic_bitcoin_dataset/modified_elliptic_txs_features.csv"

classes = pd.read_csv(classes_path)
edges = pd.read_csv(edges_path)
feat_cols = ['txId', 'time_step'] + [f'trans_feat_{i}' for i in range(93)] + [f'agg_feat_{i}' for i in range(72)]
feats = pd.read_csv(features_path, header=None, names=feat_cols)

# Preprocess the classes DataFrame
classes.columns = ['txId', 'label']
df = classes.set_index('txId').join(feats.set_index('txId'))

# Create a mapping for all nodes
all_nodes_dict = {tx_id: i for i, tx_id in enumerate(classes['txId'])}

# Create edges list with all nodes
edges_list = [
    (all_nodes_dict[edges['txId1'][i]], all_nodes_dict[edges['txId2'][i]])
    for i in tqdm(range(len(edges)))
    if edges['txId1'][i] in all_nodes_dict and edges['txId2'][i] in all_nodes_dict
]
edge_index = torch.tensor(edges_list, dtype=torch.long).T

# Convert node features and labels    Parameter to (V) choose features used
node_features = torch.tensor(df.iloc[:, 1:].values, dtype=torch.float)
#edge_index = torch.tensor(filtered_edges_list.values.T, dtype=torch.int64)
label_mapping = {'1': 0, '2': 1, 'unknown': -1} 
labels = torch.tensor(classes['label'].map(label_mapping).values, dtype=torch.long)

# Create graph data object
data = Data(x=node_features, edge_index=edge_index, y=labels)
print(node_features)
edges = pd.read_csv(edges_path)

# Check the maximum index in the edges
print(f"Max edge index: {edges.max().max()}")
print(f"Number of nodes: {len(node_features)}")


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████| 234355/234355 [00:01<00:00, 194178.36it/s]


tensor([[ 1.0000e+00, -1.7147e-01, -1.8467e-01,  ..., -9.7524e-02,
         -1.2061e-01, -1.1979e-01],
        [ 1.0000e+00, -1.7148e-01, -1.8467e-01,  ..., -9.7524e-02,
         -1.2061e-01, -1.1979e-01],
        [ 1.0000e+00, -1.7211e-01, -1.8467e-01,  ..., -1.8367e-01,
         -1.2061e-01, -1.1979e-01],
        ...,
        [ 4.9000e+01, -1.7201e-01, -7.8182e-02,  ..., -9.7524e-02,
         -1.2061e-01, -1.1979e-01],
        [ 4.9000e+01, -1.7284e-01, -1.7662e-01,  ..., -1.4060e-01,
          1.5197e+00,  1.5214e+00],
        [ 4.9000e+01, -1.2037e-02, -1.3228e-01,  ..., -1.4060e-01,
          1.5197e+00,  1.5214e+00]])
Max edge index: 203768
Number of nodes: 203769


In [12]:
# Define GCN model
class GCN(torch.nn.Module):
    def __init__(self, num_features, num_classes):
        super().__init__()
        self.gcn = GCNConv(num_features, 128)  # GCNConv layer, 3 output channels
        self.out = Linear(128, num_classes)    # Linear layer for classification output
    
    def forward(self, x, edge_index):
        h = self.gcn(x, edge_index).relu()   # Apply GCN and ReLU
        z = self.out(h)                      # Output layer
        return h, z

# Initialize the model
num_features = data.x.shape[1]  # Number of features (columns in x)
num_classes = len(label_mapping)  # Number of classes (2 in this case)
model = GCN(num_features, num_classes)
print(model)

# Loss function and optimizer
criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.02)

# Accuracy calculation function
def accuracy(pred_y, y):
    return (pred_y == y).sum() / len(y)

# Training loop
for epoch in range(201):
    optimizer.zero_grad()
    h, z = model(data.x, data.edge_index)  # h: embeddings, z: logits

    # Exclude unlabeled nodes from the loss calculation
    mask = data.y != -1  
    loss = criterion(z[mask], data.y[mask]) # Compute loss
    
    loss.backward()                         # Backpropagate
    optimizer.step()                        # Update model parameters
    
    if epoch % 10 == 0:
        acc = accuracy(z.argmax(dim=1)[mask], data.y[mask])  # Calculate accuracy
        print(f'Epoch {epoch:>3} | Loss: {loss:.2f} | Acc: {acc*100:.2f}%')

GCN(
  (gcn): GCNConv(166, 128)
  (out): Linear(in_features=128, out_features=3, bias=True)
)
Epoch   0 | Loss: 2.01 | Acc: 14.45%
Epoch  10 | Loss: 0.45 | Acc: 90.27%
Epoch  20 | Loss: 0.26 | Acc: 90.43%
Epoch  30 | Loss: 0.19 | Acc: 92.66%
Epoch  40 | Loss: 0.16 | Acc: 94.02%
Epoch  50 | Loss: 0.14 | Acc: 95.30%
Epoch  60 | Loss: 0.12 | Acc: 95.83%
Epoch  70 | Loss: 0.11 | Acc: 96.17%
Epoch  80 | Loss: 0.10 | Acc: 96.45%
Epoch  90 | Loss: 0.09 | Acc: 96.78%
Epoch 100 | Loss: 0.09 | Acc: 96.99%
Epoch 110 | Loss: 0.08 | Acc: 97.18%
Epoch 120 | Loss: 0.08 | Acc: 97.28%
Epoch 130 | Loss: 0.07 | Acc: 97.53%
Epoch 140 | Loss: 0.07 | Acc: 97.61%
Epoch 150 | Loss: 0.06 | Acc: 97.76%
Epoch 160 | Loss: 0.06 | Acc: 97.90%
Epoch 170 | Loss: 0.06 | Acc: 97.95%
Epoch 180 | Loss: 0.06 | Acc: 98.09%
Epoch 190 | Loss: 0.05 | Acc: 98.19%
Epoch 200 | Loss: 0.05 | Acc: 98.29%


In [13]:
print(h.shape)
# Estrazione degli embeddings dopo aver addestrato la GCN
model.eval()
with torch.no_grad():
    embeddings, _ = model(data.x, data.edge_index)  # h: embeddings

# Converti gli embeddings in un DataFrame Pandas
embeddings_df = pd.DataFrame(embeddings.cpu().numpy())

# Salva gli embeddings su un file CSV
embeddings_df.to_csv('embeddings.csv', index=False)

torch.Size([203769, 128])
