In [13]:
import pandas as pd
from tqdm import tqdm
import torch
from torch_geometric.data import Data
from torch_geometric.nn import GCNConv
import random
import torch
from torch_geometric.data import Data
from torch_geometric.nn import GCNConv
from torch.nn import Linear



In [14]:
classes_path = "../elliptic_bitcoin_dataset/modified_elliptic_txs_classes.csv"
edges_path = "../elliptic_bitcoin_dataset/modified_elliptic_txs_edgelist.csv"
features_path = "../elliptic_bitcoin_dataset/modified_elliptic_txs_features.csv"

classes = pd.read_csv(classes_path)
edges = pd.read_csv(edges_path)
feat_cols = ['txId', 'time_step'] + [f'trans_feat_{i}' for i in range(93)] + [f'agg_feat_{i}' for i in range(72)]
feats = pd.read_csv(features_path, header=None, names=feat_cols)

# Preprocess the classes DataFrame
classes.columns = ['txId', 'label']
df = classes.set_index('txId').join(feats.set_index('txId'))

# Filter out 'unknown' labels
df_filtered = df[df['label'] != 'unknown']
filtered_classes = classes[classes['label'] != 'unknown']

filtered_edges_dict = {tx_id: i for i, tx_id in enumerate(filtered_classes['txId'])}

filtered_edges_list = [
    (filtered_edges_dict[edges['txId1'][i]], filtered_edges_dict[edges['txId2'][i]])
    for i in tqdm(range(len(edges)))
    if edges['txId1'][i] in filtered_classes['txId'].values and
       edges['txId2'][i] in filtered_classes['txId'].values
]
filtered_edge_index = torch.tensor(filtered_edges_list, dtype=torch.long).T

# Convert node features and labels    Parameter to (V) choose features used
node_features = torch.tensor(df_filtered.iloc[:, 1:94].values, dtype=torch.float)
#edge_index = torch.tensor(filtered_edges_list.values.T, dtype=torch.int64)
label_mapping = {'1': 0, '2': 1} 
labels = torch.tensor(filtered_classes['label'].map(label_mapping).values, dtype=torch.long)

# Create graph data object
data = Data(x=node_features, edge_index=filtered_edge_index, y=labels)
print(node_features)
edges = pd.read_csv(edges_path)

# Check the maximum index in the edges
print(f"Max edge index: {edges.max().max()}")
print(f"Number of nodes: {len(node_features)}")


100%|██████████| 234355/234355 [00:04<00:00, 56936.69it/s]

tensor([[ 1.0000e+00,  1.6305e-01,  1.9638e+00,  ..., -6.9424e-01,
          2.0847e+00,  2.5308e-02],
        [ 1.0000e+00, -5.0271e-03,  5.7894e-01,  ..., -6.9424e-01,
          2.0847e+00,  2.5308e-02],
        [ 1.0000e+00, -1.4785e-01, -1.8467e-01,  ..., -6.3365e-01,
         -6.7628e-01, -1.0849e+00],
        ...,
        [ 4.9000e+01, -1.7041e-01, -7.8164e-02,  ..., -6.9399e-01,
         -7.2078e-01, -1.0849e+00],
        [ 4.9000e+01, -9.3732e-02, -1.1616e-01,  ...,  1.7883e+00,
          1.8226e+00,  1.1355e+00],
        [ 4.9000e+01, -1.7201e-01, -7.8182e-02,  ..., -5.2008e-01,
         -5.9215e-01,  1.1355e+00]])
Max edge index: 203768
Number of nodes: 46564





In [15]:
# Define GCN model
class GCN(torch.nn.Module):
    def __init__(self, num_features, num_classes):
        super().__init__()
        self.gcn = GCNConv(num_features, 128)  # GCNConv layer, 3 output channels
        self.out = Linear(128, num_classes)    # Linear layer for classification output
    
    def forward(self, x, edge_index):
        h = self.gcn(x, edge_index).relu()   # Apply GCN and ReLU
        z = self.out(h)                      # Output layer
        return h, z

# Initialize the model
num_features = data.x.shape[1]  # Number of features (columns in x)
num_classes = len(label_mapping)  # Number of classes (2 in this case)
model = GCN(num_features, num_classes)
print(model)

# Loss function and optimizer
criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.02)

# Accuracy calculation function
def accuracy(pred_y, y):
    return (pred_y == y).sum() / len(y)

# Training loop
for epoch in range(201):
    optimizer.zero_grad()
    h, z = model(data.x, data.edge_index)  # h: embeddings, z: logits
    loss = criterion(z, data.y)            # Compute loss
    loss.backward()                        # Backpropagate
    optimizer.step()                       # Update model parameters
    
    if epoch % 1== 0:
        acc = accuracy(z.argmax(dim=1), data.y)  # Calculate accuracy
        print(f'Epoch {epoch:>3} | Loss: {loss:.2f} | Acc: {acc*100:.2f}%')

GCN(
  (gcn): GCNConv(93, 128)
  (out): Linear(in_features=128, out_features=2, bias=True)
)
Epoch   0 | Loss: 2.78 | Acc: 10.01%
Epoch   1 | Loss: 0.55 | Acc: 90.23%
Epoch   2 | Loss: 0.91 | Acc: 90.24%
Epoch   3 | Loss: 1.07 | Acc: 90.24%
Epoch   4 | Loss: 1.08 | Acc: 90.24%
Epoch   5 | Loss: 1.02 | Acc: 90.24%
Epoch   6 | Loss: 0.92 | Acc: 90.24%
Epoch   7 | Loss: 0.80 | Acc: 90.24%
Epoch   8 | Loss: 0.69 | Acc: 90.24%
Epoch   9 | Loss: 0.59 | Acc: 90.24%
Epoch  10 | Loss: 0.49 | Acc: 90.24%
Epoch  11 | Loss: 0.42 | Acc: 90.24%
Epoch  12 | Loss: 0.37 | Acc: 90.24%
Epoch  13 | Loss: 0.33 | Acc: 90.24%
Epoch  14 | Loss: 0.31 | Acc: 90.27%
Epoch  15 | Loss: 0.29 | Acc: 90.28%
Epoch  16 | Loss: 0.28 | Acc: 90.25%
Epoch  17 | Loss: 0.27 | Acc: 90.24%
Epoch  18 | Loss: 0.26 | Acc: 90.25%
Epoch  19 | Loss: 0.25 | Acc: 90.25%
Epoch  20 | Loss: 0.24 | Acc: 90.25%
Epoch  21 | Loss: 0.23 | Acc: 90.28%
Epoch  22 | Loss: 0.23 | Acc: 90.31%
Epoch  23 | Loss: 0.22 | Acc: 90.31%
Epoch  24 | Loss: 0

In [16]:
print(h.shape)
# Estrazione degli embeddings dopo aver addestrato la GCN
model.eval()
with torch.no_grad():
    embeddings, _ = model(data.x, data.edge_index)  # h: embeddings

# Converti gli embeddings in un DataFrame Pandas
embeddings_df = pd.DataFrame(embeddings.cpu().numpy())

# Salva gli embeddings su un file CSV
embeddings_df.to_csv('embeddings.csv', index=False)

torch.Size([46564, 128])
