In [1]:
import dgl
import torch
import torch.nn as nn
import torch.nn.functional as F
from dgl.nn import GraphConv
from sklearn.metrics import roc_auc_score, f1_score, precision_score, recall_score
from sklearn.exceptions import UndefinedMetricWarning, ConvergenceWarning
import warnings
import pickle
import copy
import numpy as np


In [2]:
# load G_dgl
# read G_dgl graph
import pickle as pkl
with open('/home/qian/HNE/Model/GCN/Ethereum/G_dgl.pkl', 'rb') as f:
    G_dgl = pkl.load(f)

In [3]:
class GCN(nn.Module):
    def __init__(self, in_feats, hidden_size, out_feats, dropout_rate):
        super(GCN, self).__init__()
        self.conv1 = GraphConv(in_feats, hidden_size)
        self.conv2 = GraphConv(hidden_size, hidden_size)  # added layer
        self.conv3 = GraphConv(hidden_size, out_feats)  # final layer
        self.dropout = nn.Dropout(dropout_rate)  # dropout layer
        self.batchnorm1 = nn.BatchNorm1d(hidden_size)  # batchnorm layer

    def forward(self, g, features):
        x = F.relu(self.conv1(g, features))
        x = self.dropout(x)  # apply dropout
        x = self.batchnorm1(x)  # apply batchnorm
        x = F.relu(self.conv2(g, x))
        x = self.dropout(x)  # apply dropout
        x = self.conv3(g, x)
        return x

In [4]:
# store all edge_indices in separate files
import pickle as pkl
with open('/home/qian/HNE/Model/GCN/Ethereum/positive_train_edge_indices.pkl', 'rb') as f:
    positive_train_edge_indices = pkl.load(f)
    
with open('/home/qian/HNE/Model/GCN/Ethereum/negative_train_edge_indices.pkl', 'rb') as f:
    negative_train_edge_indices = pkl.load(f)
    
with open('/home/qian/HNE/Model/GCN/Ethereum/positive_validation_edge_indices.pkl', 'rb') as f:
    positive_validation_edge_indices = pkl.load(f)
    
with open('/home/qian/HNE/Model/GCN/Ethereum/negative_validation_edge_indices.pkl', 'rb') as f:
    negative_validation_edge_indices = pkl.load(f)
    
with open('/home/qian/HNE/Model/GCN/Ethereum/positive_test_edge_indices.pkl', 'rb') as f:
    positive_test_edge_indices = pkl.load(f)
    
with open('/home/qian/HNE/Model/GCN/Ethereum/negative_test_edge_indices.pkl', 'rb') as f:
    negative_test_edge_indices = pkl.load(f)

In [5]:
# get train, validation, test set
def generate_edge_embeddings(h, edges):
    # Extract the source and target node indices from the edges
    src, dst = edges[0], edges[1]
    
    # Use the node indices to get the corresponding node embeddings
    src_embed = h[src]
    dst_embed = h[dst]

    # Concatenate the source and target node embeddings
    edge_embs = torch.cat([src_embed, dst_embed], dim=1)

    return edge_embs

In [7]:
# write a five loop to get the result and document them
from sklearn.metrics import roc_auc_score, f1_score, precision_score, recall_score, accuracy_score
import copy
linear = nn.Linear(256, 1)
for i in range(5):
    model = GCN(8, 128, 128, 0.1)
    optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
    criterion = nn.BCEWithLogitsLoss()
    best_val_loss = float('inf')
    best_model = None
    num_epochs = 200
    patience = 30
    early_stopping_counter = 0
    
    for epoch in range(num_epochs):
        model.train()
        
        # forward pass
        logits = model(G_dgl, G_dgl.ndata['normalized_log_features'])
        
        # generate edge embeddings
        pos_train_edge_embs = generate_edge_embeddings(logits, positive_train_edge_indices)
        neg_train_edge_embs = generate_edge_embeddings(logits, negative_train_edge_indices)
        
        # concatenete positive and negative edge embeddings
        train_edge_embs = torch.cat([pos_train_edge_embs, neg_train_edge_embs], dim=0)
        train_edge_labels = torch.cat([torch.ones(pos_train_edge_embs.shape[0]), torch.zeros(neg_train_edge_embs.shape[0])], dim=0).unsqueeze(1)
        
        # print shapes of tensors for debugging
        # print(f"Train Edge Embeddings Shape: {train_edge_embs.shape}")
        # print(f"Train Edge Labels Shape: {train_edge_labels.shape}")
        
        # calculate loss
        loss = criterion(linear(train_edge_embs), train_edge_labels)
        print(f"Training Loss: {loss.item()}")
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        
        # validation
        model.eval()
        
        with torch.no_grad():
            # repeat the same process as above for validation samples
            logits = model(G_dgl, G_dgl.ndata['normalized_log_features'].float())
            pos_val_edge_embs = generate_edge_embeddings(logits, positive_validation_edge_indices)
            neg_val_edge_embs = generate_edge_embeddings(logits, negative_validation_edge_indices)
            val_edge_embs = torch.cat([pos_val_edge_embs, neg_val_edge_embs], dim=0)
            val_edge_labels = torch.cat([torch.ones(pos_val_edge_embs.shape[0]), torch.zeros(neg_val_edge_embs.shape[0])], dim=0).unsqueeze(1)
            # print shapes of tensors for debugging
            # print(f"Validation Edge Embeddings Shape: {val_edge_embs.shape}")
            # print(f"Validation Edge Labels Shape: {val_edge_labels.shape}")

            val_loss = criterion(linear(val_edge_embs), val_edge_labels)
            print(f"Validation Loss: {val_loss.item()}")
            
            # early stopping based on validation loss
            if val_loss < best_val_loss:
                best_val_loss = val_loss
                early_stopping_counter = 0
                # save the best model
                best_model = copy.deepcopy(model)
            else:
                early_stopping_counter += 1
                if early_stopping_counter >= patience:
                    print('early stopping due to validation loss not improving')
                    break
                
    # switch to evaluation mode
    best_model.eval()

    with torch.no_grad():
        # generate the embeddings using the best model
        logits = best_model(G_dgl, G_dgl.ndata['normalized_log_features'].float())

        # generate edge embeddings for the test samples
        pos_test_edge_embs = generate_edge_embeddings(logits, positive_test_edge_indices)
        neg_test_edge_embs = generate_edge_embeddings(logits, negative_test_edge_indices)

        # concatenate the positive and negative edge embeddings and labels
        test_edge_embs = torch.cat([pos_test_edge_embs, neg_test_edge_embs], dim=0)
        test_edge_labels = torch.cat([torch.ones(pos_test_edge_embs.shape[0]), torch.zeros(neg_test_edge_embs.shape[0])], dim=0)


        # test_loss = criterion(linear(test_edge_embs), val_edge_labels)
        # calculate predictions using the linear layer
        
        predictions = torch.sigmoid(linear(test_edge_embs))
        
        # reshape the predictions and the labels
        predictions = predictions.view(-1).cpu().numpy()
        test_edge_labels = test_edge_labels.cpu().numpy()

        # calculate scores and entropyloss
        
        
        auc = roc_auc_score(test_edge_labels, predictions)
        predictions_binary = (predictions > 0.5).astype(int)
        f1 = f1_score(test_edge_labels, predictions_binary)
        precision = precision_score(test_edge_labels, predictions_binary)
        recall = recall_score(test_edge_labels, predictions_binary)
        accuracy = accuracy_score(test_edge_labels, predictions_binary)

        print(f"AUC: {auc}")
        print(f"F1 Score: {f1}")
        print(f"Precision: {precision}")
        print(f"Recall: {recall}")
        print(f"Accuracy: {accuracy}")
    # print accuracy, f1, precision, recall, auc-roc
    # print(f"Test Loss: {test_loss.item()}")
        with open('results_wo_twitter.txt', 'a') as f:
            f.write(f"AUC: {auc}\n")
            f.write(f"F1 Score: {f1}\n")
            f.write(f"Precision: {precision}\n")
            f.write(f"Recall: {recall}\n")
            f.write(f"Accuracy: {accuracy}\n")
            f.write('\n')
    

  assert input.numel() == input.storage().size(), (


Training Loss: 0.9696251749992371
Validation Loss: 0.6919221878051758
Training Loss: 1.0484257936477661
Validation Loss: 0.6920235753059387
Training Loss: 1.0569833517074585
Validation Loss: 0.6915087103843689
Training Loss: 1.0829960107803345
Validation Loss: 0.6908208727836609
Training Loss: 0.9845759272575378
Validation Loss: 0.6899517178535461
Training Loss: 0.8065875768661499
Validation Loss: 0.6893700361251831
Training Loss: 0.6759758591651917
Validation Loss: 0.6893088817596436
Training Loss: 0.7726309895515442
Validation Loss: 0.6891878247261047
Training Loss: 0.8154413104057312
Validation Loss: 0.6889891028404236
Training Loss: 0.7199773788452148
Validation Loss: 0.6890242695808411
Training Loss: 0.6711159348487854
Validation Loss: 0.6892832517623901
Training Loss: 0.7339988946914673
Validation Loss: 0.6891583204269409
Training Loss: 0.7431488037109375
Validation Loss: 0.6885975003242493
Training Loss: 0.725646436214447
Validation Loss: 0.6878105401992798
Training Loss: 0.6517

  assert input.numel() == input.storage().size(), (


Training Loss: 1.5299779176712036
Validation Loss: 0.6912275552749634
Training Loss: 0.9839853048324585
Validation Loss: 0.6887030601501465
Training Loss: 0.8305760025978088
Validation Loss: 0.6891153454780579
Training Loss: 0.9529653787612915
Validation Loss: 0.6891239285469055
Training Loss: 0.9746603965759277
Validation Loss: 0.6894117593765259
Training Loss: 0.9358487725257874
Validation Loss: 0.6899793744087219
Training Loss: 0.8992177844047546
Validation Loss: 0.6904220581054688
Training Loss: 0.8597847819328308
Validation Loss: 0.6907200813293457
Training Loss: 0.7212643623352051
Validation Loss: 0.6913257837295532
Training Loss: 0.7396906614303589
Validation Loss: 0.6915829181671143
Training Loss: 0.7506247162818909
Validation Loss: 0.6911800503730774
Training Loss: 0.7728770971298218
Validation Loss: 0.6900342702865601
Training Loss: 0.7572342753410339
Validation Loss: 0.6887076497077942
Training Loss: 0.7273079752922058
Validation Loss: 0.6878450512886047
Training Loss: 0.740

  assert input.numel() == input.storage().size(), (


Training Loss: 1.0600279569625854
Validation Loss: 0.6915921568870544
Training Loss: 0.9211241006851196
Validation Loss: 0.6902446746826172
Training Loss: 1.0527554750442505
Validation Loss: 0.689483642578125
Training Loss: 1.034454584121704
Validation Loss: 0.6884207725524902
Training Loss: 0.9382619261741638
Validation Loss: 0.6878199577331543
Training Loss: 0.8229954242706299
Validation Loss: 0.6876899003982544
Training Loss: 0.7243620753288269
Validation Loss: 0.6886441707611084
Training Loss: 0.8305832147598267
Validation Loss: 0.6894184350967407
Training Loss: 0.8929542303085327
Validation Loss: 0.6895046830177307
Training Loss: 0.8533904552459717
Validation Loss: 0.689277708530426
Training Loss: 0.8412970304489136
Validation Loss: 0.6888523697853088
Training Loss: 0.7725631594657898
Validation Loss: 0.6883193254470825
Training Loss: 0.7056075930595398
Validation Loss: 0.6880429983139038
Training Loss: 0.7014429569244385
Validation Loss: 0.6883617043495178
Training Loss: 0.761478

  assert input.numel() == input.storage().size(), (


Training Loss: 1.1993770599365234
Validation Loss: 0.6910296082496643
Training Loss: 0.7765955328941345
Validation Loss: 0.6930341124534607
Training Loss: 0.9820594191551208
Validation Loss: 0.692314088344574
Training Loss: 1.0020872354507446
Validation Loss: 0.6906334757804871
Training Loss: 0.8957234621047974
Validation Loss: 0.6895983815193176
Training Loss: 0.7601326107978821
Validation Loss: 0.6900593042373657
Training Loss: 0.7370398640632629
Validation Loss: 0.6904457807540894
Training Loss: 0.7787874937057495
Validation Loss: 0.6903923749923706
Training Loss: 0.807474672794342
Validation Loss: 0.6901775002479553
Training Loss: 0.764512300491333
Validation Loss: 0.6898587346076965
Training Loss: 0.6895157098770142
Validation Loss: 0.6897538304328918
Training Loss: 0.6690640449523926
Validation Loss: 0.6895898580551147
Training Loss: 0.7850762009620667
Validation Loss: 0.6889422535896301
Training Loss: 0.7131357192993164
Validation Loss: 0.6884966492652893
Training Loss: 0.709284

  assert input.numel() == input.storage().size(), (


Training Loss: 1.0443605184555054
Validation Loss: 0.700945258140564
Training Loss: 1.097896933555603
Validation Loss: 0.6974138021469116
Training Loss: 0.9079899787902832
Validation Loss: 0.6937010288238525
Training Loss: 0.7286074161529541
Validation Loss: 0.6915360689163208
Training Loss: 0.7376694679260254
Validation Loss: 0.6904283165931702
Training Loss: 0.8052614331245422
Validation Loss: 0.6905869245529175
Training Loss: 0.8060413599014282
Validation Loss: 0.6920697093009949
Training Loss: 0.7098308801651001
Validation Loss: 0.6931737065315247
Training Loss: 0.771602988243103
Validation Loss: 0.6929563283920288
Training Loss: 0.7238679528236389
Validation Loss: 0.6920320391654968
Training Loss: 0.7217649817466736
Validation Loss: 0.6920336484909058
Training Loss: 0.786857008934021
Validation Loss: 0.6929089426994324
Training Loss: 0.6977556347846985
Validation Loss: 0.6946346759796143
Training Loss: 0.6977478265762329
Validation Loss: 0.6977673172950745
Training Loss: 0.7764358