In [1]:
import dgl
import torch
import torch.nn as nn
import torch.nn.functional as F
from dgl.nn import GraphConv
from sklearn.metrics import roc_auc_score, f1_score, precision_score, recall_score, accuracy_score
from sklearn.exceptions import UndefinedMetricWarning, ConvergenceWarning
import warnings
import pickle
import copy
import numpy as np


In [2]:
import pickle as pkl
# read G_dgl_with_twitter_converted.pkl
with open('/home/qian/HNE/Model/GCN/Ethereum/G_dgl_with_twitter_features_converted.pkl', 'rb') as f:
    G_dgl_with_twitter_features_converted = pkl.load(f)

In [3]:
# again print some examples
print(G_dgl_with_twitter_features_converted.nodes[0].data['combine_normalized_pca_8_twitter_features'])

tensor([[ 1.0142e-01,  6.7766e-02,  6.7329e-02,  1.0352e-01,  6.7766e-02,
          6.7329e-02,  7.1084e-02,  4.1487e-01, -9.9902e-01,  1.0765e-02,
          4.2618e-02,  5.1163e-03,  2.6459e-04, -7.2428e-04, -5.4747e-04,
         -2.2076e-03]], dtype=torch.float64)


In [4]:
class GCN(nn.Module):
    def __init__(self, in_feats, hidden_size, out_feats, dropout_rate):
        super(GCN, self).__init__()
        self.conv1 = GraphConv(in_feats, hidden_size)
        self.conv2 = GraphConv(hidden_size, hidden_size)  
        self.conv3 = GraphConv(hidden_size, out_feats)  
        self.dropout = nn.Dropout(dropout_rate)
        self.batchnorm1 = nn.BatchNorm1d(hidden_size) 

    def forward(self, g, features):
        x = F.relu(self.conv1(g, features))
        x = self.dropout(x)  
        x = self.batchnorm1(x)
        x = F.relu(self.conv2(g, x))
        x = self.dropout(x)
        # x = self.batchnorm1(x)
        x = self.conv3(g, x)
        return x


In [5]:
# store all edge_indices in separate files
with open('/home/qian/HNE/Model/GCN/Ethereum/positive_train_edge_indices.pkl', 'rb') as f:
    positive_train_edge_indices = pkl.load(f)
    
with open('/home/qian/HNE/Model/GCN/Ethereum/negative_train_edge_indices.pkl', 'rb') as f:
    negative_train_edge_indices = pkl.load(f)
    
with open('/home/qian/HNE/Model/GCN/Ethereum/positive_validation_edge_indices.pkl', 'rb') as f:
    positive_validation_edge_indices = pkl.load(f)
    
with open('/home/qian/HNE/Model/GCN/Ethereum/negative_validation_edge_indices.pkl', 'rb') as f:
    negative_validation_edge_indices = pkl.load(f)
    
with open('/home/qian/HNE/Model/GCN/Ethereum/positive_test_edge_indices.pkl', 'rb') as f:
    positive_test_edge_indices = pkl.load(f)
    
with open('/home/qian/HNE/Model/GCN/Ethereum/negative_test_edge_indices.pkl', 'rb') as f:
    negative_test_edge_indices = pkl.load(f)

In [6]:
def generate_edge_embeddings(h, edges):
    # Extract the source and target node indices from the edges
    src, dst = edges[0], edges[1]
    
    # Use the node indices to get the corresponding node embeddings
    src_embed = h[src]
    dst_embed = h[dst]

    # Concatenate the source and target node embeddings
    edge_embs = torch.cat([src_embed, dst_embed], dim=1)

    return edge_embs

In [7]:
# write a five loop to get the result and document them
import copy

from sklearn.metrics import roc_auc_score, f1_score, precision_score, recall_score, accuracy_score
linear = (
    nn.Linear(256, 1)
)

for i in range(5):
    model = GCN(16, 128, 128, 0.1)
    optimizer = torch.optim.AdamW(model.parameters(), lr=0.001, weight_decay=5e-4)
    criterion = nn.BCEWithLogitsLoss()
    best_val_loss = float('inf')
    best_model = None
    num_epochs = 200
    patience = 30
    early_stopping_counter = 0
    
    for epoch in range(num_epochs):
        model.train()
        
        # forward pass
        logits = model(G_dgl_with_twitter_features_converted, G_dgl_with_twitter_features_converted.ndata['combine_normalized_pca_8_twitter_features'].float())
        
        # generate edge embeddings
        pos_train_edge_embs = generate_edge_embeddings(logits, positive_train_edge_indices)
        neg_train_edge_embs = generate_edge_embeddings(logits, negative_train_edge_indices)
        
        # concatenete positive and negative edge embeddings
        train_edge_embs = torch.cat([pos_train_edge_embs, neg_train_edge_embs], dim=0)
        train_edge_labels = torch.cat([torch.ones(pos_train_edge_embs.shape[0]), torch.zeros(neg_train_edge_embs.shape[0])], dim=0).unsqueeze(1)
        
        # print shapes of tensors for debugging
        # print(f"Train Edge Embeddings Shape: {train_edge_embs.shape}")
        # print(f"Train Edge Labels Shape: {train_edge_labels.shape}")
        
        # calculate loss
        loss = criterion(linear(train_edge_embs), train_edge_labels)
        print(f"Training Loss: {loss.item()}")
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        
        # validation
        model.eval()
        
        with torch.no_grad():
            # repeat the same process as above for validation samples
            logits = model(G_dgl_with_twitter_features_converted, G_dgl_with_twitter_features_converted.ndata['combine_normalized_pca_8_twitter_features'].float())
            pos_val_edge_embs = generate_edge_embeddings(logits, positive_validation_edge_indices)
            neg_val_edge_embs = generate_edge_embeddings(logits, negative_validation_edge_indices)
            val_edge_embs = torch.cat([pos_val_edge_embs, neg_val_edge_embs], dim=0)
            val_edge_labels = torch.cat([torch.ones(pos_val_edge_embs.shape[0]), torch.zeros(neg_val_edge_embs.shape[0])], dim=0).unsqueeze(1)
            # print shapes of tensors for debugging
            # print(f"Validation Edge Embeddings Shape: {val_edge_embs.shape}")
            # print(f"Validation Edge Labels Shape: {val_edge_labels.shape}")

            val_loss = criterion(linear(val_edge_embs), val_edge_labels)
            print(f"Validation Loss: {val_loss.item()}")
            
            # early stopping based on validation loss
            if val_loss < best_val_loss:
                best_val_loss = val_loss
                early_stopping_counter = 0
                # save the best model
                best_model = copy.deepcopy(model)
            else:
                early_stopping_counter += 1
                if early_stopping_counter >= patience:
                    print('early stopping due to validation loss not improving')
                    break
                
    # switch to evaluation mode
    best_model.eval()

    with torch.no_grad():
        # generate the embeddings using the best model
        logits = best_model(G_dgl_with_twitter_features_converted, G_dgl_with_twitter_features_converted.ndata['combine_normalized_pca_8_twitter_features'].float())

        # generate edge embeddings for the test samples
        pos_test_edge_embs = generate_edge_embeddings(logits, positive_test_edge_indices)
        neg_test_edge_embs = generate_edge_embeddings(logits, negative_test_edge_indices)

        # concatenate the positive and negative edge embeddings and labels
        test_edge_embs = torch.cat([pos_test_edge_embs, neg_test_edge_embs], dim=0)
        test_edge_labels = torch.cat([torch.ones(pos_test_edge_embs.shape[0]), torch.zeros(neg_test_edge_embs.shape[0])], dim=0)


        # test_loss = criterion(linear(test_edge_embs), val_edge_labels)
        # calculate predictions using the linear layer
        
        predictions = torch.sigmoid(linear(test_edge_embs))
        
        # reshape the predictions and the labels
        predictions = predictions.view(-1).cpu().numpy()
        test_edge_labels = test_edge_labels.cpu().numpy()

        # calculate scores and entropyloss
        
        
        auc = roc_auc_score(test_edge_labels, predictions)
        predictions_binary = (predictions > 0.5).astype(int)
        f1 = f1_score(test_edge_labels, predictions_binary)
        precision = precision_score(test_edge_labels, predictions_binary)
        recall = recall_score(test_edge_labels, predictions_binary)
        accuracy = accuracy_score(test_edge_labels, predictions_binary)

        print(f"AUC: {auc}")
        print(f"F1 Score: {f1}")
        print(f"Precision: {precision}")
        print(f"Recall: {recall}")
        print(f"Accuracy: {accuracy}")
    # print accuracy, f1, precision, recall, auc-roc
    # print(f"Test Loss: {test_loss.item()}")
        with open('results_with_twitter.txt', 'a') as f:
            f.write(f"AUC: {auc}\n")
            f.write(f"F1 Score: {f1}\n")
            f.write(f"Precision: {precision}\n")
            f.write(f"Recall: {recall}\n")
            f.write(f"Accuracy: {accuracy}\n")
            
            f.write('\n')
    

  assert input.numel() == input.storage().size(), (


Training Loss: 0.7274777293205261
Validation Loss: 0.6988350749015808
Training Loss: 2.002140522003174
Validation Loss: 0.6957563757896423
Training Loss: 1.516695261001587
Validation Loss: 0.6916395425796509
Training Loss: 1.0195236206054688
Validation Loss: 0.6903523802757263
Training Loss: 0.9099217057228088
Validation Loss: 0.6905655264854431
Training Loss: 1.0188179016113281
Validation Loss: 0.6908146142959595
Training Loss: 0.9717609286308289
Validation Loss: 0.6911085844039917
Training Loss: 0.7496069669723511
Validation Loss: 0.6913556456565857
Training Loss: 0.8019750714302063
Validation Loss: 0.6909366250038147
Training Loss: 0.8914154767990112
Validation Loss: 0.6897105574607849
Training Loss: 0.8582203984260559
Validation Loss: 0.6894385814666748
Training Loss: 0.819326639175415
Validation Loss: 0.6903719902038574
Training Loss: 0.7880764603614807
Validation Loss: 0.6918101906776428
Training Loss: 0.7583291530609131
Validation Loss: 0.6926093101501465
Training Loss: 0.779686

  assert input.numel() == input.storage().size(), (


Training Loss: 0.9410529732704163
Validation Loss: 0.6925426125526428
Training Loss: 1.431749701499939
Validation Loss: 0.6931300163269043
Training Loss: 1.4717375040054321
Validation Loss: 0.6912415623664856
Training Loss: 1.4767076969146729
Validation Loss: 0.6906079053878784
Training Loss: 1.2135818004608154
Validation Loss: 0.6912274956703186
Training Loss: 1.059579849243164
Validation Loss: 0.6912279725074768
Training Loss: 0.7653493285179138
Validation Loss: 0.6907972693443298
Training Loss: 1.0531189441680908
Validation Loss: 0.6900766491889954
Training Loss: 0.9902530908584595
Validation Loss: 0.6880086064338684
Training Loss: 0.7730395793914795
Validation Loss: 0.6859492063522339
Training Loss: 0.7928352355957031
Validation Loss: 0.6849619150161743
Training Loss: 0.9189442992210388
Validation Loss: 0.6844463348388672
Training Loss: 0.958188533782959
Validation Loss: 0.6840711832046509
Training Loss: 0.9646299481391907
Validation Loss: 0.6835307478904724
Training Loss: 0.872176

KeyboardInterrupt: 