In [None]:
import torch
import torch_geometric
import networkx as nx
import pandas as pd
import numpy as np
import pickle as pkl
import csv
from torch_geometric.nn import Node2Vec
from torch.cuda.amp import GradScaler, autocast
from collections import defaultdict
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score, f1_score, precision_score, recall_score
from sklearn.exceptions import UndefinedMetricWarning, ConvergenceWarning
import warnings
from sklearn.preprocessing import StandardScaler

In [None]:
# read G_train_dgl, G_val_dgl, G_test_dgl, use pickle to read
with open('G_train.gpickle', 'rb') as f:
    G_train = pkl.load(f)
    
with open('G_val.gpickle', 'rb') as f:
    G_val = pkl.load(f)
    
with open('G_test.gpickle', 'rb') as f:
    G_test = pkl.load(f)

In [None]:
# print G_train_dgl information
print(G_train_dgl)

In [None]:
# print some label examples
print(G_train_dgl.ndata['label'][0:10])

In [None]:
# print first 10 nodes in G_train_dgl
print(G_train_dgl.nodes()[0:10])

In [None]:
# print G_train_dgl node number
print(G_train_dgl.number_of_nodes())

In [None]:
# print G_LP_connected_dgl first 10 nodes
print(G_LP_connected.nodes()[0:10])

In [None]:
# extract train_positive examples, train_negative examples, val_positive examples, val_negative examples, test_positive examples, test_negative examples
train_positive = []
train_negative = []
val_positive = []
val_negative = []
test_positive = []
test_negative = []

## Run node2vec on the whole connected graph, get each node's embedding and store them

In [None]:
# load G_LP_connected
with open('G_LP_connected.gpickle', 'rb') as f:
    G_LP_connected = pkl.load(f)

In [None]:
# print information of G_LP_connected
print(G_LP_connected)

In [None]:
# check whether G_LP_connected is weakly_connected
print(nx.is_weakly_connected(G_LP_connected))

In [None]:
# print the first 10 nodes in G_LP_connected
print(list(G_LP_connected.nodes())[0:10])

In [None]:
# convert G_LP_connected to dgl graph
import dgl

# Create a mapping from NetworkX node (address) to DGL node (integer)
address_to_dgl_node = {address: i for i, address in enumerate(G_LP_connected.nodes())}

# Also create the inverse mapping from DGL node (integer) to NetworkX node (address)
dgl_node_to_address = {i: address for address, i in address_to_dgl_node.items()}

# Convert NetworkX graph to DGL graph
G_LP_connected_dgl = dgl.from_networkx(G_LP_connected)

In [None]:
import pickle

# Save the mappings
with open('address_to_dgl_node.pkl', 'wb') as f:
    pickle.dump(address_to_dgl_node, f)

with open('dgl_node_to_address.pkl', 'wb') as f:
    pickle.dump(dgl_node_to_address, f)

# Load the mappings
with open('address_to_dgl_node.pkl', 'rb') as f:
    address_to_dgl_node = pickle.load(f)

with open('dgl_node_to_address.pkl', 'rb') as f:
    dgl_node_to_address = pickle.load(f)


In [None]:
# save G_LP_connected_dgl
with open('G_LP_connected_dgl.pkl', 'wb') as f:
    pickle.dump(G_LP_connected_dgl, f)

In [None]:
# print some examples of address_to_dgl_node
print(list(address_to_dgl_node.items())[0:10])

In [None]:
# print some examples of dgl_node_to_address
print(list(dgl_node_to_address.items())[0:10])

In [None]:
# print information of G_LP_connected_dgl
print(G_LP_connected_dgl)

In [None]:
# print G_LP_connected_dgl some nodes and edges
print(G_LP_connected_dgl.nodes()[0:10])

In [None]:
# print G_LP_connected_dgl edges examples first 10
print(G_LP_connected_dgl.edges()[0:10])

In [None]:
# print edge_index of the dgl graph
print(G_LP_connected_dgl.edges()[0:10])

In [None]:
device = torch.device('cuda:1' if torch.cuda.is_available() else 'cpu')

In [None]:
# Get edge indices from your DGL graph
src, dst = G_LP_connected_dgl.edges()

# Create edge_index tensor
edge_index = torch.tensor([src.tolist(), dst.tolist()], dtype=torch.long).contiguous().to(device)


## Define a deepwalk model

In [None]:
model = Node2Vec(edge_index, embedding_dim=128, walk_length=20, context_size=5, walks_per_node=40, num_negative_samples=1, sparse=True,).to(device)
loader = model.loader(batch_size=128, shuffle=True, num_workers=4)
optimizer = torch.optim.SparseAdam(list(model.parameters()), lr=0.001)

In [None]:
def train():
    scaler = GradScaler()
    model.train()
    total_loss = 0
    for pos_rw, neg_rw in loader:
        optimizer.zero_grad()
        with autocast():
            loss = model.loss(pos_rw.to(device), neg_rw.to(device))
        scaler.scale(loss).backward()
        scaler.step(optimizer)
        scaler.update()
        total_loss += loss.item()
    return total_loss / len(loader)

In [None]:
model = Node2Vec(edge_index, embedding_dim=128, walk_length=20, context_size=5, walks_per_node=40, num_negative_samples=1, sparse=True,).to(device)
loader = model.loader(batch_size=128, shuffle=True, num_workers=4)
optimizer = torch.optim.SparseAdam(list(model.parameters()), lr=0.001)

In [None]:
# train in 100 epochs
for epoch in range(1, 101):
    loss = train()
    print('Epoch: {:02d}, Loss: {:.4f}'.format(epoch, loss))

In [None]:
# store embeddings
embeddings = model(torch.arange(edge_index.max().item() + 1).to(device))

In [None]:
# write a loop to do training, and store embeddings for 5 times for later use
for i in range(5):
    # set device to cuda:1
    device = torch.device('cuda:1' if torch.cuda.is_available() else 'cpu')
    print('training round: ', i)
    model = Node2Vec(edge_index, embedding_dim=128, walk_length=20, context_size=5, walks_per_node=40, num_negative_samples=1, sparse=True,).to(device)
    loader = model.loader(batch_size=128, shuffle=True, num_workers=4)
    optimizer = torch.optim.SparseAdam(list(model.parameters()), lr=0.001)
    for epoch in range(1, 101):
        loss = train()
        print('Epoch: {:02d}, Loss: {:.4f}'.format(epoch, loss))
    
    print('finish training round: ', i)
    
    # store embeddings
    embeddings = model(torch.arange(edge_index.max().item() + 1).to(device))
    
    # save embeddings
    with open('deepwalk_embeddings_' + str(i+1) + '.pkl', 'wb') as f:
        pickle.dump(embeddings, f)
        
    print('finish saving embeddings: ', i)

In [None]:
# load G_train.dgl, G_test.dgl
with open('G_train_dgl.gpickle', 'rb') as f:
    G_train_dgl = pkl.load(f)
    
with open('G_test_dgl.gpickle', 'rb') as f:
    G_test_dgl = pkl.load(f)

In [None]:
# load dgl_node_to_address.pkl
with open('dgl_node_to_address.pkl', 'rb') as f:
    dgl_node_to_address = pkl.load(f)

In [None]:
# print some examples of dgl_node_to_address
print(list(dgl_node_to_address.items())[0:10])

In [None]:
# print last examples of dgl_node_to_address
print(list(dgl_node_to_address.items())[-10:])

In [None]:
# load address_to_dgl_node.pkl
with open('address_to_dgl_node.pkl', 'rb') as f:
    address_to_dgl_node = pkl.load(f)

In [None]:
# print some examples of address_to_dgl_node, first and last some examples
print(list(address_to_dgl_node.items())[0:10])
print(list(address_to_dgl_node.items())[-10:])

In [None]:
# print G_train_dgl some nodes and edges
print(G_train_dgl.nodes()[0:10])
print(G_test_dgl.nodes()[0:10])

In [None]:
# load G_train, G_test
with open('G_train.gpickle', 'rb') as f:
    G_train = pkl.load(f)

In [None]:
with open('G_test.gpickle', 'rb') as f:
    G_test = pkl.load(f)

In [None]:
# print some information of G_train including nodes and edges
print(list(G_train.nodes())[0:10])

In [None]:
# print some examples of G_test including nodes and edges
print(list(G_test.nodes())[0:10])

In [None]:
# add node labels of G_train, and G_test
# read eoa_addr_list.txt
with open('eoa_addr_list.txt', 'r') as f:
    eoa_addr_list = f.read().splitlines()

In [None]:
# print first 10 examples of eoa_addr_list
print(eoa_addr_list[0:10])

In [None]:
# iterate G_train and add node labels
for node in G_train.nodes():
    if node in eoa_addr_list:
        G_train.nodes[node]['label'] = 1
    else:
        G_train.nodes[node]['label'] = 0

In [None]:
# add G_test node labels
for node in G_test.nodes():
    if node in eoa_addr_list:
        G_test.nodes[node]['label'] = 1
    else:
        G_test.nodes[node]['label'] = 0

In [None]:
# use logistic regression to train on G_train_dgl node embeddings for node classification
# load deepwalk_embeddings_0.pkl
with open('deepwalk_embeddings_0.pkl', 'rb') as f:
    embeddings = pkl.load(f)

## Step 2: 1-to-1 extract label 1 and label 0 nodes, as the positive and negative examples

In [None]:
# extract 1-to-1 positive and negative examples of G_train
train_positive = []
train_negative = []

# iterate G_train nodes and extract positive and negative examples, keep 1-to-1 ratio
for node in G_train.nodes():
    if G_train.nodes[node]['label'] == 1:
        train_positive.append(node)


# randomly sample negative examples, random choice        
train_negative = np.random.choice(list(set(G_train.nodes()) - set(train_positive)), len(train_positive), replace=False)

In [None]:
# print examples of train_positive and train_negative
print(train_positive[0:10])
print(train_negative[0:10])

In [None]:
# store train_positive and train_negative to train_positive.pkl and train_negative.pkl
with open('train_positive.pkl', 'wb') as f:
    pkl.dump(train_positive, f)
    
with open('train_negative.pkl', 'wb') as f:
    pkl.dump(train_negative, f)

In [None]:
# for test, do the same operation
test_positive = []
test_negative = []

# iterate G_test nodes and extract positive and negative examples, keep 1-to-1 ratio
for node in G_test.nodes():
    if G_test.nodes[node]['label'] == 1:
        test_positive.append(node)
        
# randomly sample negative examples, random choice
test_negative = np.random.choice(list(set(G_test.nodes()) - set(test_positive)), len(test_positive), replace=False)

In [None]:
# print test_positive and test_negative examples
print(test_positive[0:10])
print(test_negative[0:10])

In [None]:
# store test_positive and test_negative to test_positive.pkl and test_negative.pkl
with open('test_positive.pkl', 'wb') as f:
    pkl.dump(test_positive, f)
    
with open('test_negative.pkl', 'wb') as f:
    pkl.dump(test_negative, f)

## Step 3: use node embeddings, fit into the clf evaluate, get auc f1 precision recall accuracy macro-f1

In [None]:
# use node embeddings, fit into the clf evaluate, get auc f1 precision recall accuracy macro-f1
# load train_positive.pkl and train_negative.pkl

with open('train_positive.pkl', 'rb') as f:
    train_positive = pkl.load(f)
    
with open('train_negative.pkl', 'rb') as f:
    train_negative = pkl.load(f)

In [None]:
# print train_positive and train_negative examples
print(train_positive[0:10])
print(train_negative[0:10])

In [None]:
# load test_positive.pkl and test_negative.pkl
with open('test_positive.pkl', 'rb') as f:
    test_positive = pkl.load(f)
    
with open('test_negative.pkl', 'rb') as f:
    test_negative = pkl.load(f)

In [None]:
# print test_positive and test_negative examples
print(test_positive[0:10])
print(test_negative[0:10])

In [None]:
# get node embeddings of the node in train_positive and train_negative, and use logistic regression to train and evaluate
# load deepwalk_embeddings_0.pkl
# set device to cuda:6
device = torch.device('cuda:6' if torch.cuda.is_available() else 'cpu')
with open('deepwalk_embeddings_0.pkl', 'rb') as f:
    embeddings = pkl.load(f)

In [None]:
# print embeddings information
print(embeddings[10])

In [None]:
# define the clf
clf = LogisticRegression(random_state=0, max_iter=3000)

In [None]:
train_positive_embeddings = []
for node in train_positive:
    train_positive_embeddings.append(embeddings[address_to_dgl_node[node]].detach().cpu().numpy())

In [None]:
# print train_positive_embeddings first 10 examples
print(train_positive_embeddings[0:10])

In [None]:
# also do the same operation for train_negative
train_negative_embeddings = []
for node in train_negative:
    train_negative_embeddings.append(embeddings[address_to_dgl_node[node]].detach().cpu().numpy())

In [None]:
# use StandardScaler to scale train_positive_embeddings and train_negative_embeddings
scaler = StandardScaler()
train_positive_embeddings = scaler.fit_transform(train_positive_embeddings)
train_negative_embeddings = scaler.fit_transform(train_negative_embeddings)

In [None]:
# Initialize lists to store embeddings and labels
train_positive_embeddings = []
train_negative_embeddings = []

# Assuming 'embeddings' holds your precomputed node embeddings and 'address_to_dgl_node' maps addresses to node IDs
for node in train_positive:
    train_positive_embeddings.append(embeddings[address_to_dgl_node[node]].detach().cpu().numpy())

for node in train_negative:
    train_negative_embeddings.append(embeddings[address_to_dgl_node[node]].detach().cpu().numpy())

# Combine positive and negative embeddings
train_nodes_embeddings = train_positive_embeddings + train_negative_embeddings

# Create corresponding labels
train_nodes_labels = [1] * len(train_positive_embeddings) + [0] * len(train_negative_embeddings)


In [None]:
# print number of train_nodes_embeddings and train_nodes_labels
print(len(train_nodes_embeddings))
print(len(train_nodes_labels))

In [None]:
# print the number of 1 and 0 in train_nodes_labels
print(train_nodes_labels.count(1))
print(train_nodes_labels.count(0))

In [None]:
# fit the clf
clf.fit(train_nodes_embeddings, train_nodes_labels)

In [None]:
# evaluate on test_positive and test_negative
test_positive_embeddings = []
for node in test_positive:
    test_positive_embeddings.append(embeddings[address_to_dgl_node[node]].detach().cpu().numpy())
    
test_negative_embeddings = []
for node in test_negative:
    test_negative_embeddings.append(embeddings[address_to_dgl_node[node]].detach().cpu().numpy())

In [None]:
# print some examples of test_positive_embeddings and test_negative_embeddings
print(test_positive_embeddings[0:10])
print(test_negative_embeddings[0:10])

In [None]:
# print the number of test_positive_embeddings and test_negative_embeddings
print(len(test_positive_embeddings))
print(len(test_negative_embeddings))

In [None]:
# combine test_positive_embeddings and test_negative_embeddings
test_nodes_embeddings = test_positive_embeddings + test_negative_embeddings

In [None]:
# print first 10 examples of test_nodes_embeddings
print(test_nodes_embeddings[0:10])

In [None]:
# print the number of test_nodes_embeddings
print(len(test_nodes_embeddings))

In [None]:
# clf predict on test_nodes_embeddings, first get all labels

# Generate labels for the test dataset: 1 for positive and 0 for negative
test_nodes_labels = [1] * len(test_positive_embeddings) + [0] * len(test_negative_embeddings)


# predict on test_nodes_embeddings
test_nodes_predictions = clf.predict(test_nodes_embeddings)


# compute auc, f1, precision, recall, accuracy, macro-f1
auc = roc_auc_score(test_nodes_labels, test_nodes_predictions)
f1 = f1_score(test_nodes_labels, test_nodes_predictions)
precision = precision_score(test_nodes_labels, test_nodes_predictions)
recall = recall_score(test_nodes_labels, test_nodes_predictions)
accuracy = (test_nodes_predictions == test_nodes_labels).mean()
macro_f1 = f1_score(test_nodes_labels, test_nodes_predictions, average='macro')

# print auc, f1, precision, recall, accuracy, macro_f1
print('auc: ', auc)
print('f1: ', f1)
print('precision: ', precision)
print('recall: ', recall)
print('accuracy: ', accuracy)
print('macro_f1: ', macro_f1)