In [None]:
import torch
import torch_geometric
import networkx as nx
import pandas as pd
import numpy as np
import pickle as pkl
import csv
from torch_geometric.nn import Node2Vec
from torch.cuda.amp import GradScaler, autocast
from collections import defaultdict
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score, f1_score, precision_score, recall_score
from sklearn.exceptions import UndefinedMetricWarning, ConvergenceWarning
import warnings
from sklearn.preprocessing import StandardScaler

In [None]:
with open('G_train_dgl_twitter.gpickle', 'rb') as f:
    G_train_dgl_twitter = pkl.load(f)
    
with open('G_test_dgl_twitter.gpickle', 'rb') as f:
    G_test_dgl_twitter = pkl.load(f)
    
with open('G_val_dgl_twitter.gpickle', 'rb') as f:
    G_val_dgl_twitter = pkl.load(f)

In [None]:
# load G_LP_connected_dgl
with open('G_LP_connected_dgl_twitter.gpickle', 'rb') as f:
    G_LP_connected_dgl_twitter = pkl.load(f)

In [None]:
device = torch.device('cuda:1' if torch.cuda.is_available() else 'cpu')

In [None]:
# Get edge indices from your DGL graph
src, dst = G_LP_connected_dgl_twitter.edges()

# Create edge_index tensor
edge_index = torch.tensor([src.tolist(), dst.tolist()], dtype=torch.long).contiguous().to(device)


In [None]:
model = Node2Vec(edge_index, embedding_dim=128, walk_length=20, context_size=5, walks_per_node=40, num_negative_samples=1, p=0.5, q=2, sparse=True,).to(device)
loader = model.loader(batch_size=128, shuffle=True, num_workers=4)
optimizer = torch.optim.SparseAdam(list(model.parameters()), lr=0.001)

In [None]:
def train():
    scaler = GradScaler()
    model.train()
    total_loss = 0
    for pos_rw, neg_rw in loader:
        optimizer.zero_grad()
        with autocast():
            loss = model.loss(pos_rw.to(device), neg_rw.to(device))
        scaler.scale(loss).backward()
        scaler.step(optimizer)
        scaler.update()
        total_loss += loss.item()
    return total_loss / len(loader)

In [None]:
# train in 100 epochs
for epoch in range(1, 101):
    loss = train()
    print('Epoch: {:02d}, Loss: {:.4f}'.format(epoch, loss))

In [None]:
# write a loop to do training, and store embeddings for 5 times for later use
for i in range(5):
    # set device to cuda:1
    device = torch.device('cuda:1' if torch.cuda.is_available() else 'cpu')
    print('training round: ', i)
    model = Node2Vec(edge_index, embedding_dim=128, walk_length=20, context_size=5, walks_per_node=40, num_negative_samples=1, p=0.5, q=2, sparse=True,).to(device)
    loader = model.loader(batch_size=128, shuffle=True, num_workers=4)
    optimizer = torch.optim.SparseAdam(list(model.parameters()), lr=0.001)
    for epoch in range(1, 101):
        loss = train()
        print('Epoch: {:02d}, Loss: {:.4f}'.format(epoch, loss))
    
    print('finish training round: ', i)
    
    # store embeddings
    embeddings = model(torch.arange(edge_index.max().item() + 1).to(device))
    
    # save embeddings
    with open('deepwalk_embeddings_' + str(i+1) + '.pkl', 'wb') as f:
        pkl.dump(embeddings, f)
        
    print('finish saving embeddings: ', i)

In [None]:
# use node embeddings, fit into the clf evaluate, get auc f1 precision recall accuracy macro-f1
# load train_positive.pkl and train_negative.pkl

with open('train_positive.pkl', 'rb') as f:
    train_positive = pkl.load(f)
    
with open('train_negative.pkl', 'rb') as f:
    train_negative = pkl.load(f)

In [None]:
# load test_positive.pkl and test_negative.pkl
with open('test_positive.pkl', 'rb') as f:
    test_positive = pkl.load(f)
    
with open('test_negative.pkl', 'rb') as f:
    test_negative = pkl.load(f)

In [None]:
# get node embeddings of the node in train_positive and train_negative, and use logistic regression to train and evaluate
# load deepwalk_embeddings_0.pkl
# set device to cuda:6
device = torch.device('cuda:6' if torch.cuda.is_available() else 'cpu')
with open('deepwalk_embeddings_0.pkl', 'rb') as f:
    embeddings = pkl.load(f)

In [None]:
# load address_to_dgl_node
with open('address_to_dgl_node_twitter.pkl', 'rb') as f:
    address_to_dgl_node = pkl.load(f)

In [None]:
# Initialize lists to store embeddings and labels
train_positive_embeddings = []
train_negative_embeddings = []

# Assuming 'embeddings' holds your precomputed node embeddings and 'address_to_dgl_node' maps addresses to node IDs
for node in train_positive:
    train_positive_embeddings.append(embeddings[address_to_dgl_node[node]].detach().cpu().numpy())

for node in train_negative:
    train_negative_embeddings.append(embeddings[address_to_dgl_node[node]].detach().cpu().numpy())

# Combine positive and negative embeddings
train_nodes_embeddings = train_positive_embeddings + train_negative_embeddings

# Create corresponding labels
train_nodes_labels = [1] * len(train_positive_embeddings) + [0] * len(train_negative_embeddings)


In [None]:
# define the clf
clf = LogisticRegression(random_state=0, max_iter=3000)

In [None]:
# use StandardScaler to scale train_positive_embeddings and train_negative_embeddings
scaler = StandardScaler()
train_positive_embeddings = scaler.fit_transform(train_positive_embeddings)
train_negative_embeddings = scaler.fit_transform(train_negative_embeddings)

In [None]:
# Initialize lists to store embeddings and labels
train_positive_embeddings = []
train_negative_embeddings = []

# Assuming 'embeddings' holds your precomputed node embeddings and 'address_to_dgl_node' maps addresses to node IDs
for node in train_positive:
    train_positive_embeddings.append(embeddings[address_to_dgl_node[node]].detach().cpu().numpy())

for node in train_negative:
    train_negative_embeddings.append(embeddings[address_to_dgl_node[node]].detach().cpu().numpy())

# Combine positive and negative embeddings
train_nodes_embeddings = train_positive_embeddings + train_negative_embeddings

# Create corresponding labels
train_nodes_labels = [1] * len(train_positive_embeddings) + [0] * len(train_negative_embeddings)


In [None]:
# fit the clf
clf.fit(train_nodes_embeddings, train_nodes_labels)

In [None]:
# evaluate on test_positive and test_negative
test_positive_embeddings = []
for node in test_positive:
    test_positive_embeddings.append(embeddings[address_to_dgl_node[node]].detach().cpu().numpy())
    
test_negative_embeddings = []
for node in test_negative:
    test_negative_embeddings.append(embeddings[address_to_dgl_node[node]].detach().cpu().numpy())

In [None]:
# get test_nodes_embeddings and test_nodes_labels
test_nodes_embeddings = test_positive_embeddings + test_negative_embeddings

In [None]:
# clf predict on test_nodes_embeddings, first get all labels

# Generate labels for the test dataset: 1 for positive and 0 for negative
test_nodes_labels = [1] * len(test_positive_embeddings) + [0] * len(test_negative_embeddings)


# predict on test_nodes_embeddings
test_nodes_predictions = clf.predict(test_nodes_embeddings)


# compute auc, f1, precision, recall, accuracy, macro-f1
auc = roc_auc_score(test_nodes_labels, test_nodes_predictions)
f1 = f1_score(test_nodes_labels, test_nodes_predictions)
precision = precision_score(test_nodes_labels, test_nodes_predictions)
recall = recall_score(test_nodes_labels, test_nodes_predictions)
accuracy = (test_nodes_predictions == test_nodes_labels).mean()
macro_f1 = f1_score(test_nodes_labels, test_nodes_predictions, average='macro')

# print auc, f1, precision, recall, accuracy, macro_f1
print('auc: ', auc)
print('f1: ', f1)
print('precision: ', precision)
print('recall: ', recall)
print('accuracy: ', accuracy)
print('macro_f1: ', macro_f1)