In [2]:
from torch_geometric.datasets import Twitch
import os.path as osp

import torch
from sklearn.metrics import roc_auc_score

import torch_geometric.transforms as T
from torch_geometric.datasets import Planetoid
from torch_geometric.nn import GCNConv
from torch_geometric.utils import negative_sampling

In [3]:
if torch.cuda.is_available():
    device = torch.device('cuda')
elif hasattr(torch.backends, 'mps') and torch.backends.mps.is_available():
    device = torch.device('mps')
else:
    device = torch.device('cpu')

In [4]:
dataset = Twitch(root='data/Twitch', name='EN')
print(dataset[0])
#
print()
print(f'Dataset: {dataset}:')
print('====================')
print(f'Number of graphs: {len(dataset)}')
print(f'Number of features: {dataset.num_features}')
print(f'Number of classes: {dataset.num_classes}')

Data(x=[7126, 128], edge_index=[2, 77774], y=[7126])

Dataset: Twitch():
Number of graphs: 1
Number of features: 128
Number of classes: 2


In [5]:
from torch_geometric.utils import train_test_split_edges


data = train_test_split_edges(dataset[0])

print('Train edges:', data.train_pos_edge_index.size(1))
print('Validation edges (positive):', data.val_pos_edge_index.size(1))
print('Validation edges (negative):', data.val_neg_edge_index.size(1))
print('Test edges (positive):', data.test_pos_edge_index.size(1))
print('Test edges (negative):', data.test_neg_edge_index.size(1))

print(data)



Train edges: 60052
Validation edges (positive): 1766
Validation edges (negative): 1766
Test edges (positive): 3532
Test edges (negative): 3532
Data(x=[7126, 128], y=[7126], val_pos_edge_index=[2, 1766], test_pos_edge_index=[2, 3532], train_pos_edge_index=[2, 60052], train_neg_adj_mask=[7126, 7126], val_neg_edge_index=[2, 1766], test_neg_edge_index=[2, 3532])


In [6]:
class Net(torch.nn.Module):
    def __init__(self, in_channels, hidden_channels, out_channels):
        super().__init__()
        self.conv1 = GCNConv(in_channels, hidden_channels)
        self.conv2 = GCNConv(hidden_channels, out_channels)

    def encode(self, x, edge_index):
        x = self.conv1(x, edge_index).relu()
        x = self.conv2(x, edge_index)
        return x

    def decode(self, z, edge_label_index):
        x = (z[edge_label_index[0]] * z[edge_label_index[1]])
        x = x.sum(dim=-1)
        return x

    def decode_all(self, z):
        prob_adj = z @ z.t()
        return (prob_adj > 0).nonzero(as_tuple=False).t()

    def forward(self, x, edge_index, edge_label_index):
        outputs = {}
        x = self.conv1(x, edge_index).relu()
        outputs["conv1"] = x
        z = self.conv2(x, edge_index)
        outputs["conv2"] = z

        x = (z[edge_label_index[0]] * z[edge_label_index[1]])
        outputs["decode_mul"] = x
        x = x.sum(dim=-1)
        outputs["decode_sum"] = x

        prob_adj = z @ z.t()
        outputs["prob_adj"] = prob_adj
        outputs["decode_all_final"] = (prob_adj > 0).nonzero(as_tuple=False).t()
        return outputs
        


model = Net(dataset.num_features, 64, 32).to(device)
optimizer = torch.optim.Adam(params=model.parameters(), lr=0.01)
criterion = torch.nn.BCEWithLogitsLoss()
print(device)

cpu


In [7]:
def train():
    model.train()
    optimizer.zero_grad()
    z = model.encode(data.x.to(device), data.train_pos_edge_index.to(device))

    pos_edge_index = data.train_pos_edge_index.to(device)
    pos_out = model.decode(z, pos_edge_index)
    pos_loss = criterion(pos_out, torch.ones(pos_out.size(0), device=device))

    neg_edge_index = negative_sampling(
        edge_index=data.train_pos_edge_index, 
        num_nodes=data.num_nodes,
        num_neg_samples=pos_edge_index.size(1)
    ).to(device)
    neg_out = model.decode(z, neg_edge_index)
    neg_loss = criterion(neg_out, torch.zeros(neg_out.size(0), device=device))

    loss = pos_loss + neg_loss
    loss.backward()
    optimizer.step()
    return loss.item()

In [8]:
def test(pos_edge_index, neg_edge_index):
    model.eval()
    with torch.no_grad():
        z = model.encode(data.x.to(device), data.train_pos_edge_index.to(device))
    
    pos_out = model.decode(z, pos_edge_index.to(device))
    neg_out = model.decode(z, neg_edge_index.to(device))

    pos_y = torch.ones(pos_out.size(0), device=device)
    neg_y = torch.zeros(neg_out.size(0), device=device)
    y = torch.cat([pos_y, neg_y])
    pred = torch.cat([pos_out, neg_out])

    loss = criterion(pred, y).item()
    pred = torch.sigmoid(pred)  
    pred = pred > 0.5
    acc = pred.eq(y).sum().item() / y.size(0)
    return loss, acc

In [9]:
for epoch in range(1, 101):
    loss = train()
    val_loss, val_acc = test(data.val_pos_edge_index, data.val_neg_edge_index)
    test_loss, test_acc = test(data.test_pos_edge_index, data.test_neg_edge_index)
    print(f'Epoch: {epoch:03d}, Loss: {loss:.4f}, Val Loss: {val_loss:.4f}, Val Acc: {val_acc:.4f}, Test Loss: {test_loss:.4f}, Test Acc: {test_acc:.4f}')

Epoch: 001, Loss: 1.6867, Val Loss: 0.6293, Val Acc: 0.5459, Test Loss: 0.6349, Test Acc: 0.5447
Epoch: 002, Loss: 1.2362, Val Loss: 0.6210, Val Acc: 0.5461, Test Loss: 0.6245, Test Acc: 0.5467
Epoch: 003, Loss: 1.2101, Val Loss: 0.6124, Val Acc: 0.5544, Test Loss: 0.6179, Test Acc: 0.5558
Epoch: 004, Loss: 1.1861, Val Loss: 0.5962, Val Acc: 0.5665, Test Loss: 0.6046, Test Acc: 0.5725
Epoch: 005, Loss: 1.1545, Val Loss: 0.5864, Val Acc: 0.5793, Test Loss: 0.5965, Test Acc: 0.5834
Epoch: 006, Loss: 1.1372, Val Loss: 0.5803, Val Acc: 0.5923, Test Loss: 0.5907, Test Acc: 0.5927
Epoch: 007, Loss: 1.1231, Val Loss: 0.5757, Val Acc: 0.5980, Test Loss: 0.5859, Test Acc: 0.6016
Epoch: 008, Loss: 1.1140, Val Loss: 0.5738, Val Acc: 0.6084, Test Loss: 0.5834, Test Acc: 0.6045
Epoch: 009, Loss: 1.1002, Val Loss: 0.5726, Val Acc: 0.6152, Test Loss: 0.5818, Test Acc: 0.6138
Epoch: 010, Loss: 1.0957, Val Loss: 0.5688, Val Acc: 0.6285, Test Loss: 0.5778, Test Acc: 0.6243
Epoch: 011, Loss: 1.0826, Val 

In [10]:
gData = dataset[0]
print(gData)

Data(x=[7126, 128], edge_index=[2, 77774], y=[7126])


In [11]:
def get_neighbor_count(data, node_index):
    if node_index < 0 or node_index >= data.num_nodes:
        raise ValueError("exceed the dataset")
    edge_index = data.edge_index
    neighbors = edge_index[1][edge_index[0] == node_index]
    neighbor_of_neighbor_count = 0
    for neighbor in neighbors:
        second_neighbors = edge_index[1][edge_index[0] == neighbor]
        neighbor_of_neighbor_count += second_neighbors.size(0)
    return neighbors.size(0), neighbor_of_neighbor_count

In [12]:
# print out the size of the size graph
# node i - node j - subgraph size for node i - subgraph size for node j
# (the neighbors of the target node, the neibhors of the neighbors node)
for i in range(50):
    for j in range(50):
        if i != j:
            print(i, j, get_neighbor_count(gData, i), get_neighbor_count(gData, j))

0 1 (2, 6) (27, 801)
0 2 (2, 6) (2, 339)
0 3 (2, 6) (8, 133)
0 4 (2, 6) (2, 81)
0 5 (2, 6) (5, 833)
0 6 (2, 6) (11, 194)
0 7 (2, 6) (3, 13)
0 8 (2, 6) (2, 13)
0 9 (2, 6) (13, 1840)
0 10 (2, 6) (2, 6)
0 11 (2, 6) (5, 103)
0 12 (2, 6) (3, 9)
0 13 (2, 6) (13, 290)
0 14 (2, 6) (11, 648)
0 15 (2, 6) (6, 551)
0 16 (2, 6) (6, 96)
0 17 (2, 6) (7, 131)
0 18 (2, 6) (3, 474)
0 19 (2, 6) (10, 237)
0 20 (2, 6) (8, 176)
0 21 (2, 6) (8, 329)
0 22 (2, 6) (3, 732)
0 23 (2, 6) (59, 2283)
0 24 (2, 6) (123, 5005)
0 25 (2, 6) (2, 76)
0 26 (2, 6) (92, 4983)
0 27 (2, 6) (2, 5)
0 28 (2, 6) (3, 80)
0 29 (2, 6) (4, 392)
0 30 (2, 6) (38, 1803)
0 31 (2, 6) (3, 138)
0 32 (2, 6) (10, 377)
0 33 (2, 6) (4, 49)
0 34 (2, 6) (12, 755)
0 35 (2, 6) (14, 272)
0 36 (2, 6) (7, 837)
0 37 (2, 6) (5, 1161)
0 38 (2, 6) (3, 798)
0 39 (2, 6) (5, 134)
0 40 (2, 6) (5, 810)
0 41 (2, 6) (8, 191)
0 42 (2, 6) (5, 77)
0 43 (2, 6) (14, 333)
0 44 (2, 6) (6, 114)
0 45 (2, 6) (2, 23)
0 46 (2, 6) (10, 1675)
0 47 (2, 6) (2, 46)
0 48 (2, 6) (11

In [13]:
def predict_edge(model, node_index1, node_index2):
    model.eval()
    with torch.no_grad():
        edge_label_index = torch.tensor([[node_index1], [node_index2]], device=device)
        prediction = model.forward(data.x.to(device), data.train_pos_edge_index.to(device), edge_label_index)
        return prediction

node_index1 = 45
node_index2 = 17

# for i in range(20):
#     for j in range(20):
#         if i != j:
#             prediction = predict_edge(model, i, j)
#             print(f"predict node {i} and node {j} probability that has an edge: {prediction}")

In [14]:


prediction = predict_edge(model, 12, 70)
print(prediction)

{'conv1': tensor([[0.0000, 0.0000, 0.0000,  ..., 0.3380, 0.0000, 0.0000],
        [0.0000, 0.0000, 0.5520,  ..., 0.0000, 0.0000, 0.0000],
        [0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
        ...,
        [0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.7719, 0.1896],
        [0.0000, 0.0528, 0.1755,  ..., 0.0000, 0.0000, 1.0192],
        [0.0000, 0.0000, 0.0000,  ..., 0.1210, 0.0000, 0.9730]]), 'conv2': tensor([[ 0.1015, -0.0169,  0.0972,  ..., -0.0432, -0.1360,  0.0791],
        [ 0.2822, -0.3765, -0.8265,  ..., -0.1868, -0.6209, -0.6068],
        [-0.1269,  0.3079, -0.0863,  ...,  0.2367, -0.1714,  0.1181],
        ...,
        [-0.5259, -0.1410, -0.2117,  ..., -0.3948, -0.1146,  1.1619],
        [-0.7714,  0.0036, -0.1814,  ..., -0.3676,  0.2210,  0.5061],
        [ 0.1555, -0.2788, -0.1260,  ...,  0.1105, -0.0262, -0.0221]]), 'decode_mul': tensor([[ 0.1397, -0.0751, -0.0087, -0.0483, -0.0354,  0.0125, -0.1248, -0.0448,
          0.0123,  0.0499, -0.0774, -0.0273, -0.0259

In [15]:
print(f'conv1 shape: {list(prediction['conv1'].shape)}')
print(f'conv2 shape: {list(prediction['conv2'].shape)}')
print(f'conv3 shape: {list(prediction['decode_mul'].shape)}')
print(f'final shape: {list(prediction['prob_adj'].shape)}')
print(f'final shape: {list(prediction['decode_all_final'].shape)}')

conv1 shape: [7126, 64]
conv2 shape: [7126, 32]
conv3 shape: [1, 32]
final shape: [7126, 7126]
final shape: [2, 24355794]


In [16]:
# dummy input
x = torch.randn(7126, 128)
edge_index = torch.randint(0, 7126, (2, 77774))
dummy_input = (x, edge_index, edge_index)
prediction = model.forward(x, edge_index, edge_index)


In [17]:
print(prediction)

{'conv1': tensor([[0.0000, 0.7129, 0.2120,  ..., 0.1438, 0.2927, 0.4204],
        [0.3989, 0.6252, 0.1501,  ..., 0.0000, 0.0000, 0.0000],
        [0.0400, 0.3666, 0.8766,  ..., 0.0000, 0.0827, 0.1051],
        ...,
        [0.0353, 0.0000, 0.0000,  ..., 0.0000, 0.0316, 0.0067],
        [0.0000, 0.0000, 0.0000,  ..., 0.7054, 0.0000, 0.0000],
        [0.0470, 0.0000, 0.0000,  ..., 0.4038, 0.0000, 0.0000]],
       grad_fn=<ReluBackward0>), 'conv2': tensor([[-0.1303,  0.0965,  0.2786,  ..., -0.2742,  0.1943,  0.0804],
        [ 0.1424, -0.2110,  0.2098,  ...,  0.0685, -0.0397,  0.3343],
        [-0.0070, -0.1382,  0.1839,  ...,  0.1299,  0.1559,  0.3097],
        ...,
        [-0.1183, -0.1159,  0.1970,  ..., -0.0684,  0.0632,  0.2109],
        [ 0.0006, -0.0153, -0.0956,  ..., -0.0583,  0.0209,  0.1832],
        [-0.0232, -0.0084,  0.1879,  ...,  0.0097,  0.0485,  0.2621]],
       grad_fn=<AddBackward0>), 'decode_mul': tensor([[-0.0037,  0.0052, -0.0201,  ..., -0.0127,  0.0071,  0.0590],


In [18]:
# python model to ONNX model
torch.onnx.export(model,               # model being run
                  dummy_input,         # model input 
                  "gnn_link_model.onnx",    # where to save the model
                  export_params=True,  # store the trained parameter weights inside the model file
                  opset_version=17,    # the ONNX version to export the model to
                #   do_constant_folding=True,  # whether to execute constant folding for optimization
                  input_names = ['x', 'edge_index', 'edge_label_index'],   # the model's input names
                  output_names=['conv1', 'conv2', 'decode_mul', 'decode_sum', 'prob_adj', 'decode_all_final'],
                  dynamic_axes={'x': {0: 'num_nodes'},
                                'edge_index': {1: 'num_edges'},
                                'output': {0: 'batch_size'}})  # which axes should be considered dynamic)



In [19]:
# export data from dataset
import json, torch
def data_to_json(data):
    json_data = {}
    
    # Convert node features to a list of lists
    if data.x is not None:
        json_data['x'] = data.x.tolist()  # Assuming x is a tensor of node features
    
    # Convert edge index to a list of pairs/lists
    if data.edge_index is not None:
        edge_index_list = data.edge_index.tolist()  # Convert to [2] and then to list
        json_data['edge_index'] = edge_index_list 
    
    # Convert labels to a list
    if data.y is not None:
        json_data['y'] = data.y.tolist()  
    num_nodes = data.x.size(0)
    batch = torch.zeros(num_nodes, dtype=torch.int32)
    json_data['batch'] = batch.tolist()
    
    return json_data

json_data = data_to_json(dataset)
with open(f'twitch.json', 'w') as f:
    json.dump(json_data, f, indent=4)