In [1]:
import itertools

import dgl
import dgl.data
import numpy as np
import scipy.sparse as sp
import torch
import torch.nn as nn
import torch.nn.functional as F

In [2]:
dataset = dgl.data.CoraGraphDataset("./")

  NumNodes: 2708
  NumEdges: 10556
  NumFeats: 1433
  NumClasses: 7
  NumTrainingSamples: 140
  NumValidationSamples: 500
  NumTestSamples: 1000
Done loading data from cached files.


In [3]:
g = dataset[0]

### prepare training and testing set

In [4]:
u, v = g.edges()
eids = np.arange(g.num_edges())
eids = np.random.permutation(eids)

In [6]:
test_size = int(len(eids) * 0.2)
train_size = len(eids) - test_size

In [102]:
# 数据集读出来的边都是正样本
test_u_pos, test_v_pos = u[eids[:test_size]], v[eids[:test_size]]
train_u_pos, train_v_pos = u[eids[test_size:]], v[eids[test_size:]]
n_nodes = g.num_nodes()
print("n_nodes:", n_nodes)

# coo_matrix api: (data,(row_idx,col_idx))
adj = sp.coo_matrix((np.ones(len(u)), (u.numpy(), v.numpy())))
adj_neg = 1 - adj.toarray() - np.eye(n_nodes)
neg_u, neg_v = np.where(adj_neg == 1)

# g.num_edges()是正样本个数，负样本取相同数量
neg_eids = np.random.choice(len(neg_u), g.num_edges())
test_u_neg, test_v_neg = (neg_u[neg_eids[:test_size]], neg_v[neg_eids[:test_size]])
train_u_neg, train_v_neg = (neg_u[neg_eids[test_size:]], neg_v[neg_eids[test_size:]])

print("训练集正样本边个数：", len(train_u_pos))
print("训练集负样本边个数：", len(train_u_neg))
print("测试集正样本边个数：", len(test_u_pos))
print("测试集负样本边个数：", len(test_u_neg))

n_nodes: 2708
训练集正样本边个数： 8445
训练集负样本边个数： 8445
测试集正样本边个数： 2111
测试集负样本边个数： 2111


In [117]:
# 移除测试边
train_g = dgl.remove_edges(g, eids[:test_size])
print(train_g)

Graph(num_nodes=2708, num_edges=8445,
      ndata_schemes={'train_mask': Scheme(shape=(), dtype=torch.bool), 'label': Scheme(shape=(), dtype=torch.int64), 'val_mask': Scheme(shape=(), dtype=torch.bool), 'test_mask': Scheme(shape=(), dtype=torch.bool), 'feat': Scheme(shape=(1433,), dtype=torch.float32)}
      edata_schemes={})


### 模型

In [208]:
from dgl.nn import SAGEConv

class GraphSAGE(nn.Module):
    def __init__(self, in_feats, h_feats):
        super(GraphSAGE, self).__init__()
        self.conv1 = SAGEConv(in_feats, h_feats, "mean")
        self.conv2 = SAGEConv(h_feats, h_feats, "mean")

    def forward(self, g, in_feats):
        h = self.conv1(g, in_feats)
        h = F.relu(h)
        h = self.conv2(g, h)
        return h

In [209]:
train_g_pos = dgl.graph((train_u_pos, train_v_pos), num_nodes=g.num_nodes())
train_g_neg = dgl.graph((train_u_neg, train_v_neg), num_nodes=g.num_nodes())

test_g_pos = dgl.graph((test_u_pos, test_v_pos), num_nodes=g.num_nodes())
test_g_neg = dgl.graph((test_u_neg, test_v_neg), num_nodes=g.num_nodes())

tensor([1, 2, 3])

In [371]:
import dgl.function as fn

# 预测器


class DotPredictor(nn.Module):
    def forward(self, g, h):
        with g.local_scope():
            g.ndata["h"] = h
            g.apply_edges(fn.u_dot_v("h", "h", "score"))
            return g.edata["score"][:, 0]


class MLPPredictor(nn.Module):
    def __init__(self, n_feats):
        super(MLPPredictor, self).__init__()
        self.w1 = nn.Linear(n_feats * 2, n_feats)
        self.w2 = nn.Linear(n_feats, 1)

    def apply_edges(self, edges):
        h1 = edges.src["h"]
        h2 = edges.dst["h"]
        h = torch.cat([h1, h2], dim=1)
        h = self.w2(F.relu(self.w1(h)))
        return {"score": h.squeeze()}

    def forward(self, g, h):
        with g.local_scope():
            g.ndata["h"] = h
            g.apply_edges(self.apply_edges)

            return g.edata["score"]
            # return g.edata["score"][:, 0]

In [372]:
from sklearn.metrics import roc_auc_score


def compute_loss(pos_score, neg_score):
    scores = torch.cat([pos_score, neg_score])
    labels = torch.cat(
        [torch.ones(pos_score.shape[0]), torch.zeros(neg_score.shape[0])]
    )
    return F.binary_cross_entropy_with_logits(scores, labels)


def compute_auc(pos_score, neg_score):
    scores = torch.cat([pos_score, neg_score])
    labels = torch.cat(
        [torch.ones(pos_score.shape[0]), torch.zeros(neg_score.shape[0])]
    ).numpy()
    return roc_auc_score(labels, scores)

In [373]:
model = GraphSAGE(train_g.ndata["feat"].shape[1], 16)
# pred_func = DotPredictor()
pred_func = MLPPredictor(16)

In [374]:
optimizer = torch.optim.Adam(
    itertools.chain(model.parameters(), pred_func.parameters()), lr=0.01
)

### training

In [381]:
for e in range(100):
    h = model(train_g, train_g.ndata["feat"])
    pos_score = pred_func(train_g_pos, h)
    neg_score = pred_func(train_g_neg, h)
    loss = compute_loss(pos_score, neg_score)

    # backward
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    if e % 10 == 0:
        print(f"epoch {e}, loss: {loss}")

epoch 0, loss: 0.0009329569293186069
epoch 10, loss: 0.0007845202926546335
epoch 20, loss: 0.0006720645469613373
epoch 30, loss: 0.0005841287784278393
epoch 40, loss: 0.0005138266133144498
epoch 50, loss: 0.0004566173884086311
epoch 60, loss: 0.0004093478200957179
epoch 70, loss: 0.00036978654679842293
epoch 80, loss: 0.0003363025316502899
epoch 90, loss: 0.00030760039226152003


In [379]:
with torch.no_grad():
    model.eval()
    pos_score = pred_func(test_g_pos, h)
    neg_score = pred_func(test_g_neg, h)
    print("AUC", compute_auc(pos_score, neg_score))

AUC 0.7948305339763451
