In [1]:
import numpy as np
import networkx as nx
import pandas as pd

In [2]:
content=pd.read_csv('cora.content.txt',sep='\t',header=None)

In [3]:
el=pd.read_csv('cora.cites.txt',sep='\t',header=None)
el.columns=['source','target']
G=nx.from_pandas_edgelist(el)

In [4]:
node=list(content[0])
wordvec=[]
for i in range(len(content)):
    wordvec.append(list(content.loc[i,1:1433]))
field=list(content[1434])

In [5]:
for i,vertex in enumerate(node):
    # G.nodes[vertex]['content']=wordvec[i]
    G.nodes[vertex]['field']=field[i]

In [6]:

def random_mat(n,m):
    np.random.seed(1000)
    rdmat=np.zeros((n,m))
    for i in range(n):
        for j in range(m):
            rdmat[i,j]=np.random.rand()+1
    rdmat=rdmat/np.sum(rdmat,axis=1)[:,np.newaxis]
    return rdmat

def loged_division_scale(bs,maxnum,minnum):
    adj=(maxnum+minnum)/2
    b=np.array(bs)-adj
    return np.exp(b)/sum(np.exp(b))

def loged_division(bs):
    maxnum=max(bs)
    a=np.zeros(len(bs))
    ind=bs>=(maxnum-1000)
    a[ind]=loged_division_scale(bs[ind],maxnum=maxnum,minnum=min(bs[ind]))
    return a

In [5]:
import dgl
import torch
import torch.nn as nn
import torch.nn.functional as F
import itertools
import numpy as np
import scipy.sparse as sp

dataset = dgl.data.CoraGraphDataset()
g = dataset[0]

u, v = g.edges()
eids = np.arange(g.number_of_edges())
eids = np.random.permutation(eids)  # 将顺序打乱

test_size = int(len(eids) * 0.1)
train_size = g.number_of_edges() - test_size
test_pos_u, test_pos_v = u[eids[:test_size]], v[eids[:test_size]]
train_pos_u, train_pos_v = u[eids[test_size:]], v[eids[test_size:]]

adj = sp.coo_matrix((np.ones(len(u)), (u.numpy(), v.numpy())))  
adj_neg = 1 - adj.todense() - np.eye(g.number_of_nodes()) 
neg_u, neg_v = np.where(adj_neg != 0)  

neg_eids = np.random.choice(len(neg_u), g.number_of_edges())
test_neg_u, test_neg_v = neg_u[neg_eids[:test_size]], neg_v[neg_eids[:test_size]]
train_neg_u, train_neg_v = neg_u[neg_eids[test_size:]], neg_v[neg_eids[test_size:]]

train_g = dgl.remove_edges(g, eids[:test_size])


  NumNodes: 2708
  NumEdges: 10556
  NumFeats: 1433
  NumClasses: 7
  NumTrainingSamples: 140
  NumValidationSamples: 500
  NumTestSamples: 1000
Done loading data from cached files.


In [23]:
from dgl.nn import SAGEConv


class GraphSAGE(nn.Module):
    def __init__(self, in_feats, h_feats):
        super(GraphSAGE, self).__init__()
        self.conv1 = SAGEConv(in_feats, h_feats,aggregator_type='mean')
        self.conv2 = SAGEConv(h_feats, h_feats,aggregator_type='mean')

    def forward(self, g, in_feat):
        h = self.conv1(g, in_feat)
        h = F.relu(h)
        h = self.conv2(g, h)
        return h

In [24]:
train_pos_g = dgl.graph((train_pos_u, train_pos_v), num_nodes=g.number_of_nodes())
train_neg_g = dgl.graph((train_neg_u, train_neg_v), num_nodes=g.number_of_nodes())

test_pos_g = dgl.graph((test_pos_u, test_pos_v), num_nodes=g.number_of_nodes())
test_neg_g = dgl.graph((test_neg_u, test_neg_v), num_nodes=g.number_of_nodes())


In [25]:
import dgl.function as fn

class DotPredictor(nn.Module):
    def forward(self, g, h):
        with g.local_scope():
            g.ndata['h'] = h
            g.apply_edges(fn.u_dot_v('h', 'h', 'score'))
            return g.edata['score'][:, 0]

In [41]:
model = GraphSAGE(g.ndata['feat'].shape[1], 7)
pred = DotPredictor()
def compute_loss(pos_score, neg_score):
    scores = torch.cat([pos_score, neg_score])
    labels = torch.cat([torch.ones(pos_score.shape[0]), torch.zeros(neg_score.shape[0])])
    return F.binary_cross_entropy_with_logits(scores, labels)

def compute_auc(pos_score, neg_score):
    scores = torch.cat([pos_score, neg_score]).numpy()
    labels = torch.cat(
        [torch.ones(pos_score.shape[0]), torch.zeros(neg_score.shape[0])]).numpy()
    return roc_auc_score(labels, scores)

optimizer = torch.optim.Adam(itertools.chain(model.parameters(), pred.parameters()), lr=0.01)
all_logits = []
for e in range(500):
    h = model(train_g, train_g.ndata['feat'])
    pos_score = pred(train_pos_g, h)
    neg_score = pred(train_neg_g, h)
    loss = compute_loss(pos_score, neg_score)

    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    if e % 5 == 0:
        print('In epoch {}, loss: {}'.format(e, loss))
from sklearn.metrics import roc_auc_score
with torch.no_grad():
    pos_score = pred(test_pos_g, h)
    neg_score = pred(test_neg_g, h)
    print('AUC', compute_auc(pos_score, neg_score))

In epoch 0, loss: 0.6930727362632751
In epoch 5, loss: 0.6797953248023987
In epoch 10, loss: 0.6405137181282043
In epoch 15, loss: 0.5821075439453125
In epoch 20, loss: 0.5371377468109131
In epoch 25, loss: 0.5234941840171814
In epoch 30, loss: 0.5015429854393005
In epoch 35, loss: 0.4823758900165558
In epoch 40, loss: 0.465501070022583
In epoch 45, loss: 0.448535680770874
In epoch 50, loss: 0.43461722135543823
In epoch 55, loss: 0.42184972763061523
In epoch 60, loss: 0.408948689699173
In epoch 65, loss: 0.3967340290546417
In epoch 70, loss: 0.38518255949020386
In epoch 75, loss: 0.373823344707489
In epoch 80, loss: 0.36286282539367676
In epoch 85, loss: 0.3520471751689911
In epoch 90, loss: 0.3414112329483032
In epoch 95, loss: 0.33100834488868713
In epoch 100, loss: 0.32093438506126404
In epoch 105, loss: 0.31095990538597107
In epoch 110, loss: 0.3012780547142029
In epoch 115, loss: 0.29179197549819946
In epoch 120, loss: 0.2825547158718109
In epoch 125, loss: 0.2735680043697357
In e

In [51]:
to_pred_edge = dgl.graph((np.array([1]), np.array([4])), num_nodes=g.number_of_nodes())
pred(to_pred_edge,h)

tensor([-1.2234], grad_fn=<SelectBackward>)