Import

In [1]:
import dgl
import torch
import torch.nn as nn
import torch.nn.functional as F
import itertools
import numpy as np
import scipy.sparse as sp

Loading graph and features

Reading data

In [2]:
node_feats = []
with open('node_feat.txt', 'r') as f:
    for line in f:
        node_feats.append(list(map(float, line.strip().split())))
node_feats = torch.Tensor(node_feats)
node_feats[0][:5]

tensor([ 0.0770, -0.0650, -0.1356,  0.0073, -0.0196])

In [3]:
edges = []
with open('train_edges.txt', 'r') as f:
    for line in f:
        edges.append(tuple(map(int, line.strip().split())))
edges = torch.tensor(edges, dtype=torch.int64)
edges[:5]

tensor([[ 0,  2],
        [ 1, 19],
        [ 1, 21],
        [ 1, 23],
        [ 1, 15]])

In [4]:
new_edges = []
with open('unlabeled_edges.txt', 'r') as f:
    for line in f:
        new_edges.append(tuple(map(int, line.strip().split())))
new_edges = torch.tensor(new_edges, dtype=torch.int64)

test_g = dgl.DGLGraph()
test_g.add_nodes(len(node_feats), {'feat': node_feats})
test_g.add_edges(new_edges[:, 0], new_edges[:, 1])
test_g



Graph(num_nodes=12588, num_edges=44014,
      ndata_schemes={'feat': Scheme(shape=(32,), dtype=torch.float32)}
      edata_schemes={})

Creating graph based on read data

In [5]:
g = dgl.DGLGraph()
g.add_nodes(len(node_feats), {'feat': node_feats})
g.add_edges(edges[:, 0], edges[:, 1])
g

Graph(num_nodes=12588, num_edges=14322,
      ndata_schemes={'feat': Scheme(shape=(32,), dtype=torch.float32)}
      edata_schemes={})

prepare training and testing sets

In [20]:
# Split edge set for training and testing
u, v = g.edges()

eids = np.arange(g.number_of_edges())
eids = np.random.permutation(eids)
test_size = int(len(eids) * 0.2)

train_size = g.number_of_edges() - test_size
test_pos_u, test_pos_v = u[eids[:test_size]], v[eids[:test_size]]
train_pos_u, train_pos_v = u[eids[test_size:]], v[eids[test_size:]]

# Find all negative edges and split them for training and testing
n = max(max(u), max(v)) + 1
adj = sp.coo_matrix((np.ones(len(u)), (u.numpy(), v.numpy())), shape=(n, n))
adj_neg = 1 - adj.todense() - np.eye(g.number_of_nodes())
neg_u, neg_v = np.where(adj_neg != 0)

neg_eids = np.random.choice(len(neg_u), 20 * g.number_of_edges())
test_neg_u, test_neg_v = neg_u[neg_eids[:test_size]], neg_v[neg_eids[:test_size]]
train_neg_u, train_neg_v = neg_u[neg_eids[test_size:]], neg_v[neg_eids[test_size:]]

In [21]:
train_g = dgl.remove_edges(g, eids[:test_size])

Define a GraphSAGE model

In [22]:
from dgl.nn import SAGEConv, GraphConv

class GraphSAGE(nn.Module):
    def __init__(self, in_feats, h_feats):
        super(GraphSAGE, self).__init__()
        self.conv1 = SAGEConv(in_feats, h_feats, 'gcn')
        self.conv2 = SAGEConv(h_feats, h_feats, 'gcn')
        #self.conv3 = SAGEConv(h_feats, h_feats, 'mean')

    def forward(self, g, in_feat):
        h = self.conv1(g, in_feat)
        h = F.relu(h)
        h = self.conv2(g, h)
        # h = F.relu(h)
        # h = self.conv3(g, h)
        return h

Positive and negative graphs for train and test

In [23]:
train_pos_g = dgl.graph((train_pos_u, train_pos_v), num_nodes=g.number_of_nodes())
train_neg_g = dgl.graph((train_neg_u, train_neg_v), num_nodes=g.number_of_nodes())

test_pos_g = dgl.graph((test_pos_u, test_pos_v), num_nodes=g.number_of_nodes())
test_neg_g = dgl.graph((test_neg_u, test_neg_v), num_nodes=g.number_of_nodes())

In [24]:
import dgl.function as fn

class DotPredictor(nn.Module):
    def forward(self, g, h):
        with g.local_scope():
            g.ndata['h'] = h
            # Compute a new edge feature named 'score' by a dot-product between the
            # source node feature 'h' and destination node feature 'h'.
            g.apply_edges(fn.u_dot_v('h', 'h', 'score'))
            # u_dot_v returns a 1-element vector for each edge so you need to squeeze it.
            return g.edata['score'][:, 0]

In [25]:
class MLPPredictor(nn.Module):
    def __init__(self, h_feats):
        super().__init__()
        self.W1 = nn.Linear(h_feats * 2, h_feats)
        self.W2 = nn.Linear(h_feats, 1)

    def apply_edges(self, edges):
        h = torch.cat([edges.src['h'], edges.dst['h']], 1)
        return {'score': self.W2(F.relu(self.W1(h))).squeeze(1)}

    def forward(self, g, h):
        with g.local_scope():
            g.ndata['h'] = h
            g.apply_edges(self.apply_edges)
            return g.edata['score']

Training loop

In [26]:
from sklearn.metrics import accuracy_score
n_hidden = 64
model = GraphSAGE(
    train_g.ndata['feat'].shape[1], 
    n_hidden,
)
# You can replace DotPredictor with MLPPredictor.
#pred = MLPPredictor(n_hidden)
pred = DotPredictor()

def compute_loss(pos_score, neg_score):
    scores = torch.cat([pos_score, neg_score])
    labels = torch.cat([torch.ones(pos_score.shape[0]), torch.zeros(neg_score.shape[0])])
    return F.binary_cross_entropy_with_logits(scores, labels)

def compute_accuracy(pos_score, neg_score):
    scores = torch.cat([pos_score, neg_score]).numpy()
    scores = np.where(scores > 0, 1, 0)
    labels = torch.cat(
        [torch.ones(pos_score.shape[0]), torch.zeros(neg_score.shape[0])]).numpy()
    return accuracy_score(labels, scores)

In [27]:
optimizer = torch.optim.Adam(itertools.chain(model.parameters(), pred.parameters()), lr=0.01)
losses = []
for e in range(300):
    # forward
    h = model(train_g, train_g.ndata['feat'])
    pos_score = pred(train_pos_g, h)
    neg_score = pred(train_neg_g, h)
    loss = compute_loss(pos_score, neg_score)
    losses.append(loss)
    # backward
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    if e % 50 == 0:
        print('In epoch {}, loss: {}'.format(e, loss))

with torch.no_grad():
    pos_score = pred(test_pos_g, h)
    neg_score = pred(test_neg_g, h)
    print('accuracy', compute_accuracy(pos_score, neg_score))

In epoch 0, loss: 5.008874893188477
In epoch 50, loss: 0.6923273205757141
In epoch 100, loss: 0.6916512846946716
In epoch 150, loss: 0.6914926767349243
In epoch 200, loss: 0.6913081407546997
In epoch 250, loss: 0.6911448240280151
accuracy 0.6586941340782123


In [28]:
test_g.edges()

(tensor([11468,  3508,  6724,  ..., 11810, 12336,  1262]),
 tensor([ 1677,  8904,  2318,  ...,  4003, 12428,  2232]))

In [29]:
test_edges = test_g.edges()
preds = pred(test_g, h)
d = {}
for i in range(len(test_edges[0])):
    d[(test_edges[0][i].item(), test_edges[1][i].item())] = preds[i].item()
    d[(test_edges[1][i].item(), test_edges[0][i].item())] = preds[i].item()

new_edges_pred = []
for edge in new_edges:
    new_edges_pred.append(d[edge[0].item(), edge[1].item()])
new_edges_pred = np.array(new_edges_pred)
final_preds = np.where(new_edges_pred > 0, 1, 0)

In [30]:
final_preds

array([0, 0, 0, ..., 1, 0, 1])

In [31]:
with open('submit.txt', 'w') as f:
    for line in final_preds:
        f.write(f"{line}\n")