In [3]:
import networkx as nx
import pandas as pd
import numpy as np
import scipy.sparse as sp
import torch
import torch.nn as nn
import torch.nn.functional as F
import itertools

In [2]:
cc=pd.read_csv("dummy_cctrain.csv")
cc=cc.dropna()
cc=cc.drop_duplicates()
cc = cc.drop(index=[92202])
G = nx.from_pandas_edgelist(cc, source='src', target='dst') 
#nx.draw(g)

unique_conc=list(set(cc[["src","dst"]].values.ravel("K")))
total_conc=len(unique_conc)

print(f"Total concepts in edges_cc {len(unique_conc)}")
print("")
sub_grps=[G.subgraph(c).copy() for c in nx.connected_components(G)]

print(f"No. of connected subgraphs {len(sub_grps)}")
print("'")
for sub_grp in sub_grps:
    
    print(f"Total nodes in this sub graph are {sub_grp.number_of_nodes()}")

Total concepts in edges_cc 40312

No. of connected subgraphs 374
'
Total nodes in this sub graph are 39554
Total nodes in this sub graph are 3
Total nodes in this sub graph are 2
Total nodes in this sub graph are 2
Total nodes in this sub graph are 2
Total nodes in this sub graph are 2
Total nodes in this sub graph are 2
Total nodes in this sub graph are 2
Total nodes in this sub graph are 2
Total nodes in this sub graph are 2
Total nodes in this sub graph are 2
Total nodes in this sub graph are 2
Total nodes in this sub graph are 3
Total nodes in this sub graph are 2
Total nodes in this sub graph are 4
Total nodes in this sub graph are 2
Total nodes in this sub graph are 2
Total nodes in this sub graph are 2
Total nodes in this sub graph are 2
Total nodes in this sub graph are 2
Total nodes in this sub graph are 2
Total nodes in this sub graph are 4
Total nodes in this sub graph are 2
Total nodes in this sub graph are 2
Total nodes in this sub graph are 2
Total nodes in this sub graph

In [3]:
G_main=sub_grps[0];
G_main_nodes = list(G_main);
total_conc_main=G_main.number_of_nodes()
print("")
print(f"Total number of nodes removed after finding connected graph {total_conc-total_conc_main}")


Total number of nodes removed after finding connected graph 758


In [4]:
g = dgl.from_networkx(G_main)

In [5]:
g.ndata["feat"] = torch.eye(g.num_nodes())

In [6]:
u, v = g.edges()

eids = np.arange(g.number_of_edges())
eids = np.random.permutation(eids)
test_size = int(len(eids) * 0.1)
train_size = g.number_of_edges() - test_size
test_pos_u, test_pos_v = u[eids[:test_size]], v[eids[:test_size]]
train_pos_u, train_pos_v = u[eids[test_size:]], v[eids[test_size:]]

# Find all negative edges and split them for training and testing

""" 
adj = sp.coo_matrix((np.ones(len(u)), (u.numpy(), v.numpy())))
adj_neg = 1 - adj.todense() - np.eye(g.number_of_nodes())
neg_u, neg_v = np.where(adj_neg != 0)

neg_eids = np.random.choice(len(neg_u), g.number_of_edges() // 2)
test_neg_u, test_neg_v = neg_u[neg_eids[:test_size]], neg_v[neg_eids[:test_size]]
train_neg_u, train_neg_v = neg_u[neg_eids[test_size:]], neg_v[neg_eids[test_size:]]
""" 


' \nadj = sp.coo_matrix((np.ones(len(u)), (u.numpy(), v.numpy())))\nadj_neg = 1 - adj.todense() - np.eye(g.number_of_nodes())\nneg_u, neg_v = np.where(adj_neg != 0)\n\nneg_eids = np.random.choice(len(neg_u), g.number_of_edges() // 2)\ntest_neg_u, test_neg_v = neg_u[neg_eids[:test_size]], neg_v[neg_eids[:test_size]]\ntrain_neg_u, train_neg_v = neg_u[neg_eids[test_size:]], neg_v[neg_eids[test_size:]]\n'

In [7]:
train_neg_u = np.load("train_neg_u.npy")
train_neg_v = np.load("train_neg_v.npy")
test_neg_u = np.load("test_neg_u.npy")
test_neg_v = np.load("test_neg_v.npy")


In [8]:
train_neg_u = torch.tensor(train_neg_u,dtype = torch.int64)
train_neg_v = torch.tensor(train_neg_v,dtype = torch.int64)
test_neg_u = torch.tensor(test_neg_u,dtype = torch.int64)
test_neg_v = torch.tensor(test_neg_v,dtype = torch.int64)

In [9]:
train_g = dgl.remove_edges(g, eids[:test_size])
train_g

Graph(num_nodes=39554, num_edges=337550,
      ndata_schemes={'feat': Scheme(shape=(39554,), dtype=torch.float32)}
      edata_schemes={})

In [10]:
class MLPPredictor(nn.Module):
    def __init__(self, h_feats):
        super().__init__()
        self.W1 = nn.Linear(h_feats * 2, h_feats)
        self.W2 = nn.Linear(h_feats, 1)

    def apply_edges(self, edges):
        """
        Computes a scalar score for each edge of the given graph.

        Parameters
        ----------
        edges :
            Has three members ``src``, ``dst`` and ``data``, each of
            which is a dictionary representing the features of the
            source nodes, the destination nodes, and the edges
            themselves.

        Returns
        -------
        dict
            A dictionary of new edge features.
        """
        h = torch.cat([edges.src['h'], edges.dst['h']], 1)
        return {'score': self.W2(F.relu(self.W1(h))).squeeze(1)}

    def forward(self, g, h):
        with g.local_scope():
            g.ndata['h'] = h
            g.apply_edges(self.apply_edges)
            return g.edata['score']

In [11]:
from dgl.nn import SAGEConv

# ----------- 2. create model -------------- #
# build a two-layer GraphSAGE model
class GraphSAGE(nn.Module):
    def __init__(self, in_feats, h_feats):
        super(GraphSAGE, self).__init__()
        self.conv1 = SAGEConv(in_feats, h_feats, 'mean')
        self.conv2 = SAGEConv(h_feats, h_feats, 'mean')

    def forward(self, g, in_feat):
        h = self.conv1(g, in_feat)
        h = F.relu(h)
        h = self.conv2(g, h)
        return h

In [12]:
train_pos_g = dgl.graph((train_pos_u, train_pos_v), num_nodes=g.number_of_nodes())
train_neg_g = dgl.graph((train_neg_u, train_neg_v), num_nodes=g.number_of_nodes())

test_pos_g = dgl.graph((test_pos_u, test_pos_v), num_nodes=g.number_of_nodes())
test_neg_g = dgl.graph((test_neg_u, test_neg_v), num_nodes=g.number_of_nodes())

In [13]:
import dgl.function as fn

class DotPredictor(nn.Module):
    def forward(self, g, h):
        with g.local_scope():
            g.ndata['h'] = h
            # Compute a new edge feature named 'score' by a dot-product between the
            # source node feature 'h' and destination node feature 'h'.
            g.apply_edges(fn.u_dot_v('h', 'h', 'score'))
            # u_dot_v returns a 1-element vector for each edge so you need to squeeze it.
            return g.edata['score'][:, 0]

In [14]:
model = GraphSAGE(train_g.ndata['feat'].shape[1], 16)
#pred = MLPPredictor(16)
pred = DotPredictor()

def compute_loss(pos_score, neg_score):
    scores = torch.cat([pos_score, neg_score])
    labels = torch.cat([torch.ones(pos_score.shape[0]), torch.zeros(neg_score.shape[0])])
    return F.binary_cross_entropy_with_logits(scores, labels)

def compute_auc(pos_score, neg_score):
    scores = torch.cat([pos_score, neg_score]).numpy()
    labels = torch.cat(
        [torch.ones(pos_score.shape[0]), torch.zeros(neg_score.shape[0])]).numpy()
    return roc_auc_score(labels, scores)

In [17]:
optimizer = torch.optim.Adam(itertools.chain(model.parameters(), pred.parameters()), lr=0.01)
for e in range(100):
    # forward
    h = model(train_g, train_g.ndata['feat'])
    pos_score = pred(train_pos_g, h)
    neg_score = pred(train_neg_g, h)
    loss = compute_loss(pos_score, neg_score)
    

    # backward
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()


    if e % 5 == 0:
        print('In epoch {}, train loss: {}'.format(e, loss))
        

In epoch 0, train loss: 0.6928998827934265
In epoch 5, train loss: 0.534268319606781
In epoch 10, train loss: 0.41612228751182556
In epoch 15, train loss: 0.3547072410583496
In epoch 20, train loss: 0.32018256187438965
In epoch 25, train loss: 0.2953540086746216
In epoch 30, train loss: 0.2667562961578369
In epoch 35, train loss: 0.23658570647239685
In epoch 40, train loss: 0.20967960357666016
In epoch 45, train loss: 0.18366242945194244
In epoch 50, train loss: 0.15911921858787537
In epoch 55, train loss: 0.13697385787963867
In epoch 60, train loss: 0.1169351115822792
In epoch 65, train loss: 0.09905430674552917
In epoch 70, train loss: 0.08312878757715225
In epoch 75, train loss: 0.06892715394496918
In epoch 80, train loss: 0.056369487196207047
In epoch 85, train loss: 0.04533293843269348
In epoch 90, train loss: 0.035783551633358
In epoch 95, train loss: 0.027776071801781654


In [18]:
from sklearn.metrics import roc_auc_score
with torch.no_grad():
    #h = model(train_g, train_g.ndata['feat'])
    pos_score = pred(test_pos_g, h)
    neg_score = pred(test_neg_g, h)
    print('AUC', compute_auc(pos_score, neg_score))


AUC 0.8902456807918656


In [14]:
s = np.load("dummy_data/train_neg_v.npy")

In [15]:
s.shape

(168931,)

In [12]:
t = np.load("dummy_data/test_neg_v.npy")

In [13]:
t.shape[0]

37542