In [1]:
import dgl
import numpy as np
import networkx as nx
from node2vec import Node2Vec
import matplotlib.pyplot as plt
from operator import itemgetter
import scipy.sparse as sp
import torch
import torch.nn as nn
import torch.nn.functional as F
from sklearn.neighbors import NearestNeighbors
import itertools

import dgl.data
import dgl.function as fn
import dgl.nn.pytorch as dglnn
from dgl.nn import GraphConv
from dgl.nn import SumPooling
from dgl.nn import DenseGraphConv
from dgl.nn import SAGEConv

from tqdm import tqdm
import time

Using backend: pytorch


In [3]:
class GraphSAGE(nn.Module):
    def __init__(self, in_feats, h_feats):
        super().__init__()
        self.conv1 = SAGEConv(in_feats, h_feats, 'mean')
        self.conv2 = SAGEConv(h_feats, h_feats, 'mean')

    def forward(self, g, in_feat):
        h = self.conv1(g, in_feat)
        h = F.relu(h)
        h = self.conv2(g, h)
        return h

In [4]:
class MLPPredictor(nn.Module):
    def __init__(self, h_feats):
        super().__init__()
        self.W1 = nn.Linear(h_feats * 2, h_feats)
        self.W2 = nn.Linear(h_feats, 1)

    def apply_edges(self, edges):
        h = torch.cat([edges.src['h'], edges.dst['h']], 1)
        return {'score': self.W2(F.relu(self.W1(h))).squeeze(1)}

    def forward(self, g, h):
        with g.local_scope():
            g.ndata['h'] = h
            g.apply_edges(self.apply_edges)
            return g.edata['score']

In [5]:
class FC(nn.Module):
    def __init__(self, h_feats):
        super().__init__()
        self.W1 = nn.Linear(h_feats*2, h_feats*2)
        self.W2 = nn.Linear(h_feats*2, h_feats)

    def apply_edges(self, edges):
        h = torch.cat([edges.src['h'], edges.dst['h']], 1)
        return {'score': self.W2(F.relu(self.W1(h))).squeeze(1)}

    def forward(self, g, h):
        with g.local_scope():
            g.ndata['h'] = h
            g.apply_edges(self.apply_edges)
            return g.edata['score']

In [6]:
def positive_sample(graph, test_size=0.1):
    u, v = graph.edges()
    eids = np.random.permutation(np.arange(graph.number_of_edges())) #random index edges
    test_size_idx = int(len(eids) * test_size) #size positive sample by index

    test_pos_u, test_pos_v = u[eids[:test_size_idx]], v[eids[:test_size_idx]]
    train_pos_u, train_pos_v = u[eids[test_size_idx:]], v[eids[test_size_idx:]] 
    
    train_pos_g = dgl.graph((train_pos_u, train_pos_v), num_nodes=graph.number_of_nodes())
    test_pos_g = dgl.graph((test_pos_u, test_pos_v), num_nodes=graph.number_of_nodes())
    
    return train_pos_g, test_pos_g

In [27]:
spmat = sp.rand(500, 500, density=0.01) # 5% nonzero entries
g = dgl.from_scipy(spmat)
g

Graph(num_nodes=500, num_edges=2500,
      ndata_schemes={}
      edata_schemes={})

In [28]:
# dataset = dgl.data.CoraGraphDataset()
# g = dataset[0]

# g = dgl.remove_edges(g, eids[:1000], ) #subgraph
# g = dgl.remove_nodes(g, range(2000))
# print(g)

G = g.to_networkx()

print(len(list(G.nodes())))
print(len(list(G.edges())))

500
2500


In [29]:
print(list(G.nodes())[:10])
print(list(g.nodes())[:10])

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
[tensor(0), tensor(1), tensor(2), tensor(3), tensor(4), tensor(5), tensor(6), tensor(7), tensor(8), tensor(9)]


In [30]:
print(list(G.edges())[:10])
print(list(g.edges())[0][:10], list(g.edges())[1][:10])

[(0, 198), (0, 432), (1, 62), (1, 440), (1, 82), (1, 463), (1, 180), (2, 394), (2, 14), (2, 222)]
tensor([476, 408, 131, 428, 276, 493, 306,   5, 378, 327]) tensor([ 28, 360, 476, 284, 142,  93,   5,  85, 123, 289])


In [31]:
def alternate_list(a,b):
    c = list()
    for x in range(len(a)):
        c.extend([a[x], b[x]])
    return c

In [32]:
node2vec = Node2Vec(G, dimensions=3, walk_length=3)#, workers=6)
model_n2v = node2vec.fit(window=3, min_count=1)
embeddings = np.array([model_n2v.wv[x] for x in list(G.nodes())])
embeddings = torch.from_numpy(embeddings)
g.ndata['feat'] = embeddings                       # f_u

Computing transition probabilities:   0%|          | 0/500 [00:00<?, ?it/s]

Generating walks (CPU: 1): 100%|██████████| 10/10 [00:00<00:00, 195.93it/s]


In [33]:
print(list(G.nodes())[:10])
print(list(g.nodes())[:10])

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
[tensor(0), tensor(1), tensor(2), tensor(3), tensor(4), tensor(5), tensor(6), tensor(7), tensor(8), tensor(9)]


In [34]:
embeddings[:5]

tensor([[-0.4910,  0.5045,  1.1666],
        [-0.4397,  0.4268,  1.1207],
        [-0.3721,  0.8706,  1.0431],
        [-0.7498,  0.7491,  0.9239],
        [-0.9081,  0.6458,  0.9013]])

In [35]:
g.ndata['feat'][:5]

tensor([[-0.4910,  0.5045,  1.1666],
        [-0.4397,  0.4268,  1.1207],
        [-0.3721,  0.8706,  1.0431],
        [-0.7498,  0.7491,  0.9239],
        [-0.9081,  0.6458,  0.9013]])

In [36]:
model_n2v.wv[0]

array([-0.49101672,  0.5044954 ,  1.1666161 ], dtype=float32)

In [37]:
eids = np.random.permutation(np.arange(g.number_of_edges()))
train_g = g#dgl.remove_edges(g, eids[:int(len(eids) * 0.1)]) #subgraph
print(train_g)

Graph(num_nodes=500, num_edges=2500,
      ndata_schemes={'feat': Scheme(shape=(3,), dtype=torch.float32)}
      edata_schemes={})


### line graph

In [38]:
temp_G = nx.DiGraph()
temp_G.add_edges_from(list(G.edges()))
LnxG = nx.line_graph(temp_G)
# #lg = g.line_graph(backtracking=False)
# #lg = dgl.from_networkx(G).line_graph(backtracking=False)
# lg = dgl.from_networkx(LnxG)
# LG = lg.to_networkx()

# dual_nodes_dict = {}
# for idx, val in enumerate(list(LnxG.nodes())):
#     dual_nodes_dict[val] = idx


In [39]:
print(len(list(LnxG.nodes())), sorted(list(LnxG.nodes()),reverse=False)[:10])
print(len(list(LnxG.edges())), list(LnxG.edges())[:10])

2500 [(0, 198), (0, 432), (1, 62), (1, 82), (1, 180), (1, 440), (1, 463), (2, 14), (2, 16), (2, 147)]
12508 [((0, 198), (198, 435)), ((0, 198), (198, 210)), ((0, 198), (198, 498)), ((198, 435), (435, 263)), ((198, 435), (435, 232)), ((198, 435), (435, 209)), ((198, 435), (435, 421)), ((198, 435), (435, 336)), ((198, 435), (435, 332)), ((198, 435), (435, 186))]


In [40]:
def create_dgl_nx_dual_graph(line_nx_graph):
    nodes = sorted(list(line_nx_graph.nodes()),reverse=False)
    edges = list(line_nx_graph.edges())
    nodes_dict = {}
    new_u, new_v = [], []
    
    for idx, val in enumerate(nodes):
        nodes_dict[val] = idx
    
    for edge in edges:
        new_u.append(nodes_dict[edge[0]])
        new_v.append(nodes_dict[edge[1]])
    
    u = torch.tensor(new_u)
    v = torch.tensor(new_v)
    g = dgl.graph((u, v))
    G = g.to_networkx()
    return g, G
    
    
lg, LG = create_dgl_nx_dual_graph(LnxG)
dual_edges_dict = {edge: num for num, edge in enumerate(list(LG.edges()))}
dual_nodes_dict = {node: num for num, node in enumerate(sorted(list(LnxG.nodes()),reverse=False))}

In [57]:
# print(dual_nodes_dict[(0, 633)],dual_nodes_dict[(0, 1862)])

In [42]:
#print(dual_nodes_dict[:10])
#print(dual_edges_dict[:10])
print(len(list(LG.nodes)), list(LG.nodes)[:10])
print(len(list(LG.edges)))
print(list(LG.edges())[:10])

2500 [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
12508
[(0, 993), (0, 992), (0, 994), (1, 2182), (1, 2178), (1, 2179), (1, 2181), (1, 2180), (2, 312), (2, 313)]


In [43]:
print(len(list(lg.nodes())), list(lg.nodes())[:10])
print(len(list(lg.edges())[0]))
print(list(lg.edges())[0][:10], list(lg.edges())[0][:10])

2500 [tensor(0), tensor(1), tensor(2), tensor(3), tensor(4), tensor(5), tensor(6), tensor(7), tensor(8), tensor(9)]
12508
tensor([  0,   0,   0, 993, 993, 993, 993, 993, 993, 993]) tensor([  0,   0,   0, 993, 993, 993, 993, 993, 993, 993])


In [56]:
# model_n2v_dual.wv[1]

In [45]:
#m = nn.AvgPool1d(2, stride=2)
node2vec = Node2Vec(LnxG, dimensions=3, walk_length=3)
model_n2v_dual = node2vec.fit(window=3, min_count=1)
#embeddings_dual = [[alternate_list(model_n2v_dual.wv[x][0],model_n2v_dual.wv[x][1]) for x in list(LnxG.nodes)]]
embeddings_dual = [model_n2v_dual.wv[x] for x in list(LG.nodes)]
#embeddings_dual = m(torch.tensor(embeddings_dual))[0]
#embeddings_dual = (torch.tensor(embeddings_dual))[0]
embeddings_dual = torch.tensor(embeddings_dual)
lg.ndata['feat'] = embeddings_dual                #f_uv^*

Computing transition probabilities:   0%|          | 0/2500 [00:00<?, ?it/s]

Generating walks (CPU: 1): 100%|██████████| 10/10 [00:00<00:00, 35.73it/s]


In [46]:
lg.ndata['feat'][:5]

tensor([[ 0.3280,  0.6446,  2.4208],
        [-0.2927,  0.0932,  1.8933],
        [ 0.0603,  2.3261,  1.2868],
        [-1.6834,  2.0137,  0.6531],
        [-1.1967,  1.5019,  1.0093]])

In [47]:
embeddings_dual[:5]

tensor([[ 0.3280,  0.6446,  2.4208],
        [-0.2927,  0.0932,  1.8933],
        [ 0.0603,  2.3261,  1.2868],
        [-1.6834,  2.0137,  0.6531],
        [-1.1967,  1.5019,  1.0093]])

In [48]:
dual_eids = np.random.permutation(np.arange(lg.number_of_edges()))
train_dual_g = lg#dgl.remove_edges(dual_g, eids[:int(len(dual_eids) * 0.1)]) #subgraph
print(train_dual_g)

Graph(num_nodes=2500, num_edges=12508,
      ndata_schemes={'feat': Scheme(shape=(3,), dtype=torch.float32)}
      edata_schemes={})


In [49]:
# print(list(train_dual_g.nodes))
# list(train_dual_g.edges)
#list(train_dual_g.nodes())

In [50]:
model = GraphSAGE(train_g.ndata['feat'].shape[1], 3)
#pred = MLPPredictor(20)
FC_net = FC(3)
###############################################################
model_dual = GraphSAGE(train_dual_g.ndata['feat'].shape[1], 3)
#pred_dual = MLPPredictor(20)
FC_net_dual = FC(3)
#model = SAGE(train_g.ndata['feat'].shape[1], 20, 4, F.relu, 0.25)
#pred = DotPredictor()

In [51]:
#train_pos_g, test_pos_g = positive_sample(g)
#train_neg_g, test_neg_g = negative_sample(g, 'dgl_example')
###########################################################
#train_pos_dual_g, test_pos_dual_g = positive_sample(lg)
#train_neg_dual_g, test_neg_dual_g = negative_sample(dual_g, 'dgl_example')

In [52]:
print(len(dual_nodes_dict))
print(len(dual_edges_dict))
# list(LnxG.edges())

2500
12508


In [53]:
def g_u_star(G, LnxG, pos_score_dual):
    node_features = np.zeros((G.number_of_nodes(), 3))
    counts = np.zeros((G.number_of_nodes(), 1))

    for node in list(G.nodes()):
        for i, edge in enumerate(list(LnxG.edges())):
            if (node in edge[0])or(node in edge[1]):
                n1, n2 = dual_nodes_dict[edge[0]], dual_nodes_dict[edge[1]]
                #print(edge, n1, n2)
                try:
                    num_embd_edge_G_star = dual_edges_dict[(n1, n2)]
                    node_features[node] += pos_score_dual[num_embd_edge_G_star].detach().numpy()
                    counts[node] += 1
                    #print(pos_score_dual[num_embd_edge_G_star])
                except: print('NUN', edge)
        #print(counts)
        node_features[node]/=counts[node]
        
    return torch.from_numpy(node_features)

In [54]:
def compute_loss_1(z_u, g_u_star, g_uv, z_uv_star):
    alfa, beta = 0.5, 1.
    return alfa*((z_u - g_u_star)**2).mean()+ beta*((g_uv - z_uv_star)**2).mean()
    #return alfa*F.binary_cross_entropy_with_logits(z_u, g_u_star)

In [55]:
optimizer = torch.optim.Adam(itertools.chain(model.parameters(), FC_net.parameters()), lr=0.01)
optimizer_d = torch.optim.Adam(itertools.chain(model_dual.parameters(), FC_net_dual.parameters()), lr=0.01)

all_logits, diff = [], 0
for e in tqdm(range(20)):
    h = model(train_g, embeddings)                              #z_u
    h_dual = model_dual(train_dual_g, embeddings_dual)          #z_uv^*
    
    pos_score = FC_net(train_g, h)                              #g_uv
    #neg_score = pred(train_neg_g, h)                           #g_uv -
    pos_score_dual = FC_net_dual(train_dual_g, h_dual)          #g_u^*
    #neg_score_dual = pred(train_neg_dual_g, h_dual)            #g_u^* -
    start = time.time()
    g_u_s = g_u_star(G, LnxG, pos_score_dual)
    end = time.time()
    diff += int(end - start)
    #print(diff)
    
    #loss = compute_loss(pos_score, neg_score) 
    loss = compute_loss_1(h, g_u_s, pos_score, h_dual)
    
    # print(h.shape)
    # print(h_dual.shape)
    # print(pos_score.shape)
    # print(pos_score_dual.shape)
    # print(g_u_s.shape)
    # print(loss, F.mse_loss(h, g_u_s), F.mse_loss(pos_score, h_dual))
    
    optimizer.zero_grad()
    optimizer_d.zero_grad()
    loss.backward()
    optimizer.step()
    optimizer_d.step()

    if e % 5 == 0:
        print('In epoch {}, loss: {}, time_g_u_s: {} s'.format(e, loss, diff))
        diff = 0


from sklearn.metrics import roc_auc_score
#with torch.no_grad():
    #pos_score = pred(test_pos_g, h)
    #neg_score = pred(test_neg_g, h)
    #print('AUC', compute_auc(pos_score, neg_score))
    
# torch.Size([7, 3])
# torch.Size([8, 3])
# torch.Size([8, 3])
# torch.Size([12, 3])


  5%|▌         | 1/20 [00:04<01:19,  4.18s/it]

In epoch 0, loss: 6.121008622668738, time_g_u_s: 4 s


 30%|███       | 6/20 [00:24<00:58,  4.19s/it]

In epoch 5, loss: 1.9246748981456883, time_g_u_s: 19 s


 55%|█████▌    | 11/20 [00:46<00:38,  4.23s/it]

In epoch 10, loss: 0.9036029286468674, time_g_u_s: 20 s


 80%|████████  | 16/20 [01:06<00:16,  4.05s/it]

In epoch 15, loss: 0.4599778798371139, time_g_u_s: 16 s


100%|██████████| 20/20 [01:22<00:00,  4.12s/it]


In [19]:
##################################################################################################################################################################

In [40]:
optimizer = torch.optim.Adam(itertools.chain(model.parameters(), pred.parameters()), lr=0.01)

all_logits = []
for e in range(100):
    h = model(train_dual_g, embeddings)  #train_g.ndata['feat'])
    pos_score = pred(train_pos_g, h)
    neg_score = pred(train_neg_g, h)
    loss = compute_loss(pos_score, neg_score)
    
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    if e % 5 == 0:
        print('In epoch {}, loss: {}'.format(e, loss))


from sklearn.metrics import roc_auc_score
with torch.no_grad():
    pos_score = pred(test_pos_g, h)
    neg_score = pred(test_neg_g, h)
    print('AUC', compute_auc(pos_score, neg_score))

In epoch 0, loss: 0.706829845905304
In epoch 5, loss: 0.6571868062019348
In epoch 10, loss: 0.6407327055931091
In epoch 15, loss: 0.6243399381637573
In epoch 20, loss: 0.6006797552108765
In epoch 25, loss: 0.5711714625358582
In epoch 30, loss: 0.546612560749054
In epoch 35, loss: 0.5299765467643738
In epoch 40, loss: 0.5167433023452759
In epoch 45, loss: 0.5066409111022949
In epoch 50, loss: 0.4984222650527954
In epoch 55, loss: 0.49076011776924133
In epoch 60, loss: 0.48473072052001953
In epoch 65, loss: 0.48049843311309814
In epoch 70, loss: 0.4720494747161865
In epoch 75, loss: 0.4649830758571625
In epoch 80, loss: 0.4580879807472229
In epoch 85, loss: 0.44973331689834595
In epoch 90, loss: 0.44106853008270264
In epoch 95, loss: 0.4309082329273224
AUC 0.8564272029271526


In [None]:
train_g = dgl.remove_edges(g, eids[:1000], ) #subgraph
train_g = dgl.remove_nodes(train_g, range(2000))
print(train_g)

label2 = train_g.ndata['label']
nx_G2 = train_g.to_networkx()

#visualize(label2, nx_G2)

# TRASH

In [None]:
'''
data_dir = os.path.expanduser("~/data/cora")

data_dir
#edgelist = pd.read_csv(os.path.join(data_dir, "cora.cites"), sep='\t', header=None, names=["target", "source"])

edgelist = pd.read_csv(f"./data/cora/cora.cites", sep='\t', header=None, names=["target", "source"])
edgelist["label"] = "cites"

edgelist.sample(frac=1).head(5)

Gnx = nx.from_pandas_edgelist(edgelist, edge_attr="label")
#nx.set_node_attributes(Gnx, "paper", "label")

In [572]:
nodelist = sorted(G.nodes())
adj_mat = sp.csr_matrix(nx.to_numpy_matrix(G, nodelist))
args_d = LnxG.number_of_nodes()
edgelist = list(LnxG.nodes())

edge_embs = h_dual
enum = {node: num for num, node in enumerate(list(G.nodes()))}
print(enum)
num_nodes = adj_mat.shape[0]
node_features = np.zeros((num_nodes, args_d))
counts = np.ones(num_nodes)
for i, edge in enumerate(edgelist):
    # средние по эмбеддингам рёбер, связанных с данной вершиной
    print(i, edge)
    u = enum[edge[0]]; v = enum[edge[1]]
    print(u, v)
    node_features[u, :] += edge_embs[i, :]
    node_features[v, :] -= edge_embs[i, :]
    counts[u] += 1; counts[v] += 1
node_features /= counts[:, np.newaxis]

{0: 0, 1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6}
0 (5, 6)
5 6


TypeError: Concatenation operation is not implemented for NumPy arrays, use np.concatenate() instead. Please do not rely on this error; it may not be given on all Python implementations.