/
experiments.py
339 lines (292 loc) · 14.6 KB
/
experiments.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
from __future__ import print_function
import argparse
import numpy as np
import networkx as nx
import node2vec
import os
import pickle
import config
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_curve, auc, roc_auc_score
from rnn_batch_node2vec_pytorch import Node2Vec
def parse_args():
parser = argparse.ArgumentParser(description="Run node2vec.")
parser.add_argument('--input', nargs='?',
default=config.input_edgelist,
help='Input graph path')
parser.add_argument('--output', nargs='?', default=config.output_file,
help='Embeddings path')
parser.add_argument('--dimensions', type=int, default=config.dimensions,
help='Number of dimensions. Default is 128.')
parser.add_argument('--walk-length', type=int, default=config.walk_length,
help='Length of walk per source. Default is 80.')
parser.add_argument('--num-walks', type=int, default=config.num_walks,
help='Number of walks per source. Default is 10.')
parser.add_argument('--window-size', type=int, default=config.window_size,
help='Context size for optimization. Default is 10.')
parser.add_argument('--iter', default=1, type=int,
help='Number of epochs in SGD')
parser.add_argument('--workers', type=int, default=8,
help='Number of parallel workers. Default is 8.')
parser.add_argument('--p', type=float, default=config.p,
help='Return hyperparameter. Default is 1.')
parser.add_argument('--q', type=float, default=config.q,
help='Inout hyperparameter. Default is 1.')
parser.add_argument('--weighted', dest='weighted', action='store_true',
help='Boolean specifying (un)weighted. Default is unweighted.')
parser.add_argument('--unweighted', dest='unweighted', action='store_false')
parser.set_defaults(weighted=False)
parser.add_argument('--directed', dest='directed', action='store_true',
help='Graph is (un)directed. Default is undirected.')
parser.add_argument('--undirected', dest='undirected', action='store_false')
parser.set_defaults(directed=False)
return parser.parse_args()
def read_graph(file, get_connected_graph=True, remove_selfloops=True):
if args.weighted:
G = nx.read_edgelist(file, nodetype=int, data=(('weight', float),), create_using=nx.DiGraph())
else:
G = nx.read_edgelist(file, nodetype=int, create_using=nx.DiGraph())
for edge in G.edges():
G[edge[0]][edge[1]]['weight'] = 1
print('Graph created!!!')
if remove_selfloops:
# remove the edges with selfloops
for node in G.nodes_with_selfloops():
G.remove_edge(node, node)
if not args.directed:
G = G.to_undirected()
if get_connected_graph and not nx.is_connected(G):
connected_sub_graph = max(nx.connected_component_subgraphs(G), key=len)
print('Initial graph not connected...returning the largest connected subgraph..')
return connected_sub_graph
else:
print('Returning undirected graph!')
return G
else:
print('Returning directed graph!')
return G
def learn_embeddings(walks, train_pos=None, train_neg=None, test_pos=None, test_neg=None, eval_bool=False):
# walks = [map(str, walk) for walk in walks] # this will work on python2 but not in python3
if not eval_bool:
print('Creating walk corpus..')
walks = [list(map(str, walk)) for walk in walks] # this is for python3
model = Node2Vec(walks=walks, output_file=args.output, walk_length=args.walk_length,
embedding_dim=args.dimensions,
epochs=args.iter, batch_size=config.batch_size, window_size=args.window_size, neg_sample_num=config.neg_samples)
print('Optimization started...')
model.train()
embeddings = model.wv
return embeddings
else:
model = Node2Vec(walks=None, output_file=args.output, walk_length=args.walk_length,
embedding_dim=args.dimensions,
epochs=args.iter, batch_size=config.batch_size, window_size=args.window_size, neg_sample_num=config.neg_samples)
print('Evaluation started...')
model.eval(train_pos, train_neg, test_pos, test_neg)
def create_train_test_splits(percent_pos, percent_neg, graph):
# This method creates the train,test splits...from the dataset
# we remove some percentage of the existing edges(ensuring the graph remains connected)
# and use them as positive samples
# then we sample the same amount of non-existing edges, using them as negative samples
# then do the same for testing set.
np.random.seed(0)
num_nodes = graph.number_of_nodes()
num_edges = graph.number_of_edges()
num_pos_train_edges = int(num_edges * percent_pos)
num_neg_train_edges = int(num_edges * percent_neg)
num_pos_test_edges = num_edges - num_pos_train_edges
num_neg_test_edges = num_edges - num_neg_train_edges
all_edges = [edge for edge in graph.edges]
all_nodes = [node for node in graph.nodes]
all_edges_set = set(all_edges) # for quick access
train_edges = set(all_edges)
test_edges = set()
counter1 = 0
counter2 = 0
if not args.directed:
original_connected_comps = nx.number_connected_components(graph)
print('Connected components: ', original_connected_comps)
print('Creating positive test samples..')
# shuffle the edges and iterate over them creating the test set
np.random.shuffle(all_edges)
# for idx, edge in enumerate(all_edges):
# if idx % 100 == 0:
# print('Edge: {}/{}'.format(idx, num_edges))
# print('Added: ', counter2)
# print('Not Added: ', counter1)
# node1 = edge[0]
# node2 = edge[1]
# # make sure that the graph remains connected
# # --from https://github.com/lucashu1/link-prediction/blob/master/gae/preprocessing.py
# graph.remove_edge(node1, node2)
# # if not args.directed:
# # if nx.number_connected_components(graph) > original_connected_comps:
# # graph.add_edge(node1, node2)
# # continue
# reachable_from_v1 = nx.connected._plain_bfs(graph, edge[0])
# if edge[1] not in reachable_from_v1:
# graph.add_edge(node1, node2)
# counter1 += 1
# continue
# # remove edges from the train_edges set and add them to the test_edges set --positive samples
# if len(test_edges) < num_pos_test_edges:
# test_edges.add(edge)
# train_edges.remove(edge)
# counter2 += 1
# elif len(test_edges) == num_pos_test_edges:
# if not args.directed:
# graph.add_edge(node1, node2)
# break
# now create false edges for test and train sets..making sure the edge is not a real edge
# and not already sampled
# first for test_set
print('Creating negative test samples..')
test_false_edges = set()
counter = 0
while len(test_false_edges) < num_neg_test_edges:
idx_i = np.random.randint(0, len(all_nodes))
idx_j = np.random.randint(0, len(all_nodes))
# we dont want to sample the same node
if idx_i == idx_j:
continue
sampled_edge = (all_nodes[idx_i], all_nodes[idx_j])
# check if we sampled a real edge or an already sampled edge
if sampled_edge in all_edges_set:
continue
if sampled_edge in test_false_edges:
continue
# everything is ok so we add the fake edge to the test_set
reachable_from_v1 = nx.connected._plain_bfs(graph, all_nodes[idx_i])
if all_nodes[idx_j] in reachable_from_v1:
#print(all_nodes[idx_i], all_nodes[idx_j])
counter += 1
test_false_edges.add(sampled_edge)
print(counter)
# do the same for the train_set
print('Creating negative training samples...')
train_false_edges = set()
while len(train_false_edges) < num_neg_train_edges:
idx_i = np.random.randint(0, num_nodes)
idx_j = np.random.randint(0, num_nodes)
# we don't want to sample the same node
if idx_i == idx_j:
continue
sampled_edge = (all_nodes[idx_i], all_nodes[idx_j])
# check if we sampled a real edge or an already sampled edge
if sampled_edge in all_edges_set:
continue
if sampled_edge in test_false_edges:
continue
if sampled_edge in train_false_edges:
continue
# everything is ok so we add the fake edge to the train_set
# reachable_from_v1 = nx.connected._plain_bfs(graph, all_nodes[idx_i])
# if all_nodes[idx_j] in reachable_from_v1:
# print(all_nodes[idx_i], all_nodes[idx_j])
train_false_edges.add(sampled_edge)
# asserts
assert test_false_edges.isdisjoint(all_edges_set)
assert train_false_edges.isdisjoint(all_edges_set)
assert test_false_edges.isdisjoint(train_false_edges)
assert test_edges.isdisjoint(train_edges)
# convert them back to lists and return them
train_pos = list(train_edges)
train_neg = list(train_false_edges)
test_pos = list(test_edges)
test_neg = list(test_false_edges)
# odir = 'tributary_of-undirected-dataset-train-test-splits'
# if not os.path.exists(odir):
# os.makedirs(odir)
#
# # save the splits
# with open("{}.p".format(os.path.join(odir, 'tributary_of_train_pos')), 'wb') as dump_file:
# pickle.dump(train_pos, dump_file)
# with open("{}.p".format(os.path.join(odir, 'tributary_of_train_neg')), 'wb') as dump_file:
# pickle.dump(train_neg, dump_file)
# with open("{}.p".format(os.path.join(odir, 'tributary_of_test_pos')), 'wb') as dump_file:
# pickle.dump(test_pos, dump_file)
# with open("{}.p".format(os.path.join(odir, 'tributary_of_test_neg')), 'wb') as dump_file:
# pickle.dump(test_neg, dump_file)
# #
# nx.write_edgelist(graph, os.path.join(odir, 'tributary_of_train_graph_undirected.edgelist'))
return train_pos, train_neg, test_pos, test_neg
# in the paper they get edge embeddings..they use a lot of methods but for link prediction they state that
# the Hadamard product is highly stable and gives the best performance.
def get_edge_embeddings(edge_list, node_embeddings):
# create a list containing edge embeddings
edge_embeddings = []
for edge in edge_list:
emb_node1 = node_embeddings[str(edge[0])]
emb_node2 = node_embeddings[str(edge[1])]
hadamard = np.multiply(emb_node1, emb_node2)
edge_embeddings.append(hadamard)
edge_embeddings = np.array(edge_embeddings)
return edge_embeddings
def load_embeddings(file):
node_embeddings = {}
odir = 'C:/Users/sotir/PycharmProjects/node2vec_average_embeddings/embeddings/'
with open("{}".format(os.path.join(odir, file)), 'r') as embeddings:
embeddings.readline()
for i, line in enumerate(embeddings):
line = line.strip().split(' ')
word = line[0]
embedding = [float(x) for x in line[1:]]
assert len(embedding) == 128
node_embeddings[word] = np.array(embedding)
return node_embeddings
def main(args):
nx_G = read_graph(file=args.input, get_connected_graph=False, remove_selfloops=True)
print(nx_G.number_of_nodes(), nx_G.number_of_edges())
train_pos = pickle.load(open(config.train_pos, 'rb'))
test_pos = pickle.load(open(config.test_pos, 'rb'))
train_neg = pickle.load(open(config.train_neg, 'rb'))
test_neg = pickle.load(open(config.test_neg, 'rb'))
# train_pos, train_neg, test_pos, test_neg = create_train_test_splits(0.5, 0.5, nx_G)
# train_neg, test_neg = create_train_test_splits(0.5, 0.5, nx_G)
print('Number of positive training samples: ', len(train_pos))
print('Number of negative training samples: ', len(train_neg))
print('Number of positive testing samples: ', len(test_pos))
print('Number of negative testing samples: ', len(test_neg))
train_graph = read_graph(
file=config.train_graph,
get_connected_graph=False,
remove_selfloops=False)
print(
'Train graph created: {} nodes, {} edges'.format(train_graph.number_of_nodes(), train_graph.number_of_edges()))
print('Number of connected components: ', nx.number_connected_components(train_graph))
G = node2vec.Graph(train_graph, args.directed, args.p, args.q)
G.preprocess_transition_probs()
walks = G.simulate_walks(args.num_walks, args.walk_length)
# walks = [['1', '2345', '3356', '4446', '5354', '6124', '7457', '8445', '9790', '1022', '1133'],
# ['6914', '1022', '9780', '8445', '7457', '6123', '5354', '4446', '3356', '2345', '1'],
# ['6914', '1022', '9790', '8445', '7457', '6123', '5354', '4446', '3356', '2345', '1'],
# ['6914', '1022', '9790', '8445', '7457', '6123', '5354', '4446', '3356', '2345', '1'],
# ['6914', '1022', '9790', '8445', '7457', '6123', '5354', '4446', '3356', '2345', '1', '9999', '5000', '2000', '1000']]
node_embeddings = learn_embeddings(walks)
# for training
train_pos_edge_embs = get_edge_embeddings(train_pos, node_embeddings)
train_neg_edge_embs = get_edge_embeddings(train_neg, node_embeddings)
train_set = np.concatenate([train_pos_edge_embs, train_neg_edge_embs])
# labels: 1-> link exists, 0-> false edge
train_labels = np.zeros(len(train_set))
train_labels[:len(train_pos_edge_embs)] = 1
# for testing
test_pos_edge_embs = get_edge_embeddings(test_pos, node_embeddings)
test_neg_edge_embs = get_edge_embeddings(test_neg, node_embeddings)
test_set = np.concatenate([test_pos_edge_embs, test_neg_edge_embs])
# labels: 1-> link exists, 0-> false edge
test_labels = np.zeros(len(test_set))
test_labels[:len(test_pos_edge_embs)] = 1
# train the classifier and evaluate in the test set
classifier = LogisticRegression(random_state=0)
classifier.fit(train_set, train_labels)
# evaluate
test_preds = classifier.predict_proba(test_set)[:, 1]
false_positive_rate, true_positive_rate, thresholds = roc_curve(test_labels, test_preds)
test_auc = auc(false_positive_rate, true_positive_rate)
test_roc = roc_auc_score(test_labels, test_preds)
print('node2vec Test ROC score: ', str(test_roc))
print('node2vec Test AUC score: ', str(test_auc))
if __name__ == "__main__":
args = parse_args()
main(args)