-
Notifications
You must be signed in to change notification settings - Fork 17
/
graph.py
747 lines (683 loc) · 27.4 KB
/
graph.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
from collections.abc import Iterable
from copy import deepcopy
import csv
import gc
import io
import networkx as nx
import numba
import numpy as np
import os
import pandas as pd
import pyarrow as pa
import pyarrow.compute as pc
from pyarrow import csv as pa_csv
from scipy import sparse
from sklearn.decomposition import TruncatedSVD
import time
import warnings
from csrgraph.numba_backend.methods import (
_row_norm, _node_degrees, _src_multiply, _dst_multiply
)
from csrgraph.numba_backend.random_walks import (
_random_walk, _node2vec_walks,_node2vec_walks_with_rejective_sampling, seed_numba
)
from csrgraph.numba_backend import methods, random_walks
from csrgraph.embedding import ggvec, glove, grarep
__all__ = [
"csrgraph",
"read_edgelist",
"from_df",
"from_tuples",
]
UINT32_MAX = (2**32) - 1
UINT16_MAX = (2**16) - 1
class csrgraph():
"""
This top level python class either calls external JIT'ed methods
or methods from the JIT'ed internal graph
"""
def __init__(self, data, nodenames=None, copy=True, threads=0):
"""
A class for larger graphs.
NOTE: this class tends to "steal data" by default.
If you pass a numpy array/scipy matrix/csr graph
Chances are this object will point to the same instance of data
Parameters:
-------------------
data : The graph data. Can be one of:
**NetworkX Graph**
**Numpy dense matrix**
**CSR Matrix**
**(data, indices, indptr)**
**CSRGraph object**
nodenames (array of str or int) : Node names
The position in this array should correspond with the node ID
So if passing a CSR Matrix or raw data, it should be co-indexed
with the matrix/raw data arrays
copy : bool
Wether to copy passed data to create new object
Default behavior is to point to underlying ctor passed data
For networkX graphs and numpy dense matrices we create a new object anyway
threads : int
number of threads to leverage for methods in this graph.
WARNING: changes the numba environment variable to do it.
Recompiles methods and changes it when changed.
0 is numba default (usually all threads)
TODO: add numpy mmap support for very large on-disk graphs
Should be in a different class
This also requires routines to read/write edgelists, etc. from disk
TODO: subclass scipy.csr_matrix???
"""
self.read_input_data(data, copy)
# Now that we have the core csr_matrix, alias underlying arrays
assert hasattr(self, 'weights')
assert hasattr(self, 'src')
assert hasattr(self, 'dst')
# Convert to Arrow data
self.weights = pa.Array.from_pandas(self.weights)
self.src = pa.Array.from_pandas(self.src)
self.dst = pa.Array.from_pandas(self.dst)
# indptr has one more element than nnodes
self.nnodes = len(self.src) - 1
# If no node names, just a range of ints
if nodenames is not None:
self.names = nodenames
if not hasattr(self, 'names'):
self.names = pd.Series(np.arange(self.nnodes))
self.names = pa.Array.from_pandas(self.names)
if self.nnodes != len(self.names):
raise ValueError(
"index pointer must have length equal to number of nodes\n"
+ f"nnodes: {self.nnodes}, indptr array: {len(self.src)}\n"
+ f"number of node names: {len(self.names)}"
)
# Bounds check once here otherwise there be dragons later
max_idx = np.max(self.dst)
if self.nnodes < max_idx:
raise ValueError(f"""
Out of bounds node: {max_idx}, nnodes: {self.nnodes}
""")
self.set_threads(threads)
def read_input_data(self, data, copy=True):
"""
Convert from whatever input data format to our format
data : The graph data. Can be one of:
**NetworkX Graph**
**Numpy dense matrix**
**CSR Matrix**
**(data, indices, indptr)**
**CSRGraph object**
nodenames (array of str or int) : Node names
The position in this array should correspond with the node ID
So if passing a CSR Matrix or raw data, it should be co-indexed
with the matrix/raw data arrays
copy : bool
Wether to copy passed data to create new object
Default behavior is to point to underlying ctor passed data
For networkX graphs and numpy dense matrices we create a new object anyway
"""
# If input is a CSRGraph, same object
if isinstance(data, csrgraph):
if copy:
self.weights = data.weights.copy()
self.src = data.src.copy()
self.dst = data.dst.copy()
self.names = pd.Series(deepcopy(data.names))
else:
self.names = pd.Series(data.names)
self.weights = data.weights
self.src = data.indptr
self.dst = data.indices
# NetworkX Graph input
elif isinstance(data, (nx.Graph, nx.DiGraph)):
mat = nx.adjacency_matrix(data)
mat.data = mat.data.astype(np.float32)
self.weights = mat.data
self.src = mat.indptr
self.dst = mat.indices
self.names = pd.Series(list(data))
# CSR Matrix Input
elif isinstance(data, (sparse.csr_matrix, sparse.csr_array)):
if copy: mat = data.copy()
else: mat = data
self.weights = mat.data
self.src = mat.indptr
self.dst = mat.indices
# Numpy Array Input
elif isinstance(data, np.ndarray):
mat = sparse.csr_matrix(data)
self.weights = mat.data
self.src = mat.indptr
self.dst = mat.indices
else:
raise ValueError(f"Incorrect input data type: {type(data).__name__}")
def set_threads(self, threads):
self.threads = threads
# Manage threading through Numba hack
if type(threads) is not int:
raise ValueError("Threads argument must be an int!")
if threads == 0:
threads = numba.config.NUMBA_DEFAULT_NUM_THREADS
threads = str(threads)
# If we change the number of threads, recompile
try:
prev_numba_value = os.environ['NUMBA_NUM_THREADS']
except KeyError:
prev_numba_value = threads
if threads != prev_numba_value:
os.environ['NUMBA_NUM_THREADS'] = threads
_random_walk.recompile()
_row_norm.recompile()
_node2vec_walks.recompile()
_node_degrees.recompile()
_src_multiply.recompile()
_dst_multiply.recompile()
def __getitem__(self, node):
"""
[] operator
like networkX, gets names of neighbor nodes
"""
# Get node ID from names array
# This is O(n) by design -- we more often get names from IDs
# than we get IDs from names and we don't want to hold 2 maps
# node_id = self.names.get_loc(node)
# edges = self.dst[
# (self.src[node_id]).as_py()
# : (self.src[node_id+1]).as_py()
# ]
pc.cast(node, self.names.type)
node_idx = pc.index(self.names, node).as_py()
src_idx_start = self.src[node_idx].as_py()
src_idx_end = self.src[node_idx + 1].as_py()
edge_idx = self.dst[src_idx_start: src_idx_end]
return pc.take(self.names, edge_idx)
def nodes(self):
"""
Returns the graph's nodes, in order
"""
if self.names is not None:
return self.names
else:
return np.arange(self.nnodes)
def normalize(self, return_self=True):
"""
Normalizes edge weights per node
For any node in the Graph, the new edges' weights will sum to 1
return_self : bool
whether to change the graph's values and return itself
this lets us call `G.normalize()` directly
"""
new_weights = _row_norm(self.weights.to_numpy(), self.src.to_numpy())
if return_self:
G_new = csrgraph(sparse.csr_matrix((new_weights, self.dst, self.src)))
# Point objects to the correct places
self.weights = G_new.weights
self.src = G_new.src
self.dst = G_new.dst
G_new = None
gc.collect()
return self
else:
return csrgraph(sparse.csr_matrix(
(new_weights, self.dst, self.src)),
nodenames=self.names)
def random_walks(self,
walklen=10,
epochs=1,
start_nodes=None,
normalize_self=False,
return_weight=1.,
neighbor_weight=1.,
rejective_sampling=False,
seed=None):
"""
Create random walks from the transition matrix of a graph
in CSR sparse format
Parameters
----------
T : scipy.sparse.csr matrix
Graph transition matrix in CSR sparse format
walklen : int
length of the random walks
epochs : int
number of times to start a walk from each nodes
return_weight : float in (0, inf]
Weight on the probability of returning to node coming from
Having this higher tends the walks to be
more like a Breadth-First Search.
Having this very high (> 2) makes search very local.
Equal to the inverse of p in the Node2Vec paper.
neighbor_weight : float in (0, inf]
Weight on the probability of visitng a neighbor node
to the one we're coming from in the random walk
Having this higher tends the walks to be
more like a Depth-First Search.
Having this very high makes search more outward.
Having this very low makes search very local.
Equal to the inverse of q in the Node2Vec paper.
threads : int
number of threads to use. 0 is full use
rejective_sampling: bool
for deepwalk (p=1, q=1), this parameters is of no use
for node2vec walks, it determines if we use rejective sampling or not
Rejective sampling is faster, but less stable
Credit to https://github.com/louisabraham/fastnode2vec for original idea
Returns
-------
out : 2d np.array (n_walks, walklen)
A matrix where each row is a random walk,
and each entry is the ID of the node
"""
if seed is not None:
seed = int(seed)
seed_numba(seed)
# Make csr graph
if normalize_self:
self.normalize(return_self=True)
T = self
else:
T = self.normalize(return_self=False)
n_rows = T.nnodes
if start_nodes is None:
start_nodes = np.arange(n_rows)
sampling_nodes = np.tile(start_nodes, epochs)
# Node2Vec Biased walks if parameters specified
if (return_weight > 1. or return_weight < 1.
or neighbor_weight < 1. or neighbor_weight > 1.):
if rejective_sampling:
walks = _node2vec_walks_with_rejective_sampling(
T.weights.to_numpy(), T.src.to_numpy(), T.dst.to_numpy(),
sampling_nodes=sampling_nodes,
walklen=walklen,
return_weight=return_weight,
neighbor_weight=neighbor_weight)
else:
walks = _node2vec_walks(
T.weights.to_numpy(), T.src.to_numpy(), T.dst.to_numpy(),
sampling_nodes=sampling_nodes,
walklen=walklen,
return_weight=return_weight,
neighbor_weight=neighbor_weight)
# much faster implementation for regular walks
else:
walks = _random_walk(
T.weights.to_numpy(), T.src.to_numpy(), T.dst.to_numpy(),
sampling_nodes, walklen)
return walks
def ggvec(self, n_components=2,
learning_rate=0.05,
tol="auto",
max_epoch=500,
negative_ratio=1.0,
order=1,
negative_decay=0.,
exponent=0.5,
max_loss=30.,
tol_samples=100,
verbose=True):
"""
GGVec: Fast global first (and higher) order local embeddings.
This algorithm directly minimizes related nodes' distances.
It uses a relaxation pass (negative sample) + contraction pass (loss minimization)
To find stable embeddings based on the minimal dot product of edge weights.
Parameters:
-------------
n_components (int):
Number of individual embedding dimensions.
order : int >= 1
Meta-level of the embeddings. Improves link prediction performance.
Setting this higher than 1 ~quadratically slows down algorithm
Order = 1 directly optimizes the graph.
Order = 2 optimizes graph plus 2nd order edges
(eg. neighbours of neighbours)
Order = 3 optimizes up to 3rd order edges
Higher order edges are automatically weighed using GraRep-style graph formation
Eg. the higher-order graph is from stable high-order random walk distribution.
negative_ratio : float in [0, 1]
Negative sampling ratio.
Setting this higher will do more negative sampling.
This is slower, but can lead to higher quality embeddings.
exponent : float
Weighing exponent in loss function.
Having this lower reduces effect of large edge weights.
tol : float in [0, 1] or "auto"
Optimization early stopping criterion.
Stops average loss < tol for tol_samples epochs.
"auto" sets tol as a function of learning_rate
tol_samples : int
Optimization early stopping criterion.
This is the number of epochs to sample for loss stability.
Once loss is stable over this number of epochs we stop early.
negative_decay : float in [0, 1]
Decay on negative ratio.
If >0 then negative ratio will decay by (1-negative_decay) ** epoch
You should usually leave this to 0.
max_epoch : int
Stopping criterion.
max_count : int
Ceiling value on edge weights for numerical stability
learning_rate : float in [0, 1]
Optimization learning rate.
max_loss : float
Loss value ceiling for numerical stability.
"""
if tol == 'auto':
tol = max(learning_rate / 2, 0.05)
# Higher order graph embeddings
# Method inspired by GraRep (powers of transition matrix)
if order > 1:
norm_weights = _row_norm(self.weights.to_numpy(), self.src.to_numpy())
tranmat = sparse.csr_matrix((norm_weights, self.dst, self.src))
target_matrix = tranmat.copy()
res = np.zeros((self.nnodes, n_components))
for _ in range(order - 1):
target_matrix = target_matrix.dot(tranmat)
w = ggvec.ggvec_main(
data=target_matrix.data,
src=target_matrix.indptr,
dst=target_matrix.indices,
n_components=n_components, tol=tol,
tol_samples=tol_samples,
max_epoch=max_epoch, learning_rate=learning_rate,
negative_ratio=negative_ratio,
negative_decay=negative_decay,
exponent=exponent,
max_loss=max_loss, verbose=verbose)
res = np.sum([res, w], axis=0)
return res
else:
return ggvec.ggvec_main(
data=self.weights.to_numpy(),
src=self.src.to_numpy(),
dst=self.dst.to_numpy(),
n_components=n_components, tol=tol,
tol_samples=tol_samples,
max_epoch=max_epoch, learning_rate=learning_rate,
negative_ratio=negative_ratio,
negative_decay=negative_decay,
exponent=exponent,
max_loss=max_loss, verbose=verbose)
def glove(self, n_components=2,
tol=0.0001, max_epoch=10_000,
max_count=50, exponent=0.5,
learning_rate=0.1, max_loss=10.,
verbose=True):
"""
Global first order embedding for positive, count-valued sparse matrices.
This algorithm is normally used in NLP on word co-occurence matrices.
The algorithm fails if any value in the sparse matrix < 1.
It is also a poor choice for matrices with homogeneous edge weights.
Parameters:
-------------
n_components (int):
Number of individual embedding dimensions.
tol : float in [0, 1]
Optimization early stopping criterion.
Stops when largest gradient is < tol
max_epoch : int
Stopping criterion.
max_count : int
Ceiling value on edge weights for numerical stability
exponent : float
Weighing exponent in loss function.
Having this lower reduces effect of large edge weights.
learning_rate : float in [0, 1]
Optimization learning rate.
max_loss : float
Loss value ceiling for numerical stability.
References:
-------------
Paper: https://nlp.stanford.edu/pubs/glove.pdf
Original implementation: https://github.com/stanfordnlp/GloVe/blob/master/src/glove.c
"""
w = glove.glove_main(
weights=self.weights.to_numpy(),
dst=self.dst.to_numpy(),
src=self.src.to_numpy(),
n_components=n_components, tol=tol,
max_epoch=max_epoch, learning_rate=learning_rate,
max_count=max_count, exponent=exponent,
max_loss=max_loss, verbose=verbose
)
return w
def grarep(self, n_components=2,
order=5,
embedder=TruncatedSVD(
n_iter=10,
random_state=42),
verbose=True):
"""Implementation of GraRep: Embeddings of powers of the PMI matrix
of the graph adj matrix.
NOTE: Unlike GGVec and GLoVe, this method returns a LIST OF EMBEDDINGS
(one per order). You can sum them, or take the last one for embedding.
Default pooling is to sum :
`lambda x : np.sum(x, axis=0)`
You can also take only the highest order embedding:
`lambda x : x[-1]`
Etc.
Original paper: https://dl.acm.org/citation.cfm?id=2806512
Parameters :
----------------
n_components (int):
Number of individual embedding dimensions.
order (int):
Number of PMI matrix powers.
The performance degrades close to quadratically as a factor of this parameter.
Generally should be kept under 5.
embedder : (instance of sklearn API compatible model)
Should implement the `fit_transform` method:
https://scikit-learn.org/stable/glossary.html#term-fit-transform
The model should also have `n_components` as a parameter
for number of resulting embedding dimensions. See:
https://scikit-learn.org/stable/modules/manifold.html#manifold
If not compatible, set resulting dimensions in the model instance directly
merger : function[list[array]] -> array
GraRep returns one embedding matrix per order.
This function reduces it to a single matrix.
"""
w_array = grarep.grarep_main(
weights=self.weights.to_numpy(),
dst=self.dst.to_numpy(),
src=self.src.to_numpy(),
n_components=n_components, order=order,
embedder=embedder, verbose=verbose
)
return w_array
def random_walk_resample(self, walklen=4, epochs=30):
"""
Create a new graph from random walk co-occurences.
First, we generate random walks on the graph
Then, any nodes appearing together in a walk get an edge
Edge weights are co-occurence counts.
Recommendation: many short walks > fewer long walks
TODO: add node2vec walk parameters
"""
walks = self.random_walks(walklen=walklen, epochs=epochs)
elist = random_walks.walks_to_edgelist(walks)
if 'weight' in elist.columns:
weights = elist.weight.to_numpy()
else:
weights = np.ones(dst.shape[0])
return methods._edgelist_to_graph(
elist.src.to_numpy(), elist.dst.to_numpy(),
weights, nnodes=self.nnodes, nodenames=self.names)
def to_scipy(self):
return sparse.csr_array((self.weights, self.dst, self.src))
def to_numpy(self):
return self.to_scipy().todense()
#
#
# TODO: Organize Graph method here
# Layout nodes by their 1d embedding's position
# Also layout by hilbert space filling curve
#
#
def read_edgelist(f,
directed=True,
sep="infer",
header="infer",
keep_default_na=False,
**readcsvkwargs):
"""
Creates a csrgraph from an edgelist file.
The edgelist should be in the form
[source destination]
or
[source destination edge_weight]
The first column needs to be the source, the second the destination.
If there is a third column it's assumed to be edge weights.
Otherwise, all arguments from pandas.read_csv can be used to read the file.
f : str
Filename to read
directed : bool
Whether the graph is directed or undirected.
All csrgraphs are directed, undirected graphs simply add "return edges"
sep : str
CSV-style separator. Eg. Use "," if comma separated
header : int or None
pandas read_csv parameter. Use if column names are present
keep_default_na: bool
pandas read_csv argument to prevent casting any value to NaN
read_csv_kwargs : keyword arguments for pd.read_csv
Pass these kwargs as you would normally to pd.read_csv.
Returns : csrgraph
"""
with open(f, encoding='utf-8', errors='ignore') if isinstance(f, str) \
else io.TextIOWrapper(f, encoding='utf-8') as fname:
csv_sniff = csv.Sniffer()
sample = fname.read(1024)
dialect = csv_sniff.sniff(sample)
# Parse header
skip_rows_before = 0
if header == "infer":
has_header = False
has_header = csv_sniff.has_header(sample)
elif header is None:
has_header = False
if sep == 'infer':
sep = dialect.delimiter
else:
if len(sep) != 1:
raise ValueError(f"sep must be 1 char, got {sep}")
sep = sep
read_options = pa_csv.ReadOptions(
use_threads=True,
block_size=None,
skip_rows=0, # Rows to ignore before column names
skip_rows_after_names=0, # Rows to ignore after column names
column_names=None,
# Autogenerate columns if there's no header only
autogenerate_column_names=not has_header,
encoding='utf8'
)
parse_options = pa_csv.ParseOptions(
delimiter=sep,
quote_char=dialect.quotechar,
double_quote=dialect.doublequote,
escape_char=False,
newlines_in_values=False,
ignore_empty_lines=True,
invalid_row_handler=None
)
convert_options = pa_csv.ConvertOptions(
check_utf8=True,
column_types=None,
null_values=None,
true_values=None,
false_values=None,
decimal_point=".",
strings_can_be_null=False,
quoted_strings_can_be_null=True,
include_columns=None,
include_missing_columns=False,
auto_dict_encode=False,
auto_dict_max_cardinality=None,
timestamp_parsers=None
)
# Reset file cursor to start for full read
fname.seek(0)
elist = pa_csv.read_csv(f,
read_options=read_options,
parse_options=parse_options,
convert_options=convert_options
).to_pandas()
return from_df(elist, directed=directed)
def edgelist_to_ipc():
"""
Translates edgelist
"""
def from_df(elist: pd.DataFrame, directed: bool = True) -> csrgraph:
"""
Creates a csrgraph from a DataFrame of either two or three columns.
elist :
Either a DataFrame with two columns for source and target or three
columns for source, target, and weight.
Returns : csrgraph
"""
if len(elist.columns) == 2:
elist.columns = ['src', 'dst']
elist['weight'] = np.ones(elist.shape[0])
elif len(elist.columns) == 3:
elist.columns = ['src', 'dst', 'weight']
else:
raise ValueError(f"""
Invalid columns: {elist.columns}
Expected 2 (source, destination)
or 3 (source, destination, weight)
Read File: \n{elist.head(5)}
""")
# Create name mapping to normalize node IDs
# Somehow this is 1.5x faster than np.union1d. Shame on numpy.
allnodes = list(
set(elist.src.unique())
.union(set(elist.dst.unique())))
# Factor all nodes to unique IDs
names = (
pd.Series(allnodes).astype('category')
.cat.categories
)
nnodes = names.shape[0]
# Get the input data type
dtype = np.uint16
if nnodes > UINT16_MAX:
dtype = np.uint32
if nnodes > UINT32_MAX:
dtype = np.uint64
name_dict = dict(zip(names,
np.arange(names.shape[0], dtype=dtype)))
elist.src = elist.src.map(name_dict).astype(dtype)
elist.dst = elist.dst.map(name_dict).astype(dtype)
# clean up temp data
allnodes = None
name_dict = None
gc.collect()
# If undirected graph, append edgelist to reversed self
if not directed:
other_df = elist.copy()
other_df.columns = ['dst', 'src', 'weight']
elist = pd.concat([elist, other_df])
other_df = None
gc.collect()
# Need to sort by src for _edgelist_to_graph
elist = elist.sort_values(by='src')
# extract numpy arrays and clear memory
src = elist.src.to_numpy()
dst = elist.dst.to_numpy()
weight = elist.weight.to_numpy()
elist = None
gc.collect()
G = methods._edgelist_to_graph(
src, dst, weight,
nnodes, nodenames=names
)
return G
def from_tuples(tuples, directed: bool = False) -> csrgraph:
"""
Creates a csrgraph from an iterable of edge tuples.
tuples : iterable[tuple[str, str]] or iterable[tuple[str, str, str]]]
Either an iterable of source, target pairs or an iterable of
source, target, weight triples
Returns : csrgraph
"""
elist = pd.DataFrame(tuples)
return from_df(elist, directed=directed)