### Data Preprocessing

***DO NOT RUN THIS FILE***: It is used to generate edge list files which can be found in the data folder

**Synthetic Data:**

Creating 2 random synthetic graphs with 5000 nodes and 10000 edges:

- undirected
- directed

In [3]:
import networkx as nx

In [None]:
# PLEASE DONT RE-RUN

# undirected random graph

G1 = nx.gnm_random_graph(5000, 10000, seed=42)

nx.write_edgelist(G1, "rand_undir_graph_edgelist.txt", data=False)


In [None]:
# PLEASE DONT RE-RUN

# directed random graph

G2 = nx.gnm_random_graph(5000, 10000, directed=True, seed=42)

nx.write_edgelist(G2, "rand_dir_graph_edgelist.txt", data=False)


**Real-world Data from the Paper**

Creating an edge list for the *Rome road network*

In [7]:
road_network = nx.DiGraph()  # directed graph

In [8]:
with open("../data/real-world-old/road-network-rome99.gr.txt") as f:
    for line in f:
        if line.startswith("a"):
            parts = line.split()
            u = int(parts[1])
            v = int(parts[2])
            w = float(parts[3])  # edge weight (distance)
            road_network.add_edge(u, v, weight=w)

In [9]:
print("Nodes:", road_network.number_of_nodes())
print("Edges:", road_network.number_of_edges())

Nodes: 3353
Edges: 8859


In [10]:
nx.write_edgelist(
    road_network,
    "road_network_rome99_edgelist.txt",  # new file
    data=['weight'],                      # include weights
    delimiter=' ',
)

Converting the .mat file to an edge list for the *Crawl network*

In [11]:
import scipy.io
import numpy as np
import scipy.sparse

In [None]:
# load  .mat file
mat_data = scipy.io.loadmat("../data/real-world-old/crawl-network-wb-cs-stanford.mat")

In [3]:
print(mat_data.keys())

dict_keys(['__header__', '__version__', '__globals__', 'Problem'])


In [None]:
# understand data types
problem = mat_data['Problem']

print(type(problem))
print(problem.dtype)

<class 'numpy.ndarray'>
[('name', 'O'), ('title', 'O'), ('A', 'O'), ('id', 'O'), ('date', 'O'), ('author', 'O'), ('ed', 'O'), ('kind', 'O'), ('notes', 'O')]


In [5]:
problem_struct = mat_data['Problem'][0,0]  # access the struct

In [None]:
adj = problem_struct['A']  # adjacency matrix
print(adj.shape)            
print(type(adj))

(9914, 9914)
<class 'scipy.sparse._csc.csc_array'>


In [14]:
adj_coo = adj.tocoo()  # convert to COO format

In [None]:
edge_list = np.vstack((adj_coo.row, adj_coo.col, adj_coo.data)).T
print(edge_list.shape)  


(36854, 3)


In [16]:
# save as a text file
np.savetxt(
    "crawl_network_edgelist.txt",
    edge_list,
    fmt=['%d', '%d', '%.6f'],
    delimiter=' '
)


Creating an edge list for the *Cite Network*

In [17]:
import scipy.io
import numpy as np
import scipy.sparse

In [18]:
# load  .mat file
mat_data = scipy.io.loadmat("../data/real-world-old/cite-network-Lederberg.mat")

In [19]:
print(mat_data.keys())

dict_keys(['__header__', '__version__', '__globals__', 'Problem'])


In [20]:
# understand data types
problem = mat_data['Problem']

print(type(problem))
print(problem.dtype)

<class 'numpy.ndarray'>
[('name', 'O'), ('title', 'O'), ('A', 'O'), ('id', 'O'), ('kind', 'O'), ('notes', 'O'), ('aux', 'O'), ('date', 'O'), ('author', 'O'), ('ed', 'O')]


In [21]:
problem_struct = mat_data['Problem'][0,0]  # access the struct
adj = problem_struct['A']  # adjacency matrix
print(adj.shape)            
print(type(adj))

(8843, 8843)
<class 'scipy.sparse._csc.csc_array'>


In [22]:
adj_coo = adj.tocoo()  # convert to COO format

In [23]:
edge_list = np.vstack((adj_coo.row, adj_coo.col, adj_coo.data)).T
print(edge_list.shape)

(41601, 3)


In [24]:
# save as a text file
np.savetxt(
    "cite_network_edgelist.txt",
    edge_list,
    fmt=['%d', '%d', '%.6f'],
    delimiter=' '
)