In [1]:
import networkx as nx
import random
from tqdm import tqdm
import numpy as np
import random
import networkx as nx
from tqdm import tqdm, tnrange, tqdm_notebook

In [2]:
input_path = './data/ca-AstroPh.txt'
working_dir = './data/astroph/'

In [11]:
TEST_FRACTION = .3
BATCH_COUNT = 4
if BATCH_COUNT == 8:
    COMBINATIONS = [(0,1), (1,2), (2,3), (3,4), (4,5), (5,6), (6,7), 
                (0,3), (3,6), (1,4), (2,5), (4,7), 
                (0,2),(1,3), (2,4), (3,5), (4,6), (5,7),
                (0,4),(1,5),(2,6),(3,7)]
if BATCH_COUNT == 4:
    COMBINATIONS = [(0,1),(1,2),(2,3),(0,3)]

### 1. choose the largest connected component of the graph

In [3]:
if working_dir[-1] != '/':
    working_dir += '/'

graph = nx.read_edgelist(input_path)
graphs = list(nx.connected_component_subgraphs(graph))
graph = max(graphs, key=len)
print(len(graph.nodes),len(graph.edges), 'lcc')

17903 197031 lcc


### 2. relabel the graph's nodes

In [4]:
node_label_mapping = {}

new_labels = list(range(len(graph.nodes())))

random.Random(4).shuffle(new_labels)

for node_idx, node_name in enumerate(graph.nodes()):
    node_label_mapping[node_name] = str(new_labels[node_idx])

graph = nx.relabel_nodes(graph, node_label_mapping)

edge_list = list(graph.edges)[:]
edge_list =[(min(int(edge[0]), int(edge[1])), max(int(edge[0]), int(edge[1]))) for edge in edge_list]

with open(working_dir + 'all_edges.txt','w+') as f:
    for edge in edge_list:
        f.write(str(edge[0]) + ' ' + str(edge[1]) + '\n')
        
assert max([int(a) for a in graph.nodes()]) == len(graph.nodes()) - 1

### 3. make train and test sets

In [6]:
def make_train_test(graph, test_frac=.1, prevent_disconnect=True, verbose=True):
    nodes = list(graph.nodes())
    
    removed_edges = []
    edge_list = list(graph.edges)[:]
    edge_list =[(str(min(int(edge[0]), int(edge[1]))), str(max(int(edge[0]), int(edge[1])))) for edge in edge_list]

    random.Random(4).shuffle(edge_list)
    
    test_count = int(test_frac * len(edge_list))
    print('test_count', test_count)
    print('edge_list', len(edge_list))
    step = 0
    for edge in tqdm_notebook(edge_list):
        step += 1
        graph.remove_edge(edge[0], edge[1])
        if nx.is_connected(graph) == False:
            graph.add_edge(edge[0], edge[1])
        else:
            removed_edges.append(edge)
        
        if step % 5000 == 0:
            print('step',step,'removed_edges',len(removed_edges))
        if len(removed_edges) == test_count:
            break

    test_edges_true = removed_edges[:]
    train_edges_true = list(graph.edges())
    train_edges_true = [(str(min(int(edge[0]), int(edge[1]))), str(max(int(edge[0]), int(edge[1])))) for edge in train_edges_true]
    print('train_edges_true, test_edges_true extracted')
    
    edge_list_dict = {}
    for e in edge_list:
        if e[0] not in edge_list_dict:
            edge_list_dict[e[0]] = []
        edge_list_dict[e[0]].append(e[1])
    

    test_edges_false = set()
    while(len(test_edges_false) < test_count):
        idx_i = int(nodes[np.random.randint(0, len(graph.nodes()))])
        idx_j = int(nodes[np.random.randint(0, len(graph.nodes()))])

        if idx_i == idx_j:
            continue

        false_edge = (str(min(idx_i, idx_j)), str(max(idx_i, idx_j)))
        idx_i = false_edge[0]
        idx_j = false_edge[1]
        # Make sure false_edge not an actual edge, and not a repeat
        if idx_i in edge_list_dict:
            if idx_j in edge_list_dict[idx_i]:
                continue
        if false_edge in test_edges_false:
            continue

        test_edges_false.add(false_edge)
    print('test_edges_false extracted')

    train_edges_false = set()
    while(len(train_edges_false) < len(train_edges_true)):
        idx_i = int(nodes[np.random.randint(0, len(graph.nodes()))])
        idx_j = int(nodes[np.random.randint(0, len(graph.nodes()))])

        if idx_i == idx_j:
            continue

        false_edge = (str(min(idx_i, idx_j)), str(max(idx_i, idx_j)))
        idx_i = false_edge[0]
        idx_j = false_edge[1]
        # Make sure false_edge not an actual edge, and not a repeat
        if idx_i in edge_list_dict:
            if idx_j in edge_list_dict[idx_i]:
                continue
        if false_edge in train_edges_false:
            continue
        if false_edge in test_edges_false:
            continue

        train_edges_false.add(false_edge)
    print('train_edges_false extracted')
    
    ####### performe some test ########
    for ss_idx, ss in enumerate([
        train_edges_true,
        train_edges_false,
        test_edges_true,
        test_edges_false]):
        for e in ss:
            if int(e[0]) > int(e[1]):
                print('problem at', ss_idx)
                assert False
    
    train_edges_true_set = set(train_edges_true)
    train_edges_false_set = set(train_edges_false)
    test_edges_true_set = set(test_edges_true)
    test_edges_false_set = set(test_edges_false)
    assert len(train_edges_true_set.intersection(train_edges_false_set)) == 0
    assert len(train_edges_true_set.intersection(test_edges_true_set)) == 0
    assert len(train_edges_true_set.intersection(test_edges_false_set)) == 0

    assert len(train_edges_false_set.intersection(test_edges_true_set)) == 0
    assert len(train_edges_false_set.intersection(test_edges_false_set)) == 0

    assert len(test_edges_true_set.intersection(test_edges_false_set)) == 0

    return train_edges_true, train_edges_false, test_edges_true, test_edges_false
    
train_edges_true, train_edges_false, test_edges_true, test_edges_false = make_train_test(graph, test_frac=TEST_FRACTION)

test_count 59109
edge_list 197031


HBox(children=(IntProgress(value=0, max=197031), HTML(value='')))

step 5000 removed_edges 4969
step 10000 removed_edges 9943
step 15000 removed_edges 14914
step 20000 removed_edges 19884
step 25000 removed_edges 24850
step 30000 removed_edges 29821
step 35000 removed_edges 34780
step 40000 removed_edges 39731
step 45000 removed_edges 44678
step 50000 removed_edges 49635
step 55000 removed_edges 54574
train_edges_true, test_edges_true extracted
test_edges_false extracted
train_edges_false extracted


In [8]:
def write_edge_list(edge_list, filename):
    with open(filename, 'w+') as f:
        for e in edge_list:
            f.write(str(e[0]) + ' ' + str(e[1]) + '\n')
    print(filename, 'created')
    
def read_edge_list(file_path):
    with open(file_path) as f:
        return [(line.split(' ')[0],line.split(' ')[1]) for line in f.read().split('\n')[:-1]]

In [9]:
print('len(train_edges_true),len(train_edges_false),len(test_edges_true),len(test_edges_false)')
print(len(train_edges_true),len(train_edges_false),len(test_edges_true),len(test_edges_false))

write_edge_list(train_edges_true, working_dir + 'train_edges_true.txt')
write_edge_list(train_edges_false, working_dir + 'train_edges_false.txt')
write_edge_list(test_edges_true, working_dir + 'test_edges_true.txt')
write_edge_list(test_edges_false, working_dir + 'test_edges_false.txt')

len(train_edges_true),len(train_edges_false),len(test_edges_true),len(test_edges_false)
137922 137922 59109 59109
./data/astroph/train_edges_true.txt created
./data/astroph/train_edges_false.txt created
./data/astroph/test_edges_true.txt created
./data/astroph/test_edges_false.txt created


In [10]:
train_edges_true  = read_edge_list(working_dir + 'train_edges_true.txt')
train_edges_false = read_edge_list(working_dir + 'train_edges_false.txt')
test_edges_true   = read_edge_list(working_dir + 'test_edges_true.txt')
test_edges_false  = read_edge_list(working_dir + 'test_edges_false.txt')
print(len(train_edges_true))
print(len(train_edges_false))
print(len(test_edges_true))
print(len(test_edges_false))

137922
137922
59109
59109


In [12]:
print(len(COMBINATIONS),'combinations')
BATCH_SIZE = int(len(graph.nodes()) / BATCH_COUNT) + 1

parts_graph = nx.Graph()
combination_edges = []
for comb in COMBINATIONS:
    parts_graph.add_edge(comb[0],comb[1])
    combination_edges.append([])

assert nx.diameter(parts_graph) <= 3

3 combinations


In [13]:
input_path = working_dir + 'all_edges.txt'
graph = nx.read_edgelist(input_path)
    
def is_in(A1, A2, n1, n2):
    n1 = int(n1)
    n2 = int(n2)
    if(n1 in A1 and n2 in A2):
        return True
    if(n1 in A2 and n2 in A1):
        return True
    if(n1 in A1 and n2 in A1):
        return True
    if(n2 in A2 and n1 in A2):
        return True
    return False

batch_ranges = []
for i in range(BATCH_COUNT):
    batch_ranges.append(range(i * BATCH_SIZE, (i+1) * BATCH_SIZE))

batch_nodes = []
for _ in range(BATCH_COUNT):
    batch_nodes.append([])
for node in graph.nodes():
    for batch_idx in range(BATCH_COUNT):
        if int(node) in batch_ranges[batch_idx]:
            batch_nodes[batch_idx].append(node)

ignored_edges = []
for e in tqdm(list(graph.edges())):
    ignored = True
    for comb_idx, comb in enumerate(COMBINATIONS):
        if(is_in(batch_ranges[comb[0]], batch_ranges[comb[1]], e[0], e[1])):
            combination_edges[comb_idx].append((e[0],e[1]))
            ignored = False
    if ignored:
        ignored_edges.append((e[0],e[1]))

print(len(ignored_edges),'edgs ignored', len(ignored_edges) / (len(train_edges_true) + len(test_edges_true)))

write_edge_list(ignored_edges, working_dir+'ignored_edges.txt')

for comb_idx, comb in enumerate(COMBINATIONS):
    write_edge_list(combination_edges[comb_idx], working_dir + str(comb[0]) + '_' + str(comb[1]) + '_edges.txt')

for batch_idx in range(BATCH_COUNT):
    f_name = working_dir + str(batch_idx) + '_nodes.txt'
    with open(f_name, 'w+') as f:
        for node in batch_nodes[batch_idx]:
            f.write(str(node) + '\n')
    print(f_name,'created')

100%|██████████| 197031/197031 [00:00<00:00, 203279.18it/s]


73194 edgs ignored 0.37148469022641106
./data/astroph/ignored_edges.txt created
./data/astroph/0_1_edges.txt created
./data/astroph/1_2_edges.txt created
./data/astroph/2_3_edges.txt created
./data/astroph/0_nodes.txt created
./data/astroph/1_nodes.txt created
./data/astroph/2_nodes.txt created
./data/astroph/3_nodes.txt created


In [None]:
batch_nodes[0]