# Precompute Validation Subgraphs
This is a simple script go from our validation dictionary (containing nodes, neighbors, features, and labels) to a list of subgraphs on which we can evaluate.  To do this we will do the following (hopefully in parallel):
- Insert each node into the unsupervised graph one node at a time
- After inserting a node, extract its 2-hop subgraph
- Save each 2-hop subgraph in a list --> we will need it for evaluation later

In [1]:
import numpy as np
import pandas as pd
import torch as th
import dgl
import scipy
import networkx as nx
from progressbar import progressbar
import time
import random
from tqdm import tqdm_notebook as tqdm

import pickle

from dgl.data.utils import save_graphs, load_graphs, split_dataset

import dgl.nn as dglnn
import torch.nn as nn
import torch.nn.functional as F

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
from IPython.utils import io

import multiprocessing
from concurrent.futures import ProcessPoolExecutor

Using backend: pytorch


## Extract and Store Subgraph Minibatches

In [2]:
#this needs to operate on a single validation node and its data
def compute_subgraph(dict_entry):
    temp_graphs, temp_labels = load_graphs("new_train_graphs.bin")
    temp_unsup_graph = temp_graphs[0]
    
    #extract our graph
    #temp_unsup_graph = entry[0]
    #temp_unsup_graph = temp_unsup_graph.add_self_loop()
    
    #extract our validation node data
    #dict_entry = entry[1]
    
    #extract node features, label, neighbors from validation entry
    temp_feats = th.tensor(dict_entry['features'])
    temp_label = th.tensor(dict_entry['label'])
    neighb_list = dict_entry['neighbors']
    
    #add node to graph
    temp_unsup_graph.add_nodes(1) # this is saying add 1 node --> it will always be node 399722
    temp_unsup_graph.ndata['labels'][-1] = temp_label
    temp_unsup_graph.ndata['features'][-1] = temp_feats
    
    assert temp_unsup_graph.number_of_nodes() == 399723
    #store edges and their reverse
    val_neighbors = [[399722, x] for x in neighb_list]
    val_neighbors_reversed = [[x, 399722] for x in neighb_list]
    final_val_neighbors = [*val_neighbors, *val_neighbors_reversed]
    source_nodes = [x[0] for x in final_val_neighbors]
    dest_nodes = [x[1] for x in final_val_neighbors]
    
    #add edges
    temp_unsup_graph.add_edges(source_nodes, dest_nodes)

    #define our sampler and dataloader --> we will use this to extract the subgraph around our validation node
    sampler = dgl.dataloading.MultiLayerNeighborSampler([15,10])
    temp_dataloader = dgl.dataloading.NodeDataLoader(
    temp_unsup_graph, [399722], sampler,
    batch_size=1024,
    shuffle=True,
    drop_last=False,
    num_workers=0)

    #get our sampled subgraph
    temp_input_nodes, temp_output_nodes, temp_blocks = next(iter(temp_dataloader))

    #check to ensure that our output node is what we want it to be
#     assert th.equal(temp_blocks[-1].dstdata['labels'][-1], th.tensor(validation_dict[nid]['label'])) #check that the labels match
#     assert th.equal(temp_blocks[-1].dstdata['features'].flatten(), th.tensor(validation_dict[nid]['features'])) #check that the features match
    
    return (temp_input_nodes, temp_output_nodes, temp_blocks) #return tuple all the necessary info for our validation subgraph

In [3]:
if __name__ == "__main__":
    #load in validation dictionary
    with open('final_validation_data.pickle', 'rb') as f:
        validation_dict = pickle.load(f)
    
    validation_values = list(validation_dict.values())
    
    #split our subgraphs into batches --> if not the parallel process mems out after a while
    BATCH_SIZE = 2000
    node_batches = [validation_values[i:i + BATCH_SIZE] for i in range(0, len(validation_values), BATCH_SIZE)]
    #val_data_graph = [(temp_unsup_graph.clone(), x) for x in validation_values]

    with ProcessPoolExecutor(max_workers=50) as executor:
        for i, batch in enumerate(node_batches[1:]):
            result = []
            for r in progressbar(executor.map(compute_subgraph, batch)):
                result.append(r)
            th.save(result, f'val_subgraphs/subgraph{i+1}')

| |                     #                          | 1999 Elapsed Time: 0:04:26
| |            #                                   | 1999 Elapsed Time: 0:04:27
| |               #                                | 1999 Elapsed Time: 0:04:27
| |                                      #         | 1999 Elapsed Time: 0:04:24
| |                                         #      | 1999 Elapsed Time: 0:04:23
| |                               #                | 1999 Elapsed Time: 0:04:22
| |                                         #      | 1999 Elapsed Time: 0:04:24
| |                   #                            | 1999 Elapsed Time: 0:04:21
| |            #                                   | 1999 Elapsed Time: 0:04:20
| |           #                                    | 1999 Elapsed Time: 0:04:20
| |                         #                      | 1999 Elapsed Time: 0:04:21
| |            #                                   | 1999 Elapsed Time: 0:04:20
| |                                     

## Combine Mini Subgraphs into Single File

In [17]:
#combine all of our subgraph calculations into one file
full_subgraph_list = []
for i in range(20):
    x = th.load(f'val_subgraphs/subgraph{i}')
    full_subgraph_list.extend(x)

#save subgraphs
th.save(full_subgraph_list, f'val_subgraphs/full_subgraph_list')

In [20]:
#load in subgraphs to check that saving and loading is ok
all_subgraphs = th.load('val_subgraphs/full_subgraph_list')

In [60]:
#check to make sure the labels match up
subgraph_labels = []
for sg in all_subgraphs:
    subgraph_labels.append(sg[2][-1].dstdata['labels'].item())
    
val_dict_labels = []
for val in list(validation_dict.values()):
    val_dict_labels.append(val['label'])
    
assert subgraph_labels == val_dict_labels

## Old Code - Unparallelized
Keeping in case need for reference later

In [None]:
# def precompute_validation_subgraphs(validation_dict):
#     subgraph_holder = [] #we will store all of our subgraphs here
    
#     keys = list(validation_dict.keys())[0:100] #keys to our validation nodes
#     for nid in progressbar(keys):
#         #temporary copy of graph that we will pass in our new nodes to
#         temp_graphs, temp_labels = load_graphs("new_train_graphs.bin")
#         temp_unsup_graph = temp_graphs[0]
#         temp_unsup_graph = temp_unsup_graph.add_self_loop()

#         #add node from validation set to our graph and extract its features, label, and neighbors
#         temp_feats = th.tensor(validation_dict[nid]['features'])
#         temp_label = th.tensor(validation_dict[nid]['label'])

#         #add node to graph
#         temp_unsup_graph.add_nodes(1) # this is saying add 1 node --> it will always be node 399722
#         temp_unsup_graph.ndata['labels'][-1] = temp_label
#         temp_unsup_graph.ndata['features'][-1] = temp_feats

#         #store edges and their reverse
#         neighb_list = validation_dict[nid]['neighbors']
#         val_neighbors = [[399722, x] for x in neighb_list]
#         val_neighbors_reversed = [[x, 399722] for x in neighb_list]
#         final_val_neighbors = [*val_neighbors, *val_neighbors_reversed]
#         source_nodes = [x[0] for x in final_val_neighbors]
#         dest_nodes = [x[1] for x in final_val_neighbors]

#         #add edges
#         temp_unsup_graph.add_edges(source_nodes, dest_nodes)
        
#         #define our sampler and dataloader --> we will use this to extract the subgraph around our validation node
#         sampler = dgl.dataloading.MultiLayerNeighborSampler([15,10])
#         temp_dataloader = dgl.dataloading.NodeDataLoader(
#         temp_unsup_graph, [399722], sampler,
#         batch_size=1024,
#         shuffle=True,
#         drop_last=False,
#         num_workers=0)

#         #get our sampled subgraph
#         temp_input_nodes, temp_output_nodes, temp_blocks = next(iter(temp_dataloader))
        
#         #check to ensure that our output node is what we want it to be
#         assert th.equal(temp_blocks[-1].dstdata['labels'][-1], th.tensor(validation_dict[nid]['label'])) #check that the labels match
#         assert th.equal(temp_blocks[-1].dstdata['features'].flatten(), th.tensor(validation_dict[nid]['features'])) #check that the features match
        
#         #append subgraph info for each validation node as a tuple to a running list
#         subgraph_holder.append((temp_input_nodes, temp_output_nodes, temp_blocks))
        
#     return subgraph_holder
# subgraph_holder = precompute_validation_subgraphs(validation_dict)