# Minibatch GNN Training

In [1]:
import numpy as np
import pandas as pd
import torch as th
import dgl
import scipy
import networkx as nx
from tqdm import tqdm

from dgl.data.utils import save_graphs, load_graphs, split_dataset

import dgl.nn as dglnn
import torch.nn as nn
import torch.nn.functional as F

Using backend: pytorch


## Load Data

In [2]:
#load our graphs from before
glist, label_dict = load_graphs("./data_final.bin")
unsup_graph, sup_graph = glist[0], glist[1]

## Prepare Data for Training

In [3]:
#fix problem where we have a label of -1 to represent the missing class
tempLabels = sup_graph.ndata['label']

#something weird - no labels from original data had class 8??? --> map our -1 class to 8 so it works with cross-entropy loss
tempLabels[tempLabels==-1] = 8

sup_graph.ndata['label'] = tempLabels
unsup_graph.ndata['label'] = tempLabels

In [4]:
#add self loops to both graphs
sup_graph = sup_graph.add_self_loop()
unsup_graph = unsup_graph.add_self_loop()

In [5]:
#split data into train,test,val
sup_split = split_dataset(sup_graph, shuffle=True, random_state=10)
unsup_split = split_dataset(sup_graph, shuffle=True, random_state=10)

#extract the splits
sup_train, sup_val, sup_test = sup_split[0], sup_split[1], sup_split[2]
unsup_train, unsup_val, unsup_test = unsup_split[0], unsup_split[1], unsup_split[2]

#convert the index based representation into boolean masks for the graphs
n = len(sup_graph) #total num nodes in each graph
train_mask, val_mask, test_mask = np.zeros(n, dtype=bool), np.zeros(n, dtype=bool), np.zeros(n, dtype=bool) #create empty arrays for train/val/test

#populate our boolean masks
train_mask[sup_train.indices] = True 
val_mask[sup_val.indices] = True
test_mask[sup_test.indices] = True

#embed these masks into our graph
sup_graph.ndata['train_mask'], sup_graph.ndata['val_mask'], sup_graph.ndata['test_mask'] = th.tensor(train_mask, dtype=bool), th.tensor(val_mask, dtype=bool), th.tensor(test_mask, dtype=bool)
unsup_graph.ndata['train_mask'], unsup_graph.ndata['val_mask'], unsup_graph.ndata['test_mask'] = th.tensor(train_mask), th.tensor(val_mask), th.tensor(test_mask)



In [6]:
#extract and store information from each graph

#sup
sup_node_features = sup_graph.ndata['features']
sup_node_labels = sup_graph.ndata['label']

#unsup
unsup_node_features = unsup_graph.ndata['features']
unsup_node_labels = unsup_graph.ndata['label']

#general graph characteristics - doesn't matter which graph
train_mask = sup_graph.ndata['train_mask']
valid_mask = sup_graph.ndata['val_mask']
test_mask = sup_graph.ndata['test_mask']
n_features = sup_node_features.shape[1]
n_labels = int(sup_node_labels.max().item() + 1)

In [7]:
#Split the graph into training graph, validation graph, and test graph by training
#and validation masks.  Suitable for inductive models.

#sup
sup_train_graph = sup_graph.subgraph(sup_graph.ndata['train_mask'])
sup_val_graph = sup_graph.subgraph(sup_graph.ndata['train_mask'] | sup_graph.ndata['val_mask'])
sup_test_graph = sup_graph

#unsup
unsup_train_graph = unsup_graph.subgraph(unsup_graph.ndata['train_mask'])
unsup_val_graph = unsup_graph.subgraph(unsup_graph.ndata['train_mask'] | unsup_graph.ndata['val_mask'])
unsup_test_graph = unsup_graph

## Build Model for Training

In [8]:
import dgl
import numpy as np
import torch as th
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torch.multiprocessing as mp
from torch.utils.data import DataLoader
import dgl.function as fn
import dgl.nn.pytorch as dglnn
import time
import argparse
from _thread import start_new_thread
from functools import wraps
from dgl.data import RedditDataset
from tqdm import tqdm
import traceback

In [34]:
class SAGE(nn.Module):
    def __init__(self,
                 in_feats,
                 n_hidden,
                 n_classes,
                 n_layers,
                 activation,
                 dropout):
        super().__init__()
        self.n_layers = n_layers
        self.n_hidden = n_hidden
        self.n_classes = n_classes
        self.layers = nn.ModuleList()
        self.layers.append(dglnn.SAGEConv(in_feats, n_hidden, 'mean'))
        for i in range(1, n_layers - 1):
            self.layers.append(dglnn.SAGEConv(n_hidden, n_hidden, 'mean'))
        self.layers.append(dglnn.SAGEConv(n_hidden, n_classes, 'mean'))
        self.dropout = nn.Dropout(dropout)
        self.activation = activation

    def forward(self, blocks, x):
        h = x
        for l, (layer, block) in enumerate(zip(self.layers, blocks)):
            h = layer(block, h)
            if l != len(self.layers) - 1:
                h = self.activation(h)
                h = self.dropout(h)
        h = F.softmax(h,dim=-1)
        return h

    def inference(self, g, x, batch_size):
        """
        Inference with the GraphSAGE model on full neighbors (i.e. without neighbor sampling).
        g : the entire graph.
        x : the input of entire node set.
        The inference code is written in a fashion that it could handle any number of nodes and
        layers.
        """
        # During inference with sampling, multi-layer blocks are very inefficient because
        # lots of computations in the first few layers are repeated.
        # Therefore, we compute the representation of all nodes layer by layer.  The nodes
        # on each layer are of course splitted in batches.
        # TODO: can we standardize this?
        for l, layer in enumerate(self.layers):
            y = th.zeros(g.number_of_nodes(), self.n_hidden if l != len(self.layers) - 1 else self.n_classes)

            sampler = dgl.dataloading.MultiLayerFullNeighborSampler(1)
            dataloader = dgl.dataloading.NodeDataLoader(
                g,
                th.arange(g.number_of_nodes()),
                sampler,
                batch_size=args.batch_size,
                shuffle=True,
                drop_last=False)

            for input_nodes, output_nodes, blocks in tqdm(dataloader):
                block = blocks[0]

                block = block.int()
                h = x[input_nodes]
                h = layer(block, h)
                if l != len(self.layers) - 1:
                    h = self.activation(h)
                    h = self.dropout(h)
                h = F.softmax(h,dim=-1)

                y[output_nodes] = h

            x = y
        return y

In [14]:
#helper functions
def compute_acc(pred, labels):
    """
    Compute the accuracy of prediction given the labels.
    """
    labels = labels.long()
    return (th.argmax(pred, dim=1) == labels).float().sum() / len(pred)

def evaluate(model, g, inputs, labels, val_nid, batch_size):
    """
    Evaluate the model on the validation set specified by ``val_nid``.
    g : The entire graph.
    inputs : The features of all the nodes.
    labels : The labels of all the nodes.
    val_nid : the node Ids for validation.
    batch_size : Number of nodes to compute at the same time.
    device : The GPU device to evaluate on.
    """
    model.eval()
    with th.no_grad():
        pred = model.inference(g, inputs, batch_size)
    model.train()
    return compute_acc(pred[val_nid], labels[val_nid])

# def load_subtensor(g, seeds, input_nodes, device):
#     """
#     Copys features and labels of a set of nodes onto GPU.
#     """
#     batch_inputs = g.ndata['features'][input_nodes].to(device)
#     batch_labels = g.ndata['labels'][seeds].to(device)
#     return batch_inputs, batch_labels

In [112]:
#run code
#### Entry point
def run(args, sup_data, unsup_data):
    # Unpack data - sup and unsup
    sup_in_feats, sup_n_classes, sup_train_g, sup_val_g, sup_test_g = sup_data
    unsup_in_feats, unsup_n_classes, unsup_train_g, unsup_val_g, unsup_test_g = unsup_data
    
    
    
    #get node IDs for train/text/val sets
    sup_train_nid = th.nonzero(sup_train_g.ndata['train_mask'], as_tuple=True)[0]
    sup_val_nid = th.nonzero(sup_val_g.ndata['val_mask'], as_tuple=True)[0]
    sup_test_nid = th.nonzero(~(sup_test_g.ndata['train_mask'] | sup_test_g.ndata['val_mask']), as_tuple=True)[0]
    
    unsup_train_nid = th.nonzero(unsup_train_g.ndata['train_mask'], as_tuple=True)[0]
    unsup_val_nid = th.nonzero(unsup_val_g.ndata['val_mask'], as_tuple=True)[0]
    unsup_test_nid = th.nonzero(~(unsup_test_g.ndata['train_mask'] | unsup_test_g.ndata['val_mask']), as_tuple=True)[0]
    
    
    
    # Create PyTorch DataLoaders for constructing blocks
    sampler = dgl.dataloading.MultiLayerNeighborSampler([15,10])
    
    sup_dataloader = dgl.dataloading.NodeDataLoader(
        sup_train_g,
        sup_train_nid,
        sampler,
        batch_size=args.batch_size,
        shuffle=True,
        drop_last=False)
    
    unsup_dataloader = dgl.dataloading.NodeDataLoader(
        unsup_train_g,
        unsup_train_nid,
        sampler,
        batch_size=args.batch_size,
        shuffle=True,
        drop_last=False)
    
    
    # Define model and optimizer
    model = SAGE(in_feats, args.num_hidden, n_classes, args.num_layers, F.relu, args.dropout)
    loss_fcn = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=args.lr)

    # Training loop
    avg = 0
    iter_tput = []
    for epoch in range(args.num_epochs):
        tic = time.time()
        
#         if epoch == 0: # debugging
#             continue

        # Loop over the dataloader to sample the computation dependency graph as a list of
        # blocks.
        tic_step = time.time()
        for step, dl in enumerate(zip(sup_dataloader,unsup_dataloader)):
            #extract inputs from both data loaders
            (sup_input_nodes, sup_seeds, sup_blocks) = dl[0]
            (unsup_input_nodes, unsup_seeds, unsup_blocks) = dl[0]
            
            #test that each batch of nodes is the same
            assert th.equal(sup_input_nodes,unsup_input_nodes)
            assert th.equal(sup_seeds, unsup_seeds)
            
            
            # Load the input features as well as output labels
            #batch_inputs, batch_labels = load_subtensor(train_g, seeds, input_nodes, device)
            sup_blocks = [block.int() for block in sup_blocks]
            sup_batch_inputs = sup_blocks[0].srcdata['features']
            sup_batch_labels = sup_blocks[-1].dstdata['label']
            
            unsup_blocks = [block.int() for block in unsup_blocks]
            unsup_batch_inputs = unsup_blocks[0].srcdata['features']
            unsup_batch_labels = unsup_blocks[-1].dstdata['label']

            
            
            # Compute loss for each graph and add in additional term
            sup_batch_pred = model(sup_blocks, sup_batch_inputs)
            sup_loss = loss_fcn(sup_batch_pred, sup_batch_labels)
            
            unsup_batch_pred = model(unsup_blocks, unsup_batch_inputs)
            unsup_loss = loss_fcn(unsup_batch_pred, unsup_batch_labels)
            
            additional_term = th.dist(sup_batch_pred, unsup_batch_pred,2)/args.batch_size
            
            
            
            #final loss and backprop
            loss = sup_loss + unsup_loss + additional_term
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            iter_tput.append(len(sup_seeds) / (time.time() - tic_step))
            if step % args.log_every == 0:
                acc = compute_acc(sup_batch_pred, sup_batch_labels)
                print('Epoch {:05d} | Step {:09d} | Loss {:.4f} | Train Acc {:.4f} | Speed (samples/sec) {:.4f}'.format(
                    epoch, step, loss.item(), acc.item(), np.mean(iter_tput[3:])))
                with open('./logs/log_file.txt', 'a') as f:
                    f.write('Epoch {:05d} | Step {:09d} | Loss {:.4f} | Train Acc {:.4f} | Speed (samples/sec) {:.4f}\n'.format(
                    epoch, step, loss.item(), acc.item(), np.mean(iter_tput[3:])))
            tic_step = time.time()

        toc = time.time()
        print('Epoch Time(s): {:.4f}'.format(toc - tic))
        
        with open('./logs/log_file.txt', 'a') as f:
            f.write('Epoch Time(s): {:.4f}\n\n'.format(toc - tic))
            f.write('________________________________________________\n\n\n')
        
        if epoch >= 5:
            avg += toc - tic
        if epoch % args.eval_every == 0 and epoch != 0:
            eval_acc = evaluate(model, unsup_val_g, unsup_val_g.ndata['features'], unsup_val_g.ndata['label'], unsup_val_nid, args.batch_size)
            print('Eval Acc {:.4f}'.format(eval_acc))
            test_acc = evaluate(model, unsup_test_g, unsup_test_g.ndata['features'], unsup_test_g.ndata['label'], unsup_test_nid, args.batch_size)
            print('Test Acc: {:.4f}'.format(test_acc))
            
            with open('./logs/log_file.txt', 'a') as f:
                f.write('________________________________________________\n\n\n')
                f.write('Eval Acc {:.4f}\n'.format(eval_acc))
                f.write('Test Acc: {:.4f}\n'.format(test_acc))
                f.write('________________________________________________\n\n\n')
                
                
            
        th.save(model.state_dict(), f'./mini_batch_models/model{epoch}')

    #print('Avg epoch time: {}'.format(avg / (epoch - 4)))


In [None]:
class arg_holder():
    def __init__(self, num_epochs=20, num_hidden=256, num_layers=2, batch_size=1024, log_every=100, eval_every=1, lr=0.003, dropout=0.5):
        self.num_epochs = num_epochs
        self.num_hidden = num_hidden
        self.num_layers= num_layers
        self.batch_size = batch_size
        self.log_every = log_every
        self.eval_every = eval_every
        self.lr = lr
        self.dropout = dropout
        
        
# argparser = argparse.ArgumentParser("mini-batch training")
# argparser.add_argument('--num-epochs', type=int, default=20)
# argparser.add_argument('--num-hidden', type=int, default=256)
# argparser.add_argument('--num-layers', type=int, default=2)
# argparser.add_argument('--batch-size', type=int, default=1000)
# argparser.add_argument('--log-every', type=int, default=4)
# argparser.add_argument('--eval-every', type=int, default=4)
# argparser.add_argument('--lr', type=float, default=0.003)
# argparser.add_argument('--dropout', type=float, default=0.5)
# args = argparser.parse_args()

args = arg_holder()

#get dimension of embedded node features and number of classes
in_feats = sup_graph.ndata['features'].shape[1]
n_classes = len(set(unsup_graph.ndata['label'].tolist()))

sup_data = in_feats, n_classes, sup_train_graph, sup_val_graph, sup_test_graph
unsup_data = in_feats, n_classes, unsup_train_graph, unsup_val_graph, unsup_test_graph

run(args, sup_data, unsup_data)

Epoch 00000 | Step 000000000 | Loss 9.1108 | Train Acc 0.0088 | Speed (samples/sec) nan
Epoch 00000 | Step 000000100 | Loss 7.9164 | Train Acc 0.5928 | Speed (samples/sec) 1695.4224
Epoch 00000 | Step 000000200 | Loss 7.8910 | Train Acc 0.6055 | Speed (samples/sec) 1723.2751
Epoch 00000 | Step 000000300 | Loss 7.9281 | Train Acc 0.5869 | Speed (samples/sec) 1734.3549


## Test Stuff

In [41]:
# Unpack data - sup and unsup
sup_in_feats, sup_n_classes, sup_train_g, sup_val_g, sup_test_g = sup_data
unsup_in_feats, unsup_n_classes, unsup_train_g, unsup_val_g, unsup_test_g = unsup_data



#get node IDs for train/val/test sets
sup_train_nid = th.nonzero(sup_train_g.ndata['train_mask'], as_tuple=True)[0]
sup_val_nid = th.nonzero(sup_val_g.ndata['val_mask'], as_tuple=True)[0]
sup_test_nid = th.nonzero(~(sup_test_g.ndata['train_mask'] | sup_test_g.ndata['val_mask']), as_tuple=True)[0]

unsup_train_nid = th.nonzero(unsup_train_g.ndata['train_mask'], as_tuple=True)[0]
unsup_val_nid = th.nonzero(unsup_val_g.ndata['val_mask'], as_tuple=True)[0]
unsup_test_nid = th.nonzero(~(unsup_test_g.ndata['train_mask'] | unsup_test_g.ndata['val_mask']), as_tuple=True)[0]



# Create PyTorch DataLoaders for constructing blocks
sampler = dgl.dataloading.MultiLayerNeighborSampler([15,10])

sup_dataloader = dgl.dataloading.NodeDataLoader(
    sup_train_g,
    sup_train_nid,
    sampler,
    batch_size=args.batch_size,
    shuffle=True,
    drop_last=False)

unsup_dataloader = dgl.dataloading.NodeDataLoader(
    unsup_train_g,
    unsup_train_nid,
    sampler,
    batch_size=args.batch_size,
    shuffle=True,
    drop_last=False)


In [None]:
# Define model and optimizer
model = SAGE(in_feats, args.num_hidden, n_classes, args.num_layers, F.relu, args.dropout)
loss_fcn = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=args.lr)
for step, dl in enumerate(zip(sup_dataloader,unsup_dataloader)):
            #extract inputs from both data loaders
            (sup_input_nodes, sup_seeds, sup_blocks) = dl[0]
            (unsup_input_nodes, unsup_seeds, unsup_blocks) = dl[0]
            
            assert th.equal(sup_input_nodes,unsup_input_nodes)
            assert th.equal(sup_seeds, unsup_seeds)
            print(sup_input_nodes, '\n', sup_seeds, '\n', sup_blocks)
            print()
            print()
            print(unsup_input_nodes,'\n', unsup_seeds, '\n', unsup_blocks)
            print()
            print('___________________________________________________')
            
            # Load the input features as well as output labels
            #batch_inputs, batch_labels = load_subtensor(train_g, seeds, input_nodes, device)
            sup_blocks = [block.int() for block in sup_blocks]
            sup_batch_inputs = sup_blocks[0].srcdata['features']
            sup_batch_labels = sup_blocks[-1].dstdata['label']
            
            unsup_blocks = [block.int() for block in unsup_blocks]
            unsup_batch_inputs = unsup_blocks[0].srcdata['features']
            unsup_batch_labels = unsup_blocks[-1].dstdata['label']
            
            # Compute loss for each graph and add in additional term
            sup_batch_pred = model(sup_blocks, sup_batch_inputs) #pass in our graph and features
            sup_loss = loss_fcn(sup_batch_pred, sup_batch_labels) #compute loss from here
            
            unsup_batch_pred = model(unsup_blocks, unsup_batch_inputs)
            unsup_loss = loss_fcn(unsup_batch_pred, unsup_batch_labels)
            
            additional_term = th.dist(sup_batch_pred, unsup_batch_pred,2)/args.batch_size
            

In [103]:
sup_batch_inputs.shape

torch.Size([65326, 512])

In [104]:
sup_blocks

[Block(num_src_nodes=65326, num_dst_nodes=8620, num_edges=92996),
 Block(num_src_nodes=8620, num_dst_nodes=1024, num_edges=8319)]

In [85]:
sup_seeds

tensor([282331, 164362, 664126,  ..., 784736, 284825, 279173])

In [83]:
unsup_seeds.shape

torch.Size([1024])

In [93]:
sup_blocks[0].int()

Block(num_src_nodes=65326, num_dst_nodes=8620, num_edges=92996)

In [94]:
unsup_blocks[0].int()

Block(num_src_nodes=65326, num_dst_nodes=8620, num_edges=92996)

In [96]:
sup_blocks[0].srcdata['features'].shape

torch.Size([65326, 512])

In [98]:
sup_blocks[-1].dstdata['label'].shape

torch.Size([1024])