# GraphSage Model Training
Now that we've constructed the graphs, we can train on a simple 2-layer GraphSage network.  The procedure will be as follows:
- Initialize our network
- Send in mini-batches from both training graphs and save both of their outputs
- Compute loss as follows: supervised_cross_entropy + unsupervised_cross_entropy + L2_distance(supervised_output, unsupervised_output) --> this is post softmax
Evaluation will be done by adding in one node at a time from our validation nodes to our unsupervised graph.  From there, we will extract it's two-hop subgraph and pass this through our network to predict the class.

In [2]:
import numpy as np
import pandas as pd
import torch as th
import dgl
import scipy
import networkx as nx
from progressbar import progressbar
import time
import random
from tqdm import tqdm_notebook as tqdm

import pickle

from dgl.data.utils import save_graphs, load_graphs, split_dataset

import dgl.nn as dglnn
import torch.nn as nn
import torch.nn.functional as F

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
from IPython.utils import io


Using backend: pytorch


## Load Data

In [4]:
#load in previously computed graphs
glist, label_dict = load_graphs("new_train_graphs.bin")
unsup_graph, sup_graph = glist[0], glist[1]

#load in validation dictionary
with open('final_validation_data.pickle', 'rb') as f:
    validation_dict = pickle.load(f)

In [10]:
from collections import Counter
lab_holder = []
for k,v in validation_dict.items():
    lab_holder.append(v['label'])
counts = Counter(lab_holder)
sorted(counts.items(), key = lambda x:x[1], reverse = True)

[(8, 4823),
 (91, 2393),
 (0, 1917),
 (88, 1688),
 (84, 1371),
 (2, 1307),
 (3, 1290),
 (82, 1129),
 (83, 1037),
 (85, 951),
 (76, 853),
 (90, 791),
 (79, 761),
 (80, 747),
 (87, 691),
 (71, 676),
 (86, 655),
 (89, 650),
 (70, 650),
 (73, 641),
 (81, 640),
 (60, 631),
 (1, 549),
 (45, 548),
 (65, 546),
 (72, 543),
 (46, 494),
 (75, 449),
 (67, 435),
 (51, 424),
 (66, 413),
 (50, 391),
 (69, 390),
 (77, 386),
 (74, 382),
 (64, 356),
 (62, 338),
 (56, 337),
 (59, 332),
 (78, 324),
 (58, 321),
 (61, 298),
 (57, 280),
 (23, 270),
 (54, 267),
 (63, 251),
 (39, 242),
 (47, 208),
 (36, 195),
 (38, 194),
 (41, 194),
 (52, 184),
 (68, 183),
 (22, 177),
 (44, 172),
 (25, 150),
 (26, 149),
 (35, 141),
 (27, 134),
 (29, 133),
 (14, 130),
 (33, 117),
 (37, 112),
 (24, 107),
 (49, 98),
 (28, 92),
 (43, 92),
 (13, 85),
 (15, 79),
 (16, 75),
 (30, 74),
 (18, 74),
 (17, 73),
 (9, 73),
 (48, 67),
 (20, 62),
 (42, 60),
 (31, 57),
 (4, 51),
 (6, 49),
 (34, 48),
 (21, 45),
 (32, 45),
 (7, 42),
 (12, 35),
 

In [3]:
#add self loops to both graphs
sup_graph = sup_graph.add_self_loop()
unsup_graph = unsup_graph.add_self_loop()

## Define Model

In [4]:
class StochasticTwoLayerGCN(nn.Module):
    def __init__(self, in_features, hidden_features, out_features):
        super().__init__()
        self.conv1 = dgl.nn.SAGEConv(in_features, hidden_features, aggregator_type='pool')
        self.conv2 = dgl.nn.SAGEConv(hidden_features, out_features, aggregator_type='pool')

    def forward(self, blocks, x):
        x = F.relu(self.conv1(blocks[0], x))
        x = F.relu(self.conv2(blocks[1], x))
        return x
    
    
class SAGE(nn.Module):
    def __init__(self, in_feats, hid_feats1, hid_feats2, out_feats, dropout=0.3):
        super().__init__()
        self.conv1 = dglnn.SAGEConv(
            in_feats=in_feats, out_feats=hid_feats1, aggregator_type='pool')
        self.conv2 = dglnn.SAGEConv(
            in_feats=hid_feats1, out_feats=hid_feats2, aggregator_type='pool')
        self.dense = nn.Linear(in_features=hid_feats2, out_features = out_feats)
        self.dropout = nn.Dropout(dropout)

    def forward(self, blocks, inputs):
        # inputs are features of nodes
        h = inputs
        
        h = self.conv1(blocks[0], h)
        h = F.relu(h)
        h = self.dropout(h)
        
        h = self.conv2(blocks[1], h)
        h = F.relu(h)
        h = self.dropout(h)
        
        h = self.dense(h)
        h = F.relu(h)
        
        #note don't add a softmax layer to logits before cross_entropy --> this is done for us by F.crossentropy and causes overflow if we do
        #cross_entropy needs to take in raw logits
        
        return h

## Validation Setup
- Load in precomputed subgraphs necessary to do evaluation --> will greatly speed up evaluation
- Write code to evaluate validation nodes one subgraph at a time

In [6]:
def validation_evaluation(subgraph_holder, model):
    val_1_correct = [] #store whether or not we correctly predicted the class for the 1-validation method
    val_5_correct = [] #store whether our 
    
    for subgraph in progressbar(subgraph_holder):
        temp_input_nodes, temp_output_nodes, temp_blocks = subgraph[0], subgraph[1], subgraph[2] #unpack our values that we precomputed for each validation node
        
        #extract our features and labels to pass through the model and evaluate
        temp_input_features = temp_blocks[0].srcdata['features']
        temp_output_labels = temp_blocks[-1].dstdata['labels']

        with th.no_grad():
            model.eval()
            temp_outputs = model(temp_blocks, temp_input_features)
            _, temp_indices = th.max(temp_outputs, dim=1)
            _, top_k_indices = th.topk(temp_outputs, 5, dim=1)
            
            correct_1 = temp_indices.item() == temp_output_labels.item() #did we get exact classification
            correct_5 = temp_output_labels.item() in top_k_indices[0] #were our top 5 guesses one of the right classes
            val_1_correct.append(correct_1)
            val_5_correct.append(correct_5)

    
    #store and return tuple of total val_1/val_5 accuracies
    val_1_accuracy = sum(val_1_correct)/len(val_1_correct)
    val_5_accuracy = sum(val_5_correct)/len(val_5_correct)
    return (val_1_accuracy, val_5_accuracy)

## Minibatch Training Setup
- set up our dataloaders to properly sample the same nodes from both graphs
- if i only pass in 1024 node ids to both dataloaders and set the batch size to 1024 --> guarantee same nodes sampled
    - need to split input nodes into equal batches of 1024 so I can feed in
- pass through our model in minibatches and backprop loss

In [7]:
#shuffle our node IDs
train_nids = unsup_graph.nodes().tolist()
random.shuffle(train_nids)

#break up our node IDs into batches of 1024 --> we will pass each batch of IDs into our dataloader
#this guarantees that we sample the same nodes from each graph
BATCH_SIZE = 1024
node_batches = [train_nids[i:i + BATCH_SIZE] for i in range(0, len(train_nids), BATCH_SIZE)]

In [8]:
EPOCHS = 20
BATCH_SIZE = 1024
EVAL_EVERY = 1 #evaluate every n epochs
RECORD_EVERY = 30 #record every n minibatch results 

#set up model parameters
in_features = unsup_graph.ndata['features'].shape[1]
out_features = len(set(unsup_graph.ndata['labels'].tolist())) #how many unique classes = number of outputs
hidden_features_1 = 256
hidden_features_2 = 128

#load in all the subgraphs we need for evaluation
subgraph_holder = th.load('val_subgraphs/full_subgraph_list')

model = SAGE(in_features, hidden_features_1, hidden_features_2, out_features, 0.3)
opt = th.optim.Adam(model.parameters())

#sample 15 1-hop neighbors and 10 2-hop neighbors for each node

for epoch in range(EPOCHS):
    start_time = time.time()
    for step, train_nids in enumerate(progressbar(node_batches)):
        model = model.train() #set train context
        
        #set up our sampler and dataloaders --> one dataloader per graph
        sampler = dgl.dataloading.MultiLayerNeighborSampler([15,10])
        sup_dataloader = dgl.dataloading.NodeDataLoader(
            sup_graph, train_nids, sampler,
            batch_size=1024,
            shuffle=True,
            drop_last=False,
            num_workers=0)
        unsup_dataloader = dgl.dataloading.NodeDataLoader(
            unsup_graph, train_nids, sampler,
            batch_size=1024,
            shuffle=True,
            drop_last=False,
            num_workers=0)

        #generate graph dependency for each batch of nodes for both graphs
        sup_input_nodes, sup_output_nodes, sup_blocks = next(iter(sup_dataloader))
        unsup_input_nodes, unsup_output_nodes, unsup_blocks = next(iter(unsup_dataloader))
        
        
        #make sure the same nodes were sampled
        assert list(sorted(sup_output_nodes.tolist())) == list(sorted(unsup_output_nodes.tolist()))

        
        #extract our features and labels for supervised/unsupervised graphs
        sup_input_features = sup_blocks[0].srcdata['features']
        sup_output_labels = sup_blocks[-1].dstdata['labels']
        unsup_input_features = unsup_blocks[0].srcdata['features']
        unsup_output_labels = unsup_blocks[-1].dstdata['labels']

        
        
        #pass each subgraph through our model to get our predictions
        sup_output_predictions = model(sup_blocks, sup_input_features)
        unsup_output_predictions = model(unsup_blocks, unsup_input_features)

        
        
        #get the training accuracy for both graphs
        _, sup_indices = th.max(sup_output_predictions, dim=1)
        sup_correct = th.sum(sup_indices == sup_output_labels)
        sup_train_accuracy = sup_correct.item() * 1.0 / len(sup_output_labels)

        _, unsup_indices = th.max(unsup_output_predictions, dim=1)
        unsup_correct = th.sum(unsup_indices == unsup_output_labels)
        unsup_train_accuracy = unsup_correct.item() * 1.0 / len(unsup_output_labels)

        

        
        
        #reorder results so vectors correspond to the same nodes in both graphs --> necessary to do the Euclidean distance later
        sup_ids = sup_blocks[1].dstdata['_ID'] 
        unsup_ids = unsup_blocks[1].dstdata['_ID']
        sorted_sup_vals, sorted_sup_indices = sup_ids.sort() #how to reorder the supervised predictions/labels
        sorted_unsup_vals, sorted_unsup_indices = unsup_ids.sort() #how to reorder the unsupervised predictions/labels
        
        #reorder our labels and output predictions according to their corresponding node IDs
        sup_output_predictions = sup_output_predictions[sorted_sup_indices]
        unsup_output_predictions = unsup_output_predictions[sorted_unsup_indices]
        sup_output_labels = sup_output_labels[sorted_sup_indices]
        unsup_output_labels = unsup_output_labels[sorted_unsup_indices]       
        
        #check to make sure the labels match up --> if the nodes are in the same order then the labels should be too
        assert th.equal(sup_output_labels,unsup_output_labels)
        
      
        
        
        #compute the loss for both results, calculate the distance between their softmax outputs, then combine these values
        sup_loss = F.cross_entropy(sup_output_predictions, sup_output_labels)
        unsup_loss = F.cross_entropy(unsup_output_predictions, unsup_output_labels)
        distance_term = th.dist(F.softmax(sup_output_predictions,dim=1), F.softmax(unsup_output_predictions,dim=1))/BATCH_SIZE
        loss = (sup_loss + unsup_loss)/2 + distance_term

        #backprop through network to update weights
        opt.zero_grad()
        loss.backward()
        opt.step()
        
        with io.capture_output() as captured: #suppress weird printing from files
            #record statistics for mini batches
            if step%RECORD_EVERY==0:
                with open('logfiles/minibatch_log_file.txt', 'a') as f:
                    f.write('Epoch {:04d} | Step {:05d} | Total Loss {:.4f} | Sup Loss {:.4f} | Unsup Loss {:.4f} | Distance Term {:.4f} | Sup Train Acc {:.4f} | Unsup Train Acc {:.4f}\n'.format(
                    epoch, step, loss, sup_loss, unsup_loss, distance_term, sup_train_accuracy, unsup_train_accuracy));

    
    with io.capture_output() as captured: #suppress weird printing from files
        #record epoch time
        end_time = time.time()
        with open('logfiles/minibatch_log_file.txt', 'a') as f:
            f.write(f'\nEpoch Time: {end_time-start_time} seconds\n\n');

        #evaluate on validation set and record results     
        if epoch%EVAL_EVERY==0:
            #extract our evaluation metrics
            results = validation_evaluation(subgraph_holder, model)
            val_1_accuracy, val_5_accuracy = results[0], results[1]

            with open('logfiles/minibatch_log_file.txt', 'a') as f:
                f.write('\n\n\n________________________________________________\n');
                f.write(f'VALIDATION SET EVALUATION\n');
                f.write(f'Validation 1 Accuracy: {val_1_accuracy}\nValidation 5 Accuracy: {val_5_accuracy}\n');
                f.write('\n________________________________________________\n\n\n');
        

        #Save model
        th.save(model.state_dict(), f'minibatch_models/model{epoch}')
    



100% (391 of 391) |######################| Elapsed Time: 0:30:16 Time:  0:30:16
100% (40000 of 40000) |##################| Elapsed Time: 0:02:13 Time:  0:02:13
100% (391 of 391) |######################| Elapsed Time: 0:29:56 Time:  0:29:56
100% (391 of 391) |######################| Elapsed Time: 0:29:35 Time:  0:29:35
100% (391 of 391) |######################| Elapsed Time: 0:29:31 Time:  0:29:31
100% (391 of 391) |######################| Elapsed Time: 0:29:33 Time:  0:29:33
100% (391 of 391) |######################| Elapsed Time: 0:29:13 Time:  0:29:13
100% (40000 of 40000) |##################| Elapsed Time: 0:02:13 Time:  0:02:13
100% (391 of 391) |######################| Elapsed Time: 0:29:10 Time:  0:29:10
100% (391 of 391) |######################| Elapsed Time: 0:29:07 Time:  0:29:07
100% (391 of 391) |######################| Elapsed Time: 0:29:02 Time:  0:29:02
100% (391 of 391) |######################| Elapsed Time: 0:28:53 Time:  0:28:53
100% (391 of 391) |#####################