In [17]:
%load_ext snakeviz

### E2E benchmarks currently: 
```
69_619.04 samples/sec  DGL CUDA
17_911 samples/sec  DGL CPU
20387.86 samples/sec cuGRAPH 
(We should be able to 2x this when etypes can be directly set in cugraph Graph)
```

## Setup Memory Pool

In [2]:
import sys
import rmm

rmm.reinitialize(pool_allocator=True,initial_pool_size=5e+9, maximum_pool_size=22e+9)

# OBGN- Mag DataSet Training

In [3]:
from ogb.nodeproppred import DglNodePropPredDataset, Evaluator
from dgl import AddReverse, Compose, ToSimple
import dgl

import torch as th
import torch.nn.functional as F

from tqdm import tqdm
import itertools
import torch

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
### Local Imports from this folder
from logger import Logger

## Step 1.a:  Load the Graph

In [5]:
def load_dgl_graph(paper_dim_to_load=10):
    dataset = DglNodePropPredDataset(name="ogbn-mag", root='/datasets/vjawa/gnn/')
    split_idx = dataset.get_idx_split()
    # graph: dgl graph object, label: torch tensor of shape (num_nodes, num_tasks)
    g, labels = dataset[0]        
    # Update to paper_dim_to_load to save on GPU memory for non distributed versions
    # We need to clear g.ndata to do this
    ndata = {k: v for k, v in g.ndata.items()}
    g.ndata.clear()        
    ndata['feat']['paper'] = ndata['feat']['paper'][:,:paper_dim_to_load]
    g.ndata.update(ndata)
    
    labels = labels["paper"].flatten()
    transform = Compose([ToSimple(), AddReverse()])
    g = transform(g)
    print("Loaded graph: {}".format(g))
    return g, labels, dataset.num_classes, split_idx


In [6]:
load_d_feature = 10
device = 'cuda'

g, labels, num_classes, split_idx = load_dgl_graph(load_d_feature)
assert g.ndata['feat']['paper'].shape[1] == load_d_feature


g = g.to(device)

Loaded graph: Graph(num_nodes={'author': 1134649, 'field_of_study': 59965, 'institution': 8740, 'paper': 736389},
      num_edges={('author', 'affiliated_with', 'institution'): 1043998, ('author', 'writes', 'paper'): 7145660, ('field_of_study', 'rev_has_topic', 'paper'): 7505078, ('institution', 'rev_affiliated_with', 'author'): 1043998, ('paper', 'cites', 'paper'): 10832542, ('paper', 'has_topic', 'field_of_study'): 7505078, ('paper', 'rev_writes', 'author'): 7145660},
      metagraph=[('author', 'institution', 'affiliated_with'), ('author', 'paper', 'writes'), ('institution', 'author', 'rev_affiliated_with'), ('paper', 'paper', 'cites'), ('paper', 'field_of_study', 'has_topic'), ('paper', 'author', 'rev_writes'), ('field_of_study', 'paper', 'rev_has_topic')])


## Step 1.b Convert Graph from DGL Graph to cugraph

In [7]:
### Local Imports from this folder
from dgl_cugraph_conversion_utils import add_edges, add_nodes, add_ndata

In [8]:
def graphstore_from_hetrograph(gs, g):
    num_nodes_dict = {ntype: g.num_nodes(ntype) for ntype in g.ntypes}
    add_nodes(gs, g, num_nodes_dict)
    print("Added Nodes")
    add_edges(gs, g, num_nodes_dict)
    print("Added Edges")
    add_ndata(gs, g, num_nodes_dict)
    print("Added ndata")
    #TODO: 
    # add_edata

gs = dgl.contrib.cugraph.CuGraphStorage(idtype=g.idtype)
graphstore_from_hetrograph(gs, g)

Added Nodes
Added Edges
Added ndata


#### Verify Similar Structure

In [9]:
## Testing Similar Structure
from dgl_cugraph_conversion_utils import assert_same_num_nodes, assert_same_num_edges
gs.num_edges(etype='affiliated_with')==g.num_edges(etype='affiliated_with')
assert_same_num_edges(gs, g)
assert_same_num_nodes(gs, g)

### Update g to graphstore

In [10]:
g = gs
del gs

## Step 1.c Initiate Sampler and Train Loader

In [11]:
#### Run only on 200_000 samples to test the pipeline
n_samples = 100_000
subset_split_idx = {'train': {k: v[:n_samples].to(device) for k,v in split_idx['train'].items()},
                   'valid' : {k: v[:n_samples].to(device) for k,v in split_idx['valid'].items()},
                    'test' : {k: v[:n_samples].to(device) for k,v in split_idx['test'].items()},
                   }


sampler = dgl.dataloading.MultiLayerNeighborSampler([25, 20], prefetch_node_feats={'paper':['feat']})
train_loader = dgl.dataloading.DataLoader(
    g,
    #split_idx,
    subset_split_idx["train"],
    sampler,
    batch_size=8192,
    shuffle=True,
    num_workers=0,
    device=device,
)

## Step 2. Initate a Model

###  Traing HyperParameters

In [12]:
from model import  rel_graph_embed,extract_embed
from model import EntityClassify

In [13]:
nruns = 2
logger = Logger(nruns)

In [14]:
first_index = torch.IntTensor([0]).to(device)
feat_shape = g.get_node_storage(key='feat', ntype='paper').fetch(first_index, device=device).shape[1]

In [15]:
embedding_shape = feat_shape
embed_layer = rel_graph_embed(g, embedding_shape).to(device)
model = EntityClassify(g, embedding_shape, num_classes).to(device)

print(
    f"Number of embedding parameters: {sum(p.numel() for p in embed_layer.parameters())}"
)
print(
    f"Number of model parameters: {sum(p.numel() for p in model.parameters())}"
)

Number of embedding parameters: 12033540
Number of model parameters: 254388


## Benchmark Training Loop
```
Toal Time = 6.8 s
Forward pass= 1.25 s (18% time)
DataLoader/Sampling = 5.52 s (80% time) 

In Data Loading : 
4.9 s in sample_neighbours 
3.94s _get_edgeid_type_d (This should go away soon) 
``` 

In [28]:
# def sampling_behavior_benchmark(g, seed_nodes,labels, node_embed, train_loader):
#     category = "paper"
#     all_params = itertools.chain(model.parameters(), embed_layer.parameters())
#     optimizer = th.optim.Adam(all_params, lr=0.01)

        
#     for input_nodes, seeds, blocks in train_loader:
#         seeds = seeds[category]
#         emb = extract_embed(node_embed, input_nodes)
#         feat = blocks[0].srcdata['feat']['paper']
#         # Add the batch's raw "paper" features
#         emb.update(
#             {"paper": feat}
#             #{"paper": g.ndata["feat"]["paper"][input_nodes["paper"]]}
#         )

#         emb = {k: e.to(device) for k, e in emb.items()}
#         lbl = labels[seeds].to(device)
#         optimizer.zero_grad()
#         logits = model(emb, blocks)[category]

#         y_hat = logits.log_softmax(dim=-1)
#         loss = F.nll_loss(y_hat, lbl)
#     return None
# %snakeviz sampling_behavior_benchmark(g, subset_split_idx['train'],labels,embed_layer, train_loader)

## Model Training

In [29]:
def train(
    g,
    model,
    node_embed,
    optimizer,
    train_loader,
    split_idx,
    labels,
    logger,
    device,
    run,
):
    print("start training...")
    category = "paper"
    for epoch in range(5):
        num_train = split_idx["train"][category].shape[0]
        pbar = tqdm(total=num_train)
        pbar.set_description(f"Epoch {epoch:02d}")
        model.train()

        total_loss = 0

        for input_nodes, seeds, blocks in train_loader:
            blocks = [blk.to(device) for blk in blocks]
            seeds = seeds[
                category
            ]  # we only predict the nodes with type "category"
            batch_size = seeds.shape[0]

            emb = extract_embed(node_embed, input_nodes)
            
            feat = blocks[0].srcdata['feat']['paper']
            #label = subgs[-1].dstdata['label']
    
            # Add the batch's raw "paper" features
            emb.update(
                {"paper": feat}
                #{"paper": g.ndata["feat"]["paper"][input_nodes["paper"]]}
            )

            emb = {k: e.to(device) for k, e in emb.items()}
            lbl = labels[seeds].to(device)

            optimizer.zero_grad()
            logits = model(emb, blocks)[category]

            y_hat = logits.log_softmax(dim=-1)
            loss = F.nll_loss(y_hat, lbl)
            loss.backward()
            optimizer.step()

            total_loss += loss.item() * batch_size
            pbar.update(batch_size)
            
        pbar.close()
        loss = total_loss / num_train

    result = test(g, model, node_embed, labels, device, split_idx)
    logger.add_result(run, result)
    train_acc, valid_acc, test_acc = result
    print(
        f"Run: {run + 1:02d}, "
        f"Epoch: {epoch +1 :02d}, "
        f"Loss: {loss:.4f}, "
        f"Train: {100 * train_acc:.2f}%, "
        f"Valid: {100 * valid_acc:.2f}%, "
        f"Test: {100 * test_acc:.2f}%"
    )

    return logger


@th.no_grad()
def test(g, model, node_embed, y_true, device, split_idx):
    model.eval()
    category = "paper"
    evaluator = Evaluator(name="ogbn-mag")
    
    #TODO: Fix memory issues  (VJAWA)
    # 2 GNN layers
    # Possible memory leak
    # sampler = dgl.dataloading.MultiLayerFullNeighborSampler(2,  prefetch_node_feats={'paper':['feat']})
    sampler = dgl.dataloading.MultiLayerNeighborSampler([25, 20], prefetch_node_feats={'paper':['feat']})
    
    loader = dgl.dataloading.DataLoader(
        g,
        {"paper": th.arange(g.num_nodes("paper")).to(device)},
        sampler,
        batch_size=16384,
        shuffle=False,
        num_workers=0,
        device = device,
    )

    pbar = tqdm(total=y_true.size(0))
    pbar.set_description(f"Inference")

    y_hats = list()
    
    for input_nodes, seeds, blocks in loader:
        blocks = [blk.to(device) for blk in blocks]
        seeds = seeds[
            category
        ]  # we only predict the nodes with type "category"
        batch_size = seeds.shape[0]

        emb = extract_embed(node_embed, input_nodes)
        # Get the batch's raw "paper" features
        
        ## prefetch_feat
        feat = blocks[0].srcdata['feat']['paper']
        emb.update({"paper": feat})
        emb = {k: e.to(device) for k, e in emb.items()}

        logits = model(emb, blocks)[category]
        y_hat = logits.log_softmax(dim=-1).argmax(dim=1, keepdims=True)
        y_hats.append(y_hat.cpu())

        pbar.update(batch_size)
        
        del input_nodes, seeds, blocks
        del feat, emb

        

    pbar.close()

    y_pred = th.cat(y_hats, dim=0)
    y_pred_rows = y_pred.shape[0]
    
    y_true = th.unsqueeze(y_true[:y_pred_rows], 1)
    
    
    train_split_idx = split_idx["train"]["paper"]
    valid_split_idx = split_idx["valid"]["paper"]
    test_split_idx =  split_idx["test"]["paper"]
    
    ### I only want to calculate over the rows i had
    

    train_acc = evaluator.eval(
        {
            "y_true": y_true[train_split_idx],
            "y_pred": y_pred[train_split_idx],
        }
    )["acc"]
    valid_acc = evaluator.eval(
        {
            "y_true": y_true[valid_split_idx],
            "y_pred": y_pred[valid_split_idx],
        }
    )["acc"]
    test_acc = evaluator.eval(
        {
            "y_true": y_true[test_split_idx],
            "y_pred": y_pred[test_split_idx],
        }
    )["acc"]

    return train_acc, valid_acc, test_acc

In [30]:
type(g)

dgl.contrib.cugraph.cugraph_storage.CuGraphStorage

In [26]:
for run in range(nruns):
    embed_layer.reset_parameters()
    model.reset_parameters()

    # optimizer
    all_params = itertools.chain(
        model.parameters(), embed_layer.parameters()
    )
    optimizer = th.optim.Adam(all_params, lr=0.01)

    logger = train(
        g,
        model,
        embed_layer,
        optimizer,
        train_loader,
        subset_split_idx,
        #TODO: Change to split_idx,
        labels,
        logger,
        device,
        run,
    )
    logger.print_statistics(run)
print("Final performance: ")
logger.print_statistics()

start training...


Epoch 00: 100%|██████████| 100000/100000 [00:05<00:00, 19603.93it/s]
Epoch 01: 100%|██████████| 100000/100000 [00:04<00:00, 20359.54it/s]
Epoch 02: 100%|██████████| 100000/100000 [00:05<00:00, 19754.76it/s]
Epoch 03: 100%|██████████| 100000/100000 [00:04<00:00, 20591.25it/s]
Epoch 04: 100%|██████████| 100000/100000 [00:04<00:00, 20061.01it/s]
Inference: 100%|██████████| 736389/736389 [00:16<00:00, 44165.42it/s]


Run: 01, Epoch: 05, Loss: 1.3529, Train: 83.33%, Valid: 30.59%, Test: 30.54%
Run 01:
Highest Train: 83.33
Highest Valid: 30.59
  Final Train: 83.33
Final Test: 30.54
start training...


Epoch 00: 100%|██████████| 100000/100000 [00:05<00:00, 19414.25it/s]
Epoch 01: 100%|██████████| 100000/100000 [00:05<00:00, 19177.87it/s]
Epoch 02: 100%|██████████| 100000/100000 [00:05<00:00, 19336.42it/s]
Epoch 03: 100%|██████████| 100000/100000 [00:04<00:00, 20605.24it/s]
Epoch 04: 100%|██████████| 100000/100000 [00:04<00:00, 20020.21it/s]
Inference: 100%|██████████| 736389/736389 [00:16<00:00, 44958.68it/s]

Run: 02, Epoch: 05, Loss: 1.4598, Train: 79.96%, Valid: 30.50%, Test: 30.73%
Run 02:
Highest Train: 79.96
Highest Valid: 30.50
  Final Train: 79.96
Final Test: 30.73
Final performance: 
All runs:
Highest Train: 81.64 ± 2.38
Highest Valid: 30.54 ± 0.06
  Final Train: 81.64 ± 2.38
   Final Test: 30.64 ± 0.13



