In [1]:
%load_ext autoreload
%autoreload 2
%load_ext tensorboard

In [2]:
from random import random

from SourceCodeTools.code.data.dataset.Dataset import SourceGraphDataset, filter_dst_by_freq
from SourceCodeTools.models.graph.train.sampling_multitask2 import training_procedure, SamplingMultitaskTrainer
from SourceCodeTools.models.graph.train.objectives.NodeClassificationObjective import NodeClassifierObjective
from SourceCodeTools.models.graph.train.utils import get_name, get_model_base
from SourceCodeTools.models.graph import RGGAN
from SourceCodeTools.tabular.common import compact_property
from SourceCodeTools.code.data.file_utils import unpersist

import dgl
import torch
import numpy as np
from argparse import Namespace
from torch import nn
from datetime import datetime

Using backend: pytorch


# Prepare parameters and options

Full list of options that can be added can be found in `SourceCodeTools/models/training_options.py`. They are ment to be used as arguments for cli trainer. Trainer script can be found in `SourceCodeTools/scripts/train.py`.

There are a lot of parameters. Ones that might be of interest are marked with `***`.

In [3]:
args = Namespace(
    
    # dataset parameters
    data_path="large_graph",             # *** path to node type
    model_output_dir="large_graph",      # *** directory to save checkpoints and training data
    use_node_types=False,                # node types currently not supported
    use_edge_types=True,                 # whether to use edge types
    filter_edges=None,                   # None or list of edge type names
    self_loops=False,                    # whether to use self loops
    train_frac=0.8,                      # *** fraction of nodes to use for training
    tokenizer="sentencepiece_bpe.model", # *** path to sentencepiece model
    random_seed=42,                      # random seed for splitting dataset int o train test validation
    min_count_for_objectives=5,          # *** minimum frequency of targets
    no_global_edges=False,               # remove global edges
    remove_reverse=False,                # remove reverse edges
    custom_reverse=None,                 # None or list of edges, for which reverse edges should be created (use together with `remove_reverse`)
    
    # training parameters
    batch_size=1024,                     # *** 
    num_per_neigh=10,                    # number of dependencies to sample for each node
    neg_sampling_factor=1,               # *** number of negative samples for each positive sample
    epochs=10,                           # *** number of epochs
    node_emb_size=100,                   # *** dimensionality of node embeddings
    elem_emb_size=100,                   # *** dimensionality of target embeddings (for node name prediction)
    pretraining_phase=0,                 # number of epochs for pretraining
    use_layer_scheduling=False,          # currently not supported
    schedule_layers_every=1,             # currently not supported
    embedding_table_size=200000,         # *** embedding table size for subwords
    save_checkpoints=False,              # set to False if checkpoints are not needed
    save_each_epoch=False,               # save each epoch, useful in case of studying model behavior
    measure_scores=True,                 # *** measure ranking scores during evaluation
    dilate_scores=200,                   # downsampling factor for measuring scores to make evaluation faster
    objectives="node_clf",               # type of objective
    early_stopping=False,                # whether to do early stopping
    early_stopping_tolerance=1e-5,
    force_w2v_ns=True,                   # negative sampling strategy
    metric="inner_prod",                 # do not change for now
    nn_index="brute",                    # do not change for now
    gpu=-1,                              # gpuid
    use_gcn_checkpoint=True,
    use_att_checkpoint=True,
    use_gru_checkpoint=True,
    restore_state=False,
    pretrained=None
)

model_parameters = {
    'h_dim': 100,                        # *** should match to node dimensionality
    'num_bases': 10,                     # number of bases for computing parmetwer weights for different edge types
    'dropout': 0.2,                      # *** 
    'use_self_loop': True,               #
    'activation': torch.tanh,            # *** 
    'lr': 1e-3                           # *** 
}

# Create Dataset

In [4]:
dataset = SourceGraphDataset(
    args.data_path,
    use_node_types=args.use_node_types,
    use_edge_types=args.use_edge_types,
    filter=args.filter_edges,
    self_loops=args.self_loops,
    train_frac=args.train_frac,
    tokenizer_path=args.tokenizer,         # path to sentencepiece tokenizer
    random_seed=args.random_seed,
    min_count_for_objectives=args.min_count_for_objectives,
    no_global_edges=args.no_global_edges,
    remove_reverse=args.remove_reverse,
    custom_reverse=args.custom_reverse,
)



# Declare target loading function (labels)

In [5]:
def load_type_prediction():
    from SourceCodeTools.code.data.dataset.reader import load_data
    
    nodes, edges = dataset.nodes, dataset.edges
    
    type_ann = unpersist("large_graph/type_annotations.json.bz2").query("src in @node_ids", local_dict={"node_ids": nodes["id"]})
    
    norm = lambda x: x.strip("\"").strip("'").split("[")[0].split(".")[-1]

    type_ann["dst"] = type_ann["dst"].apply(norm)
    type_ann = filter_dst_by_freq(type_ann, args.min_count_for_objectives)
    type_ann = type_ann[["src", "dst"]]

    return type_ann

# Define objectives

Currenlty objectives for node classification (`NodeClassifierObjective`), and name-based node embedding training `SubwordEmbedderObjective`.

![]("examples/figures/img1.png)


One or several objectives could be used

In [6]:
class Trainer(SamplingMultitaskTrainer):
    def create_objectives(self, dataset, tokenizer_path):
        self.objectives = nn.ModuleList()
        
        self.objectives.append(
            NodeClassifierObjective(
                "NodeTypeClassifier",
                self.graph_model, self.node_embedder, dataset.nodes,
                dataset.load_node_classes,                                              # need to define this function
                self.device, self.sampling_neighbourhood_size, self.batch_size,
                tokenizer_path=tokenizer_path, target_emb_size=self.elem_emb_size,
                masker=dataset.create_node_clf_masker(),                                # this is needed only for node type classification
                measure_scores=self.trainer_params["measure_scores"],
                dilate_scores=self.trainer_params["dilate_scores"]
            )
        )
        
        # self.objectives.append(
        #     NodeClassifierObjective(
        #         "TypeAnnPrediction",
        #         self.graph_model, self.node_embedder, dataset.nodes,
        #         load_type_prediction,                                                   # need to define this function
        #         self.device, self.sampling_neighbourhood_size, self.batch_size,
        #         tokenizer_path=tokenizer_path, target_emb_size=self.elem_emb_size, 
        #         masker=None,                                                            # masker is not needed here
        #         measure_scores=self.trainer_params["measure_scores"],
        #         dilate_scores=self.trainer_params["dilate_scores"]
        #     )
        # )

In [7]:
%tensorboard --logdir "large_graph"

In [None]:
training_procedure(
    dataset, 
    model_name=RGGAN, 
    model_params=model_parameters,
    args=args,
    model_base_path=get_model_base(args, get_name(RGGAN, str(datetime.now()))),
    trainer=Trainer
)

Number of nodes 324218


Epoch 0: 100%|████████████████████████████████| 243/243 [01:56<00:00,  2.09it/s]
Epoch 1:   0%|▏                                 | 1/243 [00:00<00:43,  5.54it/s]

Epoch: 0, Time: 130 s
{'Accuracy/test/NodeTypeClassifier_': 0.3872578845046083,
 'Accuracy/train/NodeTypeClassifier_': 0.4505928853754941,
 'Accuracy/train_avg/NodeTypeClassifier': 0.24216100467232712,
 'Accuracy/val/NodeTypeClassifier_': 0.3865718884881872,
 'Loss/test/NodeTypeClassifier_': 2.430450024143342,
 'Loss/train/NodeTypeClassifier_': 1.7834914922714233,
 'Loss/train_avg/NodeTypeClassifier': 2.880048046877355,
 'Loss/val/NodeTypeClassifier_': 2.4204529446940266,
 'acc@1/test/NodeTypeClassifier_': 0.3330078125,
 'acc@1/val/NodeTypeClassifier_': 0.3564453125,
 'acc@10/test/NodeTypeClassifier_': 0.685546875,
 'acc@10/val/NodeTypeClassifier_': 0.7041015625,
 'acc@3/test/NodeTypeClassifier_': 0.52734375,
 'acc@3/val/NodeTypeClassifier_': 0.5419921875,
 'acc@5/test/NodeTypeClassifier_': 0.5849609375,
 'acc@5/val/NodeTypeClassifier_': 0.60546875,
 'ndcg@1/test/NodeTypeClassifier_': 0.3330078125,
 'ndcg@1/val/NodeTypeClassifier_': 0.3564453125,
 'ndcg@10/test/NodeTypeClassifier_': 0.

Epoch 1: 100%|████████████████████████████████| 243/243 [02:22<00:00,  1.71it/s]
Epoch 2:   0%|▏                                 | 1/243 [00:00<00:42,  5.70it/s]

Epoch: 1, Time: 155 s
{'Accuracy/test/NodeTypeClassifier_': 0.6683143721198157,
 'Accuracy/train/NodeTypeClassifier_': 0.6719367588932806,
 'Accuracy/train_avg/NodeTypeClassifier': 0.4488894645427707,
 'Accuracy/val/NodeTypeClassifier_': 0.6736592268665758,
 'Loss/test/NodeTypeClassifier_': 1.2975636951385006,
 'Loss/train/NodeTypeClassifier_': 1.2128385305404663,
 'Loss/train_avg/NodeTypeClassifier': 2.06057782992414,
 'Loss/val/NodeTypeClassifier_': 1.2764978331904258,
 'acc@1/test/NodeTypeClassifier_': 0.6044921875,
 'acc@1/val/NodeTypeClassifier_': 0.6171875,
 'acc@10/test/NodeTypeClassifier_': 0.7880859375,
 'acc@10/val/NodeTypeClassifier_': 0.7939453125,
 'acc@3/test/NodeTypeClassifier_': 0.689453125,
 'acc@3/val/NodeTypeClassifier_': 0.7099609375,
 'acc@5/test/NodeTypeClassifier_': 0.712890625,
 'acc@5/val/NodeTypeClassifier_': 0.7294921875,
 'ndcg@1/test/NodeTypeClassifier_': 0.6044921875,
 'ndcg@1/val/NodeTypeClassifier_': 0.6171875,
 'ndcg@10/test/NodeTypeClassifier_': 0.6880

Epoch 2:  16%|█████▎                           | 39/243 [00:15<01:29,  2.28it/s]