In [1]:
%load_ext autoreload
%autoreload 2
%load_ext tensorboard

In [2]:
from random import random

from SourceCodeTools.models.training_config import get_config, save_config, load_config
from SourceCodeTools.code.data.dataset.Dataset import SourceGraphDataset, filter_dst_by_freq
from SourceCodeTools.models.graph.train.sampling_multitask2 import training_procedure, SamplingMultitaskTrainer
from SourceCodeTools.models.graph.train.objectives.NodeClassificationObjective import NodeClassifierObjective
from SourceCodeTools.models.graph.train.objectives.SubgraphClassifierObjective import SubgraphAbstractObjective, \
    SubgraphClassifierObjective, SubgraphEmbeddingObjective
from SourceCodeTools.models.graph.train.utils import get_name, get_model_base
from SourceCodeTools.models.graph import RGGAN
from SourceCodeTools.tabular.common import compact_property
from SourceCodeTools.code.data.file_utils import unpersist

import dgl
import torch
import numpy as np
from argparse import Namespace
from torch import nn
from datetime import datetime
from os.path import join
from functools import partial

Using backend: pytorch


# Prepare parameters and options

Full list of options that can be added can be found in `SourceCodeTools/models/training_options.py`. They are ment to be used as arguments for cli trainer. Trainer script can be found in `SourceCodeTools/scripts/train.py`.

There are a lot of parameters. Ones that might be of interest are marked with `***`.

In [3]:
tokenizer_path = "sentencepiece_bpe.model"

data_path = "variable_misuse_graph_2_percent_misuse_edges"
subgraph_partition = join(data_path, "partition.json.bz2")
edges_to_predict = join(data_path, "misuse_edges.json.bz2")
filecontent_path = join(data_path, "common_filecontent.json.bz2")

In [4]:
config = get_config(
    # tokenizer
    tokenizer_path="sentencepiece_bpe.model", # *** path to sentencepiece model
    
    # dataset parameters
    data_path=data_path,                 # *** path to dataset
    use_node_types=False,                # node types currently not supported
    use_edge_types=True,                 # whether to use edge types
    filter_edges=None,                   # None or list of edge type names
    self_loops=False,                    # whether to use self loops
    train_frac=0.8,                      # *** fraction of nodes to use for training
    random_seed=42,                      # random seed for splitting dataset int o train test validation
    min_count_for_objectives=5,          # *** minimum frequency of targets
    no_global_edges=False,               # remove global edges
    remove_reverse=False,                # remove reverse edges
    custom_reverse=None,                 # None or list of edges, for which reverse edges should be created (use together with `remove_reverse`)
    
    # training parameters
    model_output_dir=data_path,      # *** directory to save checkpoints and training data
    batch_size=128,                     # *** 
    sampling_neighbourhood_size=10,      # number of dependencies to sample for each node
    neg_sampling_factor=1,               # *** number of negative samples for each positive sample
    epochs=10,                           # *** number of epochs
    elem_emb_size=100,                   # *** dimensionality of target embeddings (for node name prediction)
    pretraining_phase=0,                 # number of epochs for pretraining
    embedding_table_size=200000,         # *** embedding table size for subwords
    save_checkpoints=False,              # set to False if checkpoints are not needed
    save_each_epoch=False,               # save each epoch, useful in case of studying model behavior
    measure_scores=True,                 # *** measure ranking scores during evaluation
    dilate_scores=200,                   # downsampling factor for measuring scores to make evaluation faster
    objectives="node_clf",               # type of objective
    force_w2v_ns=True,                   # negative sampling strategy
    gpu=-1,                              # gpuid
    restore_state=False,
    pretrained=None,
    
    # model parameters
    node_emb_size=100,                   # *** dimensionality of node embeddings
    h_dim=100,                           # *** should match to node dimensionality
    num_bases=10,                        # number of bases for computing parmetwer weights for different edge types
    dropout=0.2,                         # *** 
    use_self_loop=True,                  #
    activation="tanh",                   # *** 
    learning_rate=1e-3,                  # *** 
    use_gcn_checkpoint=True,
    use_att_checkpoint=True,
    use_gru_checkpoint=True
)

In [5]:
# save_config(config, "config_var_misuse_edge_pred.yaml")

In [6]:
# config = load_config("config_var_misuse_edge_pred.yaml")

In [7]:
config

{'DATASET': {'data_path': 'variable_misuse_graph_2_percent_misuse_edges',
  'train_frac': 0.8,
  'filter_edges': None,
  'min_count_for_objectives': 5,
  'self_loops': False,
  'use_node_types': False,
  'use_edge_types': True,
  'no_global_edges': False,
  'remove_reverse': False,
  'custom_reverse': None,
  'restricted_id_pool': None,
  'random_seed': 42,
  'subgraph_id_column': 'mentioned_in',
  'subgraph_partition': None},
 'TRAINING': {'model_output_dir': 'variable_misuse_graph_2_percent_misuse_edges',
  'pretrained': None,
  'pretraining_phase': 0,
  'sampling_neighbourhood_size': 10,
  'neg_sampling_factor': 1,
  'use_layer_scheduling': False,
  'schedule_layers_every': 10,
  'elem_emb_size': 100,
  'embedding_table_size': 200000,
  'epochs': 10,
  'batch_size': 128,
  'learning_rate': 0.001,
  'objectives': 'node_clf',
  'save_each_epoch': False,
  'save_checkpoints': False,
  'early_stopping': False,
  'early_stopping_tolerance': 20,
  'force_w2v_ns': True,
  'use_ns_groups': 

# Create Dataset

In [11]:
dataset = SourceGraphDataset(
    **{**config["DATASET"], **config["TOKENIZER"]},
)



# Declare target loading function (labels)

In [12]:
def load_edge_prediction():
    from SourceCodeTools.code.data.dataset.reader import load_data
    
    nodes, edges = dataset.nodes, dataset.edges
    
    type_ann = unpersist(edges_to_predict)
    type_ann = type_ann[["src", "dst"]]

    return type_ann

In [13]:
load_edge_prediction()

Unnamed: 0,src,dst
0,103431,42742
1,60434,57118
2,121179,104211
3,180808,268065
4,142647,218764
...,...,...
5608,175993,88220
5609,129356,47702
5610,250001,114541
5611,12201,147130


# Define objectives

Currenlty objectives for node classification (`EdgePrediction`), and name-based node embedding training `SubwordEmbedderObjective`.

![]("examples/figures/img1.png)


One or several objectives could be used

In [16]:
from SourceCodeTools.models.graph.train.objectives.GraphLinkObjective import GraphLinkObjective

class Trainer(SamplingMultitaskTrainer):
    def create_objectives(self, dataset, tokenizer_path):
        self.objectives = nn.ModuleList()
        self.objectives.append(
            GraphLinkObjective(
                "VarMisuseEdgePred",
                self.graph_model, self.node_embedder, dataset.nodes,
                load_edge_prediction,                                                   # need to define this function
                self.device, self.sampling_neighbourhood_size, self.batch_size,
                tokenizer_path=tokenizer_path, target_emb_size=self.elem_emb_size, 
                masker=None,                                                            # masker is not needed here
                measure_scores=self.trainer_params["measure_scores"],
                dilate_scores=self.trainer_params["dilate_scores"]
            )
        )

In [10]:
%tensorboard --logdir "large_graph"

ERROR: Could not find `tensorboard`. Please ensure that your PATH
contains an executable `tensorboard` program, or explicitly specify
the path to a TensorBoard binary by setting the `TENSORBOARD_BINARY`
environment variable.

In [18]:
# training_procedure(
#     dataset, 
#     model_name=RGGAN, 
#     model_params=config["MODEL"],
#     trainer_params=config["TRAINING"],
#     model_base_path=get_model_base(config["TRAINING"], get_name(RGGAN, str(datetime.now()))),
#     tokenizer_path=config["TOKENIZER"]["tokenizer_path"],
#     trainer=Trainer
# )

# target is currently sampled incorrectly