In [1]:
%load_ext autoreload
%autoreload 2
%load_ext tensorboard

In [1]:
from SourceCodeTools.models.training_config import get_config, save_config, load_config
from SourceCodeTools.code.data.dataset.Dataset import SourceGraphDataset
from SourceCodeTools.models.graph.train.sampling_multitask2 import training_procedure, SamplingMultitaskTrainer
from SourceCodeTools.models.graph.train.objectives.SubgraphClassifierObjective import \
    SubgraphClassifierObjective
from SourceCodeTools.models.graph.train.utils import get_name, get_model_base
from SourceCodeTools.models.graph import RGGAN
from SourceCodeTools.code.data.file_utils import unpersist

from torch import nn
from datetime import datetime
from os.path import join

Using backend: pytorch


# Prepare parameters and options

Full list of options that can be added can be found in `SourceCodeTools/models/training_options.py`. They are ment to be used as arguments for cli trainer. Trainer script can be found in `SourceCodeTools/scripts/train.py`.

For the task of subgraph classification the important options are:
- `subgraph_partition` is path to subgraph-based train/val/test sets. Storead as Dataframe with subgraph id and partition mask
- `subgraph_id_column` is a column is `common_edges` file that stores subgraph id.
- For variable misuse task (same will apply to authorship attribution) subgraphs are created for individual functions (files for SCAA). The label is stored in `common_filecontent`.

In [2]:
tokenizer_path = "sentencepiece_bpe.model"

data_path = "variable_misuse_graph_2_percent_balanced/with_ast"
subgraph_partition = join(data_path, "partition.json.bz2")
filecontent_path = join(data_path, "common_filecontent.json.bz2")

In [3]:
unpersist(subgraph_partition)

Unnamed: 0,id,train_mask,val_mask,test_mask
0,9237,True,False,False
1,14573,False,True,False
2,16495,False,False,True
3,19094,False,False,True
4,5945,True,False,False
...,...,...,...,...
22909,11964,True,False,False
22910,21575,False,False,True
22911,5390,True,False,False
22912,860,True,False,False


In [4]:
unpersist(join(data_path, "common_edges.json.bz2"), nrows=10)

Unnamed: 0,id,type,source_node_id,target_node_id,file_id,mentioned_in,offset_start,offset_end
0,0,subword,933540,599621,9237,,,
1,1,arg,599621,211512,9237,263126.0,,
2,2,arg_rev,211512,599621,9237,263126.0,,
3,3,args,211512,238149,9237,263126.0,35.0,39.0
4,4,args_rev,238149,211512,9237,263126.0,,
...,...,...,...,...,...,...,...,...
345,3643493,subword,557063,450612,1060,,,
346,3643494,subword,301286,450612,1060,,,
347,3643495,targets,450612,490773,1060,220570.0,2849.0,2863.0
348,3643496,targets_rev,490773,450612,1060,220570.0,,


In [5]:
config = get_config(
    # tokenizer
    tokenizer_path=tokenizer_path, # *** path to sentencepiece model

    # dataset parameters
    data_path=data_path,             # *** path to node type
    use_node_types=False,                # node types currently not supported
    use_edge_types=True,                 # whether to use edge types
    filter_edges=None,                   # None or list of edge type names
    self_loops=False,                    # whether to use self loops
    train_frac=0.8,                      # *** fraction of nodes to use for training
    random_seed=42,                      # random seed for splitting dataset int o train test validation
    min_count_for_objectives=5,          # *** minimum frequency of targets
    no_global_edges=False,               # remove global edges
    remove_reverse=False,                # remove reverse edges
    custom_reverse=None,                 # None or list of edges, for which reverse edges should be created (use together with `remove_reverse`)
    partition=subgraph_partition,  # partition into train/test/val

    # training parameters
    model_output_dir=data_path,      # *** directory to save checkpoints and training data
    batch_size=8,                     # ***
    sampling_neighbourhood_size=10,      # number of dependencies to sample for each node
    neg_sampling_factor=1,               # *** number of negative samples for each positive sample
    epochs=10,                           # *** number of epochs
    elem_emb_size=100,                   # *** dimensionality of target embeddings (for node name prediction)
    pretraining_phase=0,                 # number of epochs for pretraining
    embedding_table_size=200000,         # *** embedding table size for subwords
    save_checkpoints=False,              # set to False if checkpoints are not needed
    save_each_epoch=False,               # save each epoch, useful in case of studying model behavior
    measure_scores=True,                 # *** measure ranking scores during evaluation
    dilate_scores=200,                   # downsampling factor for measuring scores to make evaluation faster
    objectives="subgraph_clf",               # type of objective
    force_w2v_ns=True,                   # negative sampling strategy
    gpu=-1,                              # gpuid
    restore_state=False,
    pretrained=None,

    # model parameters
    node_emb_size=100,                   # *** dimensionality of node embeddings
    h_dim=100,                           # *** should match to node dimensionality
    num_bases=10,                        # number of bases for computing parmetwer weights for different edge types
    dropout=0.2,                         # ***
    use_self_loop=True,                  #
    activation="tanh",                   # ***
    learning_rate=1e-3,                  # ***
    use_gcn_checkpoint=True,
    use_att_checkpoint=True,
    use_gru_checkpoint=True
)

In [6]:
# save_config(config, "var_misuse_tiny.yaml")

In [7]:
config

{'DATASET': {'data_path': 'variable_misuse_graph_2_percent_balanced/with_ast',
  'train_frac': 0.8,
  'filter_edges': None,
  'min_count_for_objectives': 5,
  'self_loops': False,
  'use_node_types': False,
  'use_edge_types': True,
  'no_global_edges': False,
  'remove_reverse': False,
  'custom_reverse': None,
  'restricted_id_pool': None,
  'random_seed': 42,
  'subgraph_id_column': 'mentioned_in',
  'subgraph_partition': None,
  'partition': 'variable_misuse_graph_2_percent_balanced/with_ast/partition.json.bz2'},
 'TRAINING': {'model': 'RGCN',
  'model_output_dir': 'variable_misuse_graph_2_percent_balanced/with_ast',
  'pretrained': None,
  'pretraining_phase': 0,
  'sampling_neighbourhood_size': 10,
  'neg_sampling_factor': 1,
  'use_layer_scheduling': False,
  'schedule_layers_every': 10,
  'elem_emb_size': 100,
  'embedding_table_size': 200000,
  'epochs': 10,
  'batch_size': 8,
  'learning_rate': 0.001,
  'objectives': 'subgraph_clf',
  'save_each_epoch': False,
  'save_checkpo

# Create Dataset

In [8]:
dataset = SourceGraphDataset(
    **{**config["DATASET"], **config["TOKENIZER"]}
)
ntypes, etypes = dataset.get_graph_types()
config["TRAINING"]['ntypes'] = ntypes
config["TRAINING"]['etypes'] = etypes

# Declare target loading function (labels)

In [9]:
def load_labels():
    filecontent = unpersist(filecontent_path)
    return filecontent[["id", "label"]].rename({"id": "src", "label": "dst"}, axis=1)

# Declare objective and train

In [10]:
from SourceCodeTools.models.graph.train.objectives.NodeClassificationObjective import ClassifierTargetMapper


class Trainer(SamplingMultitaskTrainer):
    def create_objectives(self, dataset, tokenizer_path):
        self.objectives = nn.ModuleList()

        self.objectives.append(
            self._create_subgraph_objective(
                objective_name="VariableMisuseSubgraphClassifierObjective",
                objective_class=SubgraphClassifierObjective,
                dataset=dataset,
                tokenizer_path=tokenizer_path,
                labels_fn=load_labels,
                label_loader_class=ClassifierTargetMapper,
                label_loader_params={"emb_size": None, "tokenizer_path": None, "use_ns_groups": False}
            )
        )

# Run training

In [11]:
# %tensorboard --logdir data_path

In [None]:
training_procedure(
    dataset,
    model_name=RGGAN,
    model_params=config["MODEL"],
    trainer_params=config["TRAINING"],
    model_base_path=get_model_base(config["TRAINING"], get_name(RGGAN, str(datetime.now()))),
    trainer=Trainer
)