In [1]:
%load_ext autoreload
%autoreload 2
%load_ext tensorboard

In [2]:
from random import random

from SourceCodeTools.models.training_config import get_config, save_config, load_config
from SourceCodeTools.code.data.dataset.Dataset import SourceGraphDataset, filter_dst_by_freq
from SourceCodeTools.models.graph.train.sampling_multitask2 import training_procedure, SamplingMultitaskTrainer
from SourceCodeTools.models.graph.train.objectives.NodeClassificationObjective import NodeClassifierObjective
from SourceCodeTools.models.graph.train.utils import get_name, get_model_base
from SourceCodeTools.models.graph import RGGAN
from SourceCodeTools.tabular.common import compact_property
from SourceCodeTools.code.data.file_utils import unpersist

import dgl
import torch
import numpy as np
from argparse import Namespace
from torch import nn
from datetime import datetime

Using backend: pytorch


# Prepare parameters and options

Full list of options that can be added can be found in `SourceCodeTools/models/training_options.py`. They are ment to be used as arguments for cli trainer. Trainer script can be found in `SourceCodeTools/scripts/train.py`.

There are a lot of parameters. Ones that might be of interest are marked with `***`.

In [5]:
config = load_config("config.yaml")

In [6]:
config

{'DATASET': {'custom_reverse': None,
  'data_path': 'large_graph',
  'filter_edges': None,
  'min_count_for_objectives': 5,
  'no_global_edges': False,
  'random_seed': 42,
  'remove_reverse': False,
  'restricted_id_pool': None,
  'self_loops': False,
  'train_frac': 0.8,
  'use_edge_types': True,
  'use_node_types': False},
 'MODEL': {'activation': 'tanh',
  'dropout': 0.2,
  'h_dim': 100,
  'n_layers': 5,
  'node_emb_size': 100,
  'num_bases': 10,
  'use_att_checkpoint': True,
  'use_gcn_checkpoint': True,
  'use_gru_checkpoint': True,
  'use_self_loop': True},
 'TRAINING': {'batch_size': 1024,
  'dilate_scores': 200,
  'early_stopping': False,
  'early_stopping_tolerance': 20,
  'elem_emb_size': 100,
  'embedding_table_size': 200000,
  'epochs': 10,
  'external_dataset': None,
  'force_w2v_ns': True,
  'gpu': -1,
  'learning_rate': 0.001,
  'measure_scores': True,
  'metric': 'inner_prod',
  'model_output_dir': 'large_graph',
  'neg_sampling_factor': 1,
  'nn_index': 'brute',
  'ob

# Create Dataset

In [7]:
dataset = SourceGraphDataset(
    **config["DATASET"]
)



# Declare target loading function (labels)

In [8]:
def load_type_prediction():
    from SourceCodeTools.code.data.dataset.reader import load_data
    
    nodes, edges = dataset.nodes, dataset.edges
    
    type_ann = unpersist("large_graph/type_annotations.json.bz2").query("src in @node_ids", local_dict={"node_ids": nodes["id"]})
    
    norm = lambda x: x.strip("\"").strip("'").split("[")[0].split(".")[-1]

    type_ann["dst"] = type_ann["dst"].apply(norm)
    type_ann = filter_dst_by_freq(type_ann, config["DATASET"]["min_count_for_objectives"])
    type_ann = type_ann[["src", "dst"]]

    return type_ann

# Define objectives

Currenlty objectives for node classification (`NodeClassifierObjective`), and name-based node embedding training `SubwordEmbedderObjective`.

![]("examples/figures/img1.png)


One or several objectives could be used

In [9]:
class Trainer(SamplingMultitaskTrainer):
    def create_objectives(self, dataset, tokenizer_path):
        self.objectives = nn.ModuleList()
        
#         self.objectives.append(
#             NodeClassifierObjective(
#                 "NodeTypeClassifier",
#                 self.graph_model, self.node_embedder, dataset.nodes,
#                 dataset.load_node_classes,                                              # need to define this function
#                 self.device, self.sampling_neighbourhood_size, self.batch_size,
#                 tokenizer_path=tokenizer_path, target_emb_size=self.elem_emb_size,
#                 masker=dataset.create_node_clf_masker(),                                # this is needed only for node type classification
#                 measure_scores=self.trainer_params["measure_scores"],
#                 dilate_scores=self.trainer_params["dilate_scores"]
#             )
#         )
        
        self.objectives.append(
            NodeClassifierObjective(
                "TypeAnnPrediction",
                self.graph_model, self.node_embedder, dataset.nodes,
                load_type_prediction,                                                   # need to define this function
                self.device, self.sampling_neighbourhood_size, self.batch_size,
                tokenizer_path=tokenizer_path, target_emb_size=self.elem_emb_size, 
                masker=None,                                                            # masker is not needed here
                measure_scores=self.trainer_params["measure_scores"],
                dilate_scores=self.trainer_params["dilate_scores"]
            )
        )

In [10]:
%tensorboard --logdir "large_graph"

ERROR: Failed to launch TensorBoard (exited with -9).

In [None]:
training_procedure(
    dataset, 
    model_name=RGGAN, 
    model_params=config["MODEL"],
    trainer_params=config["TRAINING"],
    model_base_path=get_model_base(config["TRAINING"], get_name(RGGAN, str(datetime.now()))),
    trainer=Trainer
)

Number of nodes 324218


