# Variable misuse detection

VarMisuse at node level

In [1]:
#!g1.1
%load_ext autoreload
%autoreload 2
%load_ext tensorboard

In [2]:
#!g1.1
# %pip install dgl-cu113 dglgo -f https://data.dgl.ai/wheels/repo.html

In [3]:
#!g1.1
from random import random

from SourceCodeTools.models.training_config import get_config, save_config, load_config
from SourceCodeTools.code.data.dataset.Dataset import SourceGraphDataset, filter_dst_by_freq
from SourceCodeTools.models.graph.train.sampling_multitask2 import training_procedure, SamplingMultitaskTrainer
from SourceCodeTools.models.graph.train.objectives.NodeClassificationObjective import NodeClassifierObjective
from SourceCodeTools.models.graph.train.utils import get_name, get_model_base
from SourceCodeTools.models.graph import RGGAN
from SourceCodeTools.tabular.common import compact_property
from SourceCodeTools.code.data.file_utils import unpersist

import dgl
import torch
import numpy as np
from torch import nn
from datetime import datetime

# Prepare parameters and options

Full list of options that can be added can be found in `SourceCodeTools/models/training_options.py`. They are ment to be used as arguments for cli trainer. Trainer script can be found in `SourceCodeTools/scripts/train.py`.

There are a lot of parameters. Ones that might be of interest are marked with `***`.

In [4]:
#!g1.1
config = get_config(
    # tokenizer
    tokenizer_path="sentencepiece_bpe.model", # *** path to sentencepiece model
    
    # dataset parameters
    data_path="v2",             # *** path to node type
    use_node_types=False,                # node types currently not supported
    use_edge_types=False,                 # whether to use edge types
    filter_edges=None,                   # None or list of edge type names
    self_loops=False,                    # whether to use self loops
    train_frac=0.8,                      # *** fraction of nodes to use for training
    random_seed=42,                      # random seed for splitting dataset int o train test validation
    min_count_for_objectives=5,          # *** minimum frequency of targets
    no_global_edges=False,               # remove global edges
    remove_reverse=False,                # remove reverse edges
    custom_reverse=None,                 # None or list of edges, for which reverse edges should be created (use together with `remove_reverse`)
    
    # training parameters
    model_output_dir="v2",      # *** directory to save checkpoints and training data
    batch_size=256,                     # *** 
    sampling_neighbourhood_size=10,      # number of dependencies to sample for each node
    neg_sampling_factor=1,               # *** number of negative samples for each positive sample
    epochs=2,                           # *** number of epochs
    elem_emb_size=30,                   # *** dimensionality of target embeddings (for node name prediction)
    pretraining_phase=0,                 # number of epochs for pretraining
    embedding_table_size=200000,         # *** embedding table size for subwords
    save_checkpoints=False,              # set to False if checkpoints are not needed
    save_each_epoch=False,               # save each epoch, useful in case of studying model behavior
    measure_scores=True,                 # *** measure ranking scores during evaluation
    dilate_scores=1,                   # downsampling factor for measuring scores to make evaluation faster
    objectives="node_clf",               # type of objective
    force_w2v_ns=True,                   # negative sampling strategy
    gpu=0,                              # gpuid
    restore_state=False,
    pretrained=None,
    
    # model parameters
    node_emb_size=30,                   # *** dimensionality of node embeddings
    h_dim=30,                           # *** should match to node dimensionality
    n_layers=3,
    num_bases=10,                        # number of bases for computing parmetwer weights for different edge types
    dropout=0.3,                         # *** 
    use_self_loop=True,                  #
    activation="tanh",                   # *** 
    learning_rate=3e-3,                  # *** 
    use_gcn_checkpoint=True,
    use_att_checkpoint=True,
    use_gru_checkpoint=True
)

In [5]:
#!g1.1
save_config(config, "config.yaml")
config = load_config("config.yaml")
config

{'DATASET': {'custom_reverse': None,
  'data_path': 'v2',
  'filter_edges': None,
  'min_count_for_objectives': 5,
  'no_global_edges': False,
  'random_seed': 42,
  'remove_reverse': False,
  'restricted_id_pool': None,
  'self_loops': False,
  'subgraph_id_column': 'mentioned_in',
  'subgraph_partition': None,
  'train_frac': 0.8,
  'use_edge_types': False,
  'use_node_types': False},
 'MODEL': {'activation': 'tanh',
  'dropout': 0.3,
  'h_dim': 30,
  'n_layers': 3,
  'node_emb_size': 30,
  'num_bases': 10,
  'use_att_checkpoint': True,
  'use_gcn_checkpoint': True,
  'use_gru_checkpoint': True,
  'use_self_loop': True},
 'TOKENIZER': {'tokenizer_path': 'sentencepiece_bpe.model'},
 'TRAINING': {'batch_size': 256,
  'dilate_scores': 1,
  'early_stopping': False,
  'early_stopping_tolerance': 20,
  'elem_emb_size': 30,
  'embedding_table_size': 200000,
  'epochs': 2,
  'external_dataset': None,
  'force_w2v_ns': True,
  'gpu': 0,
  'learning_rate': 0.003,
  'measure_scores': True,
  'm

# Create Dataset

In [6]:
#!g1.1
dataset = SourceGraphDataset(
    **{**config["DATASET"], **config["TOKENIZER"]},
)



# Declare target loading function (labels)

In [7]:
#!g1.1
l = unpersist("v2/misuse_labels.json.bz2")

In [8]:
#!g1.1
val, counts = np.unique(l.dst, return_counts=True)
counts[1]/counts.sum()

0.24393915469235222

In [9]:
#!g1.1
def load_var_misuse():
    from SourceCodeTools.code.data.dataset.reader import load_data
    
    nodes = dataset.nodes
    
    var_misuse_labels = unpersist("v2/misuse_labels.json.bz2").query("src in @node_ids", local_dict={"node_ids": nodes["id"]})
    
    return var_misuse_labels

In [10]:
#!g1.1
load_var_misuse()

Unnamed: 0,src,dst
0,877853,correct
1,40094,correct
2,587425,correct
3,599966,correct
4,940591,misused
...,...,...
94990,557629,correct
94991,153025,correct
94992,521666,misused
94993,946785,correct


# Define objectives

Currenlty objectives for node classification (`NodeClassifierObjective`), and name-based node embedding training `SubwordEmbedderObjective`.

![](./figures/img1.png)


One or several objectives could be used

In [23]:
#!g1.1
class Trainer(SamplingMultitaskTrainer):
    def create_objectives(self, dataset, tokenizer_path):
        self.objectives = nn.ModuleList()
        self.objectives.append(
            NodeClassifierObjective(
                "VarMisuse",
                self.graph_model, self.node_embedder, dataset.nodes,
                load_var_misuse,                                                   # need to define this function
                self.device, self.sampling_neighbourhood_size, self.batch_size,
                tokenizer_path=tokenizer_path, target_emb_size=self.elem_emb_size, 
                masker=None,                                                            # masker is not needed here
                measure_scores=False,
                dilate_scores=self.trainer_params["dilate_scores"]
            )
        )

In [24]:
#!g1.1
# %tensorboard --logdir 'large_graph'

In [25]:
#!g1.1
from SourceCodeTools.models.graph.train.objectives.NodeClassificationObjective import NodeClassifierObjective

In [26]:
#!g1.1
training_procedure(
    dataset, 
    model_name=RGGAN, 
    model_params=config["MODEL"],
    trainer_params=config["TRAINING"],
    model_base_path=get_model_base(config["TRAINING"], get_name(RGGAN, str(datetime.now()))),
    tokenizer_path=config["TOKENIZER"]["tokenizer_path"],
    trainer=Trainer
)

Epoch 0: 100%|██████████| 298/298 [00:14<00:00, 20.85it/s]
Epoch 1: 100%|██████████| 298/298 [00:13<00:00, 21.64it/s]


Number of nodes 1021041
Opening File to save!!
Opening File to save!!
Epoch: 0, Time: 16 s
{'Accuracy/test/VarMisuse_': 0.7920769631959148,
 'Accuracy/train/VarMisuse_': 0.75,
 'Accuracy/train_avg/VarMisuse': 0.7785234899328859,
 'Accuracy/val/VarMisuse_': 0.7872720021157521,
 'Loss/test/VarMisuse_': 0.39307325836774465,
 'Loss/train/VarMisuse_': 0.44256123900413513,
 'Loss/train_avg/VarMisuse': 0.45258815616569265,
 'Loss/val/VarMisuse_': 0.3960964220601159}
Opening File to save!!
Opening File to save!!
Epoch: 1, Time: 15 s
{'Accuracy/test/VarMisuse_': 0.804255958867854,
 'Accuracy/train/VarMisuse_': 0.75,
 'Accuracy/train_avg/VarMisuse': 0.7930867239932886,
 'Accuracy/val/VarMisuse_': 0.8026367987305487,
 'Loss/test/VarMisuse_': 0.38428062442186717,
 'Loss/train/VarMisuse_': 0.4430818557739258,
 'Loss/train_avg/VarMisuse': 0.38926971678765826,
 'Loss/val/VarMisuse_': 0.38546452087325017}
Opening File to save!!
Opening File to save!!
Final eval: Loss/val/VarMisuse_final: 0.38546452087

(<__main__.Trainer at 0x7f495e0365b0>,
 {'Loss/val/VarMisuse_final': 0.38546452087325017,
  'Accuracy/val/VarMisuse_final': 0.8026367987305487,
  'Loss/test/VarMisuse_final': 0.38428062442186717,
  'Accuracy/test/VarMisuse_final': 0.804255958867854})

In [14]:
#!g1.1
!nvidia-smi

Wed May 25 13:56:24 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.91.03    Driver Version: 460.91.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla V100-SXM2...  Off  | 00000000:8C:00.0 Off |                    0 |
| N/A   34C    P0    36W / 300W |   1447MiB / 32510MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [15]:
#!g1.1
