# Variable misuse detection

VarMisuse at node level

In [1]:
%load_ext autoreload
%autoreload 2
%load_ext tensorboard

In [2]:
#!g1.1
# %pip install dgl-cu113 dglgo -f https://data.dgl.ai/wheels/repo.html

In [3]:
#!g1.1
from random import random

from SourceCodeTools.models.training_config import get_config, save_config, load_config
from SourceCodeTools.code.data.dataset.Dataset import SourceGraphDataset, filter_dst_by_freq
from SourceCodeTools.models.graph.train.sampling_multitask2 import training_procedure, SamplingMultitaskTrainer
from SourceCodeTools.models.graph.train.objectives.NodeClassificationObjective import NodeClassifierObjective
from SourceCodeTools.models.graph.train.utils import get_name, get_model_base
from SourceCodeTools.models.graph import RGGAN
from SourceCodeTools.tabular.common import compact_property
from SourceCodeTools.code.data.file_utils import unpersist

import dgl
import torch
import numpy as np
import pandas as pd
from torch import nn
from datetime import datetime

Using backend: pytorch


# Prepare parameters and options

Full list of options that can be added can be found in `SourceCodeTools/models/training_options.py`. They are ment to be used as arguments for cli trainer. Trainer script can be found in `SourceCodeTools/scripts/train.py`.

There are a lot of parameters. Ones that might be of interest are marked with `***`.

In [4]:
#!g1.1
config = get_config(
    # tokenizer
    tokenizer_path="sentencepiece_bpe.model", # *** path to sentencepiece model
    
    # dataset parameters
    data_path="10_percent_v1",             # *** path to node type
    use_node_types=False,                # node types currently not supported
    use_edge_types=False,                 # whether to use edge types
    filter_edges=None,                   # None or list of edge type names
    self_loops=False,                    # whether to use self loops
    train_frac=0.8,                      # *** fraction of nodes to use for training
    random_seed=42,                      # random seed for splitting dataset int o train test validation
    min_count_for_objectives=5,          # *** minimum frequency of targets
    no_global_edges=False,               # remove global edges
    remove_reverse=False,                # remove reverse edges
    custom_reverse=None,                 # None or list of edges, for which reverse edges should be created (use together with `remove_reverse`)
    
    # training parameters
    model_output_dir="10_percent_v1",      # *** directory to save checkpoints and training data
    batch_size=256,                     # *** 
    sampling_neighbourhood_size=10,      # number of dependencies to sample for each node
    neg_sampling_factor=1,               # *** number of negative samples for each positive sample
    epochs=10,                           # *** number of epochs
    elem_emb_size=300,                   # *** dimensionality of target embeddings (for node name prediction)
    pretraining_phase=0,                 # number of epochs for pretraining
    embedding_table_size=200000,         # *** embedding table size for subwords
    save_checkpoints=False,              # set to False if checkpoints are not needed
    save_each_epoch=False,               # save each epoch, useful in case of studying model behavior
    measure_scores=True,                 # *** measure ranking scores during evaluation
    dilate_scores=1,                   # downsampling factor for measuring scores to make evaluation faster
    objectives="node_clf",               # type of objective
    force_w2v_ns=True,                   # negative sampling strategy
    gpu=-1,                              # gpuid
    restore_state=False,
    pretrained=None,
    
    # model parameters
    node_emb_size=300,                   # *** dimensionality of node embeddings
    h_dim=300,                           # *** should match to node dimensionality
    n_layers=3,
    num_bases=10,                        # number of bases for computing parmetwer weights for different edge types
    dropout=0.1,                         # *** 
    use_self_loop=True,                  #
    activation="tanh",                   # *** 
    learning_rate=3e-3,                  # *** 
    use_gcn_checkpoint=True,
    use_att_checkpoint=True,
    use_gru_checkpoint=True
)

In [5]:
#!g1.1
save_config(config, "config.yaml")
config = load_config("config.yaml")
config

{'DATASET': {'custom_reverse': None,
  'data_path': '10_percent_v1',
  'filter_edges': None,
  'min_count_for_objectives': 5,
  'no_global_edges': False,
  'random_seed': 42,
  'remove_reverse': False,
  'restricted_id_pool': None,
  'self_loops': False,
  'subgraph_id_column': 'mentioned_in',
  'subgraph_partition': None,
  'train_frac': 0.8,
  'use_edge_types': False,
  'use_node_types': False},
 'MODEL': {'activation': 'tanh',
  'dropout': 0.1,
  'h_dim': 300,
  'n_layers': 3,
  'node_emb_size': 300,
  'num_bases': 10,
  'use_att_checkpoint': True,
  'use_gcn_checkpoint': True,
  'use_gru_checkpoint': True,
  'use_self_loop': True},
 'TOKENIZER': {'tokenizer_path': 'sentencepiece_bpe.model'},
 'TRAINING': {'batch_size': 256,
  'dilate_scores': 1,
  'early_stopping': False,
  'early_stopping_tolerance': 20,
  'elem_emb_size': 300,
  'embedding_table_size': 200000,
  'epochs': 10,
  'external_dataset': None,
  'force_w2v_ns': True,
  'gpu': -1,
  'learning_rate': 0.003,
  'measure_sco

# Create Dataset

In [6]:
#!g1.1
dataset = SourceGraphDataset(
    **{**config["DATASET"], **config["TOKENIZER"]},
)



# Declare target loading function (labels)

In [7]:
#!g1.1
l = unpersist("10_percent_v1/misuse_labels.json.bz2")

val, counts = np.unique(l.dst, return_counts=True)
counts[1]/counts.sum()

0.2404469820085427

In [8]:
#!g1.1
def load_var_misuse():
    from SourceCodeTools.code.data.dataset.reader import load_data
    
    nodes = dataset.nodes
    
    var_misuse_labels = unpersist("10_percent_v1/misuse_labels.json.bz2").query("src in @node_ids", local_dict={"node_ids": nodes["id"]})
    
    return var_misuse_labels

In [9]:
#!g1.1
load_var_misuse()

Unnamed: 0,src,dst
0,1634563,correct
1,1337362,misused
2,2562091,misused
3,2245942,correct
4,3082435,correct
...,...,...
486726,4616618,misused
486727,2500294,correct
486728,3084766,correct
486729,770790,misused


# Define objectives

Currenlty objectives for node classification (`NodeClassifierObjective`), and name-based node embedding training `SubwordEmbedderObjective`.

![](./figures/img1.png)


## Create objective 
One or several objectives could be used

In [10]:
#!g1.1
class Trainer(SamplingMultitaskTrainer):
    def create_objectives(self, dataset, tokenizer_path):
        self.objectives = nn.ModuleList()
        self.objectives.append(
            NodeClassifierObjective(
                "VarMisuse",
                self.graph_model, self.node_embedder, dataset.nodes,
                load_var_misuse,                                                   # need to define this function
                self.device, self.sampling_neighbourhood_size, self.batch_size,
                tokenizer_path=tokenizer_path, target_emb_size=self.elem_emb_size, 
                masker=None,                                                            # masker is not needed here
                measure_scores=False,
                dilate_scores=self.trainer_params["dilate_scores"]

            )
        )

## Launch Tensorboard

In [11]:
#!g1.1
# %tensorboard --logdir './10_percent_v1/'

## Start model training

In [12]:
#!g1.1
training_procedure(
    dataset, 
    model_name=RGGAN, 
    model_params=config["MODEL"],
    trainer_params=config["TRAINING"],
    model_base_path=get_model_base(config["TRAINING"], get_name(RGGAN, str(datetime.now()))),
    tokenizer_path=config["TOKENIZER"]["tokenizer_path"],
    trainer=Trainer
)

Number of nodes 4900346


Epoch 0: 100%|█████████████████████████████████████████████████████████████████████| 1521/1521 [07:41<00:00,  3.30it/s]


Opening File to save!!
Opening File to save!!


Epoch 1:   0%|                                                                                | 0/1521 [00:00<?, ?it/s]

Epoch: 0, Time: 515 s
{'Accuracy/test/VarMisuse_': 0.849166584372074,
 'Accuracy/train/VarMisuse_': 0.8214285714285714,
 'Accuracy/train_avg/VarMisuse': 0.8351883159575468,
 'Accuracy/val/VarMisuse_': 0.8482774504497002,
 'Loss/test/VarMisuse_': 0.31054820974046987,
 'Loss/train/VarMisuse_': 0.32327550649642944,
 'Loss/train_avg/VarMisuse': 0.3353489155462899,
 'Loss/val/VarMisuse_': 0.31167498670126265,
 'grad_norm/train/_': 0.28784561613207976}


Epoch 1: 100%|█████████████████████████████████████████████████████████████████████| 1521/1521 [07:37<00:00,  3.32it/s]


Opening File to save!!
Opening File to save!!


Epoch 2:   0%|                                                                                | 0/1521 [00:00<?, ?it/s]

Epoch: 1, Time: 499 s
{'Accuracy/test/VarMisuse_': 0.850885855571161,
 'Accuracy/train/VarMisuse_': 0.9166666666666666,
 'Accuracy/train_avg/VarMisuse': 0.853631622561911,
 'Accuracy/val/VarMisuse_': 0.849963305712858,
 'Loss/test/VarMisuse_': 0.3081789495578657,
 'Loss/train/VarMisuse_': 0.28199440240859985,
 'Loss/train_avg/VarMisuse': 0.3033150188688539,
 'Loss/val/VarMisuse_': 0.30808815493395453,
 'grad_norm/train/_': 0.31404309280087617}


Epoch 2: 100%|█████████████████████████████████████████████████████████████████████| 1521/1521 [06:27<00:00,  3.93it/s]


Opening File to save!!
Opening File to save!!


Epoch 3:   0%|                                                                                | 0/1521 [00:00<?, ?it/s]

Epoch: 2, Time: 441 s
{'Accuracy/test/VarMisuse_': 0.845756387889162,
 'Accuracy/train/VarMisuse_': 0.9047619047619048,
 'Accuracy/train_avg/VarMisuse': 0.8677823467486929,
 'Accuracy/val/VarMisuse_': 0.8445796552298468,
 'Loss/test/VarMisuse_': 0.3352269813573609,
 'Loss/train/VarMisuse_': 0.22899100184440613,
 'Loss/train_avg/VarMisuse': 0.2810926473430706,
 'Loss/val/VarMisuse_': 0.33292400931057176,
 'grad_norm/train/_': 0.5263930057634547}


Epoch 3: 100%|█████████████████████████████████████████████████████████████████████| 1521/1521 [06:23<00:00,  3.96it/s]


Opening File to save!!
Opening File to save!!


Epoch 4:   0%|                                                                                | 0/1521 [00:00<?, ?it/s]

Epoch: 3, Time: 420 s
{'Accuracy/test/VarMisuse_': 0.8390500643726592,
 'Accuracy/train/VarMisuse_': 0.9285714285714286,
 'Accuracy/train_avg/VarMisuse': 0.8828169026486333,
 'Accuracy/val/VarMisuse_': 0.8400537662391738,
 'Loss/test/VarMisuse_': 0.36056279895516735,
 'Loss/train/VarMisuse_': 0.1832854449748993,
 'Loss/train_avg/VarMisuse': 0.25598591251603714,
 'Loss/val/VarMisuse_': 0.3575709232374241,
 'grad_norm/train/_': 0.5498301819572224}


Epoch 4: 100%|█████████████████████████████████████████████████████████████████████| 1521/1521 [06:02<00:00,  4.19it/s]


Opening File to save!!
Opening File to save!!


Epoch 5:   0%|                                                                                | 0/1521 [00:00<?, ?it/s]

Epoch: 4, Time: 404 s
{'Accuracy/test/VarMisuse_': 0.8313799797372425,
 'Accuracy/train/VarMisuse_': 0.9404761904761905,
 'Accuracy/train_avg/VarMisuse': 0.8956092874362105,
 'Accuracy/val/VarMisuse_': 0.8328609052298468,
 'Loss/test/VarMisuse_': 0.38122305553406477,
 'Loss/train/VarMisuse_': 0.17492328584194183,
 'Loss/train_avg/VarMisuse': 0.2326112230168135,
 'Loss/val/VarMisuse_': 0.3772315884891309,
 'grad_norm/train/_': 0.7618695246928449}


Epoch 5: 100%|█████████████████████████████████████████████████████████████████████| 1521/1521 [06:28<00:00,  3.92it/s]


Opening File to save!!
Opening File to save!!


Epoch 6:   0%|                                                                                | 0/1521 [00:00<?, ?it/s]

Epoch: 5, Time: 436 s
{'Accuracy/test/VarMisuse_': 0.8276339755969101,
 'Accuracy/train/VarMisuse_': 0.9404761904761905,
 'Accuracy/train_avg/VarMisuse': 0.9045595052205628,
 'Accuracy/val/VarMisuse_': 0.828813603431046,
 'Loss/test/VarMisuse_': 0.4141941415145993,
 'Loss/train/VarMisuse_': 0.1749030351638794,
 'Loss/train_avg/VarMisuse': 0.2162189068153922,
 'Loss/val/VarMisuse_': 0.408546244627551,
 'grad_norm/train/_': 0.620248970182573}


Epoch 6: 100%|█████████████████████████████████████████████████████████████████████| 1521/1521 [08:07<00:00,  3.12it/s]


Opening File to save!!
Opening File to save!!


Epoch 7:   0%|                                                                                | 0/1521 [00:00<?, ?it/s]

Epoch: 6, Time: 526 s
{'Accuracy/test/VarMisuse_': 0.8242489246839888,
 'Accuracy/train/VarMisuse_': 0.9761904761904762,
 'Accuracy/train_avg/VarMisuse': 0.9105617828574558,
 'Accuracy/val/VarMisuse_': 0.8259225620419719,
 'Loss/test/VarMisuse_': 0.4382587702323993,
 'Loss/train/VarMisuse_': 0.1313932240009308,
 'Loss/train_avg/VarMisuse': 0.2026422576027274,
 'Loss/val/VarMisuse_': 0.4350411052766599,
 'grad_norm/train/_': 0.4331069972583549}


Epoch 7: 100%|█████████████████████████████████████████████████████████████████████| 1521/1521 [06:24<00:00,  3.95it/s]


Opening File to save!!
Opening File to save!!


Epoch 8:   0%|                                                                                | 0/1521 [00:00<?, ?it/s]

Epoch: 7, Time: 423 s
{'Accuracy/test/VarMisuse_': 0.8144604400749064,
 'Accuracy/train/VarMisuse_': 0.9523809523809523,
 'Accuracy/train_avg/VarMisuse': 0.9155567100278639,
 'Accuracy/val/VarMisuse_': 0.8177194370419719,
 'Loss/test/VarMisuse_': 0.4710912912463148,
 'Loss/train/VarMisuse_': 0.14331893622875214,
 'Loss/train_avg/VarMisuse': 0.1926372874182827,
 'Loss/val/VarMisuse_': 0.463167677898156,
 'grad_norm/train/_': 0.6025824826692471}


Epoch 8: 100%|█████████████████████████████████████████████████████████████████████| 1521/1521 [06:22<00:00,  3.98it/s]


Opening File to save!!
Opening File to save!!


Epoch 9:   0%|                                                                                | 0/1521 [00:00<?, ?it/s]

Epoch: 8, Time: 425 s
{'Accuracy/test/VarMisuse_': 0.8139543283298222,
 'Accuracy/train/VarMisuse_': 0.9285714285714286,
 'Accuracy/train_avg/VarMisuse': 0.9180810175401521,
 'Accuracy/val/VarMisuse_': 0.8182334173051299,
 'Loss/test/VarMisuse_': 0.47456890701626736,
 'Loss/train/VarMisuse_': 0.14123332500457764,
 'Loss/train_avg/VarMisuse': 0.1857979893733372,
 'Loss/val/VarMisuse_': 0.4677698033420663,
 'grad_norm/train/_': 0.5555798830879733}


Epoch 9: 100%|█████████████████████████████████████████████████████████████████████| 1521/1521 [06:10<00:00,  4.10it/s]


Opening File to save!!
Opening File to save!!
Epoch: 9, Time: 409 s
{'Accuracy/test/VarMisuse_': 0.8141781239027388,
 'Accuracy/train/VarMisuse_': 0.9523809523809523,
 'Accuracy/train_avg/VarMisuse': 0.922046581000908,
 'Accuracy/val/VarMisuse_': 0.8169147651565624,
 'Loss/test/VarMisuse_': 0.5076867346651852,
 'Loss/train/VarMisuse_': 0.13256163895130157,
 'Loss/train_avg/VarMisuse': 0.17815953777719687,
 'Loss/val/VarMisuse_': 0.49786863938758247,
 'grad_norm/train/_': 0.411874831994544}
Opening File to save!!
Opening File to save!!
Final eval: Loss/val/VarMisuse_final: 0.49786863938758247, Accuracy/val/VarMisuse_final: 0.8169147651565624, Loss/test/VarMisuse_final: 0.5076867346651852, Accuracy/test/VarMisuse_final: 0.8141781239027388


(<__main__.Trainer at 0x1e5d398a2b0>,
 {'Loss/val/VarMisuse_final': 0.49786863938758247,
  'Accuracy/val/VarMisuse_final': 0.8169147651565624,
  'Loss/test/VarMisuse_final': 0.5076867346651852,
  'Accuracy/test/VarMisuse_final': 0.8141781239027388})

## Evaluation 

Claculate accuracy <br>
Method classification using node classification


In [13]:
#!g1.1
from SourceCodeTools.code.common import read_edges
common_edges = read_edges('10_percent_v1/common_edges.json.bz2', True)
file_id2label = dict(unpersist("10_percent_v1/common_filecontent.json.bz2")[["id", "label"]].values)
node_id2_fileId = {}
for edges in common_edges:
    node_id2_fileId.update(dict(zip(edges["source_node_id"], edges["file_id"])))
    node_id2_fileId.update(dict(zip(edges["target_node_id"], edges["file_id"])))
    
# sink = open("node_fieid_label_map_new.csv", "w")
# sink.write("node_id,file_id,true_label\n")
# for node_id, file_id in node_id2_fileId.items():
#     sink.write(f"{node_id},{file_id},{file_id2label[file_id]}\n")
# sink.close()

In [14]:
#!g1.1
def calc_acc():
    
    label_map = {'Variable misuse':'misused', 'Correct':'correct'}
    mapping_df = pd.DataFrame([[node_id,file_id, label_map.get(file_id2label[file_id])] for node_id, file_id in node_id2_fileId.items()], columns=['node_id','file_id','true_label'])
    
    node_id2file_id = dict(zip(mapping_df["node_id"], mapping_df["file_id"]))
    node_id2true_label = dict(zip(mapping_df["node_id"], mapping_df["true_label"]))
    
    def get_info(node_id, mapping):
        if node_id in mapping:
            return mapping[node_id]
        else:
            print(node_id)
            raise ValueError()
            
    for target in ['test','val']:
        pred_df = pd.read_csv(f'{target}_temp.txt', delimiter = "\t", header=None) #(9414, 2)
        pred_df.columns = ['id', 'pred_label']
        pred_df["file_id"] = pred_df["id"].apply(lambda x: get_info(x, node_id2file_id))
        pred_df["true_label"] = pred_df["id"].apply(lambda x: get_info(x, node_id2true_label))
        print(pred_df['true_label'].value_counts())
        print(pred_df['pred_label'].value_counts())
        #pred_df['true_label'] = pred_df.true_label.apply(lambda x : label_map.get(x))
        
        correct_l = 0 
        incorrect_l = 0
        for name, group in pred_df.groupby('file_id'):
            pred = 'correct'
            if 'misused' in group.pred_label.tolist():
                #print(group.pred_label.tolist())
                pred = 'misused'
            true_l = group.true_label.unique()[0]
            #print(true_l, pred)
            correct_l += int(true_l == pred)
            incorrect_l += int(true_l != pred) 
            
        print(f'{target} accuracy {correct_l/(correct_l+incorrect_l)}')

In [16]:
#!g1.1
# %%time
calc_acc()

misused    48985
Name: true_label, dtype: int64
correct    35677
misused    13308
Name: pred_label, dtype: int64
test accuracy 0.39213840560893276
misused    48542
Name: true_label, dtype: int64
correct    35315
misused    13227
Name: pred_label, dtype: int64
val accuracy 0.3904659707995968


In [None]:
#!g1.1
