In [1]:
%load_ext autoreload
%autoreload 2
%load_ext tensorboard

# Task description

To train a variable misuse detection model one needs to implement an NLP labeling model.

For example, for a funciton containing misuse
```
def _eq(l1, l2):\n    return (set(l1) == set(l1))
```
the misuse character span is (44, 46). To do this with NLP methods, code is tokenized, and labels for tokens are generated
```
[def, _, eq, (, l, 1, ",", l, 2, ):, \n, \t, return, (, set, (, l1, ), ==, set, (, l1, ), ), ]
[O  , O, O , O, O, O,  O , O, O, 0 , O , O ,    O  , O, O  , O, O , O, O , O  , O, M , O, O, O
```
The goal is to train an NLP model that predicts those labels correctly. In this project, BILUO labeling scheme is used.

# Goal

The goal of this project
1. Verify dataset, make sure that encoded batches are correct (misuse spans are correct). You can sample dataset and make sure that the number of errors is less than a certain threshold.
2. Train variable misuse detection model (with finetuning and without)
3. Verify [scoring function](https://github.com/VitalyRomanov/method-embedding/blob/e995477db13a13875cca54c37d4d29f63b0c8e93/SourceCodeTools/nlp/entity/type_prediction.py#L71)
4. Conduct a series of experiments to identify performance
5. Analyze errors

# Why using this example?

Basic functionality, necessary for train an NLP labeler is
1. Loading data (implemented in this example)
2. Tokenization, preparing labels (implemented in [`PythonBatcher.prepare_sent`](https://github.com/VitalyRomanov/method-embedding/blob/e995477db13a13875cca54c37d4d29f63b0c8e93/SourceCodeTools/nlp/batchers/PythonBatcher.py#L123))
3. Data encoding for using with ML models (implemented in [`PythonBatcher.create_batches_with_mask`](https://github.com/VitalyRomanov/method-embedding/blob/e995477db13a13875cca54c37d4d29f63b0c8e93/SourceCodeTools/nlp/batchers/PythonBatcher.py#L206))
4. Batching (implemented in [`PythonBatcher.format_batch`](https://github.com/VitalyRomanov/method-embedding/blob/e995477db13a13875cca54c37d4d29f63b0c8e93/SourceCodeTools/nlp/batchers/PythonBatcher.py#L256))
5. Model training (partially implemented in [`CodeBertModelTrainer2.train_model`](https://github.com/VitalyRomanov/method-embedding/blob/e995477db13a13875cca54c37d4d29f63b0c8e93/SourceCodeTools/nlp/codebert/codebert_train.py#L148) and extended here)
6. Tensorboard tracking (implemented in `CodeBertModelTrainer2`)

# Install libraries

1. See [installation steps](https://github.com/VitalyRomanov/method-embedding#installing-python-libraries).

2. Install transformers
```bash
pip install transformers
```

In [2]:
import os
from os.path import join
from argparse import Namespace
from SourceCodeTools.nlp.codebert.codebert_train import CodeBertModelTrainer2, test_step, train_step_finetune, CodebertHybridModel, batch_to_torch
from SourceCodeTools.nlp.entity.type_prediction import scorer
from transformers import RobertaTokenizer, RobertaModel

import json
import torch
from time import time
from copy import copy
from datetime import datetime
import pickle

from tqdm import tqdm

# Definitions

## Reading data

In [3]:
def read_data(dataset_path, partition):
    """
    Read data storead as JSON records.
    """
    assert partition in {"train", "val", "test"}
    data_path = join(dataset_path, f"var_misuse_seq_{partition}.json")
    
    data = []
    for line in open(data_path, "r"):
        entry = json.loads(line)
        
        text = entry.pop("text")
        data.append((text, entry))
        
    return data

## Training procedure

In [4]:
class VariableMisuseDetector(CodeBertModelTrainer2):
    def get_trial_dir(self):
        """
        Define folder name format for storing checkpoints.
        """
        return os.path.join(self.output_dir, "codebert_var_mususe" + str(datetime.now())).replace(":", "-").replace(" ", "_")
    
    def train(
            self, model, train_batches, test_batches, epochs, report_every=10, scorer=None, learning_rate=0.01,
            learning_rate_decay=1., finetune=False, summary_writer=None, save_ckpt_fn=None, no_localization=False
    ):
        # all training options are specified [here](https://github.com/VitalyRomanov/method-embedding/blob/e995477db13a13875cca54c37d4d29f63b0c8e93/SourceCodeTools/nlp/entity/type_prediction.py#L256)

        optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)
        scheduler = torch.optim.lr_scheduler.ExponentialLR(optimizer, gamma=learning_rate_decay)  # there is no learning rate decay by default

        # metric history is stored here
        train_losses = []
        test_losses = []
        train_f1s = []
        test_f1s = []

        num_train_batches = len(train_batches)
        num_test_batches = len(test_batches)

        best_f1 = 0.

        for e in range(epochs):
            losses = []
            ps = []
            rs = []
            f1s = []

            start = time()
            model.train()

            for ind, batch in enumerate(tqdm(train_batches)):
                batch_to_torch(batch, self.device)  # inspect the content of `batch`

                loss, p, r, f1 = train_step_finetune(
                    model=model, optimizer=optimizer, token_ids=batch['tok_ids'],
                    prefix=None, suffix=None, graph_ids=None,  # keep this None
                    labels=batch['tags'], lengths=batch['lens'],
                    extra_mask=None,  # Keep this None
                    scorer=scorer,
                    finetune=finetune and e / epochs > 0.2,  # finetuning starts after 20% of training is complete
                    vocab_mapping=self.vocab_mapping
                )
                losses.append(loss.cpu().item())
                ps.append(p)
                rs.append(r)
                f1s.append(f1)

                self.summary_writer.add_scalar("Loss/Train", loss, global_step=e * num_train_batches + ind)
                self.summary_writer.add_scalar("Precision/Train", p, global_step=e * num_train_batches + ind)
                self.summary_writer.add_scalar("Recall/Train", r, global_step=e * num_train_batches + ind)
                self.summary_writer.add_scalar("F1/Train", f1, global_step=e * num_train_batches + ind)

            test_alosses = []
            test_aps = []
            test_ars = []
            test_af1s = []

            model.eval()

            for ind, batch in enumerate(test_batches):
                batch_to_torch(batch, self.device)
                
                test_loss, test_p, test_r, test_f1 = test_step(
                    model=model, token_ids=batch['tok_ids'],
                    prefix=None, suffix=None, graph_ids=None,  # keep this None
                    labels=batch['tags'], lengths=batch['lens'],
                    extra_mask=None,  # keep this None
                    scorer=scorer, vocab_mapping=self.vocab_mapping
                )

                self.summary_writer.add_scalar("Loss/Test", test_loss, global_step=e * num_test_batches + ind)
                self.summary_writer.add_scalar("Precision/Test", test_p, global_step=e * num_test_batches + ind)
                self.summary_writer.add_scalar("Recall/Test", test_r, global_step=e * num_test_batches + ind)
                self.summary_writer.add_scalar("F1/Test", test_f1, global_step=e * num_test_batches + ind)
                test_alosses.append(test_loss.cpu().item())
                test_aps.append(test_p)
                test_ars.append(test_r)
                test_af1s.append(test_f1)

            epoch_time = time() - start

            train_losses.append(float(sum(losses) / len(losses)))
            train_f1s.append(float(sum(f1s) / len(f1s)))
            test_losses.append(float(sum(test_alosses) / len(test_alosses)))
            test_f1s.append(float(sum(test_af1s) / len(test_af1s)))

            print(
                f"Epoch: {e}, {epoch_time: .2f} s, Train Loss: {train_losses[-1]: .4f}, Train P: {sum(ps) / len(ps): .4f}, Train R: {sum(rs) / len(rs): .4f}, Train F1: {sum(f1s) / len(f1s): .4f}, "
                f"Test loss: {test_losses[-1]: .4f}, Test P: {sum(test_aps) / len(test_aps): .4f}, Test R: {sum(test_ars) / len(test_ars): .4f}, Test F1: {test_f1s[-1]: .4f}")

            if save_ckpt_fn is not None and float(test_f1s[-1]) > best_f1:
                save_ckpt_fn()
                best_f1 = float(test_f1s[-1])

            scheduler.step(epoch=e)

        return train_losses, train_f1s, test_losses, test_f1s
    
    def train_model(self):
        
        model_params = copy(self.model_params)

        print(f"\n\n{model_params}")
        lr = model_params.pop("learning_rate")
        lr_decay = model_params.pop("learning_rate_decay")
        suffix_prefix_buckets = model_params.pop("suffix_prefix_buckets")  # used for another model, ignore

        graph_emb = load_pkl_emb(self.graph_emb_path) if self.graph_emb_path is not None else None  # used for another model, ignore

        train_batcher, test_batcher = self.get_dataloaders(word_emb=None, graph_emb=None, suffix_prefix_buckets=suffix_prefix_buckets)

        codebert_model = RobertaModel.from_pretrained("microsoft/codebert-base")
        
        # definition of CodebertHybridModel is at https://github.com/VitalyRomanov/method-embedding/blob/e995477db13a13875cca54c37d4d29f63b0c8e93/SourceCodeTools/nlp/codebert/codebert_train.py#L21
        model = CodebertHybridModel(
            codebert_model, graph_emb=None, padding_idx=0, num_classes=train_batcher.num_classes(),
            no_graph=self.no_graph
        )
        
        if self.use_cuda:
            model.cuda()

        trial_dir = self.get_trial_dir()  # create directory for saving checkpoints
        os.mkdir(trial_dir)
        self.create_summary_writer(trial_dir)
        
        pickle.dump(train_batcher.tagmap, open(os.path.join(trial_dir, "tag_types.pkl"), "wb"))

        def save_ckpt_fn():
            checkpoint_path = os.path.join(trial_dir, "checkpoint")
            torch.save(model, open(checkpoint_path, 'wb'))

        train_losses, train_f1, test_losses, test_f1 = self.train(
            model=model, train_batches=train_batcher, test_batches=test_batcher,
            epochs=self.epochs, learning_rate=lr,
            scorer=lambda pred, true: scorer(pred, true, train_batcher.tagmap, no_localization=self.no_localization),  # need to verify scoring function
            learning_rate_decay=lr_decay, finetune=self.finetune, save_ckpt_fn=save_ckpt_fn,
            no_localization=self.no_localization
        )

        metadata = {
            "train_losses": train_losses,
            "train_f1": train_f1,
            "test_losses": test_losses,
            "test_f1": test_f1,
            "learning_rate": lr,
            "learning_rate_decay": lr_decay,
            "epochs": self.epochs,
            "suffix_prefix_buckets": suffix_prefix_buckets,
            "seq_len": self.seq_len,
            "batch_size": self.batch_size,
            "no_localization": self.no_localization
        }

        print("Maximum f1:", max(test_f1))

        metadata.update(model_params)

        with open(os.path.join(trial_dir, "params.json"), "w") as metadata_sink:
            metadata_sink.write(json.dumps(metadata, indent=4))

# Execution

All training options are specified [here](https://github.com/VitalyRomanov/method-embedding/blob/e995477db13a13875cca54c37d4d29f63b0c8e93/SourceCodeTools/nlp/entity/type_prediction.py#L256)
Option names are added to `args` below.

In [5]:
dataset_path = "variable_misuse_graph_2_percent_balanced/with_ast"

args = Namespace()
args.__dict__.update({
    "learning_rate": 1e-3,           #
    "max_seq_len": 512,              # default for BERT
    "random_seed": 42,               #
    "epochs": 100,                   #
    "gpu": -1,                       # set this to GPU id to use gpu
    "batch_size": 8,                 # higher value increases memory consumption
    "finetune": True,  # set this flag to enable finetuning
    "no_localization": False,        # whether to solve variable misuse with, or without localization
    
    # do not change items below
    "no_graph": True,                # used for another model
    "model_output": dataset_path,    # where to store checkpoints
    "graph_emb_path": None,          # used for another model
    "word_emb_path": None,           # used for another model
    "trials": 1,                     # setting > 1 repeats training, used to accumulate statisitcs
})

In [6]:
train_data = read_data(dataset_path, "train")
test_data = read_data(dataset_path, "val")

In [7]:
test_data[0]  # ignore `replacements`

("def __init__(self, pwm_pin=None, pwm_freq=50, min_ms=0.5, max_ms=2.4):\n    assert (pwm_freq > 0), ('pwm_freq must be positive, given: %s' % str(pwm_freq))\n    assert (min_ms > 0), ('0 min_ms must be positive, given: %s' % str(min_ms))\n    assert (max_ms > 0), ('max_ms must be positive, given: %s' % str(max_ms))\n    self.pwm_freq = pwm_freq\n    self.min_ms = min_ms\n    self.max_ms = max_ms\n    self.pwm_pin = None\n    if pwm_pin:\n        self.attach(pwm_pin)\n    self.angle = None",
 {'replacements': [[452, 459, 870497],
   [360, 366, 914621],
   [226, 232, 914621],
   [411, 415, 501975],
   [304, 310, 904667],
   [333, 341, 451541],
   [94, 95, 501975],
   [13, 17, 193236],
   [465, 469, 557220],
   [317, 321, 557220],
   [385, 391, 904667],
   [478, 482, 501975],
   [140, 143, 65883],
   [58, 64, 120250],
   [144, 152, 451541],
   [371, 375, 557220],
   [256, 257, 501975],
   [46, 52, 950218],
   [19, 26, 545410],
   [247, 253, 904667],
   [99, 137, 501975],
   [423, 430, 87

In [8]:
test_data[100]

('def _prepare_verified_images(self, verify_image_url):\n    verify_image_url._verified_images = self._verify_images(self._find_images(), verify_image_url)\n    print(self._verified_images)',
 {'replacements': [[135, 151, 459565],
   [163, 167, 19943],
   [94, 98, 19943],
   [157, 162, 940393],
   [35, 51, 893745],
   [58, 74, 459565],
   [114, 118, 19943],
   [29, 33, 187548]],
  'entities': [[58, 74, 'misuse']]})

In [9]:
trainer = VariableMisuseDetector(
    train_data, test_data, params={"learning_rate": 1e-4, "learning_rate_decay": 0.99, "suffix_prefix_buckets": 1},
    graph_emb_path=args.graph_emb_path, word_emb_path=args.word_emb_path,
    output_dir=args.model_output, epochs=args.epochs, batch_size=args.batch_size, gpu_id=args.gpu,
    finetune=args.finetune, trials=args.trials, seq_len=args.max_seq_len, no_localization=args.no_localization,
    no_graph=args.no_graph
)

In [None]:
trainer.train_model()



{'learning_rate': 0.0001, 'learning_rate_decay': 0.99, 'suffix_prefix_buckets': 1}
