In [1]:
%load_ext autoreload
%autoreload 2
%load_ext tensorboard

# Task description

To train a variable misuse detection model one needs to implement an NLP labeling model.

For example, for a function containing misuse
```
def _eq(l1, l2):\n    return (set(l1) == set(l1))
```
the misuse character span is (44, 46). To do this with NLP methods, code is tokenized, and labels for tokens are generated
```
[def, _, eq, (, l, 1, ",", l, 2, ):, \n, \t, return, (, set, (, l1, ), ==, set, (, l1, ), ), ]
[O  , O, O , O, O, O,  O , O, O, 0 , O , O ,    O  , O, O  , O, O , O, O , O  , O, M , O, O, O
```
The goal is to train an NLP model that predicts those labels correctly. In this project, BILUO labeling scheme is used.

# Goal

The goal of this project
1. Verify dataset, make sure that encoded batches are correct (misuse spans are correct). You can sample dataset and make sure that the number of errors is less than a certain threshold.
2. Train variable misuse detection model (with fine-tuning and without)
3. Verify [scoring function](https://github.com/VitalyRomanov/method-embedding/blob/e995477db13a13875cca54c37d4d29f63b0c8e93/SourceCodeTools/nlp/entity/type_prediction.py#L71)
4. Conduct a series of experiments to identify performance
5. Analyze errors

# Why using this example?

Basic functionality, necessary for train an NLP labeler is
1. Loading data (implemented in this example)
2. Tokenization, preparing labels (implemented in [`PythonBatcher.prepare_sent`](https://github.com/VitalyRomanov/method-embedding/blob/e995477db13a13875cca54c37d4d29f63b0c8e93/SourceCodeTools/nlp/batchers/PythonBatcher.py#L123))
3. Data encoding for using with ML models (implemented in [`PythonBatcher.create_batches_with_mask`](https://github.com/VitalyRomanov/method-embedding/blob/e995477db13a13875cca54c37d4d29f63b0c8e93/SourceCodeTools/nlp/batchers/PythonBatcher.py#L206))
4. Batching (implemented in [`PythonBatcher.format_batch`](https://github.com/VitalyRomanov/method-embedding/blob/e995477db13a13875cca54c37d4d29f63b0c8e93/SourceCodeTools/nlp/batchers/PythonBatcher.py#L256))
5. Model training (partially implemented in [`CodeBertModelTrainer2.train_model`](https://github.com/VitalyRomanov/method-embedding/blob/e995477db13a13875cca54c37d4d29f63b0c8e93/SourceCodeTools/nlp/codebert/codebert_train.py#L148) and extended here)
6. Tensorboard tracking (implemented in `CodeBertModelTrainer2`)

# Install libraries

1. See [installation steps](https://github.com/VitalyRomanov/method-embedding#installing-python-libraries).

2. Install transformers
```bash
pip install transformers
```

In [140]:
import os
import json
import torch
import logging
import pickle

from argparse import Namespace
from copy import copy
from datetime import datetime
from pathlib import Path
from os.path import join
from collections import defaultdict

import numpy as np
from transformers import RobertaModel
from tqdm import tqdm

from SourceCodeTools.nlp.codebert.codebert_train import CodeBertModelTrainer
from SourceCodeTools.nlp.batchers.PythonBatcher import Batcher, PythonBatcher
from SourceCodeTools.code.data.cubert_python_benchmarks.data_iterators import DataIterator
from SourceCodeTools.nlp.entity.entity_scores import entity_scorer


# Definitions

## Reading Data

In [141]:
def read_data_fn_clf(dataset_path, partition):
    """
    Read data stored as JSON records.
    """
    assert partition in {"train", "val", "test"}
    data_path = join(dataset_path, f"var_misuse_seq_{partition}.json")

    for line in open(data_path, "r"):
        entry = json.loads(line)

        text = entry.pop("text")

        entry["category"] = "misuse" if len(entry["entities"]) > 0 else "correct"

        yield (text, entry)


class DataIteratorFnClf(DataIterator):
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)

    def __iter__(self):
        return read_data_fn_clf(self._data_path, self._partition_name)

## Model

In [159]:
from SourceCodeTools.mltools.torch import to_numpy
import torch.nn as nn

class CodebertHybridModelFnClf(nn.Module):
    def __init__(
            self, codebert_model, graph_emb, graph_padding_idx, num_classes, dense_hidden=100, dropout=0.1,
            bert_emb_size=768, no_graph=False
    ):
        super(CodebertHybridModelFnClf, self).__init__()

        self.codebert_model = codebert_model
        self.use_graph = not no_graph

        if self.use_graph:
            num_emb = graph_padding_idx + 1  # padding id is usually not a real embedding
            graph_emb_dim = graph_emb.shape[1]
            self.graph_emb = nn.Embedding(num_embeddings=num_emb, embedding_dim=graph_emb_dim, padding_idx=graph_padding_idx)

            pretrained_embeddings = torch.from_numpy(np.concatenate([graph_emb, np.zeros((1, graph_emb_dim))], axis=0)).float()
            new_param = torch.nn.Parameter(pretrained_embeddings)
            assert self.graph_emb.weight.shape == new_param.shape
            self.graph_emb.weight = new_param
            self.graph_emb.weight.requires_grad = False
        else:
            graph_emb_dim = 0

        self.fc1 = nn.Linear(
            bert_emb_size + (graph_emb_dim if self.use_graph else 0),
            dense_hidden
        )
        self.drop = nn.Dropout(dropout)
        self.fc2 = nn.Linear(dense_hidden, num_classes)

        self.loss_f = nn.CrossEntropyLoss(reduction="mean")

    def forward(self, token_ids, graph_ids, mask, finetune=False):
        if finetune:
            x = self.codebert_model(input_ids=token_ids, attention_mask=mask).pooler_output
        else:
            with torch.no_grad():
                x = self.codebert_model(input_ids=token_ids, attention_mask=mask).pooler_output

        x = torch.relu(self.fc1(x))
        x = self.drop(x)
        x = self.fc2(x)

        return x

    def loss(self, logits, labels, mask, class_weights=None, extra_mask=None):
        loss = self.loss_f(logits, labels)

        return loss

    def score(self, logits, labels, mask, scorer=None, extra_mask=None):
        true_labels = labels
        estimated_labels = logits.argmax(-1)
        
        acc = (estimated_labels == true_labels).sum() / len(true_labels)

        return {"Accuracy": acc.cpu().item(), "Prediction": estimated_labels}

## Training procedure

In [160]:
class VariableMisuseDetector(CodeBertModelTrainer):

    def set_batcher_class(self):
        self.batcher = PythonBatcher

    def set_model_class(self):
        self.model = CodebertHybridModelFnClf

    @property
    def classes_for(self):
        return "labels"

    @property
    def best_score_metric(self):
        return "Accuracy"

    @classmethod
    def _format_batch(cls, batch, device):
        key_types = {
            'tok_ids': torch.LongTensor,
            'tags': torch.LongTensor,
            'hide_mask': torch.BoolTensor,
            'no_loc_mask': torch.BoolTensor,
            'lens': torch.LongTensor,
            'graph_ids': torch.LongTensor,
            'label': torch.LongTensor
        }
        for key, tf in key_types.items():
            if key in batch:
                batch[key] = tf(batch[key]).to(device)

    def get_training_dir(self):
        if not hasattr(self, "_timestamp"):
            self._timestamp = str(datetime.now()).replace(":", "-").replace(" ", "_")
        return Path(self.trainer_params["model_output"]).joinpath("codebert_var_mususe_fn_clf" + self._timestamp)
    
    def get_model(self, *args, **kwargs):
        codebert_model = RobertaModel.from_pretrained("microsoft/codebert-base")
        model = self.model(
            codebert_model, graph_emb=kwargs["graph_embedder"],
            graph_padding_idx=kwargs["graph_padding_idx"],
            num_classes=kwargs["num_classes"],
            no_graph=self.no_graph
        )
        if self.use_cuda:
            model.cuda()

        if self.ckpt_path is not None:
            ckpt_path = os.path.join(self.ckpt_path, "checkpoint")
            model = self.load_checkpoint(model, ckpt_path)
        return model

    def iterate_batches(self, model, batches, epoch, num_train_batches, train_scores, scorer, train=True):
        scores_for_averaging = defaultdict(list)

        batch_count = 0

        for ind, batch in enumerate(tqdm(batches, desc=f"Epoch {epoch}")):
            self._format_batch(batch, self.device)
            # Can get original tokens by calling
            # batches.get_record_with_id(batch["id"][0])
            scores = self.make_step(
                model=model, optimizer=self.optimizer, token_ids=batch['tok_ids'],
                prefix=batch['prefix'], suffix=batch['suffix'],
                graph_ids=batch['graph_ids'] if 'graph_ids' in batch else None,
                labels=batch['label'], lengths=batch['lens'],
                extra_mask=batch['no_loc_mask'] if self.no_localization else batch['hide_mask'],
                # class_weights=batch['class_weights'],
                scorer=scorer, finetune=self.finetune and epoch / self.epochs > 0.6,
                vocab_mapping=self.vocab_mapping,
                train=train
            )

            batch_count += 1

            scores["batch_size"] = batch['tok_ids'].shape[0]
            for score, value in scores.items():
                self._write_to_summary(f"{score}/{'Train' if train else 'Test'}", value, epoch * num_train_batches + ind)
                scores_for_averaging[score].append(value)
            train_scores.append(scores_for_averaging)

        return num_train_batches
    
    def train_model(self):

        graph_emb = self._load_grap_embs()
        word_emb = self._load_word_embs()

        train_batcher, test_batcher = self.get_dataloaders(
            word_emb, graph_emb, self.suffix_prefix_buckets, cache_dir=Path(self.data_path).joinpath("__cache__")
        )

        trial_dir = self.get_training_dir()
        trial_dir.mkdir(parents=True, exist_ok=True)
        logging.info(f"Running trial: {str(trial_dir)}")
        self._create_summary_writer(trial_dir)

        self.save_params(
            trial_dir, {
                "MODEL_PARAMS": self.model_params,
                "TRAINER_PARAMS": self.trainer_params,
                "model_class": self.model.__class__.__name__,
                "batcher_class": self.batcher.__class__.__name__
            }
        )

        model = self.get_model(
            tok_embedder=word_emb, graph_embedder=graph_emb, train_embeddings=self.finetune,
            suffix_prefix_buckets=self.suffix_prefix_buckets,
            num_classes=train_batcher.num_classes(how=self.classes_for), seq_len=self.seq_len, no_graph=self.no_graph,
            graph_padding_idx=None,
            **self.model_params
        )

        def save_ckpt_fn():
            checkpoint_path = os.path.join(trial_dir, "checkpoint")
            self.save_checkpoint(model, checkpoint_path)

        train_scores, test_scores, train_average_scores, test_average_scores = self.train(
            model=model, train_batches=train_batcher, test_batches=test_batcher, epochs=self.epochs,
            learning_rate=self.learning_rate,
            scorer=lambda pred, true: entity_scorer(pred, true, train_batcher.tagmap,
                                                    no_localization=self.no_localization),
            learning_rate_decay=self.learning_rate_decay, finetune=self.finetune, save_ckpt_fn=save_ckpt_fn,
            no_localization=self.no_localization
        )

        metadata = {
            "train_scores": train_scores,
            "test_scores": test_scores,
            "train_average_scores": train_average_scores,
            "test_average_scores": test_average_scores,
        }

        with open(os.path.join(trial_dir, "train_data.json"), "w") as metadata_sink:
            metadata_sink.write(json.dumps(metadata, indent=4))

        pickle.dump(train_batcher.tagmap, open(os.path.join(trial_dir, "tag_types.pkl"), "wb"))

# Execution

All training options are specified [here](https://github.com/VitalyRomanov/method-embedding/blob/e995477db13a13875cca54c37d4d29f63b0c8e93/SourceCodeTools/nlp/entity/type_prediction.py#L256)
Option names are added to `args` below.

In [161]:
dataset_path = "variable_misuse_graph_2_percent_balanced/with_ast"

args = Namespace()
args.__dict__.update({
    "learning_rate": 1e-6,           #
    "learning_rate_decay": 0.99,     #
    "max_seq_len": 512,              # default for BERT
    "random_seed": 42,               #
    "epochs": 2,                     #
    "gpu": -1,                       # set this to GPU id to use gpu
    "batch_size": 8,                 # higher value increases memory consumption
    "finetune": True,  # set this flag to enable finetuning
    "no_localization": False,        # whether to solve variable misuse with, or without localization
    
    # do not change items below
    "data_path": dataset_path,
    "no_graph": True,                # used for another model
    "model_output": dataset_path,    # where to store checkpoints
    "graph_emb_path": None,          # used for another model
    "word_emb_path": None,           # used for another model
    "trials": 1,                     # setting > 1 repeats training, used to accumulate statisitcs
    "suffix_prefix_buckets": 1,
    "mask_unlabeled_declarations": False,
    "ckpt_path": None
})

In [162]:
train_data = DataIteratorFnClf(dataset_path, "train")
test_data = DataIteratorFnClf(dataset_path, "val")

In [163]:
# test_data[0]  # ignore `replacements`

In [164]:
# test_data[100]

In [165]:
trainer = VariableMisuseDetector(
    train_data, test_data, model_params={}, trainer_params=copy(args.__dict__)
)

In [166]:
trainer.train_model()

Scanning data: 100%|██████████| 13970/13970 [00:00<00:00, 17620.38it/s]
Scanning data: 100%|██████████| 1462/1462 [00:00<00:00, 21613.11it/s]
Epoch 0:  39%|███▉      | 607/1552 [11:57<18:37,  1.18s/it]


In [167]:
# test_data = DataIteratorFnClf(dataset_path, "val")
# trainer.apply_model(test_data, "/Users/LTV/dev/method-embeddings/examples/variable_misuse_graph_2_percent_balanced/with_ast/codebert_var_mususe_fn_clf2022-11-01_14-46-38.717526")