In [1]:
%load_ext autoreload
%autoreload 2
%load_ext tensorboard

In [1]:
import os
from os.path import join
from argparse import Namespace
from SourceCodeTools.nlp.codebert.codebert_train import CodeBertModelTrainer2, test_step, train_step_finetune, CodebertHybridModel, batch_to_torch
from SourceCodeTools.nlp.entity.type_prediction import scorer
from transformers import RobertaTokenizer, RobertaModel

import json
import torch
from time import time
from copy import copy
from datetime import datetime
import pickle

from tqdm import tqdm

In [2]:
def read_data(dataset_path, partition):
    data_path = join(dataset_path, f"var_misuse_seq_{partition}.json")
    
    data = []
    for line in open(data_path, "r"):
        entry = json.loads(line)
        
        text = entry.pop("text")
        data.append((text, entry))
        
    return data

In [8]:
class VariableMisuseDetector(CodeBertModelTrainer2):
    def get_trial_dir(self):
        return os.path.join(self.output_dir, "codebert_var_mususe" + str(datetime.now())).replace(":", "-").replace(" ", "_")
    
    def train(
            self, model, train_batches, test_batches, epochs, report_every=10, scorer=None, learning_rate=0.01,
            learning_rate_decay=1., finetune=False, summary_writer=None, save_ckpt_fn=None, no_localization=False
    ):

        optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)
        scheduler = torch.optim.lr_scheduler.ExponentialLR(optimizer, gamma=learning_rate_decay)

        train_losses = []
        test_losses = []
        train_f1s = []
        test_f1s = []

        num_train_batches = len(train_batches)
        num_test_batches = len(test_batches)

        best_f1 = 0.

        for e in range(epochs):
            losses = []
            ps = []
            rs = []
            f1s = []

            start = time()
            model.train()

            for ind, batch in enumerate(tqdm(train_batches)):
                batch_to_torch(batch, self.device)
                # token_ids, graph_ids, labels, class_weights, lengths = b
                loss, p, r, f1 = train_step_finetune(
                    model=model, optimizer=optimizer, token_ids=batch['tok_ids'],
                    prefix=None, suffix=None, graph_ids=None,
                    labels=batch['tags'], lengths=batch['lens'],
                    extra_mask=None,
                    # class_weights=batch['class_weights'],
                    scorer=scorer, finetune=finetune and e / epochs > 0.6,
                    vocab_mapping=self.vocab_mapping
                )
                losses.append(loss.cpu().item())
                ps.append(p)
                rs.append(r)
                f1s.append(f1)

                self.summary_writer.add_scalar("Loss/Train", loss, global_step=e * num_train_batches + ind)
                self.summary_writer.add_scalar("Precision/Train", p, global_step=e * num_train_batches + ind)
                self.summary_writer.add_scalar("Recall/Train", r, global_step=e * num_train_batches + ind)
                self.summary_writer.add_scalar("F1/Train", f1, global_step=e * num_train_batches + ind)

            test_alosses = []
            test_aps = []
            test_ars = []
            test_af1s = []

            model.eval()

            for ind, batch in enumerate(test_batches):
                batch_to_torch(batch, self.device)
                # token_ids, graph_ids, labels, class_weights, lengths = b
                test_loss, test_p, test_r, test_f1 = test_step(
                    model=model, token_ids=batch['tok_ids'],
                    prefix=None, suffix=None, graph_ids=None,
                    labels=batch['tags'], lengths=batch['lens'],
                    extra_mask=None,
                    # class_weights=batch['class_weights'],
                    scorer=scorer, vocab_mapping=self.vocab_mapping
                )

                self.summary_writer.add_scalar("Loss/Test", test_loss, global_step=e * num_test_batches + ind)
                self.summary_writer.add_scalar("Precision/Test", test_p, global_step=e * num_test_batches + ind)
                self.summary_writer.add_scalar("Recall/Test", test_r, global_step=e * num_test_batches + ind)
                self.summary_writer.add_scalar("F1/Test", test_f1, global_step=e * num_test_batches + ind)
                test_alosses.append(test_loss.cpu().item())
                test_aps.append(test_p)
                test_ars.append(test_r)
                test_af1s.append(test_f1)

            epoch_time = time() - start

            train_losses.append(float(sum(losses) / len(losses)))
            train_f1s.append(float(sum(f1s) / len(f1s)))
            test_losses.append(float(sum(test_alosses) / len(test_alosses)))
            test_f1s.append(float(sum(test_af1s) / len(test_af1s)))

            print(
                f"Epoch: {e}, {epoch_time: .2f} s, Train Loss: {train_losses[-1]: .4f}, Train P: {sum(ps) / len(ps): .4f}, Train R: {sum(rs) / len(rs): .4f}, Train F1: {sum(f1s) / len(f1s): .4f}, "
                f"Test loss: {test_losses[-1]: .4f}, Test P: {sum(test_aps) / len(test_aps): .4f}, Test R: {sum(test_ars) / len(test_ars): .4f}, Test F1: {test_f1s[-1]: .4f}")

            if save_ckpt_fn is not None and float(test_f1s[-1]) > best_f1:
                save_ckpt_fn()
                best_f1 = float(test_f1s[-1])

            scheduler.step(epoch=e)

        return train_losses, train_f1s, test_losses, test_f1s
    
    def train_model(self):
        
        model_params = copy(self.model_params)

        print(f"\n\n{model_params}")
        lr = model_params.pop("learning_rate")
        lr_decay = model_params.pop("learning_rate_decay")
        suffix_prefix_buckets = model_params.pop("suffix_prefix_buckets")

        graph_emb = load_pkl_emb(self.graph_emb_path) if self.graph_emb_path is not None else None

        train_batcher, test_batcher = self.get_dataloaders(word_emb=None, graph_emb=None, suffix_prefix_buckets=suffix_prefix_buckets)

        codebert_model = RobertaModel.from_pretrained("microsoft/codebert-base")
        model = CodebertHybridModel(
            codebert_model, graph_emb=None, padding_idx=0, num_classes=train_batcher.num_classes(),
            no_graph=self.no_graph
        )
        
        
        if self.use_cuda:
            model.cuda()

        trial_dir = self.get_trial_dir()
        os.mkdir(trial_dir)
        self.create_summary_writer(trial_dir)
        
        pickle.dump(train_batcher.tagmap, open(os.path.join(trial_dir, "tag_types.pkl"), "wb"))

        def save_ckpt_fn():
            checkpoint_path = os.path.join(trial_dir, "checkpoint")
            torch.save(model, open(checkpoint_path, 'wb'))

        train_losses, train_f1, test_losses, test_f1 = self.train(
            model=model, train_batches=train_batcher, test_batches=test_batcher,
            epochs=self.epochs, learning_rate=lr,
            scorer=lambda pred, true: scorer(pred, true, train_batcher.tagmap, no_localization=self.no_localization),
            learning_rate_decay=lr_decay, finetune=self.finetune, save_ckpt_fn=save_ckpt_fn,
            no_localization=self.no_localization
        )

        metadata = {
            "train_losses": train_losses,
            "train_f1": train_f1,
            "test_losses": test_losses,
            "test_f1": test_f1,
            "learning_rate": lr,
            "learning_rate_decay": lr_decay,
            "epochs": self.epochs,
            "suffix_prefix_buckets": suffix_prefix_buckets,
            "seq_len": self.seq_len,
            "batch_size": self.batch_size,
            "no_localization": self.no_localization
        }

        print("Maximum f1:", max(test_f1))

        metadata.update(model_params)

        with open(os.path.join(trial_dir, "params.json"), "w") as metadata_sink:
            metadata_sink.write(json.dumps(metadata, indent=4))

        


In [4]:
dataset_path = "/Users/LTV/Downloads/NitroShare/variable_misuse_graph_2_percent_misuse_edges"

args = Namespace()
args.__dict__.update({
    "learning_rate": 1e-3,
    "max_seq_len": 512,
    "random_seed": 42,
    "epochs": 100,
    "gpu": -1,
    # do not change items below
    "batch_size": 8,
    "no_graph": True,
    "model_output": dataset_path,
    "no_localization": False,
    "graph_emb_path": None,
    "word_emb_path": None,
    "finetune": False,
    "trials": 1,
})

In [5]:
train_data = read_data(dataset_path, "train")
test_data = read_data(dataset_path, "val")

In [9]:
trainer = VariableMisuseDetector(
    train_data, test_data, params={"learning_rate": 1e-4, "learning_rate_decay": 0.99, "suffix_prefix_buckets": 1},
    graph_emb_path=args.graph_emb_path, word_emb_path=args.word_emb_path,
    output_dir=args.model_output, epochs=args.epochs, batch_size=args.batch_size, gpu_id=args.gpu,
    finetune=args.finetune, trials=args.trials, seq_len=args.max_seq_len, no_localization=args.no_localization,
    no_graph=args.no_graph
)


In [None]:
trainer.train_model()