# Setup

## Files

In [2]:
TEST_BERT_DIR = "/notebooks/TestBERT"
DATA_DIR = "/datasets"

## Env
Go [here](https://docs.neptune.ai/setup/installation) to find out about setting up your own neptune project for experiment monitoring

In [3]:
import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"
os.environ["NEPTUNE_API_TOKEN"]=""
os.environ["NEPTUNE_PROJECT"]=""

## Packages

In [4]:
! pip install -qqq -r requirements.txt

# TestBERT Hugging Face

In [5]:
import math
import torch
import random
import argparse
import numpy as np
import neptune
from itertools import product
from torch.utils.data import DataLoader, RandomSampler

## Preprocessing Utils

In [6]:
# adapted from https://gist.github.com/phpdude/1ae6f19de213d66286c8183e9e3b9ec1
def remove_doc_strings(src):
    import ast, astunparse
    try:
        parsed = ast.parse(src)

        for node in ast.walk(parsed):
            # let's work only on functions & classes definitions
            if not isinstance(node, (ast.FunctionDef, ast.ClassDef, ast.AsyncFunctionDef, ast.Module)):
                continue

            if not len(node.body):
                continue

            if not isinstance(node.body[0], ast.Expr):
                continue

            if not hasattr(node.body[0], 'value') or not isinstance(node.body[0].value, ast.Str):
                continue

            node.body = node.body[1:]

        return astunparse.unparse(parsed)
    
    except SyntaxError:
        return None

In [7]:
def load_and_save_dataset(args):
    # adapted from https://huggingface.co/docs/datasets/process

    def chunk_examples(examples, args):
        
        texts = []
        file_name = []
        
        from transformers import AutoTokenizer

        tokenizer = AutoTokenizer.from_pretrained(args.model_name_or_path)
        
        for idx, text in enumerate(examples["text"]):
            tokens = tokenizer.tokenize(text)
            
            new_texts = [
                tokenizer.convert_tokens_to_string(tokens[i:i + args.max_length]) for i in range(0, len(tokens), args.max_length)
            ]
            
            texts += new_texts
            file_name += [examples["filename"][idx]]*len(new_texts)
            
        return {"text": texts, "filename": file_name}
    
    # strip doc strings, split into chunks and add filename attribute
    def format_docs(ds):
        
        if "filename" not in ds.features.keys():
            ds = ds.add_column(name="filename", column=ds.info.download_checksums.keys())
        
        ds = ds.map(
            lambda example: {"text": remove_doc_strings(f"{example['text']}")}
        ).filter(lambda example:example["text"])
            
        ds = ds.map(lambda batch: chunk_examples(batch, args), batched=True, remove_columns=ds.column_names)
        
        return ds
    
    def merge_src_test(example):
        t_file = example["filename"].replace("/src", "/test").replace(".py", "_test.py")
        
        # multiple chunks will match the below query so we select a random one
        # should help simulate the imprtance of different parts of a source and test file
        t_examples = test.filter(lambda example: example["filename"] == t_file)
        
        # t_examples is None if the test could not be previously parsed
        t_text = random.choice(t_examples)["text"] if len(t_examples) > 0 else None
        
        return {"source": example["text"], "target": t_text}
    
    DATASET_PATH = os.path.join(args.output_dir, "datasets")
    
    from datasets import load_dataset, load_from_disk
    
    if os.path.exists(DATASET_PATH):
        src_test = load_from_disk(DATASET_PATH)
    else:
        src = load_dataset("text", data_files=os.path.join(f"{DATA_DIR}/src", "**"), sample_by="document", split="train")
        test = load_dataset("text", data_files=os.path.join(f"{DATA_DIR}/test", "**"), sample_by="document", split="train")

        src = format_docs(src)
        test = format_docs(test)

        src_test = src.map(
            merge_src_test, 
            remove_columns=["text", "filename"], 
            num_proc=4
        ).filter(lambda example : example["target"])

        src_test.save_to_disk(DATASET_PATH)
            
    return src_test

In [8]:
def prep_dataset(dataset, tokenizer, test_size=0.2):
    
    def tokenize(example):
        inputs = tokenizer(
            example["source"],
            padding="max_length",
            truncation=True
        )

        return {**inputs, "labels": tokenizer(example["target"], padding="max_length", truncation=True)["input_ids"]}
        
    dataset = dataset.map(
        tokenize,
        num_proc=4,
        remove_columns=["source", "target"]
    )
    
    dataset = dataset.train_test_split(test_size=test_size)
    
    return dataset

In [9]:
def load_tokenizer(args, dataset):
    
    from itertools import chain
    
    def get_corpus():
        for text in chain.from_iterable([dataset[col] for col in dataset.column_names]):
            yield text
            
    from transformers import RobertaTokenizerFast, AutoConfig
        
    TOKENIZER_PATH = os.path.join(args.output_dir, "tokenizer.json")
    
    if not os.path.exists(TOKENIZER_PATH):

        tokenizer = RobertaTokenizerFast.from_pretrained(
            args.model_name_or_path, 
            model_max_length=args.max_length
        )
        
        new_tokenizer = tokenizer.train_new_from_iterator(
            get_corpus(), 
            AutoConfig.from_pretrained(args.model_name_or_path).vocab_size
        )
        
        tokenizer.add_tokens(list(new_tokenizer.vocab.keys()))

        tokenizer.save_pretrained(args.output_dir)

    return RobertaTokenizerFast(tokenizer_file=TOKENIZER_PATH, model_max_length=args.max_length)

In [10]:
def init_new_embeddings(model, side, tokenizer, strategy="avg"):
        
    # based on https://nlp.stanford.edu//~johnhew//vocab-expansion.html

    num_tokens = len(tokenizer)
    
    from transformers import AutoConfig
        
    num_new_tokens = len(tokenizer) - AutoConfig.from_pretrained(model.encoder.name_or_path).vocab_size

    if side == "encoder":
        model.encoder.resize_token_embeddings(num_tokens)
        weight_key = 'encoder.embeddings.word_embeddings.weight'

    else:
        model.decoder.resize_token_embeddings(num_tokens)
        weight_key = 'decoder.roberta.embeddings.word_embeddings.weight'

    params = model.state_dict()

    embeddings = params[weight_key]
    
    # embeddings = embeddings.to(torch.float32)

    pre_expansion_embeddings = embeddings[:-num_new_tokens,:]
    mu = torch.mean(pre_expansion_embeddings, dim=0)
    n = pre_expansion_embeddings.size()[0]
    sigma = ((pre_expansion_embeddings - mu).T @ (pre_expansion_embeddings - mu)) / n
    dist = torch.distributions.multivariate_normal.MultivariateNormal(
            mu, covariance_matrix=1e-5*sigma)

    new_embeddings = torch.stack(tuple((dist.sample() for _ in range(num_new_tokens))), dim=0)
    embeddings[-num_new_tokens:,:] = new_embeddings
    
    # embedings = embeddings.to(torch.float16)

    params[weight_key] = embeddings

    model.load_state_dict(params)

In [11]:
# from https://huggingface.co/docs/transformers/v4.18.0/en/performance#faster-optimizer
def get_optimizer(model, training_args):
    import bitsandbytes as bnb
    from torch import nn
    from transformers.trainer_pt_utils import get_parameter_names

    decay_parameters = get_parameter_names(model, [nn.LayerNorm])
    decay_parameters = [name for name in decay_parameters if "bias" not in name]
    optimizer_grouped_parameters = [
        {
            "params": [p for n, p in model.named_parameters() if n in decay_parameters],
            "weight_decay": training_args.weight_decay,
        },
        {
            "params": [p for n, p in model.named_parameters() if n not in decay_parameters],
            "weight_decay": 0.0,
        },
    ]

    optimizer_kwargs = {
        "betas": (training_args.adam_beta1, training_args.adam_beta2),
        "eps": training_args.adam_epsilon,
    }
    optimizer_kwargs["lr"] = training_args.learning_rate
    adam_bnb_optim = bnb.optim.Adam8bit(
        optimizer_grouped_parameters,
        betas=(training_args.adam_beta1, training_args.adam_beta2),
        eps=training_args.adam_epsilon,
        lr=training_args.learning_rate,
    )
    
    return adam_bnb_optim

## Parameters

In [12]:
MODEL_VARIANT = "microsoft/codebert-base-mlm"
max_epochs = 5 
""" 3 recommended for finetuning by BERT paper but 10 in CodeBERT example: https://github.com/microsoft/CodeXGLUE/tree/main/Code-Text/code-to-text
"""

' 3 recommended for finetuning by BERT paper but 10 in CodeBERT example: https://github.com/microsoft/CodeXGLUE/tree/main/Code-Text/code-to-text\n'

## Runner

In [24]:
def main(args):
    
    parser = argparse.ArgumentParser()

    ## Required parameters  
    parser.add_argument("--model_type", default=None, type=str, required=True,
                      help="Model type: e.g. roberta")
    parser.add_argument("--model_name_or_path", default=None, type=str, required=True,
                      help="Path to pre-trained model: e.g. roberta-base" )
    parser.add_argument("--output_dir", default=None, type=str, required=True,
                      help="The output directory. Contains any cached files or outputs")

    ## Other parameters
    parser.add_argument("--example_dir", default=None, type=str, 
                      help="The example directory. Contains source and test .py files")
    parser.add_argument("--max_length", default=512, type=int,
                      help="The maximum total target sequence length after tokenization. Sequences longer than this will be truncated, sequences shorter will be padded.")
    parser.add_argument("--max_epochs", default=-1, type=int,
                        help="")
    parser.add_argument("--do_train", action='store_true',
                        help="Whether to run training.")
    parser.add_argument("--batch_size", default=4, type=int,
                      help="Batch size per GPU/CPU for training and evaluation.")
    parser.add_argument("--learning_rate", default=5e-5, type=float,
                      help="The initial learning rate for Adam.")
    parser.add_argument("--num_beams", default=10, type=int,
                      help="The number of beams for beam search")
    parser.add_argument("--weight_decay", default=0.0, type=float,
                        help="Weight decay if we apply some.")
    parser.add_argument("--adam_epsilon", default=1e-8, type=float,
                      help="Epsilon for Adam optimizer.")
    parser.add_argument('--seed', type=int, default=42,
                      help="random seed for initialization")
    parser.add_argument('--subset_size', type=int, default=-1,
                      help="Size of subset of dataset to use if not training on entirety")
    parser.add_argument('--report_to', type=str, default="none",
                      help="Where to log training data to")
    parser.add_argument('--warmup_ratio', type=str, default=0.05,
                      help="Warmup ratio for linear scheduler; default based on 5 out of 90 epochs in original paper: https://arxiv.org/abs/1706.02677")
    
    # print arguments
    args = parser.parse_args() if len(args) == 0 else parser.parse_args(args)

    # Dataset
    
    # first load dataset
    dataset = load_and_save_dataset(args)
    
    # use dataset to initialise tokenizer
    tokenizer = load_tokenizer(args, dataset)
    tokenizer.bos_token = tokenizer.cls_token
    tokenizer.eos_token = tokenizer.sep_token
    
    # use tokenizer to tranform and prep the dataset for training
    dataset = prep_dataset(dataset, tokenizer)

    # CodeBERT model config
    
    from transformers import EncoderDecoderModel, AutoModelForSeq2SeqLM
    
    ENCODER_DECODER_PATH = os.path.join(args.output_dir, "encoder_decoder")
    
    encoder_decoder = EncoderDecoderModel.from_encoder_decoder_pretrained(args.model_name_or_path, args.model_name_or_path, tie_encoder_decoder=True)
    
    # alter model encoder decoder embeddings using tokenizer
    
    for side in ["encoder", "decoder"]:
        init_new_embeddings(encoder_decoder, side, tokenizer)
    
    encoder_decoder.save_pretrained(ENCODER_DECODER_PATH)
    
    model = AutoModelForSeq2SeqLM.from_pretrained(
        ENCODER_DECODER_PATH,
        pad_token_id=tokenizer.pad_token_id,
        decoder_start_token_id=tokenizer.bos_token_id,
    )
    
    model.save_pretrained(args.output_dir)
    
    # Trainer
    # args

    from transformers import Seq2SeqTrainer, Seq2SeqTrainingArguments, GenerationConfig
    
    gen_kwargs = {
        'pad_token_id':tokenizer.pad_token_id,
        'decoder_start_token_id':tokenizer.bos_token_id,
        'max_new_tokens':args.max_length,
        'min_new_tokens':args.max_length//2,
        # beam-search multinomial sampling strategy
        'do_sample':True,
        'num_beams':args.num_beams,
        'early_stopping':True,
    }
    
    generation_config = GenerationConfig(**gen_kwargs)
    
    gradient_accumulation_steps = args.batch_size
    
    training_args = Seq2SeqTrainingArguments(
        fp16=True,
        fp16_full_eval=True,
        seed=args.seed,
        save_total_limit=3,
        report_to=args.report_to,
        load_best_model_at_end=True,
        metric_for_best_model="eval_bert",
        save_strategy="epoch",
        evaluation_strategy="epoch",
        adam_epsilon=args.adam_epsilon,
        num_train_epochs=args.max_epochs,
        learning_rate=args.learning_rate,
        lr_scheduler_type="linear",
        warmup_ratio=args.warmup_ratio,
        optim="adafactor",
        output_dir=f"{args.output_dir}/trainer",
        per_device_train_batch_size=gradient_accumulation_steps//4,
        per_device_eval_batch_size=gradient_accumulation_steps//4,
        gradient_accumulation_steps=gradient_accumulation_steps,
        gradient_checkpointing=True,
        predict_with_generate=True,
        generation_num_beams=args.num_beams,
        generation_config=generation_config,
    )
    
    # Data Collator
    
    from transformers import DataCollatorForSeq2Seq

    data_collator = DataCollatorForSeq2Seq(
        tokenizer,
        model=model,
        max_length=args.max_length,
        padding="max_length",
        label_pad_token_id=model.config.pad_token_id,
        pad_to_multiple_of=8 if training_args.fp16 else None,
    )
    
    # metrics
    import evaluate
    metric = evaluate.load("bertscore")
    
    def decode_sequences(sequences):
        return tokenizer.convert_tokens_to_string(tokenizer.batch_decode(sequences, skip_special_tokens=True))
    
    # adapted from https://github.com/huggingface/transformers/blob/main/examples/pytorch/translation/run_translation.py
    
    def postprocess_text(preds, labels):
        preds = [pred.strip() for pred in preds]
        labels = [[label.strip()] for label in labels]

        return preds, labels

    def compute_metrics(eval_preds, output_dir=args.output_dir):
        preds, labels = eval_preds
        if isinstance(preds, tuple):
            preds = preds[0]
        # Replace -100s used for padding as we can't decode them
        preds = np.where(preds != -100, preds, tokenizer.pad_token_id)
        decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
        labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
        decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

        # Some simple post-processing
        decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)

        result = metric.compute(predictions=decoded_preds, references=decoded_labels, model_type="roberta-large")
        
        return {"eval_bert": np.mean(result["f1"]).item()}
    
    # training
    
    random.seed(a=args.seed)
    
    eval_ds = dataset["test"]
    eval_subset = eval_ds.select(random.sample(range(0, len(eval_ds)), len(eval_ds)//50))
    
    trainer = Seq2SeqTrainer(
        model=model,
        args=training_args,
        train_dataset=dataset["train"],
        eval_dataset=eval_subset,
        compute_metrics=compute_metrics,
        optimizers=(get_optimizer(model, training_args), None)
    )
    
    resume = len(os.listdir(os.path.join(args.output_dir, "trainer"))) > 0
    
    if args.do_train:
        trainer.train(resume_from_checkpoint=resume)
    else:
        
        inputs = dataset["train"][0]
        
        text = decode_sequences(inputs["input_ids"])
        
        preds, _, _ = trainer.predict([inputs])
        
        preds = decode_sequences(preds)
        
        print(f"Input: {text}")
        
        print(f"Predictions: {preds}")

## Train

In [14]:
# main([
#     "--model_type", "roberta", 
#     "--model_name_or_path", MODEL_VARIANT,
#     "--example_dir", f"{DATA_DIR}",
#     "--output_dir", f"{TEST_BERT_DIR}",
#     "--max_epochs", f"{max_epochs}",
#     "--report_to", "neptune",
#     "--do_train"
# ])

## Predict

In [25]:
# main(["--model_type", "roberta", "--model_name_or_path", MODEL_VARIANT, "--output_dir", f"{TEST_BERT_DIR}"])

 

Loading cached processed dataset at /notebooks/TestBERT/datasets/cache-e8a9d47c83606666.arrow


 

Loading cached processed dataset at /notebooks/TestBERT/datasets/cache-ed329f0c9ca20592.arrow


 

Loading cached processed dataset at /notebooks/TestBERT/datasets/cache-7b586e7b0668e2ca.arrow


 

Loading cached processed dataset at /notebooks/TestBERT/datasets/cache-d460c2d3bd1949d1.arrow
Loading cached split indices for dataset at /notebooks/TestBERT/datasets/cache-7d5b287785bdc275.arrow and /notebooks/TestBERT/datasets/cache-23d4ea0bb717758c.arrow
Some weights of the model checkpoint at microsoft/codebert-base-mlm were not used when initializing RobertaModel: ['lm_head.bias', 'lm_head.decoder.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.bias', 'lm_head.dense.weight', 'lm_head.decoder.bias', 'lm_head.layer_norm.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Input: <s>url_segment(connector_id).delete().go()

    def delete_consent(self, consent_id):
        return self.start().uri('/api/consent').url_segment(consent_id).delete().go()

    def delete_email_template(self, email_template_id):
        return self.start().uri('/api/email/template').url_segment(email_template_id).delete().go()

    def delete_entity(self, entity_id):
        return self.start().uri('/api/entity').url_segment(entity_id).delete().go()

    def delete_entity_grant(self, entity_id, recipient_entity_id=None, user_id=None):
        return self.start().uri('/api/entity').url_segment(entity_id).url_segment('grant').url_parameter('recipientEntityId', self.convert_true_false(recipient_entity_id)).url_parameter('userId', self.convert_true_false(user_id)).delete().go()

    def delete_entity_type(self, entity_type_id):
        return self.start().uri('/api/entity/type').url_segment(entity_type_id).delete().go()

    def delete_entity_type_permission(self, entity_type_id, pe

# Module

In [None]:
# if __name__ == "__main__":
#     main()