# Imports
## Packages

In [7]:
!pip install transformers &> /dev/null

In [8]:
from transformers import RobertaModel, RobertaTokenizer, EncoderDecoderModel, get_linear_schedule_with_warmup

import os
import torch
from torch.utils.data import Dataset, DataLoader
from torchtext.legacy.data import BucketIterator
import torch.nn.functional as F
import torch.optim as optim
from torch.autograd import Variable

from torch.optim import Adam
from torch.optim.lr_scheduler import ReduceLROnPlateau

from tqdm.notebook import tqdm


import numpy as np


import json
import ast
import re
from time import time



device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

## Data

In [2]:
!cp "/content/drive/MyDrive/Sourcery/codenet_data.zip" "./"
!unzip codenet_data.zip &> /dev/null

# Preprocessing

## Create Inputs and Outputs raw

In [9]:
with open("train/python_train_0.jsonl") as f:
    jsonl_content = f.readlines()

train_jsons = [json.loads(json_line) for json_line in jsonl_content]


with open("valid/python_valid_0.jsonl") as f:
    jsonl_content = f.readlines()
#divide valid size by 10
jsonl_content = jsonl_content[:int(len(jsonl_content)/10)]
val_jsons = [json.loads(json_line) for json_line in jsonl_content]

In [None]:
def get_func_and_name(data):
    try:
        node = ast.parse(data).body[0]
        function_name = node.name
        function = data
        docstring = ast.get_docstring(node)
        #remove docstring
        if docstring is not None:
            function = re.sub(r'\"\"\"(.*)\"\"\"',"",function,count=1,flags=re.DOTALL)
        #remove function name
        function = re.sub(function_name,"<mask>",function,count=1)
        return function,function_name
    except:
        return None

In [None]:
training_pairs_raw = [get_func_and_name(line["code"]) for line in train_jsons if get_func_and_name(line["code"]) is not None]

training_inputs_raw = [x for (x,y) in training_pairs_raw]
training_labels_raw = [y for (x,y) in training_pairs_raw]

val_pairs_raw = [get_func_and_name(line["code"]) for line in val_jsons if get_func_and_name(line["code"]) is not None]

val_inputs_raw = [x for (x,y) in training_pairs_raw]
val_labels_raw = [y for (x,y) in training_pairs_raw]

## Tokenize

In [None]:
tokenizer = RobertaTokenizer.from_pretrained("huggingface/CodeBERTa-small-v1")

PAD_token = tokenizer.pad_token_id
EOS_token = tokenizer.eos_token_id
BOS_token = tokenizer.bos_token_id

training_inputs = tokenizer.batch_encode_plus(training_inputs_raw)["input_ids"]
training_labels = tokenizer.batch_encode_plus(training_labels_raw)["input_ids"]

val_inputs = tokenizer.batch_encode_plus(val_inputs_raw)["input_ids"]
val_labels = tokenizer.batch_encode_plus(val_labels_raw)["input_ids"]


#Remove underscore tokens from inputs and labels, and truncate up to max model length
underscore_token = tokenizer.get_vocab()["_"]

training_inputs = [[token for token in input if token != underscore_token][:tokenizer.model_max_length] for input in training_inputs]
training_labels = [[token for token in input if token != underscore_token] for input in training_labels]

val_inputs = [[token for token in input if token != underscore_token][:tokenizer.model_max_length] for input in val_inputs]
val_labels = [[token for token in input if token != underscore_token] for input in val_labels]



training_pairs = list(zip(training_inputs,training_labels))
validation_pairs = list(zip(val_inputs,val_labels))

## Set Config

In [16]:
class Config:
    """
    Stores useful information
    Many scripts uses a config instance. See utils.create_session for its initialization
    """
    
    model_type = "CodeBERTa" #CodeBERTa, RNNEncoder or CNNEncoder
    model_name = "huggingface/CodeBERTa-small-v1"
    use_attention = False # use attention for RNNEncoder model
    tie_embeddings = False # do tie embeddings between encoders and decoders (for RNNEncoder and CNNEncoder models)

    dataset_size = "small"
    data_folder = "data"

    print_every_k_batch = 256
    batch_size = 8
    learning_rate = 1e-5
    embedding_dim = 128
    hidden_size = 128
    epochs = 10
    weight_decay = 0
    drop_rate = .1
    tf_ratio = .5  #teacher forcing ration
    max_grad_norm = 2. #max norm for gradient clipping

    path_result = "/content/drive/MyDrive/Sourcery"
    #resume = None
    resume = "/content/drive/MyDrive/Sourcery/checkpoint_small.pth"
    num_return_sequences = 5 #number of sequences to return when making prediction with the models
    max_output_seq_len = 8
    max_input_len = 1000
    bos_token_id = 0
    pad_token_id = 1
    eos_token_id = 2
    mask_token_id = 3

    def __init__(self, args={}):
        for attr in dir(self):
            if not attr.startswith('__') and hasattr(args, attr):
                setattr(self, attr, getattr(args, attr))

    def __repr__(self):
        return json.dumps(vars(self), sort_keys=True, indent=4)
config = Config()

## Utility functions

In [4]:

def pretty_time(t):
    """
    Tranforms time t in seconds into a pretty string
    """
    return f"{int(t//60)}m{int(t%60)}s"
    
def now():
    """
    Current date as a string
    """
    return datetime.now().strftime('%y-%m-%d_%Hh%Mm%Ss')

def save_json(path_result, name, x):
    """
    Saves x into path_result with the given name
    """
    with open(os.path.join(path_result, f'{name}.json'), 'w') as f:
        json.dump(x, f, indent=4)


## Build Dataloaders

In [5]:
class FunctionNamingDataset(Dataset):
    def __init__(self,data_pairs,inputs_raw):
        self.pairs = data_pairs
        self.inputs_raw = inputs_raw
        self.n_examples = len(self.pairs)
    
    def __len__(self):
        r"""When used `len` return the number of examples.
        """

        return self.n_examples


    def __getitem__(self, item):
        r"""Given an index return a pair of input output
        """
        input,output = self.pairs[item]
        input_raw = self.inputs_raw[item]
        return (input,output,len(input),len(output),input_raw)

NameError: ignored

In [6]:
train_dataset = FunctionNamingDataset(training_pairs,training_inputs_raw)
val_dataset = FunctionNamingDataset(validation_pairs,val_inputs_raw)


train_batch_size = config.batch_size
valid_batch_size = config.batch_size

train_dataloader,val_dataloader = BucketIterator.splits(
    
                        # Datasets for iterator to draw data from
                        (train_dataset,val_dataset),

                        # Tuple of train and validation batch sizes.
                        batch_sizes=(train_batch_size,valid_batch_size),

                        # Device to load batches on.
                        device=device, 

                        # Function to use for sorting examples.
                        sort_key=lambda x: x[2],


                        # Repeat the iterator for multiple epochs.
                        repeat=True, 

                        # Sort all examples in data using `sort_key`.
                        sort=False, 

                        # Shuffle data on each epoch run.
                        shuffle=True,

                        # Use `sort_key` to sort examples in each batch.
                        sort_within_batch=True,
                        )

NameError: ignored

# Training

## Model Class

In [12]:
bcolors = {
    'RESULTS': '\033[95m',
    'HEADER': '\033[94m',
    'SUCCESS': '\033[92m',
    'WARNING': '\033[93m',
    'FAIL': '\033[91m',
    'ENDC': '\033[0m',
    'INFO': '\033[1m',
    'UNDERLINE': '\033[4m'
}

def printc(log, color='HEADER'):
    """
    Prints logs with color according to the dict bcolors
    """
    print(f"{bcolors[color]}{log}{bcolors['ENDC']}")

In [13]:
class Seq2SeqModelInterface(torch.nn.Module):
    def __init__(self,config,device):
        """
        PyTorch Seq2SeqModel interface
        Every model has to inherit from Seq2SeqModelInterface so training and testing run correctly

        At least the methods defined below and which raise NotImplementedError must be implemented
        - self.optimizer
        - self.scheduler
        """
        super(Seq2SeqModelInterface, self).__init__()
        self.device = device
        self.config = config
        
    def initialize_scheduler(self, total_steps=0):
        """
        Creates a scheduler for a given otimizer
        """
        self.scheduler = get_linear_schedule_with_warmup(self.optimizer,
                                                        num_warmup_steps=2, # Default value
                                                        num_training_steps=total_steps)
    def resume(self, config):
        """
        Resumes with a given checkpoint. Loads the saved parameters, optimizer and scheduler.
        """
        printc(f"Resuming with model at {config.resume}...", "INFO")
        path_checkpoint = config.resume
        assert os.path.isfile(path_checkpoint), 'Error: no checkpoint found!'
        checkpoint = torch.load(path_checkpoint, map_location=self.device)
        
        self.load_state_dict(checkpoint['model'])
        self.optimizer.load_state_dict(checkpoint['optimizer'])
        self.scheduler.load_state_dict(checkpoint['scheduler'])

    def step(self, batch):
        """
        Args:
            batch: data from the data loaders (see training.py)
        Output:
            loss (tensor): PyTorch loss
            outputs (batch_size,seq_len,vocab_size): model outputs (raw predictions without softmax)
        
        Examples::
            >>> batch = next(iter(train_loader))
            >>> loss, outputs = model.step(batch)
        """
        raise NotImplementedError

    def forward(self,*args, **kwargs):
        """
        PyTorch nn.Module forward
        It is specific to the model, and the args have no specific format
        """
        raise NotImplementedError

    def evaluate(self, batch, num_sequences=1):
        """
        Args:
            batch: data from the data loaders (similar to training data)
            num_sequences: the number of sequences to output 
        Output:
            top_seqences(batch_size,num_sequences,max_output_seq_len): The top num_sequences predictions
            top_lengths(batch_size,num_sequences): The actual lengths of the top num_sequences predictions
            target_sequences(batch_size,batch_tgt_max_seq_len): The target sequences corresponding to the predicted ones for metrics computation
            target_lengths(batch_size): The actual lengths of the top target sequences
            decoded_sequences List[List[string] * num_sequences]*batch_size: The top num_sequences predictions decoded (as strings)
            outputs_probability (batch_size,num_sequences,max_output_seq_len - 1, vocab_size): model outputs passed through a softmax to turn into probabilities
        
        Examples::
            >>> batch = next(iter(eval_loader))
            >>> (top_seqences,top_lengths,target_sequences, target_lengths, 
                decoded_sequences,outputs_probability) = model.evaluate(batch,num_sequences=num_output_sequences)
        """
        raise NotImplementedError

    def single_inference(self, function_string, num_sequences=1):
        """
        Args:
            function_string: raw text data (i.e a function extracted using ast)
            num_sequences: the number of sequences to output 
        Output:
            decoded_sequence [num_sequences]: The top num_sequences predictions decoded (as strings)
            sequence_scores [num_sequences]: Probability for each sequence
        
        Examples:
            >>> decoded_sequences,sequence_scores = model.single_inference(function_string,num_sequences=num_output_sequences)
        """
        raise NotImplementedError

In [22]:
def set_dropout(model, drop_rate=0.1):
    for _, child in model.named_children():
        if isinstance(child, torch.nn.Dropout):
            child.p = drop_rate
        set_dropout(child, drop_rate=drop_rate)

class CodeBERTaEncoderDecoder(Seq2SeqModelInterface):
    def __init__(self,config,device):
        """
        RoBERTa to RoBERTa encoder-decoder using HuggingFace pretrained CodeBERTa models trained on the CodeNet challenge dataset on LM tasks.
        Elements in training batch for this model should be tuples (inputs,labels,inputs_lengths,labels_lengths). 
        Inputs and labels do not need any padding.
        """
        super(CodeBERTaEncoderDecoder, self).__init__(config,device)
        self.config = config
        assert config.model_type == "CodeBERTa", 'Error: Wrong model type!'

        self.model_name = config.model_name
        self.model = EncoderDecoderModel.from_encoder_decoder_pretrained(self.model_name, self.model_name).to(self.device)
        self.tokenizer = RobertaTokenizer.from_pretrained(self.model_name)

        self.max_output_seq_len = config.max_output_seq_len
        self.learning_rate = config.learning_rate
        self.max_grad_norm = config.max_grad_norm

        self.optimizer = Adam(self.model.parameters(), lr = self.learning_rate, weight_decay=config.weight_decay)
        self.scheduler = ReduceLROnPlateau(self.optimizer)

        if config.resume:
            self.resume(config)
        printc("Successfully loaded\n", "SUCCESS")

        self.drop_rate = config.drop_rate
        if self.drop_rate:
            set_dropout(self.model, drop_rate=self.drop_rate)
            print(f"Dropout rate set to {self.drop_rate}")
    
    def get_models_inputs_from_pair_batch(self,batch):
        batch_size = len(batch)
        unzipped = list(zip(*batch))
        inputs,targets,inputs_lengths,targets_lengths,inputs_raw = unzipped[0],unzipped[1],unzipped[2],unzipped[3],unzipped[4]

        PAD_token = self.tokenizer.pad_token_id

        #Build input tensor and pad
        inputs_lengths_tensor = torch.LongTensor(inputs_lengths)
        inputs_tensor = torch.ones(batch_size,inputs_lengths_tensor.max()).long() * PAD_token
        for idx, (seq, seqlen) in enumerate(zip(inputs, inputs_lengths_tensor)):
            inputs_tensor[idx,:seqlen] = torch.LongTensor(seq)

        inputs_attention_mask = (inputs_tensor != PAD_token) * 1

        #Build target tensor and pad
        targets_lengths_tensor = torch.LongTensor(targets_lengths)
        targets_tensor = torch.ones(batch_size,targets_lengths_tensor.max()).long() * PAD_token

        for idx, (seq, seqlen) in enumerate(zip(targets, targets_lengths_tensor)):
            targets_tensor[idx,:seqlen] = torch.LongTensor(seq)

        targets_attention_mask = (targets_tensor != PAD_token) * 1

        return (inputs_tensor, targets_tensor,targets_lengths_tensor,inputs_attention_mask,targets_attention_mask,inputs_raw)

    def step(self, batch):
        """
        Args:
            batch: a batch of training data in the form described above in init
        Output:
            loss (tensor): PyTorch loss
            outputs (batch_size,seq_len,vocab_size): model outputs (predictions or something else)
        """
        #Unpack batch data
        src_seqs,tgt_seqs,tgt_lens,src_mask,tgt_mask,_ = self.get_models_inputs_from_pair_batch(batch)
        
        src_seqs = src_seqs.to(self.device)
        tgt_seqs = tgt_seqs.to(self.device)

        tgt_lens = tgt_lens.to(self.device)

        src_mask = src_mask.to(self.device)
        tgt_mask = tgt_mask.to(self.device)
        # -------------------------------------
        # Training mode (enable dropout)
        # -------------------------------------
        self.model.train()    

        loss,outputs = self.forward(src_seqs,tgt_seqs,src_mask,tgt_mask)

        # -------------------------------------
        # Backward and optimize
        # -------------------------------------
        # Backward to get gradients w.r.t parameters in model.
        loss.backward()

        # Clip gradients
        torch.nn.utils.clip_grad_norm_(self.model.parameters(),max_norm=self.max_grad_norm)
        
        # Update parameters with optimizer
        self.optimizer.step()

        return loss,outputs
            

    def forward(self,src_seqs,tgt_seqs,src_mask,tgt_mask):
        output = self.model(input_ids=src_seqs,decoder_input_ids=tgt_seqs,labels=tgt_seqs,encoder_attention_mask=src_mask,decoder_attention_mask=tgt_mask)
        return output.loss,output.logits

    def evaluate(self, eval_batch, max_seq_len=None,num_return_sequences=None,num_beams=5):
        """
        Args:
            eval_batch: batch data in the same form as train data (described in init)
            num_sequences: the number of sequences to output 
            max_seq_len: Maximum output sequence length
            num_beams: Number of beams for beam search. 
        Output:
            top_seqences(batch_size,num_sequences,max_seq_len): The top num_sequences predictions
            top_lengths(batch_size,num_sequences): The actual lengths of the top num_sequences predictions
            target_sequences(batch_size,batch_tgt_max_seq_len): The target sequences corresponding to the predicted ones for metrics computation
            target_lengths(batch_size): The actual lengths of the top target sequences
            decoded_sequences List[List[string] * num_sequences]*batch_size: The top num_sequences predictions decoded (as strings)
            outputs_probability (batch_size,num_sequences,max_seq_len - 1, vocab_size): model outputs passed through a softmax to turn into probabilities
        """
        if max_seq_len is None:
            max_seq_len = self.config.max_output_seq_len
        if num_return_sequences is None:
            num_return_sequences = self.config.num_return_sequences
        with torch.no_grad():
            batch_size = len(eval_batch)

            #Unpack batch data
            src_seqs,tgt_seqs,tgt_lens,_,_,inputs_raw = self.get_models_inputs_from_pair_batch(eval_batch)

            src_seqs = src_seqs.to(self.device)
            tgt_seqs = tgt_seqs.to(self.device)
            tgt_lens = tgt_lens.to(self.device)


            # -------------------------------------
            # Eval mode mode (disable dropout)
            # -------------------------------------
            self.model.eval()

            # -------------------------------------
            # Forward model
            # -------------------------------------
            start_beam_search = time()
            beam_output = self.model.generate(
                                src_seqs, 
                                max_length=self.max_output_seq_len, 
                                num_beams=num_beams, 
                                num_return_sequences=num_return_sequences, 
                                early_stopping=True,
                                output_scores = True,
                                return_dict_in_generate=True,
                                no_repeat_ngram_size = 1,
                                eos_token_id = self.tokenizer.eos_token_id,
                                pad_token_id = self.tokenizer.eos_token_id
                            )
            beam_search_time = time() - start_beam_search
            #top_sequence = (batch_size,num_sequences,max_seq_len)
            #top_length = (batch_size,num_sequences)
            top_sequence = beam_output["sequences"].view(batch_size,num_return_sequences,beam_output["sequences"].size(1))
            # non zero values mask
            eos_mask = top_sequence == self.tokenizer.eos_token_id

            # operations on the mask to find first EOS_token in the rows
            mask_max_values, eos_index = torch.max(eos_mask, dim=2)
            # Actual length is one more than the index
            top_length = eos_index + 1

            # if the max-mask is zero, there is no pad index in the row, the length is the length of the sequence
            top_length[mask_max_values == 0] = top_sequence.size(2)

            #get output probabilites
            outputs = torch.stack(beam_output['scores']).transpose(0,1).view(batch_size,num_return_sequences,beam_output["sequences"].size(1) - 1,self.tokenizer.vocab_size)
            output_prob = torch.nn.functional.softmax(outputs,dim=3)
            #decode sequences and add _
            to_decode_full_batch = []
            for i in range(batch_size):
                to_decode_single_batch = []
                for j in range(num_return_sequences):
                    top_sequence_to_decode = [self.tokenizer.convert_tokens_to_ids("_")] * (len(top_sequence[i][j]) * 2 - 1)
                    top_sequence_to_decode[0::2] = top_sequence[i][j]
                    to_decode_single_batch.append(top_sequence_to_decode)
                to_decode_full_batch.append(to_decode_single_batch)
            

            #decode sequences
            decoded_sequences = [self.tokenizer.batch_decode(to_decode_full_batch[i],skip_special_tokens=True) for i in range(batch_size)]

            del outputs,src_seqs,eos_index,eos_mask,mask_max_values,beam_output

        return top_sequence,top_length,tgt_seqs,tgt_lens,output_prob,decoded_sequences,inputs_raw

    def single_inference(self, function_string,num_return_sequences=None):
        """
        Args:
            function_string: raw text data (i.e a function extracted using ast)
            num_sequences: the number of sequences to output 
        Output:
            decoded_sequence [num_sequences]: The top num_sequences predictions decoded (as strings)
            sequence_scores [num_sequences]: Probability for each sequence
        """
        raise NotImplementedError

## Load Model

In [23]:
model = CodeBERTaEncoderDecoder(config,device)

Some weights of the model checkpoint at huggingface/CodeBERTa-small-v1 were not used when initializing RobertaModel: ['lm_head.decoder.bias', 'lm_head.bias', 'lm_head.dense.weight', 'lm_head.decoder.weight', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.bias']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForCausalLM were not initialized from the model checkpoint at huggingface/CodeBERTa-small-v1 and are newly initialized: ['roberta.encoder.layer.0.crossattention.output.LayerNorm.bias', 'roberta.encoder.layer.1.crossattention.self.qu

[92mSuccessfully loaded
[0m
Dropout rate set to 0.1


## Evaluation functions

In [18]:
def get_acc_and_f1_values(tgt_seqs,pred_seqs,tgt_lens,pred_lens):
    batch_size = tgt_seqs.size(0)
    #get numpy arrays
    tgt_seqs = tgt_seqs.cpu().data.numpy()
    pred_seqs = pred_seqs.cpu().data.numpy()
    tgt_lens = tgt_lens.cpu().data.numpy()
    pred_lens = pred_lens.cpu().data.numpy().astype(int)
    
    #metrics to compute
    precision = 0
    recall = 0
    acc = 0
    #loop: for each prediction, different pred_len and tgt_len make vectorized computation impossible
    for i in range(batch_size):
        tgt = tgt_seqs[i,1:tgt_lens[i]-1]
        pred = pred_seqs[i,1:pred_lens[i]-1]

        tp = float((np.isin(pred,tgt)*1).sum())
        fp = float((np.isin(pred,tgt,invert=True)*1).sum())
        fn = float((np.isin(tgt,pred,invert=True)*1).sum())

        #Precision
        if (tp + fp != 0.): precision += tp/(tp + fp)
        #Recall
        if (tp + fn != 0.): recall += tp/(tp + fn)
        #Acc
        acc += (fp==0. and fn==0.) * 1.

    #average values
    precision /= batch_size
    recall /= batch_size
    acc /= batch_size
    
    if precision + recall != 0.:
        f1 = 2 * precision * recall / (precision + recall)
    else:
        f1 = 0.

    return acc,f1,precision,recall

def get_topK_metrics(tgt_seqs,topk_sequence,tgt_lens,topk_length):
    k = topk_length.size(1)
    batch_size = tgt_seqs.size(0)
    #get numpy arrays
    tgt_seqs = tgt_seqs.cpu().data.numpy()    
    topk_sequence = topk_sequence.cpu().data.numpy()
    topk_length = topk_length.cpu().data.numpy()
    
    #metrics to compute
    top1_f1 = 0
    top1_acc = 0
    topK_acc = 0
    topK_f1 = 0
    #loop: for each prediction, different pred_len and tgt_len make vectorized computation impossible
    for i in range(batch_size):
        tgt = tgt_seqs[i,1:tgt_lens[i].item()-1]
        best_acc = 0
        best_f1 = 0
        for j in range(k):
            pred = topk_sequence[i,j,1:topk_length[i,j]-1]

            tp = float((np.isin(pred,tgt)*1).sum())
            fp = float((np.isin(pred,tgt,invert=True)*1).sum())
            fn = float((np.isin(tgt,pred,invert=True)*1).sum())

            #Precision
            if (tp + fp != 0.): precision = tp/(tp + fp)
            else: precision = 0
            #Recall
            if (tp + fn != 0.): recall = tp/(tp + fn)
            else: recall = 0
            #Acc
            acc = (fp==0. and fn==0.) * 1.
            #F1
            if precision + recall != 0.:
                f1 = 2 * precision * recall / (precision + recall)
            else:
                f1 = 0.
            
            #record top1 value
            if j==0:
                top1_acc += acc
                top1_f1 += f1

            #keep best of K values
            if f1>best_f1:
                best_f1 = f1
            if acc>best_acc:
                best_acc = acc

        #add best values to topK metrics
        topK_acc += best_acc
        topK_f1 += best_f1
            

    #average values
    top1_acc /= batch_size
    top1_f1 /= batch_size
    topK_acc /= batch_size
    topK_f1 /= batch_size
    
    return top1_acc,top1_f1,topK_acc,topK_f1

def evaluate_full_dataset(val_dataloader,model):
    val_dataloader.create_batches()
    total_top1_acc = 0
    total_top1_f1 = 0
    total_topK_acc = 0
    total_topK_f1 = 0
    nb_eval = len(val_dataloader)
    for batch in tqdm(val_dataloader.batches):
        topk_sequence,topk_length,tgt_seqs,tgt_lens,output_prob,decoded_sequences,_ = model.evaluate(batch)

        top1_acc,top1_f1,topK_acc,topK_f1 = get_topK_metrics(tgt_seqs,topk_sequence,tgt_lens,topk_length)

        total_top1_acc += top1_acc
        total_top1_f1 += top1_f1
        total_topK_acc += topK_acc
        total_topK_f1 += topK_f1

    #avg values
    total_top1_acc /= nb_eval
    total_top1_f1 /= nb_eval
    total_topK_acc /= nb_eval
    total_topK_f1 /= nb_eval

    
    return total_top1_acc,total_top1_f1,total_topK_acc,total_topK_f1

## Training loop

In [20]:
def train_and_validate(model, train_dataloader, val_dataloader, device, config):
    """
    train a model on the given data as loaders.
    Inputs: please refer bellow, to the argparse arguments.
    """
    printc("\n----- STARTING TRAINING -----")

    losses = []
    val_top1_accuracies = []
    val_top1_f1_scores = []
    val_top5_accuracies = []
    val_top5_f1_scores = []
    best_topK_f1_score = 0

    n_samples = config.print_every_k_batch * config.batch_size
    model.initialize_scheduler(len(train_dataloader.dataset))
    for epoch in range(config.epochs):
        print("> EPOCH", epoch)
        model.train()
        epoch_loss, k_batch_loss = 0, 0
        epoch_start_time, k_batch_start_time = time(), time()
        train_dataloader.create_batches()
        #Training loop
        for i, batch in enumerate(train_dataloader.batches):

            loss, outputs = model.step(batch)

            epoch_loss += loss.item()
            k_batch_loss += loss.item()

            if (i+1) % config.print_every_k_batch == 0:
                average_loss = k_batch_loss / n_samples
                print(f'    [{i+1-config.print_every_k_batch}-{i+1}]  -  Average loss: {average_loss:.3f}  -  Time elapsed: {pretty_time(time()-k_batch_start_time)}')
                k_batch_loss = 0
                k_batch_start_time = time()

            
        #End of epoch
        printc("-----  Ended Train Epoch ---- Start of validation metrics computation  -----\n")
        val_top1_acc,val_top1_f1,val_topK_acc,val_topK_f1= evaluate_full_dataset(val_dataloader,model)
        print('\n' + '='*100)
        print('Training log:')
        print('- Epoch: {}/{}'.format(epoch, config.epochs))
        print('- Train loss: {}'.format(epoch_loss/len(train_dataloader.dataset)))
        print('- Val Top-1 Accuracy: {}'.format(val_top1_acc))
        print('- Val Top-1 F1 Score: {}'.format(val_top1_f1))
        print('- Val Top-K Accuracy: {}'.format(val_topK_acc))
        print('- Val Top-K F1 Score: {}'.format(val_topK_f1))
        print('='*100 + '\n')
        if best_topK_f1_score < val_topK_f1:
            best_topK_f1_score = val_topK_f1
            #run_number is set when doing hyper parameter optimization
            checkpoint_path = os.path.join(config.path_result,'checkpoint_small.pth')
            checkpoint = {
                'model': model.state_dict(),
                'epoch': epoch,
                'best_topK_f1_score': best_topK_f1_score,
                'tokenizer': model.tokenizer,
                'optimizer': model.optimizer.state_dict(),
                'scheduler': model.scheduler.state_dict()
            }
            torch.save(checkpoint,checkpoint_path)
            
            print('\n' + '='*100)
            print('Saved checkpoint to "{}".'.format(checkpoint_path))
            print('Best top 5 F1-score value: ', best_topK_f1_score)
            print('='*100 + '\n')

        losses.append(epoch_loss/len(train_dataloader))
        val_top1_accuracies.append(val_top1_acc)
        val_top1_f1_scores.append(val_top1_f1)
        val_top5_accuracies.append(val_topK_acc)
        val_top5_f1_scores.append(val_topK_f1)
        
        model.scheduler.step()

    
    printc("-----  Ended Training  -----\n")

    print("Saving losses...")
    save_json(config.path_result, "losses", { "train": losses })
    print("Saving validation metrics")
    save_json(config.path_result, "eval_metrics", { "acc_1": val_top1_accuracies, "f1_score_1": val_top1_f1_scores,
                                            "acc_5":  val_top5_accuracies,"f1_score_5":  val_top5_f1_scores})
    epochs_realized = len(losses)
    #plot loss
    plt.plot(range(1, epochs_realized+1), losses)
    plt.legend(["Train loss"])
    plt.xlabel("Epoch")
    plt.ylabel("Loss")
    plt.title("Loss evolution")
    plt.savefig(os.path.join(config.path_result, "loss.png"))
    plt.close()

    #plot eval data
    plt.plot(range(1, epochs_realized+1), val_top5_accuracies)
    plt.legend(["Evaluation Top5 F1-score"])
    plt.xlabel("Epoch")
    plt.ylabel("Top5 F1-score")
    plt.title("Top5 F1-score Evolution")
    plt.savefig(os.path.join(config.path_result, "eval_f1_score.png"))
    plt.close()
    print("[DONE]")

    return best_topK_f1_score

## Run Training

In [None]:
#model.load_state_dict(torch.load("/content/drive/MyDrive/Sourcery/codeBERTa.pt"))
train_and_validate(model, train_dataloader,val_dataloader,device=device, config=config)

[94m
----- STARTING TRAINING -----[0m
> EPOCH 0
    [0-256]  -  Average loss: 1.948  -  Time elapsed: 2m17s
    [256-512]  -  Average loss: 1.943  -  Time elapsed: 2m15s
    [512-768]  -  Average loss: 1.939  -  Time elapsed: 2m15s
    [768-1024]  -  Average loss: 1.943  -  Time elapsed: 2m17s
    [1024-1280]  -  Average loss: 1.940  -  Time elapsed: 2m17s
    [1280-1536]  -  Average loss: 1.945  -  Time elapsed: 2m12s
    [1536-1792]  -  Average loss: 1.942  -  Time elapsed: 2m15s
    [1792-2048]  -  Average loss: 1.947  -  Time elapsed: 2m13s
    [2048-2304]  -  Average loss: 1.953  -  Time elapsed: 2m14s
    [2304-2560]  -  Average loss: 1.930  -  Time elapsed: 2m15s
    [2560-2816]  -  Average loss: 1.946  -  Time elapsed: 2m16s
    [2816-3072]  -  Average loss: 1.965  -  Time elapsed: 2m16s
    [3072-3328]  -  Average loss: 1.947  -  Time elapsed: 2m11s
    [3328-3584]  -  Average loss: 1.937  -  Time elapsed: 2m15s
[94m-----  Ended Train Epoch ---- Start of validation metrics 

0it [00:00, ?it/s]

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for o


Training log:
- Epoch: 0/10
- Train loss: 1.9457816792193139
- Val Top-1 Accuracy: 0.0
- Val Top-1 F1 Score: 0.00028263740233315895
- Val Top-K Accuracy: 0.0
- Val Top-K F1 Score: 0.0005926497219531646


Saved checkpoint to "/content/drive/MyDrive/Sourcery/checkpoint_small.pth".
Best top 5 F1-score value:  0.0005926497219531646

> EPOCH 1
    [0-256]  -  Average loss: 0.575  -  Time elapsed: 2m16s
    [256-512]  -  Average loss: 0.421  -  Time elapsed: 2m18s
    [512-768]  -  Average loss: 0.373  -  Time elapsed: 2m16s
    [768-1024]  -  Average loss: 0.359  -  Time elapsed: 2m20s
    [1024-1280]  -  Average loss: 0.333  -  Time elapsed: 2m16s
    [1280-1536]  -  Average loss: 0.320  -  Time elapsed: 2m13s
    [1536-1792]  -  Average loss: 0.310  -  Time elapsed: 2m17s
    [1792-2048]  -  Average loss: 0.295  -  Time elapsed: 2m17s
    [2048-2304]  -  Average loss: 0.292  -  Time elapsed: 2m16s
    [2304-2560]  -  Average loss: 0.283  -  Time elapsed: 2m15s
    [2560-2816]  -  Average

0it [00:00, ?it/s]

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for o


Training log:
- Epoch: 1/10
- Train loss: 0.32921557010339864
- Val Top-1 Accuracy: 0.21040165465705898
- Val Top-1 F1 Score: 0.4603530610337378
- Val Top-K Accuracy: 0.31888844408860423
- Val Top-K F1 Score: 0.5915607149798081


Saved checkpoint to "/content/drive/MyDrive/Sourcery/checkpoint_small.pth".
Best top 5 F1-score value:  0.5915607149798081

> EPOCH 2
    [0-256]  -  Average loss: 0.255  -  Time elapsed: 2m18s
    [256-512]  -  Average loss: 0.249  -  Time elapsed: 2m15s
    [512-768]  -  Average loss: 0.251  -  Time elapsed: 2m12s
    [768-1024]  -  Average loss: 0.237  -  Time elapsed: 2m20s
    [1024-1280]  -  Average loss: 0.237  -  Time elapsed: 2m13s
    [1280-1536]  -  Average loss: 0.245  -  Time elapsed: 2m16s
    [1536-1792]  -  Average loss: 0.232  -  Time elapsed: 2m16s
    [1792-2048]  -  Average loss: 0.224  -  Time elapsed: 2m19s
    [2048-2304]  -  Average loss: 0.237  -  Time elapsed: 2m17s
    [2304-2560]  -  Average loss: 0.231  -  Time elapsed: 2m14s
    

In [55]:
#Small test on single eval batch
start_time = time()
val_dataloader.create_batches()
for batch in tqdm(val_dataloader.batches):
    topk_sequence,topk_length,tgt_seqs,tgt_lens,output_prob,decoded_sequences,inputs_raw = model.evaluate(batch)
    decoded_labels = model.tokenizer.batch_decode(tgt_seqs,skip_special_tokens=False)
    for i in range(len(decoded_sequences)):
        print(inputs_raw[i])
        print(decoded_sequences[i])
        print(decoded_labels[i])
    break
print(time() - start_time)

0it [00:00, ?it/s]

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


RuntimeError: ignored