In [14]:
import logging
import os
import torch
import transformers
from transformers import RobertaTokenizer, RobertaModel, RobertaConfig

In [26]:
max_source_length = 256
max_target_length = 128

In [27]:
logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s -   %(message)s',
                    datefmt = '%m/%d/%Y %H:%M:%S',
                    level = logging.INFO)
logger = logging.getLogger(__name__)

In [28]:
class Example(object):
    """A single training/test example."""
    def __init__(self,
                 idx,
                 source_question,
                 source_code,
                 target,
                 ):
        self.idx = idx
        self.source_question = source_question
        self.source_code = source_code
        self.target = target

In [30]:
def read_examples(filename, stage):
    """Read examples from filename."""
    # filename: e.g. $data_dir/$lang/train/
    # stage: e.g. train
    examples=[]
    idx = 0
    codefile = os.path.join(filename, stage + ".code")
    quesfile = os.path.join(filename, stage + ".question")
    ansfile = os.path.join(filename, stage + ".answer")
    with open(codefile,encoding="utf-8") as code_f:
        with open(ansfile, encoding="utf-8") as ans_f:
            with open(quesfile, encoding="utf-8") as ques_f:
                for codeline, quesline, ansline in zip(code_f,ques_f,ans_f):
                    code = codeline.strip()
                    question = quesline.strip()
                    ans = ansline.strip()
                    examples.append(
                        Example(
                                idx = idx,
                                # source= question + " " + code,
                                source_question = question,
                                source_code = code,
                                target = ans,
                                )
                    )
                    idx += 1
    return examples

In [31]:
class InputFeatures(object):
    """A single training/test features for a example."""
    def __init__(self,
                 example_id,
                 source_ids,
                 target_ids,
                 source_mask,
                 target_mask,

    ):
        self.example_id = example_id
        self.source_ids = source_ids
        self.target_ids = target_ids
        self.source_mask = source_mask
        self.target_mask = target_mask 

In [20]:
def convert_examples_to_features(examples, tokenizer,stage=None):
    features = []
    for example_index, example in enumerate(examples):
        #source
        source_question_tokens = tokenizer.tokenize(example.source_question)
        source_code_tokens = tokenizer.tokenize(example.source_code)
        source_tokens = source_question_tokens + [tokenizer.sep_token] + source_code_tokens
        source_tokens = source_tokens[:max_source_length-2]
        source_tokens =[tokenizer.cls_token]+source_tokens+[tokenizer.sep_token]
        source_ids =  tokenizer.convert_tokens_to_ids(source_tokens) 
        source_mask = [1] * (len(source_tokens))
        padding_length = max_source_length - len(source_ids)
        source_ids+=[tokenizer.pad_token_id]*padding_length
        source_mask+=[0]*padding_length
 
        #target
        if stage=="test":
            target_tokens = tokenizer.tokenize("None")
        else:
            target_tokens = tokenizer.tokenize(example.target)[:max_target_length-2]
        target_tokens = [tokenizer.cls_token]+target_tokens+[tokenizer.sep_token]            
        target_ids = tokenizer.convert_tokens_to_ids(target_tokens)
        target_mask = [1] *len(target_ids)
        padding_length = max_target_length - len(target_ids)
        target_ids+=[tokenizer.pad_token_id]*padding_length
        target_mask+=[0]*padding_length   
   
        if example_index < 5:
            if stage=='train':
                logger.info("*** Example ***")
                logger.info("idx: {}".format(example.idx))

                logger.info("source_tokens: {}".format([x.replace('\u0120','_') for x in source_tokens]))
                logger.info("source_ids: {}".format(' '.join(map(str, source_ids))))
                logger.info("source_mask: {}".format(' '.join(map(str, source_mask))))
                
                logger.info("target_tokens: {}".format([x.replace('\u0120','_') for x in target_tokens]))
                logger.info("target_ids: {}".format(' '.join(map(str, target_ids))))
                logger.info("target_mask: {}".format(' '.join(map(str, target_mask))))
       
        features.append(
            InputFeatures(
                 example_index,
                 source_ids,
                 target_ids,
                 source_mask,
                 target_mask,
            )
        )
    return features

In [32]:
tokenizer = RobertaTokenizer.from_pretrained('microsoft/codebert-base')

In [None]:
train_examples = read_examples('D:/NLP/codeQA/train', 'train')
train_features = convert_examples_to_features(train_examples, tokenizer,stage='train')
all_source_ids = torch.tensor([f.source_ids for f in train_features], dtype=torch.long)
all_source_mask = torch.tensor([f.source_mask for f in train_features], dtype=torch.long)
all_target_ids = torch.tensor([f.target_ids for f in train_features], dtype=torch.long)
all_target_mask = torch.tensor([f.target_mask for f in train_features], dtype=torch.long)    
train_data = TensorDataset(all_source_ids,all_source_mask,all_target_ids,all_target_mask)

In [None]:
train_data