### Import package

In [120]:
"Fine-tuning BertMasked Model with labeled dataset"
from __future__ import absolute_import, division, print_function
import argparse
import logging
import os
import random
import csv
from IPython.display import clear_output
import numpy as np
import torch
from torch.utils.data import DataLoader, RandomSampler, TensorDataset, SequentialSampler
from torch.utils.data.distributed import DistributedSampler
from tqdm import trange
import shutil
from pathlib import Path
PYTORCH_PRETRAINED_BERT_CACHE = Path(os.getenv('PYTORCH_PRETRAINED_BERT_CACHE',
                                               Path.home() / '.pytorch_pretrained_bert'))
# from pytorch_pretrained_bert.file_utils import PYTORCH_PRETRAINED_BERT_CACHE
from transformers import BertForMaskedLM
from transformers import BertTokenizer
logger = logging.getLogger(__name__)
absFilePath = os.path.abspath('')
clear_output()

### Class InputExample

In [107]:
class InputExample(object):
    """A single training/test example for simple sequence classification."""

    def __init__(self, guid, text_a, text_b=None, label=None):
        """Constructs a InputExample.

        Args:
            guid: Unique id for the example.
            text_a: string. The untokenized text of the first sequence. For single
            sequence tasks, only this sequence must be specified.
            text_b: (Optional) string. The untokenized text of the second sequence.
            Only must be specified for sequence pair tasks.
            label: (Optional) string. The label of the example. This should be
            specified for train and dev examples, but not for test examples.
        """
        self.guid = guid
        self.text_a = text_a
        self.text_b = text_b
        self.label = label

### Class AugProcessor

In [108]:
class AugProcessor():
    """Processor for dataset to be augmented."""
        
    def get_train_examples(self, data_dir):
        """See base class."""
        return self._create_examples(
            self._read_csv(os.path.join(data_dir, "train.csv")), "train") # get_train_examples

    def get_dev_examples(self, data_dir): # get_dev_examples
        """See base class."""
        return self._create_examples(
            self._read_csv(os.path.join(data_dir, "dev.csv")), "dev")

    @ staticmethod
    def get_labels(name): # get_labels
        """add your dataset here"""
        if name in ['toxic']:
            return ["0", "1"]

    def _create_examples(self, lines, set_type):
        """Creates examples for the training and dev sets."""
        examples = []
        for (i, line) in enumerate(lines):
            guid ="%s-%s" % (set_type, i)
#             print('guid', guid)
            text_a = line[1][0]
            text_b = None
            label = line[1][-1]
#             print('label', label)
            examples.append(
                InputExample(guid, text_a, text_b, label))
        return examples
    
    @staticmethod
    def _read_csv(input_file, quotechar='"'):
        """Reads a comma separated value file."""
        with open(input_file,"r",encoding='UTF-8') as f:
            reader = csv.reader(
                f,
                delimiter=",",
                quotechar=quotechar,
                doublequote=True,
                skipinitialspace=False,
                )

            lines = []
            for line in enumerate(reader):

                if line[0] == 16:
                    break
                lines.append(line)
                    
            # delete label and sentence
            del lines[0]
#             print('line', lines)
        return lines

### Convert_examples_to_features

In [109]:
def convert_examples_to_features(examples, label_list, max_seq_length, tokenizer):
    """Loads a data file into a list of `InputBatch`s."""

    label_map = {label : i for i, label in enumerate(label_list)}
#     print('label_map', label_map)
    features = []
    dupe_factor = 5
    masked_lm_prob = 0.15
    rng = random.Random(123)
    max_predictions_per_seq = 20
    a = examples
    for (ex_index, example) in enumerate(examples):
#         print('example', example)
        tokens_a = tokenizer.tokenize(example.text_a) # (aaaaa, 1), aaaa is text_a
        tokens_b = None
        if len(tokens_a) > max_seq_length - 2:  # maxlength = [cls]+token_length + [sep]
            tokens_a = tokens_a[:(max_seq_length - 2)]
#         print('label', example.label, len(example.label))
        tokens = ["[CLS]"] + tokens_a + ["[SEP]"] # tokens_a is a token of sentence
#         print('label_map[example.label]', label_map[example.label])
        s = example.label
    
        try:
            label_id = label_map[s]
        except KeyError:

            label_id = label_map[s.strip()]
#         print('lael_id', label_id)
        segment_ids = [label_id] * len(tokens) # use this label to make label_id and segment_ids
        masked_lm_labels = [0]*max_seq_length

        cand_indexes = []
        for (i, token) in enumerate(tokens):
            if token == "[CLS]" or token == "[SEP]":
#                 print('i', i, token)
                continue
#                 print('continue')
            cand_indexes.append(i)
        rng.shuffle(cand_indexes) # 随机替换词为mask，为下面做准备
        print('cand_indexes', cand_indexes)
        len_cand = len(cand_indexes)
        output_tokens = list(tokens) ## contain '[CLS]' and '[SEP]'
        num_to_predict = min(max_predictions_per_seq, ##  max_predictions_per_seq=20 最多预测20个
                             max(1, int(round(len(tokens) * masked_lm_prob)))) # 最多替换tokens的50%

        masked_lms_pos = []
        covered_indexes = set()
#         print('num_to_predict', num_to_predict)
        for index in cand_indexes: # cand_indexes 是除 [CLS] 和 ['SEP'] 的token 的index集合, 但是已经被shuffle了
            if len(masked_lms_pos) >= num_to_predict:
                break
            if index in covered_indexes:
#                 print('stop too')
                continue
            covered_indexes.add(index)
            
#             mask_pos = []
            masked_token = None
            # 80% of the time, replace with [MASK]，这个句子的token的80%换成[MASK], 10% keep original, 10% random 
            if rng.random() < 0.8:
                masked_token = "[MASK]"
                output_tokens[index] = masked_token # mask 位置的token
                masked_lms_pos.append(index)
#                 print('masked_lms_pos', masked_lms_pos)
                masked_lm_labels[index] = 1 #被mask的位置的原来的token的ids
#         print('masked_lm_labels', masked_lm_labels)
#             else:
#                 # 10% of the time, keep original
# #                 print('rng.random()', rng.random())
#                 if rng.random() < 0.5:
#                     masked_token = tokens[index]
#                 # 10% of the time, replace with random word
#                 else:
#                     masked_token = tokens[cand_indexes[rng.randint(0, len_cand - 1)]]
#             print('ex_index', ex_index)
#             print('index', index)
#             print('masked_lm_labels', masked_lm_labels)
#                 masked_lm_labels[index] = tokenizer.convert_tokens_to_ids([tokens[index]])[0] #被mask的位置的原来的token的ids
#             print('masked_lm_labels and ', masked_lm_labels)
#             output_tokens[index] = masked_token # mask 位置的token
#             masked_lms_pos.append(index) ## 被mask的token 在这句话的位置
            
        init_ids = tokenizer.convert_tokens_to_ids(tokens) # original tokens
        token_idx = masked_lms_pos
        input_ids = tokenizer.convert_tokens_to_ids(output_tokens) # 80% of the time, replace with [MASK], token的80%probability 换成[MASK], 10% keep original, 10% random 
#         print('token idx', token_idx)
        # The mask has 1 for real tokens and 0 for padding tokens. Only real
        # tokens are attended to.
        input_mask = [1] * len(input_ids)

        # Zero-pad up to the sequence length.
        padding = [0] * (max_seq_length - len(input_ids))
        init_ids += padding
        input_ids += padding
        input_mask += padding
        segment_ids += padding

        assert len(init_ids) == max_seq_length
        assert len(input_ids) == max_seq_length
        assert len(input_mask) == max_seq_length
        assert len(segment_ids) == max_seq_length

        if ex_index < 5:
            logger.info("*** Example ***")
            logger.info("guid: %s" % (example.guid))
            logger.info("tokens: %s" % " ".join(
                [str(x) for x in tokens]))
            logger.info("init_ids: %s" % " ".join([str(x) for x in init_ids]))
            logger.info("input_ids: %s" % " ".join([str(x) for x in input_ids]))
            logger.info("input_mask: %s" % " ".join([str(x) for x in input_mask]))
            logger.info(
                "segment_ids: %s" % " ".join([str(x) for x in segment_ids]))
            logger.info("masked_lm_labels: %s" % " ".join([str(x) for x in masked_lm_labels]))
#         print('input_mask', input_mask)
        features.append(
                InputFeatures(init_ids=init_ids, # original tokens ids
                              input_ids=input_ids, # have mask in the sentences and to ids
                              input_mask=input_mask, # padding 0 and no padding 1
                              segment_ids=segment_ids, # if label=='1' then seg_ids = 1* [length of sentence ]
                              masked_lm_labels=masked_lm_labels,
                              token_idx=token_idx)) # 被mask的位置的原来的token的ids
    return features

### Class InputFeatures

In [110]:
class InputFeatures(object):
    """A single set of features of data."""

    def __init__(self, init_ids, input_ids, input_mask, segment_ids, masked_lm_labels, token_idx):
        self.init_ids = init_ids
        self.input_ids = input_ids
        self.input_mask = input_mask
        self.segment_ids = segment_ids
        self.masked_lm_labels = masked_lm_labels
        self.token_idx = token_idx

### Remove_wordpiece

In [111]:
def remove_wordpiece(str):
    if len(str) > 1:
        for i in range(len(str) - 1, 0, -1):
            if str[i] == '[PAD]':
                str.remove(str[i])
            elif len(str[i]) > 1 and str[i][0] == '#' and str[i][1] == '#':
                str[i - 1] += str[i][2:]
                str.remove(str[i])
    return " ".join(str[1:-1])

### run_aug

In [115]:
def run_aug(train_number, args, save_every_epoch=False):
    # Augment the dataset with your own choice of Processer
    processors = {
        "toxic": AugProcessor
    }

    task_name = args.task_name
    if task_name not in processors:
        raise ValueError("Task not found: %s" % (task_name))
    args.data_dir = os.path.join(absFilePath, args.data_dir, task_name)
    print('data_dir', args.data_dir)
    args.output_dir = os.path.join(absFilePath, args.output_dir, task_name)
    print('output_dir', args.output_dir)

    random.seed(args.seed)
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    os.makedirs(args.output_dir, exist_ok=True)
    processor = processors[task_name]()
    label_list = processor.get_labels(task_name)
    print('label_list', label_list)
    ''' change to hugging face version''' 
    tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=args.do_lower_case) 

    train_examples = None
    num_train_steps = None
    train_examples = processor.get_train_examples(args.data_dir)
    #dev_examples = processor.get_dev_examples(args.data_dir)
    #train_examples.extend(dev_examples)
    num_train_steps = int(len(train_examples) / args.train_batch_size * args.num_train_epochs) 

    device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
#     MODEL_name = "{}/BertForMaskedLM_aug{}_epoch_3".format(task_name.lower(), task_name.lower())
    model = BertForMaskedLM.from_pretrained(args.bert_model)
#     model = load_model(MODEL_name) 
#     ''' change to hugging face version''' 
    model.to(device)
#     print('model', model)
    # Prepare optimizer
    param_optimizer = list(model.named_parameters())


    global_step = 0
    train_features = convert_examples_to_features(
        train_examples, label_list, args.max_seq_length, tokenizer)
    clear_output()
    
    logger.info("***** Running training *****")
    logger.info("  Num examples = %d", len(train_examples))
    logger.info("  Batch size = %d", args.train_batch_size)
    logger.info("  Num steps = %d", num_train_steps)

    all_init_ids = torch.tensor([f.init_ids for f in train_features], dtype=torch.long)
    all_input_ids = torch.tensor([f.input_ids for f in train_features], dtype=torch.long)
    all_input_mask = torch.tensor([f.input_mask for f in train_features], dtype=torch.long)
    all_segment_ids = torch.tensor([f.segment_ids for f in train_features], dtype=torch.long)

    all_masked_lm_labels = torch.tensor([f.masked_lm_labels for f in train_features], dtype=torch.long)
    train_data = TensorDataset(all_init_ids, all_input_ids, all_input_mask, all_segment_ids, all_masked_lm_labels)
    train_sampler = RandomSampler(train_data)
    train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size)

    MASK_id = tokenizer.convert_tokens_to_ids(['[MASK]'])[0]
    origin_train_path = os.path.join(args.output_dir, "train_origin.csv")
    save_train_path = os.path.join(args.output_dir, "augmentation_trainn_{}.csv".format(train_number))
#     shutil.copy(origin_train_path, save_train_path)

    for e in trange(int(args.num_train_epochs), desc="Epoch"):
        #torch.cuda.empty_cache()
        count = 0
#         shutil.copy(origin_train_path, save_train_path)
        save_train_file = open(save_train_path, 'a', encoding='UTF-8')
        csv_writer = csv.writer(save_train_file, delimiter=',')
        for step, batch in enumerate(train_dataloader):

            model.eval()
            batch = tuple(t.to(device) for t in batch)
    #         print('batch', batch)
            init_ids, _, input_mask, segment_ids, all_masked_lm_labels = batch
            masked_idx = []
            for i in all_masked_lm_labels.numpy():
                s = np.nonzero(i)
                masked_idx.append(s[0].tolist())
    #         print('masked_idx_jie',type(masked_idx_jie))
    #         print('init_ids', init_ids)
            for ids, idx in zip(init_ids, masked_idx):
#                 print('hi')
#                 print('original sentence:',tokenizer.convert_ids_to_tokens(ids.tolist()))
#                 print('original token:',tokenizer.convert_ids_to_tokens(ids[idx].tolist()))
                ids[idx] = MASK_id
            predictions = model(init_ids, input_mask)
#             print('predictions', predictions)
            
            for ids, idx, preds, seg in zip(init_ids, masked_idx, predictions[0], segment_ids):
#                 print('hello')
#                 idx = masked_idx[int(index)]
                _, indice = torch.topk(torch.softmax(preds[idx], -1), 1)
                ids[idx] = torch.squeeze(indice, 1)
#                 print('replace tokens:',tokenizer.convert_ids_to_tokens(ids[idx].tolist()))
#                 print('replace sentence:',tokenizer.convert_ids_to_tokens(ids.tolist()))    
                pred_str = tokenizer.convert_ids_to_tokens(ids.cpu().numpy())
                pred_str = remove_wordpiece(pred_str)
                count +=1
                csv_writer.writerow([pred_str, seg[0].item()])
          

In [119]:
train_number = 0 ## this is made for differiate different augmentation data

### main

In [117]:

def main(train_number):
    parser = argparse.ArgumentParser()

    # Required parameters
    parser.add_argument("--data_dir", default="datasets", type=str,
                        help="The input data dir. Should contain the .tsv files (or other data files) for the task.")
    parser.add_argument("--output_dir", default="aug_data", type=str,
                        help="The output dir for augmented dataset")
    parser.add_argument("--bert_model", default="bert-base-uncased", type=str,
                        help="The path of pretrained bert model.")
    parser.add_argument("--task_name",default="toxic",type=str,
                        help="The name of the task to train.")
    parser.add_argument("--max_seq_length", default=30, type=int,
                        help="The maximum total input sequence length after WordPiece tokenization. \n"
                             "Sequences longer than this will be truncated, and sequences shorter \n"
                             "than this will be padded.")
    parser.add_argument("--do_lower_case", default=True, action='store_true',
                        help="Set this flag if you are using an uncased model.")
    parser.add_argument("--train_batch_size", default=4, type=int,
                        help="Total batch size for training.")
    parser.add_argument("--learning_rate", default=5e-5, type=float,
                        help="The initial learning rate for Adam.")
    parser.add_argument("--num_train_epochs", default=1, type=float,
                        help="Total number of training epochs to perform.")
    parser.add_argument("--warmup_proportion", default=0.1, type=float,
                        help="Proportion of training to perform linear learning rate warmup for. "
                             "E.g., 0.1 = 10%% of training.")
    parser.add_argument('--seed', type=int, default=42,
                        help="random seed for initialization")
#     parser.add_argument('--train_number', type=int, default=0,
#                         help='count the number of augmentation')

    args, unknown = parser.parse_known_args()
#     print(args.data_dir)
    run_aug( train_number, args, save_every_epoch=True)

In [1]:
main(train_number)

train_number += 1


NameError: name 'main' is not defined