# 決定 Tokenizer 與使用 BertForPretraining 來做 BERT 預訓練

In [1]:
from transformers import BertTokenizer, BertForPreTraining
import torch

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForPreTraining.from_pretrained('bert-base-uncased')

Some weights of BertForPreTraining were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['cls.predictions.decoder.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


# 資料處理

## 取出資料集

In [2]:
with open('clean.txt', 'r') as fp:
    text = fp.read().split('\n')
text[:3]

['From my grandfather Verus I learned good morals and the government of my temper.',
 'From the reputation and remembrance of my father, modesty and a manly character.',
 'From my mother, piety and beneficence, and abstinence, not only from evil deeds, but even from evil thoughts; and further, simplicity in my way of living, far removed from the habits of the rich.']

## NSP 前置準備

### 以句號分割段落

In [3]:
bag = [item for sentence in text for item in sentence.split('.') if item != '']
bag_size = len(bag)

print("text:") 
print(text[14])
print("------------------")
print("bag")
print(bag[14:19])

text:
From Maximus I learned self-government, and not to be led aside by anything; and cheerfulness in all circumstances, as well as in illness; and a just admixture in the moral character of sweetness and dignity, and to do what was set before me without complaining. I observed that everybody believed that he thought as he spoke, and that in all that he did he never had any bad intention; and he never showed amazement and surprise, and was never in a hurry, and never put off doing a thing, nor was perplexed nor dejected, nor did he ever laugh to disguise his vexation, nor, on the other hand, was he ever passionate or suspicious. He was accustomed to do acts of beneficence, and was ready to forgive, and was free from all falsehood; and he presented the appearance of a man who could not be diverted from right rather than of a man who had been improved. I observed, too, that no man could ever think that he was despised by Maximus, or ever venture to think himself a better man. He had als

### 將分割好的語句進行分裝
```sentence_a```: 代表前句  
```sentence_b```: 代表後句  
```label```: 標記在同一個 index 之下， ```sentence_a``` 和 ```snetnece_b``` 是不是連句的

In [4]:
import random

sentence_a = []
sentence_b = []
label = []

for paragraph in text:
    sentences = [
        sentence for sentence in paragraph.split('.') if sentence != ''
    ]
    num_sentences = len(sentences)
    if num_sentences > 1:
        start = random.randint(0, num_sentences-2)
        # 50/50 whether is IsNextSentence or NotNextSentence
        if random.random() >= 0.5:
            # this is IsNextSentence
            sentence_a.append(sentences[start])
            sentence_b.append(sentences[start+1])
            label.append(0)
        else:
            index = random.randint(0, bag_size-1)
            # this is NotNextSentence
            sentence_a.append(sentences[start])
            sentence_b.append(bag[index])
            label.append(1)

In [5]:
for i in range(3):
    print(label[i])
    print(sentence_a[i] + '\n---')
    print(sentence_b[i] + '\n')

0
 I observed that everybody believed that he thought as he spoke, and that in all that he did he never had any bad intention; and he never showed amazement and surprise, and was never in a hurry, and never put off doing a thing, nor was perplexed nor dejected, nor did he ever laugh to disguise his vexation, nor, on the other hand, was he ever passionate or suspicious
---
 He was accustomed to do acts of beneficence, and was ready to forgive, and was free from all falsehood; and he presented the appearance of a man who could not be diverted from right rather than of a man who had been improved

0
 And I observed that he had overcome all passion for boys; and he considered himself no more than any other citizen; and he released his friends from all obligation to sup with him or to attend him of necessity when he went abroad, and those who had failed to accompany him, by reason of any urgent circumstances, always found him the same
---
 I observed too his habit of careful inquiry in all 

### Tokenization

In [6]:
inputs = tokenizer(sentence_a, sentence_b, return_tensors='pt',
                   max_length=512, truncation=True, padding='max_length')

In [7]:
inputs.keys()

dict_keys(['input_ids', 'token_type_ids', 'attention_mask'])

In [8]:
inputs['input_ids']

tensor([[  101,  1045,  5159,  ...,     0,     0,     0],
        [  101,  1998,  1045,  ...,     0,     0,     0],
        [  101,  1045,  4067,  ...,     0,     0,     0],
        ...,
        [  101,  3459,  2185,  ...,     0,     0,     0],
        [  101,  2043, 15223,  ...,     0,     0,     0],
        [  101,  7887,  3288,  ...,     0,     0,     0]])

### 把 NSP 標籤掛上去

In [9]:
inputs['next_sentence_label'] = torch.LongTensor([label]).T

In [10]:
inputs.next_sentence_label[:10]

tensor([[0],
        [0],
        [1],
        [0],
        [0],
        [1],
        [1],
        [0],
        [0],
        [1]])

## MLM 前置準備

### 複製一份 ```input_ids```，把它掛到 ```inputs``` 上，表示 **原始句子對**

In [11]:
inputs['labels'] = inputs.input_ids.detach().clone()

In [12]:
inputs.keys()

dict_keys(['input_ids', 'token_type_ids', 'attention_mask', 'next_sentence_label', 'labels'])

### BERT MLM 中，一段輸入句子對會有 15% 的 token 要被遮罩
這邊以 ```torch.rand()``` 實現，把 15% 的位子標記成 ```True```，代表要被遮罩

In [13]:
# create random array of floats with equal dimensions to input_ids tensor
rand = torch.rand(inputs.input_ids.shape)
# create mask array
mask_arr = (rand < 0.15) * (inputs.input_ids != 101) * \
           (inputs.input_ids != 102) * (inputs.input_ids != 0)
print(mask_arr)
inputs['mask_arr'] = mask_arr

tensor([[False,  True, False,  ..., False, False, False],
        [False,  True, False,  ..., False, False, False],
        [False, False, False,  ..., False, False, False],
        ...,
        [False, False,  True,  ..., False, False, False],
        [False, False, False,  ..., False, False, False],
        [False, False,  True,  ..., False, False, False]])


### 把剛剛被標記 True 的 index 取出

In [14]:
selection = []

for i in range(inputs.input_ids.shape[0]):
    selection.append(
        torch.flatten(mask_arr[i].nonzero()).tolist()
    )
    
selection[:2]

### 實作 Masking 的部分
在這些要被遮罩的 token 中，又分成以下狀況：
> 80% 以 **```[MASK]```** 遮罩  
> 10% 以 **"隨機 token"** 取代 (不含特殊 token)  
> 10% **維持原樣**  

In [16]:
import copy

rand = copy.deepcopy(selection)

for row in range(len(rand)):
    for col in range(len(rand[row])):
        rand[row][col] = random.random()

vocab_size = len(tokenizer.vocab)
vocab = tokenizer.get_vocab()
special_tokens = [vocab['[CLS]'], vocab['[SEP]'], vocab['[MASK]'], vocab['[UNK]'],  vocab['[PAD]']]

for i in range(inputs.input_ids.shape[0]):
    for j in range(len(selection[i])):
        if rand[i][j] < 0.10:
            continue
        elif rand[i][j] < 0.20:
            rand_num = vocab['[CLS]']
            while rand_num in special_tokens:
                rand_num = random.randint(1, vocab_size)
            inputs.input_ids[i, selection[i][j]] = rand_num
        else:
            inputs.input_ids[i, selection[i][j]] = 103

In [17]:
inputs.keys()

dict_keys(['input_ids', 'token_type_ids', 'attention_mask', 'next_sentence_label', 'labels', 'mask_arr'])

In [18]:
inputs.input_ids

tensor([[  101,   103,  5159,  ...,     0,     0,     0],
        [  101,   103,  1045,  ...,     0,     0,     0],
        [  101,  1045,  4067,  ...,     0,     0,     0],
        ...,
        [  101,  3459,   103,  ...,     0,     0,     0],
        [  101,  2043, 15223,  ...,     0,     0,     0],
        [  101,  7887,   103,  ...,     0,     0,     0]])

In [19]:
class OurDataset(torch.utils.data.Dataset):
    def __init__(self, encodings):
        self.encodings = encodings
    def __getitem__(self, idx):
        return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
    def __len__(self):
        return len(self.encodings.input_ids)

In [20]:
dataset = OurDataset(inputs)

In [21]:
loader = torch.utils.data.DataLoader(dataset, batch_size=6, shuffle=True)

In [22]:
from transformers import AdamW
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
model.to(device)
model.train()
optim = AdamW(model.parameters(), lr = 5e-5)



In [23]:
from tqdm import tqdm  # for our progress bar

epochs = 2
for epoch in range(epochs):
    # setup loop with TQDM and dataloader
    mask_nums = 0
    mlm_correct = 0
    loop = tqdm(loader, leave=True)
    for batch in loop:
        # initialize calculated gradients (from prev step)
        optim.zero_grad()
        # pull all tensor batches required for training
        input_ids = batch['input_ids'].to(device)
        token_type_ids = batch['token_type_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        next_sentence_label = batch['next_sentence_label'].to(device)
        labels = batch['labels'].to(device)
        mask_arr = batch['mask_arr'].to(device)
        # process
        outputs = model(input_ids, attention_mask=attention_mask,
                        token_type_ids=token_type_ids,
                        next_sentence_label=next_sentence_label,
                        labels=labels)
        
        prediction_logits = outputs.prediction_logits[mask_arr]
        predicted_ids = prediction_logits.argmax(-1)
        
        mask_nums += len(predicted_ids)
        mlm_correct += torch.eq(predicted_ids, labels[mask_arr]).sum().item()
        mlm_acc = mlm_correct / mask_nums

        # extract loss
        loss = outputs.loss
        # calculate loss for every parameter that needs grad update
        loss.backward()
        # update parameters
        optim.step()
        # print relevant info to progress bar
        loop.set_description(f'Epoch {epoch}')
        loop.set_postfix(loss=loss.item(), mlm_acc=mlm_acc)

  return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
Epoch 0: 100%|██████████| 53/53 [00:23<00:00,  2.30it/s, loss=1.24, mlm_acc=0.218] 
Epoch 1: 100%|██████████| 53/53 [00:20<00:00,  2.60it/s, loss=0.606, mlm_acc=0.316]


In [24]:
model.save_pretrained("saved_model")
tokenizer.save_pretrained("saved_tokenizer")

('saved_tokenizer\\tokenizer_config.json',
 'saved_tokenizer\\special_tokens_map.json',
 'saved_tokenizer\\vocab.txt',
 'saved_tokenizer\\added_tokens.json')