# 決定 Tokenizer 與使用 BertForPretraining 來做 BERT 預訓練

In [26]:
from transformers import BertTokenizer, BertForPreTraining
import torch
import pandas as pd

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForPreTraining.from_pretrained('bert-base-uncased')

Some weights of BertForPreTraining were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['cls.predictions.decoder.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


# 資料處理

## 取出資料集

In [27]:
df = pd.read_csv("IMDB Dataset.csv")
text = []
for review in df["review"]:
    text.append(review)

text[:3]

["One of the other reviewers has mentioned that after watching just 1 Oz episode you'll be hooked. They are right, as this is exactly what happened with me.<br /><br />The first thing that struck me about Oz was its brutality and unflinching scenes of violence, which set in right from the word GO. Trust me, this is not a show for the faint hearted or timid. This show pulls no punches with regards to drugs, sex or violence. Its is hardcore, in the classic use of the word.<br /><br />It is called OZ as that is the nickname given to the Oswald Maximum Security State Penitentary. It focuses mainly on Emerald City, an experimental section of the prison where all the cells have glass fronts and face inwards, so privacy is not high on the agenda. Em City is home to many..Aryans, Muslims, gangstas, Latinos, Christians, Italians, Irish and more....so scuffles, death stares, dodgy dealings and shady agreements are never far away.<br /><br />I would say the main appeal of the show is due to the f

## NSP 前置準備

### 以句號分割段落

In [37]:
bag = []
for sentence in text:
    for s_str in sentence.split('.<br /><br />'):
        if '. ' in s_str:
            bag.extend(s_str.split('. '))
        elif '!' in s_str:
            bag.extend(s_str.split('!'))
        elif '?' in s_str:
            bag.extend(s_str.split('?'))
        else:
            bag.append(s_str)
bag_size = len(bag)

print("text:") 
print(text[0])
print("------------------")
print("bag")
print(bag[0:5])

text:
One of the other reviewers has mentioned that after watching just 1 Oz episode you'll be hooked. They are right, as this is exactly what happened with me.<br /><br />The first thing that struck me about Oz was its brutality and unflinching scenes of violence, which set in right from the word GO. Trust me, this is not a show for the faint hearted or timid. This show pulls no punches with regards to drugs, sex or violence. Its is hardcore, in the classic use of the word.<br /><br />It is called OZ as that is the nickname given to the Oswald Maximum Security State Penitentary. It focuses mainly on Emerald City, an experimental section of the prison where all the cells have glass fronts and face inwards, so privacy is not high on the agenda. Em City is home to many..Aryans, Muslims, gangstas, Latinos, Christians, Italians, Irish and more....so scuffles, death stares, dodgy dealings and shady agreements are never far away.<br /><br />I would say the main appeal of the show is due to t

### 將分割好的語句進行分裝
```sentence_a```: 代表前句  
```sentence_b```: 代表後句  
```label```: 標記在同一個 index 之下， ```sentence_a``` 和 ```snetnece_b``` 是不是連句的

In [38]:
import random

sentence_a = []
sentence_b = []
label = []

for paragraph in text:
    sentences = []
    for s_str in paragraph.split('. '):
        if '.<br /><br />' in s_str:
            sentences.extend(s_str.split('.<br /><br />'))
        elif '!' in s_str:
            sentences.extend(s_str.split('!'))
        elif '?' in s_str:
            sentences.extend(s_str.split('?'))
        else:
            sentences.append(s_str)
            
    num_sentences = len(sentences)

    if num_sentences > 1:
        start = random.randint(0, num_sentences-2)
        # 50/50 whether is IsNextSentence or NotNextSentence
        if random.random() >= 0.5:
            # this is IsNextSentence
            sentence_a.append(sentences[start])
            sentence_b.append(sentences[start+1])
            label.append(0)
        else:
            index = random.randint(0, bag_size-1)
            # this is NotNextSentence
            sentence_a.append(sentences[start])
            sentence_b.append(bag[index])
            label.append(1)

In [39]:
for i in range(3):
    print(label[i])
    print(sentence_a[i] + '\n---')
    print(sentence_b[i] + '\n')

0
It is called OZ as that is the nickname given to the Oswald Maximum Security State Penitentary
---
It focuses mainly on Emerald City, an experimental section of the prison where all the cells have glass fronts and face inwards, so privacy is not high on the agenda

1
A wonderful little production
---

0
The plot is simplistic, but the dialogue is witty and the characters are likable (even the well bread suspected serial killer)
---
While some may be disappointed when they realize this is not Match Point 2: Risk Addiction, I thought it was proof that Woody Allen is still fully in control of the style many of us have grown to love



### Tokenization

In [40]:
inputs = tokenizer(sentence_a, sentence_b, return_tensors='pt',
                   max_length=512, truncation=True, padding='max_length')

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


In [41]:
inputs.keys()

dict_keys(['input_ids', 'token_type_ids', 'attention_mask'])

In [42]:
inputs['input_ids']

tensor([[ 101, 2009, 2003,  ...,    0,    0,    0],
        [ 101, 1037, 6919,  ...,    0,    0,    0],
        [ 101, 1996, 5436,  ...,    0,    0,    0],
        ...,
        [ 101, 1045, 2572,  ...,    0,    0,    0],
        [ 101, 2522, 8022,  ...,    0,    0,    0],
        [ 101, 2053, 2028,  ...,    0,    0,    0]])

### 把 NSP 標籤掛上去

In [9]:
inputs['next_sentence_label'] = torch.LongTensor([label]).T

In [10]:
inputs.next_sentence_label[:10]

tensor([[1],
        [1],
        [0],
        [0],
        [1],
        [0],
        [1],
        [1],
        [1],
        [1]])

## MLM 前置準備

### 複製一份 ```input_ids```，把它掛到 ```inputs``` 上，表示 **原始句子對**

In [11]:
inputs['labels'] = inputs.input_ids.detach().clone()

In [12]:
inputs.keys()

dict_keys(['input_ids', 'token_type_ids', 'attention_mask', 'next_sentence_label', 'labels'])

### BERT MLM 中，一段輸入句子對會有 15% 的 token 要被遮罩
這邊以 ```torch.rand()``` 實現，把 15% 的位子標記成 ```True```，代表要被遮罩

In [20]:
mask_ava = (inputs.input_ids != 101) * \
           (inputs.input_ids != 102) * (inputs.input_ids != 0)
print(mask_ava[0])

tensor([False,  True,  True,  True,  True,  True,  True,  True,  True,  True,
         True,  True,  True,  True,  True,  True,  True,  True,  True,  True,
         True,  True,  True,  True,  True,  True,  True,  True, False,  True,
         True,  True,  True,  True,  True,  True,  True,  True,  True,  True,
         True,  True,  True,  True,  True,  True,  True,  True,  True,  True,
         True,  True,  True,  True,  True,  True,  True,  True,  True,  True,
         True,  True, False, False, False, False, False, False, False, False,
        False, False, False, False, False, False, False, False, False, False,
        False, False, False, False, False, False, False, False, False, False,
        False, False, False, False, False, False, False, False, False, False,
        False, False, False, False, False, False, False, False, False, False,
        False, False, False, False, False, False, False, False, False, False,
        False, False, False, False, False, False, False, False, 

In [21]:
# create random array of floats with equal dimensions to input_ids tensor
rand = torch.rand(inputs.input_ids.shape)
# create mask array
mask_arr = (rand < 0.15) * (inputs.input_ids != 101) * \
           (inputs.input_ids != 102) * (inputs.input_ids != 0)
print(mask_arr)
inputs['mask_arr'] = mask_arr

tensor([[False, False, False,  ..., False, False, False],
        [False, False, False,  ..., False, False, False],
        [False, False, False,  ...,  True, False, False],
        ...,
        [False, False,  True,  ..., False, False, False],
        [False, False, False,  ..., False, False, False],
        [False, False, False,  ..., False, False, False]])


In [22]:
print(mask_arr[0])
print(mask_ava[0])

tensor([False, False, False,  True, False, False, False, False, False, False,
        False, False, False, False, False,  True, False, False,  True, False,
        False, False, False, False, False,  True, False, False, False, False,
         True, False, False,  True, False, False,  True, False, False, False,
        False, False, False, False, False, False, False, False, False, False,
        False, False, False, False, False,  True, False, False, False, False,
        False, False, False, False, False, False, False, False, False, False,
        False, False, False, False, False, False, False, False, False, False,
        False, False, False, False, False, False, False, False, False, False,
        False, False, False, False, False, False, False, False, False, False,
        False, False, False, False, False, False, False, False, False, False,
        False, False, False, False, False, False, False, False, False, False,
        False, False, False, False, False, False, False, False, 

In [27]:
a = torch.tensor([[1, 2, 3], [5, 6, 7], [8, 9, 10]])
b = torch.tensor([[True, False, False], [True, True, True], [False, True, True]])

# 找到 b 中为 True 的位置索引
true_indices = torch.where(b)
print(true_indices)

# 随机选取 true_indices 中 30% 的位置索引
num_samples = int(len(true_indices[0]) * 0.5)
sample_indices = torch.randperm(len(true_indices[0]))[:num_samples]

# 获取对应位置的值
sample_values = a[true_indices[0][sample_indices], true_indices[1][sample_indices]]

#print(sample_values)

(tensor([0, 1, 1, 1, 2, 2]), tensor([0, 0, 1, 2, 1, 2]))


### 把剛剛被標記 True 的 index 取出

In [14]:
selection = []

for i in range(inputs.input_ids.shape[0]):
    selection.append(
        torch.flatten(mask_arr[i].nonzero()).tolist()
    )
    
selection[:2]

### 實作 Masking 的部分
在這些要被遮罩的 token 中，又分成以下狀況：
> 80% 以 **```[MASK]```** 遮罩  
> 10% 以 **"隨機 token"** 取代 (不含特殊 token)  
> 10% **維持原樣**  

In [16]:
import copy

rand = copy.deepcopy(selection)

for row in range(len(rand)):
    for col in range(len(rand[row])):
        rand[row][col] = random.random()

vocab_size = len(tokenizer.vocab)
vocab = tokenizer.get_vocab()
special_tokens = [vocab['[CLS]'], vocab['[SEP]'], vocab['[MASK]'], vocab['[UNK]'],  vocab['[PAD]']]

for i in range(inputs.input_ids.shape[0]):
    for j in range(len(selection[i])):
        if rand[i][j] < 0.10:
            continue
        elif rand[i][j] < 0.20:
            rand_num = vocab['[CLS]']
            while rand_num in special_tokens:
                rand_num = random.randint(1, vocab_size)
            inputs.input_ids[i, selection[i][j]] = rand_num
        else:
            inputs.input_ids[i, selection[i][j]] = 103

In [17]:
inputs.keys()

dict_keys(['input_ids', 'token_type_ids', 'attention_mask', 'next_sentence_label', 'labels', 'mask_arr'])

In [18]:
inputs.input_ids

tensor([[  101,   103,  5159,  ...,     0,     0,     0],
        [  101,   103,  1045,  ...,     0,     0,     0],
        [  101,  1045,  4067,  ...,     0,     0,     0],
        ...,
        [  101,  3459,   103,  ...,     0,     0,     0],
        [  101,  2043, 15223,  ...,     0,     0,     0],
        [  101,  7887,   103,  ...,     0,     0,     0]])

## 設置 Dataloader

In [19]:
class OurDataset(torch.utils.data.Dataset):
    def __init__(self, encodings):
        self.encodings = encodings
    def __getitem__(self, idx):
        return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
    def __len__(self):
        return len(self.encodings.input_ids)

In [20]:
dataset = OurDataset(inputs)

In [21]:
loader = torch.utils.data.DataLoader(dataset, batch_size=6, shuffle=True)

# 訓練模型

In [22]:
from transformers import AdamW
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
model.to(device)
model.train()
optim = AdamW(model.parameters(), lr = 5e-5)



In [23]:
from tqdm import tqdm  # for our progress bar

epochs = 2
for epoch in range(epochs):
    # setup loop with TQDM and dataloader
    mask_nums = 0
    mlm_correct = 0
    loop = tqdm(loader, leave=True)
    for batch in loop:
        # initialize calculated gradients (from prev step)
        optim.zero_grad()
        # pull all tensor batches required for training
        input_ids = batch['input_ids'].to(device)
        token_type_ids = batch['token_type_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        next_sentence_label = batch['next_sentence_label'].to(device)
        labels = batch['labels'].to(device)
        mask_arr = batch['mask_arr'].to(device)
        # process
        outputs = model(input_ids, attention_mask=attention_mask,
                        token_type_ids=token_type_ids,
                        next_sentence_label=next_sentence_label,
                        labels=labels)
        
        prediction_logits = outputs.prediction_logits[mask_arr]
        predicted_ids = prediction_logits.argmax(-1)
        
        mask_nums += len(predicted_ids)
        mlm_correct += torch.eq(predicted_ids, labels[mask_arr]).sum().item()
        mlm_acc = mlm_correct / mask_nums

        # extract loss
        loss = outputs.loss
        # calculate loss for every parameter that needs grad update
        loss.backward()
        # update parameters
        optim.step()
        # print relevant info to progress bar
        loop.set_description(f'Epoch {epoch}')
        loop.set_postfix(loss=loss.item(), mlm_acc=mlm_acc)

  return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
Epoch 0: 100%|██████████| 53/53 [00:23<00:00,  2.30it/s, loss=1.24, mlm_acc=0.218] 
Epoch 1: 100%|██████████| 53/53 [00:20<00:00,  2.60it/s, loss=0.606, mlm_acc=0.316]


# 儲存模型

In [24]:
model.save_pretrained("saved_model")
tokenizer.save_pretrained("saved_tokenizer")

('saved_tokenizer\\tokenizer_config.json',
 'saved_tokenizer\\special_tokens_map.json',
 'saved_tokenizer\\vocab.txt',
 'saved_tokenizer\\added_tokens.json')