# 決定 Tokenizer 與使用 BertForPretraining 來做 BERT 預訓練

In [1]:
from transformers import BertTokenizer, BertForPreTraining
from transformers.models.bert.modeling_bert import BertForPreTrainingOutput, BERT_INPUTS_DOCSTRING, _CONFIG_FOR_DOC
from torch.nn import CrossEntropyLoss
from typing import List, Optional, Tuple, Union
from tqdm import tqdm
import pandas as pd
import torch

In [2]:
class MyBertForPreTrainingOutput(BertForPreTrainingOutput):
    def __init__(self, loss=None, prediction_logits=None, seq_relationship_logits=None, hidden_states=None, attentions=None, mlm_loss=None, nsp_loss=None):
        super().__init__(loss=loss, prediction_logits=prediction_logits, seq_relationship_logits=seq_relationship_logits, hidden_states=hidden_states, attentions=attentions)
        self.mlm_loss = mlm_loss
        self.nsp_loss = nsp_loss

In [3]:
class MyBertForPreTraining(BertForPreTraining):
    def __init__(self, config):
        super().__init__(config)
    # @add_start_docstrings_to_model_forward(BERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
    # @replace_return_docstrings(output_type=MyBertForPreTrainingOutput, config_class=_CONFIG_FOR_DOC)
    def forward(
        self,
        input_ids: Optional[torch.Tensor] = None,
        attention_mask: Optional[torch.Tensor] = None,
        token_type_ids: Optional[torch.Tensor] = None,
        position_ids: Optional[torch.Tensor] = None,
        head_mask: Optional[torch.Tensor] = None,
        inputs_embeds: Optional[torch.Tensor] = None,
        labels: Optional[torch.Tensor] = None,
        next_sentence_label: Optional[torch.Tensor] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
    ) -> Union[Tuple[torch.Tensor], MyBertForPreTrainingOutput]:
        r"""
            labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
                Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ...,
                config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked),
                the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
            next_sentence_label (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
                Labels for computing the next sequence prediction (classification) loss. Input should be a sequence
                pair (see `input_ids` docstring) Indices should be in `[0, 1]`:
                - 0 indicates sequence B is a continuation of sequence A,
                - 1 indicates sequence B is a random sequence.
            kwargs (`Dict[str, any]`, optional, defaults to *{}*):
                Used to hide legacy arguments that have been deprecated.
        Returns:
        Example:
        ```python
        >>> from transformers import AutoTokenizer, BertForPreTraining
        >>> import torch
        >>> tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
        >>> model = BertForPreTraining.from_pretrained("bert-base-uncased")
        >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
        >>> outputs = model(**inputs)
        >>> prediction_logits = outputs.prediction_logits
        >>> seq_relationship_logits = outputs.seq_relationship_logits
        ```
        """
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        outputs = self.bert(
            input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            position_ids=position_ids,
            head_mask=head_mask,
            inputs_embeds=inputs_embeds,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )

        sequence_output, pooled_output = outputs[:2]
        prediction_scores, seq_relationship_score = self.cls(sequence_output, pooled_output)

        total_loss = None
        if labels is not None and next_sentence_label is not None:
            loss_fct = CrossEntropyLoss()
            masked_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), labels.view(-1))
            next_sentence_loss = loss_fct(seq_relationship_score.view(-1, 2), next_sentence_label.view(-1))
            total_loss = masked_lm_loss + next_sentence_loss

        if not return_dict:
            output = (prediction_scores, seq_relationship_score) + outputs[2:]
            return ((total_loss,) + output) if total_loss is not None else output

        return MyBertForPreTrainingOutput(
            loss=total_loss,
            prediction_logits=prediction_scores,
            seq_relationship_logits=seq_relationship_score,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
            mlm_loss = masked_lm_loss,
            nsp_loss = next_sentence_loss,
        )

In [4]:
tokenizer = BertTokenizer.from_pretrained('bert-base-cased')
model = MyBertForPreTraining.from_pretrained('bert-base-cased')

Some weights of MyBertForPreTraining were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['cls.predictions.decoder.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


# 取出資料集

In [5]:
datapath = 'bbc-text.csv'
df = pd.read_csv(datapath)
df.head()

Unnamed: 0,category,text
0,tech,tv future in the hands of viewers with home th...
1,business,worldcom boss left books alone former worldc...
2,sport,tigers wary of farrell gamble leicester say ...
3,sport,yeading face newcastle in fa cup premiership s...
4,entertainment,ocean s twelve raids box office ocean s twelve...


In [6]:
text = []
for review in df["text"]:
    text.append(review)
text[:3]

['tv future in the hands of viewers with home theatre systems  plasma high-definition tvs  and digital video recorders moving into the living room  the way people watch tv will be radically different in five years  time.  that is according to an expert panel which gathered at the annual consumer electronics show in las vegas to discuss how these new technologies will impact one of our favourite pastimes. with the us leading the trend  programmes and other content will be delivered to viewers via home networks  through cable  satellite  telecoms companies  and broadband service providers to front rooms and portable devices.  one of the most talked-about technologies of ces has been digital and personal video recorders (dvr and pvr). these set-top boxes  like the us s tivo and the uk s sky+ system  allow people to record  store  play  pause and forward wind tv programmes when they want.  essentially  the technology allows for much more personalised tv. they are also being built-in to hig

# NSP 前置準備

## 以句號分割段落

In [7]:
bag = [item for sentence in text for item in sentence.split('.') if item != '']
bag_size = len(bag)

print("text:") 
print(text[14])
print("------------------")
print("bag")
print(bag[14:19])

text:
moya emotional after davis cup win carlos moya described spain s davis cup victory as the highlight of his career after he beat andy roddick to end the usa s challenge in seville.  moya made up for missing spain s 2000 victory through injury by beating roddick 6-2 7-6 (7-1) 7-6 (7-5) to give the hosts an unassailable 3-1 lead.  i have woken up so many nights dreaming of this day   said moya.  all my energy has been focused on today.  what i have lived today i do not think i will live again.  spain s only other davis cup title came two years ago in valencia  when they beat australia. and moya  nicknamed charly  admitted:  the davis cup is my dream and i was a bit nervous at the outset.  some people have said that i am obsessed but i think that it is better this way. it helps me reach my goals if i am obsessed.  it s really incredible - to get the winning point is really something.  spanish captain jordi arrese said:  charly played a great game. it was his opportunity and he hasn t

In [8]:
import random

sentence_a = []
sentence_b = []
label = []

for paragraph in text:
    sentences = [
        sentence for sentence in paragraph.split('.') if sentence != ''
    ]
    num_sentences = len(sentences)
    if num_sentences > 1:
        start = random.randint(0, num_sentences-2)
        # 50/50 whether is IsNextSentence or NotNextSentence
        if random.random() >= 0.5:
            # this is IsNextSentence
            sentence_a.append(sentences[start])
            sentence_b.append(sentences[start+1])
            label.append(0)
        else:
            index = random.randint(0, bag_size-1)
            # this is NotNextSentence
            sentence_a.append(sentences[start])
            sentence_b.append(bag[index])
            label.append(1)

In [9]:
inputs = tokenizer(sentence_a, sentence_b, return_tensors='pt',
                   max_length=512, truncation=True, padding='max_length')

In [10]:
inputs['next_sentence_label'] = torch.LongTensor([label]).T

In [11]:
inputs['labels'] = inputs.input_ids.detach().clone()

In [12]:
# create random array of floats with equal dimensions to input_ids tensor
rand = torch.rand(inputs.input_ids.shape)
# create mask array
mask_arr = (rand < 0.15) * (inputs.input_ids != 101) * \
           (inputs.input_ids != 102) * (inputs.input_ids != 0)
print(mask_arr)

tensor([[False, False, False,  ..., False, False, False],
        [False, False,  True,  ..., False, False, False],
        [False, False, False,  ..., False, False, False],
        ...,
        [False, False, False,  ..., False, False, False],
        [False, False, False,  ..., False, False, False],
        [False, False, False,  ..., False, False, False]])


In [13]:
inputs['mask_arr'] = mask_arr

In [14]:
selection = []

for i in range(inputs.input_ids.shape[0]):
    selection.append(
        torch.flatten(mask_arr[i].nonzero()).tolist()
    )

In [15]:
import copy

rand = copy.deepcopy(selection)

for row in range(len(rand)):
    for col in range(len(rand[row])):
        rand[row][col] = random.random()

vocab_size = len(tokenizer.vocab)
vocab = tokenizer.get_vocab()
special_tokens = [vocab['[CLS]'], vocab['[SEP]'], vocab['[MASK]'], vocab['[UNK]'],  vocab['[PAD]']]

for i in range(inputs.input_ids.shape[0]):
    for j in range(len(selection[i])):
        if rand[i][j] < 0.10:
            continue
        elif rand[i][j] < 0.20:
            rand_num = vocab['[CLS]']
            while rand_num in special_tokens:
                rand_num = random.randint(1, vocab_size)
            inputs.input_ids[i, selection[i][j]] = rand_num
        else:
            inputs.input_ids[i, selection[i][j]] = 103

In [16]:
class OurDataset(torch.utils.data.Dataset):
    def __init__(self, encodings):
        self.encodings = encodings
    def __getitem__(self, idx):
        return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
    def __len__(self):
        return len(self.encodings.input_ids)

In [17]:
dataset = OurDataset(inputs)

In [18]:
loader = torch.utils.data.DataLoader(dataset, batch_size=6, shuffle=True)

In [19]:
from transformers import AdamW
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
model.to(device)
model.train()
optim = AdamW(model.parameters(), lr = 5e-5)



In [20]:
import os
record = {"mask_percent": None,
          "mlm_acc_each_epoch": [], 
          "mlm_loss_each_epoch": []}

if os.path.isfile("record.csv"):
    rec = pd.read_csv("record.csv")
else:
    rec = pd.DataFrame()

In [21]:
epochs = 10
acc_each_epoch = []
loss_each_epoch = []
for epoch in range(epochs):
    # setup loop with TQDM and dataloader
    mask_nums = 0
    mlm_correct = 0
    nsp_nums = 0
    nsp_correct = 0
    loop = tqdm(loader, leave=True)
    for batch in loop:
        # initialize calculated gradients (from prev step)
        optim.zero_grad()
        # pull all tensor batches required for training
        input_ids = batch['input_ids'].to(device)
        token_type_ids = batch['token_type_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        next_sentence_label = batch['next_sentence_label'].to(device)
        labels = batch['labels'].to(device)
        mask_arr = batch['mask_arr'].to(device)
        # process
        outputs = model(input_ids, attention_mask=attention_mask,
                        token_type_ids=token_type_ids,
                        next_sentence_label=next_sentence_label,
                        labels=labels)
        
        prediction_logits = outputs.prediction_logits[mask_arr]
        predicted_ids = prediction_logits.argmax(-1)
        
        seq_relationship_logits = outputs.seq_relationship_logits
        predicted_labels = torch.argmax(seq_relationship_logits, dim=1)
        predicted_label = predicted_labels

        mask_nums += len(predicted_ids)
        mlm_correct += torch.eq(predicted_ids, labels[mask_arr]).sum().item()
        mlm_acc = mlm_correct / mask_nums
        nsp_nums += len(predicted_label)
        nsp_correct += predicted_label.eq(torch.squeeze(next_sentence_label)).sum().item()
        
        # extract loss
        loss = outputs.loss
        mlm_loss = outputs.mlm_loss.item()
        nsp_loss = outputs.nsp_loss.item()
        mlm_acc = mlm_correct / mask_nums
        nsp_acc = nsp_correct / nsp_nums
        # calculate loss for every parameter that needs grad update
        loss.backward()
        # update parameters
        optim.step()
        # print relevant info to progress bar
        loop.set_description(f'Epoch {epoch}')
        loop.set_postfix(Total_loss='{:.4f}'.format(loss.item()), MLM_Accuracy='{:.4f}'.format(mlm_acc), NSP_Accuracy='{:.4f}'.format(nsp_acc), \
                          MLM_loss='{:.4f}'.format(mlm_loss), NSP_loss='{:.4f}'.format(nsp_loss))
    acc_each_epoch.append(mlm_acc)
    loss_each_epoch.append(mlm_loss)

record["mask_percent"] = 15
record["mlm_acc_each_epoch"].append(acc_each_epoch)     
record["mlm_loss_each_epoch"].append(loss_each_epoch)
rec = rec.append(record, ignore_index=True)
rec.to_csv("record.csv", index = None)

  return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
Epoch 0: 100%|██████████| 371/371 [02:09<00:00,  2.87it/s, MLM_Accuracy=0.3324, MLM_loss=0.0780, NSP_Accuracy=0.8378, NSP_loss=0.3537, Total_loss=0.4317]
Epoch 1: 100%|██████████| 371/371 [02:08<00:00,  2.89it/s, MLM_Accuracy=0.4067, MLM_loss=0.0680, NSP_Accuracy=0.9420, NSP_loss=0.0197, Total_loss=0.0877]
Epoch 2: 100%|██████████| 371/371 [02:08<00:00,  2.89it/s, MLM_Accuracy=0.4238, MLM_loss=0.0422, NSP_Accuracy=0.9771, NSP_loss=0.0116, Total_loss=0.0537]
Epoch 3: 100%|██████████| 371/371 [02:08<00:00,  2.88it/s, MLM_Accuracy=0.4279, MLM_loss=0.0391, NSP_Accuracy=0.9739, NSP_loss=0.1750, Total_loss=0.2141]
Epoch 4: 100%|██████████| 371/371 [02:08<00:00,  2.88it/s, MLM_Accuracy=0.4508, MLM_loss=0.0568, NSP_Accuracy=0.9834, NSP_loss=0.0096, Total_loss=0.0665]
Epoch 5: 100%|██████████| 371/371 [02:08<00:00,  2.88it/s, MLM_Accuracy=0.4672, MLM_loss=0.0374, NSP_Accuracy=0.9816, NSP_loss=0.0608, Total_loss=0.0983

In [22]:
model.save_pretrained("saved_model_mask15")
tokenizer.save_pretrained("saved_tokenizer_mask15")

('saved_tokenizer_mask15\\tokenizer_config.json',
 'saved_tokenizer_mask15\\special_tokens_map.json',
 'saved_tokenizer_mask15\\vocab.txt',
 'saved_tokenizer_mask15\\added_tokens.json')