# 決定 Tokenizer 與使用 BertForPretraining 來做 BERT 預訓練

In [1]:
from transformers import BertTokenizer, BertForPreTraining, AdamW
from transformers.models.bert.modeling_bert import BertForPreTrainingOutput, BertPreTrainingHeads, BertConfig, BERT_INPUTS_DOCSTRING, _CONFIG_FOR_DOC
from transformers.models.albert.modeling_albert import AlbertSOPHead
from torch.nn import CrossEntropyLoss
from typing import List, Optional, Tuple, Union
from tqdm import tqdm
import pandas as pd
import torch
import random
import copy
import os

In [2]:
class MyBertForPreTrainingOutput(BertForPreTrainingOutput):
    def __init__(self, loss=None, prediction_logits=None, seq_relationship_logits=None, hidden_states=None, attentions=None, mlm_loss=None, nsp_loss=None):
        super().__init__(loss=loss, prediction_logits=prediction_logits, seq_relationship_logits=seq_relationship_logits, hidden_states=hidden_states, attentions=attentions)
        self.mlm_loss = mlm_loss
        self.nsp_loss = nsp_loss

In [3]:
class MyAlbertSOPHead(torch.nn.Module):
    def __init__(self, config: BertConfig):
        super().__init__()  

        self.dropout = torch.nn.Dropout(config.hidden_dropout_prob)
        self.classifier = torch.nn.Linear(config.hidden_size , config.num_labels)

    def forward(self, pooled_output: torch.Tensor) -> torch.Tensor:
        dropout_pooled_output = self.dropout(pooled_output)
        logits = self.classifier(dropout_pooled_output)
        return logits

In [4]:
class BertPretrainingHeadsWithSOP(BertPreTrainingHeads):
    def __init__(self, config):
        super().__init__(config)
        self.seq_relationship = MyAlbertSOPHead(config)

In [5]:
class MyBertForPreTraining(BertForPreTraining):
    def __init__(self, config, nspTask = "NSP"):
        super().__init__(config)
        if nspTask == "SOP":
            self.cls = BertPretrainingHeadsWithSOP(config)
            
    # @add_start_docstrings_to_model_forward(BERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
    # @replace_return_docstrings(output_type=MyBertForPreTrainingOutput, config_class=_CONFIG_FOR_DOC)
    def forward(
        self,
        input_ids: Optional[torch.Tensor] = None,
        attention_mask: Optional[torch.Tensor] = None,
        token_type_ids: Optional[torch.Tensor] = None,
        position_ids: Optional[torch.Tensor] = None,
        head_mask: Optional[torch.Tensor] = None,
        inputs_embeds: Optional[torch.Tensor] = None,
        labels: Optional[torch.Tensor] = None,
        next_sentence_label: Optional[torch.Tensor] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
    ) -> Union[Tuple[torch.Tensor], MyBertForPreTrainingOutput]:
        r"""
            labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
                Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ...,
                config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked),
                the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
            next_sentence_label (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
                Labels for computing the next sequence prediction (classification) loss. Input should be a sequence
                pair (see `input_ids` docstring) Indices should be in `[0, 1]`:
                - 0 indicates sequence B is a continuation of sequence A,
                - 1 indicates sequence B is a random sequence.
            kwargs (`Dict[str, any]`, optional, defaults to *{}*):
                Used to hide legacy arguments that have been deprecated.
        Returns:
        Example:
        ```python
        >>> from transformers import AutoTokenizer, BertForPreTraining
        >>> import torch
        >>> tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
        >>> model = BertForPreTraining.from_pretrained("bert-base-uncased")
        >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
        >>> outputs = model(**inputs)
        >>> prediction_logits = outputs.prediction_logits
        >>> seq_relationship_logits = outputs.seq_relationship_logits
        ```
        """
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        outputs = self.bert(
            input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            position_ids=position_ids,
            head_mask=head_mask,
            inputs_embeds=inputs_embeds,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )

        sequence_output, pooled_output = outputs[:2]
        prediction_scores, seq_relationship_score = self.cls(sequence_output, pooled_output)

        total_loss = None
        if labels is not None and next_sentence_label is not None:
            loss_fct = CrossEntropyLoss()
            masked_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), labels.view(-1))
            next_sentence_loss = loss_fct(seq_relationship_score.view(-1, 2), next_sentence_label.view(-1))
            total_loss = masked_lm_loss + next_sentence_loss

        if not return_dict:
            output = (prediction_scores, seq_relationship_score) + outputs[2:]
            return ((total_loss,) + output) if total_loss is not None else output

        return MyBertForPreTrainingOutput(
            loss=total_loss,
            prediction_logits=prediction_scores,
            seq_relationship_logits=seq_relationship_score,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
            mlm_loss = masked_lm_loss,
            nsp_loss = next_sentence_loss,
        )

# 取出資料集

In [6]:
class getData():
    def __init__(self, modelType, datapath, nspTask = "NSP"):
        self.datapath = datapath
        self.tokenizer = BertTokenizer.from_pretrained(modelType)
        self.nspTask = nspTask
        self.text = self.toText()
        self.sentence_a = []
        self.sentence_b = []
        self.label = []
        self.inputs = None
        self.nspPrepare()
        self.inputs['labels'] = self.inputs.input_ids.detach().clone()
    
    def toText(self):
        df = pd.read_csv(self.datapath)
        text = []
        for review in df["text"]:
            text.append(review)
        
        return text
    
    def nspPrepare(self):
        bag = [item for sentence in self.text for item in sentence.split('. ') if item != '']
        bag_size = len(bag)

        if self.nspTask == "NSP":
            self.nspData(bag, bag_size)
        elif self.nspTask == "SOP":
            self.sopData()

        self.inputs = self.tokenizer(self.sentence_a, self.sentence_b, return_tensors='pt',
                   max_length=512, truncation=True, padding='max_length')
        self.inputs['next_sentence_label'] = torch.LongTensor([self.label]).T
    
    def nspData(self, bag, bag_size):
        for paragraph in self.text:
            sentences = [
                sentence for sentence in paragraph.split('.') if sentence != ''
            ]
            num_sentences = len(sentences)
            if num_sentences > 1:
                start = random.randint(0, num_sentences-2)
                # 50/50 whether is IsNextSentence or NotNextSentence
                if random.random() >= 0.5:
                    # this is IsNextSentence
                    self.sentence_a.append(sentences[start])
                    self.sentence_b.append(sentences[start+1])
                    self.label.append(0)
                else:
                    index = random.randint(0, bag_size-1)
                    # this is NotNextSentence
                    self.sentence_a.append(sentences[start])
                    self.sentence_b.append(bag[index])
                    self.label.append(1)
    
    def sopData(self):
        for paragraph in self.text:
            sentences = [
                sentence for sentence in paragraph.split('.') if sentence != ''
            ]
            num_sentences = len(sentences)
            if num_sentences > 1:
                start = random.randint(0, num_sentences-2)
                # 50/50 whether is IsNextSentence or NotNextSentence
                if random.random() >= 0.5:
                    # this is IsNextSentence
                    self.sentence_a.append(sentences[start])
                    self.sentence_b.append(sentences[start+1])
                    self.label.append(0)
                else:
                    # this is NotNextSentence
                    self.sentence_a.append(sentences[start+1])
                    self.sentence_b.append(sentences[start])
                    self.label.append(1)
    
    def returnInput(self):
        return self.inputs

In [7]:
class OurDataset(torch.utils.data.Dataset):
    def __init__(self, encodings):
        self.encodings = encodings
    def __getitem__(self, idx):
        return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
    def __len__(self):
        return len(self.encodings.input_ids)

In [8]:
class trainModel():
    def __init__(self, modelType, inputs, batch_size, epoch, acc_goal_each_epoch, masking_method = "propose", saveModelName = "", saveCSV = True, nspTask = "NSP"):
        self.model = MyBertForPreTraining.from_pretrained(modelType)
        self.tokenizer = BertTokenizer.from_pretrained(modelType)
        self.inputs = inputs
        self.batch_size = batch_size
        self.epoch = epoch
        self.acc_goal_each_epoch = acc_goal_each_epoch  # 每個 epoch 的 MLM 正確率基準
        self.masking_method = masking_method
        self.saveModelName = saveModelName
        self.saveCSV = saveCSV
        self.loader = torch.utils.data.DataLoader(OurDataset(self.inputs), \
                                             batch_size=self.batch_size, shuffle=True)
        self.device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
        self.model.to(self.device)
        self.model.train()
        self.optim = AdamW(self.model.parameters(), lr = 5e-5)
        
        if os.path.isfile("record_mask_grow.csv"):
            self.rec = pd.read_csv("record_mask_grow.csv")
        else:
            self.rec = pd.DataFrame({"mlm_acc_each_epoch":[], "mlm_loss_each_epoch":[], 'Mask_Percent_each_epoch':[]})
            
        self.training()
        self.save_model(self.saveModelName)
    
    def mlmPrepare(self, input_sentences, maskPercentNow):
        rand = torch.rand(input_sentences.shape)
        # create mask array
        mask_arr = (rand < maskPercentNow * 0.01) * (input_sentences != 101) * \
                (input_sentences != 102) * (input_sentences != 0)
        
        selection = []

        for i in range(input_sentences.shape[0]):
            selection.append(
                torch.flatten(mask_arr[i].nonzero()).tolist()
            )

        rand_mask_type = copy.deepcopy(selection)

        for row in range(len(rand_mask_type)):
            for col in range(len(rand_mask_type[row])):
                rand_mask_type[row][col] = random.random()

        vocab_size = len(self.tokenizer.vocab)
        vocab = self.tokenizer.get_vocab()
        special_tokens = [vocab['[CLS]'], vocab['[SEP]'], vocab['[MASK]'], vocab['[UNK]'],  vocab['[PAD]']]

        for i in range(input_sentences.shape[0]):
            for j in range(len(selection[i])):
                if rand_mask_type[i][j] < 0.10:
                    continue
                elif rand_mask_type[i][j] < 0.20:
                    rand_num = vocab['[CLS]']
                    while rand_num in special_tokens:
                        rand_num = random.randint(1, vocab_size)
                    input_sentences[i, selection[i][j]] = rand_num
                else:
                    input_sentences[i, selection[i][j]] = 103
        
        return input_sentences, mask_arr

    def training(self):
        acc_each_epoch = []
        loss_each_epoch = []
        Mask_Percent_each_epoch = []
        stay = 0
        percent_now = 6

        for epoch in range(self.epoch):
            # setup loop with TQDM and dataloader
            mask_nums = 0
            mlm_correct = 0
            nsp_nums = 0
            nsp_correct = 0
            loop = tqdm(self.loader, leave=True)

            for batch in loop:
                input_sentences, mask_arr = self.mlmPrepare(copy.deepcopy(batch["input_ids"]), percent_now)

                # initialize calculated gradients (from prev step)
                self.optim.zero_grad()
                # pull all tensor batches required for training
                input_ids = input_sentences.to(self.device)
                token_type_ids = batch['token_type_ids'].to(self.device)
                attention_mask = batch['attention_mask'].to(self.device)
                next_sentence_label = batch['next_sentence_label'].to(self.device)
                labels = batch['labels'].to(self.device)
                # process
                outputs = self.model(input_ids, attention_mask=attention_mask,
                                token_type_ids=token_type_ids,
                                next_sentence_label=next_sentence_label,
                                labels=labels)
                
                prediction_logits = outputs.prediction_logits[mask_arr]
                predicted_ids = prediction_logits.argmax(-1)
                
                seq_relationship_logits = outputs.seq_relationship_logits
                predicted_labels = torch.argmax(seq_relationship_logits, dim=1)
                predicted_label = predicted_labels

                mask_nums += len(predicted_ids)
                mlm_correct += torch.eq(predicted_ids, labels[mask_arr]).sum().item()
                nsp_nums += len(predicted_label)
                nsp_correct += predicted_label.eq(torch.squeeze(next_sentence_label)).sum().item()
                
                # extract loss
                loss = outputs.loss
                mlm_loss = outputs.mlm_loss.item()
                nsp_loss = outputs.nsp_loss.item()
                mlm_acc = mlm_correct / mask_nums
                nsp_acc = nsp_correct / nsp_nums
                # calculate loss for every parameter that needs grad update
                loss.backward()
                # update parameters
                self.optim.step()
                # print relevant info to progress bar
                loop.set_description(f'Epoch {epoch}')
                loop.set_postfix(Total_loss='{:.4f}'.format(loss.item()), MLM_Accuracy='{:.4f}'.format(mlm_acc), NSP_Accuracy='{:.4f}'.format(nsp_acc), \
                                MLM_loss='{:.4f}'.format(mlm_loss), NSP_loss='{:.4f}'.format(nsp_loss), Mask_Percent=percent_now)

            acc_each_epoch.append(mlm_acc)
            loss_each_epoch.append(mlm_loss)
            Mask_Percent_each_epoch.append(percent_now)

            if self.masking_method == "DMLM":
                percent_now += 1
            elif self.masking_method == "propose":
                if (mlm_acc >= self.acc_goal_each_epoch[epoch] * 0.01) or stay >= 2:
                    stay = 0
                    percent_now = 6 + epoch + 1
                else:
                    stay += 1


        if self.saveCSV:
            
            new_rec = pd.concat([self.rec, pd.DataFrame(pd.DataFrame({'mlm_acc_each_epoch': [acc_each_epoch], 'mlm_loss_each_epoch': [loss_each_epoch], 'Mask_Percent_each_epoch': [Mask_Percent_each_epoch]}))], ignore_index=True)
            new_rec.to_csv("record_mask_grow.csv", index = False)
        torch.cuda.empty_cache()
    
    def save_model(self, model_name):
        self.model.save_pretrained(model_name)

In [9]:
datapath = 'bbc-text.csv'
modelType = 'bert-base-cased'
epoch = 10
batch_size = 6
epoch_acc = [33.7, 42.1, 44.2, 45.7, 47.3, 49.0, 50.6, 51.9, 53.8 , 55.6]

In [10]:
mask_dyn_grow1_input = getData(modelType = modelType, datapath = datapath, nspTask = "NSP")
mask_dyn_grow1 = trainModel(modelType = modelType, inputs = mask_dyn_grow1_input.returnInput(), batch_size = batch_size, epoch = epoch, acc_goal_each_epoch = epoch_acc, masking_method = "DMLM", saveModelName = "saved_model/saved_model_mask_dyn_grow1")
mask_dyn_grow1_input = None
mask_dyn_grow1 = None

Some weights of MyBertForPreTraining were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['cls.predictions.decoder.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
Epoch 0: 100%|██████████| 371/371 [02:21<00:00,  2.63it/s, MLM_Accuracy=0.3309, MLM_loss=0.0464, Mask_Percent=6, NSP_Accuracy=0.8319, NSP_loss=0.0246, Total_loss=0.0710]
Epoch 1: 100%|██████████| 371/371 [02:11<00:00,  2.82it/s, MLM_Accuracy=0.4085, MLM_loss=0.0367, Mask_Percent=7, NSP_Accuracy=0.9344, NSP_loss=0.0134, Total_loss=0.0500]
Epoch 2: 100%|██████████| 371/371 [02:11<00:00,  2.83it/s, MLM_Accuracy=0.4072, MLM_loss=0.0406, Mask_Percent=8, NSP_Accuracy=0.9519, NSP_loss=0.2928, Total_loss=0.3334]
Epoch 3: 100%|██████████| 371/371 [02:12<00:00,  2.81it/s, MLM_Accuracy=0.4277, MLM_loss=0.0487, Mask_Percent=9, NSP_Accuracy=0.9596, NSP_

In [10]:
mask_dyn_input = getData(modelType = modelType, datapath = datapath, nspTask = "NSP")
mask_dyn = trainModel(modelType = modelType, inputs = mask_dyn_input.returnInput(), batch_size = batch_size, epoch = epoch, acc_goal_each_epoch = epoch_acc, saveModelName = "saved_model/saved_model_mask_grow")
mask_dyn_input = None
mask_dyn = None

Some weights of MyBertForPreTraining were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['cls.predictions.decoder.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
Epoch 0: 100%|██████████| 371/371 [02:16<00:00,  2.73it/s, MLM_Accuracy=0.3264, MLM_loss=0.0539, Mask_Percent=6, NSP_Accuracy=0.8458, NSP_loss=0.2495, Total_loss=0.3035]
Epoch 1: 100%|██████████| 371/371 [02:15<00:00,  2.74it/s, MLM_Accuracy=0.4121, MLM_loss=0.0348, Mask_Percent=6, NSP_Accuracy=0.9393, NSP_loss=0.0330, Total_loss=0.0678]
Epoch 2: 100%|██████████| 371/371 [02:16<00:00,  2.71it/s, MLM_Accuracy=0.4100, MLM_loss=0.0234, Mask_Percent=6, NSP_Accuracy=0.9672, NSP_loss=0.0084, Total_loss=0.0318]
Epoch 3: 100%|██████████| 371/371 [02:18<00:00,  2.67it/s, MLM_Accuracy=0.4349, MLM_loss=0.0558, Mask_Percent=9, NSP_Accuracy=0.9721, NSP_

In [11]:
mask_dyn_input = getData(modelType = modelType, datapath = datapath, nspTask = "NSP")
mask_dyn = trainModel(modelType = modelType, inputs = mask_dyn_input.returnInput(), batch_size = batch_size, epoch = epoch, acc_goal_each_epoch = epoch_acc, saveModelName = "saved_model/saved_model_mask_grow")
mask_dyn_input = None
mask_dyn = None

Some weights of MyBertForPreTraining were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['cls.predictions.decoder.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
Epoch 0: 100%|██████████| 371/371 [02:17<00:00,  2.69it/s, MLM_Accuracy=0.3401, MLM_loss=0.0421, Mask_Percent=6, NSP_Accuracy=0.8319, NSP_loss=0.5053, Total_loss=0.5474]
Epoch 1: 100%|██████████| 371/371 [02:18<00:00,  2.68it/s, MLM_Accuracy=0.4154, MLM_loss=0.0242, Mask_Percent=7, NSP_Accuracy=0.9515, NSP_loss=0.1555, Total_loss=0.1797]
Epoch 2: 100%|██████████| 371/371 [02:17<00:00,  2.69it/s, MLM_Accuracy=0.4337, MLM_loss=0.0264, Mask_Percent=7, NSP_Accuracy=0.9613, NSP_loss=0.1169, Total_loss=0.1434]
Epoch 3: 100%|██████████| 371/371 [02:18<00:00,  2.68it/s, MLM_Accuracy=0.4349, MLM_loss=0.0347, Mask_Percent=7, NSP_Accuracy=0.9726, NSP_

In [12]:
mask_dyn_input = getData(modelType = modelType, datapath = datapath, nspTask = "NSP")
mask_dyn = trainModel(modelType = modelType, inputs = mask_dyn_input.returnInput(), batch_size = batch_size, epoch = epoch, acc_goal_each_epoch = epoch_acc, saveModelName = "saved_model/saved_model_mask_grow")
mask_dyn_input = None
mask_dyn = None

Some weights of MyBertForPreTraining were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['cls.predictions.decoder.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
Epoch 0: 100%|██████████| 371/371 [02:15<00:00,  2.74it/s, MLM_Accuracy=0.3401, MLM_loss=0.0770, Mask_Percent=6, NSP_Accuracy=0.8463, NSP_loss=0.2719, Total_loss=0.3489]
Epoch 1: 100%|██████████| 371/371 [02:16<00:00,  2.72it/s, MLM_Accuracy=0.4102, MLM_loss=0.0361, Mask_Percent=7, NSP_Accuracy=0.9375, NSP_loss=0.0480, Total_loss=0.0841]
Epoch 2: 100%|██████████| 371/371 [02:16<00:00,  2.72it/s, MLM_Accuracy=0.4227, MLM_loss=0.0197, Mask_Percent=7, NSP_Accuracy=0.9604, NSP_loss=0.2295, Total_loss=0.2492]
Epoch 3: 100%|██████████| 371/371 [02:16<00:00,  2.72it/s, MLM_Accuracy=0.4423, MLM_loss=0.0255, Mask_Percent=7, NSP_Accuracy=0.9694, NSP_

In [13]:
mask_dyn_input = getData(modelType = modelType, datapath = datapath, nspTask = "NSP")
mask_dyn = trainModel(modelType = modelType, inputs = mask_dyn_input.returnInput(), batch_size = batch_size, epoch = epoch, acc_goal_each_epoch = epoch_acc, saveModelName = "saved_model/saved_model_mask_grow")
mask_dyn_input = None
mask_dyn = None

Some weights of MyBertForPreTraining were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['cls.predictions.decoder.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
Epoch 0: 100%|██████████| 371/371 [02:15<00:00,  2.74it/s, MLM_Accuracy=0.3199, MLM_loss=0.0272, Mask_Percent=6, NSP_Accuracy=0.8310, NSP_loss=0.2244, Total_loss=0.2516]
Epoch 1: 100%|██████████| 371/371 [02:15<00:00,  2.73it/s, MLM_Accuracy=0.4123, MLM_loss=0.0177, Mask_Percent=6, NSP_Accuracy=0.9317, NSP_loss=0.0256, Total_loss=0.0433]
Epoch 2: 100%|██████████| 371/371 [02:16<00:00,  2.72it/s, MLM_Accuracy=0.4206, MLM_loss=0.0237, Mask_Percent=6, NSP_Accuracy=0.9663, NSP_loss=0.0535, Total_loss=0.0773]
Epoch 3: 100%|██████████| 371/371 [02:16<00:00,  2.72it/s, MLM_Accuracy=0.4188, MLM_loss=0.0222, Mask_Percent=9, NSP_Accuracy=0.9690, NSP_

In [10]:
mask_dyn_input = getData(modelType = modelType, datapath = datapath, nspTask = "NSP")
mask_dyn = trainModel(modelType = modelType, inputs = mask_dyn_input.returnInput(), batch_size = batch_size, epoch = epoch, acc_goal_each_epoch = epoch_acc, saveModelName = "saved_model/saved_model_mask_grow")
mask_dyn_input = None
mask_dyn = None

Some weights of MyBertForPreTraining were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['cls.predictions.decoder.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
Epoch 0: 100%|██████████| 371/371 [02:15<00:00,  2.73it/s, MLM_Accuracy=0.3481, MLM_loss=0.0439, Mask_Percent=6, NSP_Accuracy=0.8288, NSP_loss=1.3025, Total_loss=1.3463]
Epoch 1: 100%|██████████| 371/371 [02:17<00:00,  2.70it/s, MLM_Accuracy=0.4150, MLM_loss=0.0277, Mask_Percent=7, NSP_Accuracy=0.9371, NSP_loss=0.3044, Total_loss=0.3321]
Epoch 2: 100%|██████████| 371/371 [02:18<00:00,  2.68it/s, MLM_Accuracy=0.4103, MLM_loss=0.0228, Mask_Percent=7, NSP_Accuracy=0.9393, NSP_loss=0.0585, Total_loss=0.0813]
Epoch 3: 100%|██████████| 371/371 [02:19<00:00,  2.66it/s, MLM_Accuracy=0.4285, MLM_loss=0.0242, Mask_Percent=7, NSP_Accuracy=0.9690, NSP_