# 決定 Tokenizer 與使用 BertForPretraining 來做 BERT 預訓練

In [1]:
import os
os.environ['CUDA_LAUNCH_BLOCKING'] = "1"

In [2]:
from transformers import BertTokenizer, BertForPreTraining, AdamW
from transformers.models.bert.modeling_bert import BertForPreTrainingOutput, BertPreTrainingHeads, BertConfig, BERT_INPUTS_DOCSTRING, _CONFIG_FOR_DOC
from transformers.models.albert.modeling_albert import AlbertSOPHead
from transformers.utils import ModelOutput
from transformers.utils.doc import add_start_docstrings_to_model_forward, replace_return_docstrings
from torch.nn import CrossEntropyLoss
from dataclasses import dataclass
from typing import List, Optional, Tuple, Union
from tqdm import tqdm
import pandas as pd
import torch
import random
import copy

In [3]:
class MyBertForPreTrainingOutput(BertForPreTrainingOutput):
    """
    Output type of [`MyBertForPreTraining`].
    Args:
        loss (*optional*, returned when `labels` is provided, `torch.FloatTensor` of shape `(1,)`):
            Total loss as the sum of the masked language modeling loss and the next sequence prediction
            (classification) loss.
        prediction_logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
            Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
        seq_relationship_logits (`torch.FloatTensor` of shape `(batch_size, 2)`):
            Prediction scores of the next sequence prediction (classification) head (scores of True/False continuation
            before SoftMax).
        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
            shape `(batch_size, sequence_length, hidden_size)`.
            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
            sequence_length)`.
            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
        mlm_loss (`float`):
            MLM loss.
        nsp_loss (`float`):
            NSP loss.
    """
    def __init__(self, loss=None, prediction_logits=None, seq_relationship_logits=None, hidden_states=None, attentions=None, mlm_loss=None, nsp_loss=None):
        super().__init__(loss=loss, prediction_logits=prediction_logits, seq_relationship_logits=seq_relationship_logits, hidden_states=hidden_states, attentions=attentions)
        self.mlm_loss = mlm_loss
        self.nsp_loss = nsp_loss

In [4]:
class MyAlbertSOPHead(torch.nn.Module):
    def __init__(self, config: BertConfig):
        super().__init__()  

        self.dropout = torch.nn.Dropout(config.hidden_dropout_prob)
        self.classifier = torch.nn.Linear(config.hidden_size , config.num_labels)

    def forward(self, pooled_output: torch.Tensor) -> torch.Tensor:
        dropout_pooled_output = self.dropout(pooled_output)
        logits = self.classifier(dropout_pooled_output)
        return logits

In [5]:
class BertPretrainingHeadsWithSOP(BertPreTrainingHeads):
    def __init__(self, config):
        super().__init__(config)
        self.seq_relationship = MyAlbertSOPHead(config)

In [6]:
class MyBertForPreTraining(BertForPreTraining):
    def __init__(self, config, nspTask = "NSP"):
        super().__init__(config)
        if nspTask == "SOP":
            self.cls = BertPretrainingHeadsWithSOP(config)
            
    @add_start_docstrings_to_model_forward(BERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
    @replace_return_docstrings(output_type=MyBertForPreTrainingOutput, config_class=_CONFIG_FOR_DOC)
    def forward(
        self,
        input_ids: Optional[torch.Tensor] = None,
        attention_mask: Optional[torch.Tensor] = None,
        token_type_ids: Optional[torch.Tensor] = None,
        position_ids: Optional[torch.Tensor] = None,
        head_mask: Optional[torch.Tensor] = None,
        inputs_embeds: Optional[torch.Tensor] = None,
        labels: Optional[torch.Tensor] = None,
        next_sentence_label: Optional[torch.Tensor] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
    ) -> Union[Tuple[torch.Tensor], MyBertForPreTrainingOutput]:
        r"""
            labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
                Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ...,
                config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked),
                the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
            next_sentence_label (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
                Labels for computing the next sequence prediction (classification) loss. Input should be a sequence
                pair (see `input_ids` docstring) Indices should be in `[0, 1]`:
                - 0 indicates sequence B is a continuation of sequence A,
                - 1 indicates sequence B is a random sequence.
            kwargs (`Dict[str, any]`, optional, defaults to *{}*):
                Used to hide legacy arguments that have been deprecated.
        Returns:
        Example:
        ```python
        >>> from transformers import AutoTokenizer, BertForPreTraining
        >>> import torch
        >>> tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
        >>> model = BertForPreTraining.from_pretrained("bert-base-uncased")
        >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
        >>> outputs = model(**inputs)
        >>> prediction_logits = outputs.prediction_logits
        >>> seq_relationship_logits = outputs.seq_relationship_logits
        ```
        """
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        outputs = self.bert(
            input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            position_ids=position_ids,
            head_mask=head_mask,
            inputs_embeds=inputs_embeds,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )

        sequence_output, pooled_output = outputs[:2]
        prediction_scores, seq_relationship_score = self.cls(sequence_output, pooled_output)

        total_loss = None
        if labels is not None and next_sentence_label is not None:
            loss_fct = CrossEntropyLoss()
            masked_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), labels.view(-1))
            next_sentence_loss = loss_fct(seq_relationship_score.view(-1, 2), next_sentence_label.view(-1))
            total_loss = masked_lm_loss + next_sentence_loss

        if not return_dict:
            output = (prediction_scores, seq_relationship_score) + outputs[2:]
            return ((total_loss,) + output) if total_loss is not None else output

        return MyBertForPreTrainingOutput(
            loss=total_loss,
            prediction_logits=prediction_scores,
            seq_relationship_logits=seq_relationship_score,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
            mlm_loss = masked_lm_loss,
            nsp_loss = next_sentence_loss,
        )

# 取出資料集

In [7]:
class getData():
    def __init__(self, modelType, datapath, nspTask = "NSP"):
        self.datapath = datapath
        self.tokenizer = BertTokenizer.from_pretrained(modelType)
        self.nspTask = nspTask
        self.text = self.toText()
        self.sentence_a = []
        self.sentence_b = []
        self.label = []
        self.inputs = None
        self.nspPrepare()
        self.inputs['labels'] = self.inputs.input_ids.detach().clone()
    
    def toText(self):
        df = pd.read_csv(self.datapath)
        text = []
        for review in df["text"]:
            text.append(review)
        
        return text
    
    def nspPrepare(self):
        bag = [item for sentence in self.text for item in sentence.split('. ') if item != '']
        bag_size = len(bag)

        if self.nspTask == "NSP":
            self.nspData(bag, bag_size)
        elif self.nspTask == "SOP":
            self.sopData()

        self.inputs = self.tokenizer(self.sentence_a, self.sentence_b, return_tensors='pt',
                   max_length=512, truncation=True, padding='max_length')
        self.inputs['next_sentence_label'] = torch.LongTensor([self.label]).T
    
    def nspData(self, bag, bag_size):
        for paragraph in self.text:
            sentences = [
                sentence for sentence in paragraph.split('.') if sentence != ''
            ]
            num_sentences = len(sentences)
            if num_sentences > 1:
                start = random.randint(0, num_sentences-2)
                # 50/50 whether is IsNextSentence or NotNextSentence
                if random.random() >= 0.5:
                    # this is IsNextSentence
                    self.sentence_a.append(sentences[start])
                    self.sentence_b.append(sentences[start+1])
                    self.label.append(0)
                else:
                    index = random.randint(0, bag_size-1)
                    # this is NotNextSentence
                    self.sentence_a.append(sentences[start])
                    self.sentence_b.append(bag[index])
                    self.label.append(1)
    
    def sopData(self):
        for paragraph in self.text:
            sentences = [
                sentence for sentence in paragraph.split('.') if sentence != ''
            ]
            num_sentences = len(sentences)
            if num_sentences > 1:
                start = random.randint(0, num_sentences-2)
                # 50/50 whether is IsNextSentence or NotNextSentence
                if random.random() >= 0.5:
                    # this is IsNextSentence
                    self.sentence_a.append(sentences[start])
                    self.sentence_b.append(sentences[start+1])
                    self.label.append(0)
                else:
                    # this is NotNextSentence
                    self.sentence_a.append(sentences[start+1])
                    self.sentence_b.append(sentences[start])
                    self.label.append(1)
    
    def returnInput(self):
        return self.inputs

In [8]:
class OurDataset(torch.utils.data.Dataset):
    def __init__(self, encodings):
        self.encodings = encodings
    def __getitem__(self, idx):
        return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
    def __len__(self):
        return len(self.encodings.input_ids)

In [9]:
class trainModel():
    def __init__(self, modelType, inputs, batch_size, epoch, acc_goal_each_epoch, masking_method = "purpose", saveModelName = "", saveCSV = True, nspTask = "NSP"):
        self.model = MyBertForPreTraining.from_pretrained(modelType)
        self.tokenizer = BertTokenizer.from_pretrained(modelType)
        self.inputs = inputs
        self.batch_size = batch_size
        self.epoch = epoch
        self.acc_goal_each_epoch = acc_goal_each_epoch  # 每個 epoch 的 MLM 正確率基準
        self.masking_method = masking_method
        self.saveModelName = saveModelName
        self.saveCSV = saveCSV
        self.loader = torch.utils.data.DataLoader(OurDataset(self.inputs), \
                                             batch_size=self.batch_size, shuffle=True)
        self.device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
        self.model.to(self.device)
        self.model.train()
        self.optim = AdamW(self.model.parameters(), lr = 5e-5)
        self.last_acc = 0.0
        
        if os.path.isfile("record_mask_grow_move.csv"):
            self.rec = pd.read_csv("record_mask_grow_move.csv")
        else:
            self.rec = pd.DataFrame({"mlm_acc_each_epoch":[], "mlm_loss_each_epoch":[], 'Mask_Percent_each_epoch':[]})
            
        self.training()
        # self.save_model(self.saveModelName)
    
    def mlmPrepare(self, input_sentences, maskPercentNow, mask_ori, mask_avai):
        # create mask array
        mask_arr = torch.full(mask_ori.shape, False)
        
        for i in range(len(mask_ori)):
            num_to_mask = round(len(torch.where(mask_ori[i])[0]) * (maskPercentNow * 0.01))
            avai_can_mask = torch.where(mask_avai[i])
            avai_can_mask_len = len(avai_can_mask[0])

            if num_to_mask <= avai_can_mask_len:
                mask_index = torch.randperm(avai_can_mask_len)[:num_to_mask]
                mask_arr[i, avai_can_mask[0][mask_index]] = True
                mask_avai[i] = mask_avai[i] ^ mask_arr[i]
            else:
                mask_index = torch.randperm(avai_can_mask_len)[:avai_can_mask_len]
                num_to_mask -= avai_can_mask_len
                mask_arr[i, avai_can_mask[0][mask_index]] = True
                
                set_mask_index = set(avai_can_mask[0][mask_index].numpy())

                mask_avai[i] = mask_avai[i] ^ mask_ori[i]
                avai_can_mask = torch.where(mask_avai[i])
                avai_can_mask_len = len(avai_can_mask[0])
                new_index = torch.randperm(avai_can_mask_len)[:num_to_mask]
                
                set_new_index = set(avai_can_mask[0][new_index].numpy())

                intersection = set_mask_index.intersection(set_new_index)
                while len(intersection) > 0:
                    new_index = torch.randperm(avai_can_mask_len)[:num_to_mask]
                    set_new_index = set(avai_can_mask[0][new_index].numpy())
                    intersection = set_mask_index.intersection(set_new_index)

                mask_arr[i, avai_can_mask[0][new_index]] = True
                mask_avai[i] = mask_avai[i] ^ mask_arr[i]

        selection = []

        for i in range(input_sentences.shape[0]):
            selection.append(
                torch.flatten(mask_arr[i].nonzero()).tolist()
            )

        rand_mask_type = copy.deepcopy(selection)

        for row in range(len(rand_mask_type)):
            for col in range(len(rand_mask_type[row])):
                rand_mask_type[row][col] = random.random()

        vocab_size = len(self.tokenizer.vocab)
        vocab = self.tokenizer.get_vocab()
        special_tokens = [vocab['[CLS]'], vocab['[SEP]'], vocab['[MASK]'], vocab['[UNK]'],  vocab['[PAD]']]

        for i in range(input_sentences.shape[0]):
            for j in range(len(selection[i])):
                if rand_mask_type[i][j] < 0.10:
                    continue
                elif rand_mask_type[i][j] < 0.20:
                    rand_num = vocab['[CLS]']
                    while rand_num in special_tokens:
                        rand_num = random.randint(1, vocab_size)
                    input_sentences[i, selection[i][j]] = rand_num
                else:
                    input_sentences[i, selection[i][j]] = 103
        
        return input_sentences, mask_arr

    def training(self):
        acc_each_epoch = []
        loss_each_epoch = []
        Mask_Percent_each_epoch = []
        stay = 0
        percent_now = 6
        masking_position = {"avai_pos":[], "available":[]}

        for epoch in range(self.epoch):
            # setup loop with TQDM and dataloader
            mask_nums = 0
            mlm_correct = 0
            nsp_nums = 0
            nsp_correct = 0
            loop = tqdm(self.loader, leave=True)

            for batch_index, batch in enumerate(loop):
                if epoch == 0:
                    can_mask = (batch["input_ids"] != 101) * (batch["input_ids"] != 102) * (batch["input_ids"] != 0)
                    masking_position["avai_pos"].append((can_mask).detach().clone())
                    masking_position["available"].append((can_mask).detach().clone())
                
                input_sentences, mask_arr = self.mlmPrepare(batch["input_ids"].detach().clone(), percent_now, \
                                                            masking_position["avai_pos"][batch_index], masking_position["available"][batch_index])

                # initialize calculated gradients (from prev step)
                self.optim.zero_grad()
                # pull all tensor batches required for training
                input_ids = input_sentences.to(self.device)
                token_type_ids = batch['token_type_ids'].to(self.device)
                attention_mask = batch['attention_mask'].to(self.device)
                next_sentence_label = batch['next_sentence_label'].to(self.device)
                labels = batch['labels'].to(self.device)
                # process
                outputs = self.model(input_ids, attention_mask=attention_mask,
                                token_type_ids=token_type_ids,
                                next_sentence_label=next_sentence_label,
                                labels=labels)
                
                prediction_logits = outputs.prediction_logits[mask_arr]
                predicted_ids = prediction_logits.argmax(-1)
                
                seq_relationship_logits = outputs.seq_relationship_logits
                predicted_labels = torch.argmax(seq_relationship_logits, dim=1)
                predicted_label = predicted_labels

                mask_nums += len(predicted_ids)
                mlm_correct += torch.eq(predicted_ids, labels[mask_arr]).sum().item()
                nsp_nums += len(predicted_label)
                nsp_correct += predicted_label.eq(torch.squeeze(next_sentence_label)).sum().item()
                
                # extract loss
                loss = outputs.loss
                mlm_loss = outputs.mlm_loss.item()
                nsp_loss = outputs.nsp_loss.item()
                mlm_acc = mlm_correct / mask_nums
                nsp_acc = nsp_correct / nsp_nums
                # calculate loss for every parameter that needs grad update
                loss.backward()
                # update parameters
                self.optim.step()
                # print relevant info to progress bar
                loop.set_description(f'Epoch {epoch}')
                loop.set_postfix(Total_loss='{:.4f}'.format(loss.item()), MLM_Accuracy='{:.4f}'.format(mlm_acc), NSP_Accuracy='{:.4f}'.format(nsp_acc), \
                                MLM_loss='{:.4f}'.format(mlm_loss), NSP_loss='{:.4f}'.format(nsp_loss), Mask_Percent=percent_now)
            
            acc_each_epoch.append(mlm_acc)
            loss_each_epoch.append(mlm_loss)
            Mask_Percent_each_epoch.append(percent_now)

            if self.masking_method == "DMLM":
                percent_now += 1
            elif self.masking_method == "purpose":
                if (mlm_acc >= self.acc_goal_each_epoch[epoch] * 0.01) or stay >= 2:
                    stay = 0
                    percent_now = 6 + epoch + 1
                else:
                    stay += 1
            elif self.masking_method == "adaptive":
                if mlm_acc > self.last_acc:
                    percent_now += 1
                else:
                    percent_now -= 1
                self.last_acc = mlm_acc
            
            if epoch % 5 == 4:
                self.save_model(self.saveModelName + "_epoch" + str(epoch + 1))


        if self.saveCSV:
            
            new_rec = pd.concat([self.rec, pd.DataFrame(pd.DataFrame({'mlm_acc_each_epoch': [acc_each_epoch], 'mlm_loss_each_epoch': [loss_each_epoch], 'Mask_Percent_each_epoch': [Mask_Percent_each_epoch]}))], ignore_index=True)
            new_rec.to_csv("record_mask_grow_move.csv", index = False)
        torch.cuda.empty_cache()
    
    def save_model(self, model_name):
        self.model.save_pretrained(model_name)

In [10]:
datapath = 'bbc-text.csv'
modelType = 'bert-base-cased'
epoch = 10
batch_size = 6
nsp_input = getData(modelType = modelType, datapath = datapath, nspTask = "NSP")
epoch_acc = [34.0, 42.1, 44.2, 45.7, 47.3, 49.7, 50.8, 52.8, 53.8 , 55.6]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


In [11]:
mask_dyn_grow1 = trainModel(modelType = modelType, inputs = nsp_input.returnInput(), batch_size = batch_size, epoch = epoch, acc_goal_each_epoch = epoch_acc, masking_method = "DMLM", saveModelName = "saved_model/saved_model_movemask_dyn_grow1")
mask_dyn_grow1 = None

  return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
Epoch 0: 100%|██████████| 371/371 [02:55<00:00,  2.11it/s, MLM_Accuracy=0.3424, MLM_loss=0.0631, Mask_Percent=6, NSP_Accuracy=0.8400, NSP_loss=0.4383, Total_loss=0.5014]
Epoch 1: 100%|██████████| 371/371 [03:02<00:00,  2.03it/s, MLM_Accuracy=0.4911, MLM_loss=0.0307, Mask_Percent=7, NSP_Accuracy=0.9411, NSP_loss=0.3854, Total_loss=0.4161]
Epoch 2: 100%|██████████| 371/371 [03:06<00:00,  1.99it/s, MLM_Accuracy=0.4972, MLM_loss=0.0423, Mask_Percent=8, NSP_Accuracy=0.9645, NSP_loss=0.0698, Total_loss=0.1122]
Epoch 3: 100%|██████████| 371/371 [03:06<00:00,  1.99it/s, MLM_Accuracy=0.4963, MLM_loss=0.0462, Mask_Percent=9, NSP_Accuracy=0.9744, NSP_loss=0.9217, Total_loss=0.9679]
Epoch 4: 100%|██████████| 371/371 [03:06<00:00,  1.99it/s, MLM_Accuracy=0.5131, MLM_loss=0.0252, Mask_Percent=10, NSP_Accuracy=0.9771, NSP_loss=0.1771, Total_loss=0.2023]
Epoch 5: 100%|██████████| 371/371 [02:57<00:00,  2.09it/s, MLM_Accuracy

In [12]:
mask_dyn_grow1 = trainModel(modelType = modelType, inputs = nsp_input.returnInput(), batch_size = batch_size, epoch = epoch, acc_goal_each_epoch = epoch_acc, masking_method = "DMLM", saveModelName = "saved_model/saved_model_movemask_dyn_grow1")
mask_dyn_grow1 = None

  return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
Epoch 0: 100%|██████████| 371/371 [02:50<00:00,  2.18it/s, MLM_Accuracy=0.3568, MLM_loss=0.0459, Mask_Percent=6, NSP_Accuracy=0.8387, NSP_loss=0.6125, Total_loss=0.6584]
Epoch 1: 100%|██████████| 371/371 [02:50<00:00,  2.18it/s, MLM_Accuracy=0.4906, MLM_loss=0.0457, Mask_Percent=7, NSP_Accuracy=0.9452, NSP_loss=0.0662, Total_loss=0.1119]
Epoch 2: 100%|██████████| 371/371 [02:49<00:00,  2.19it/s, MLM_Accuracy=0.5027, MLM_loss=0.0314, Mask_Percent=8, NSP_Accuracy=0.9694, NSP_loss=0.0120, Total_loss=0.0434]
Epoch 3: 100%|██████████| 371/371 [02:49<00:00,  2.19it/s, MLM_Accuracy=0.5043, MLM_loss=0.0369, Mask_Percent=9, NSP_Accuracy=0.9780, NSP_loss=0.0054, Total_loss=0.0423]
Epoch 4: 100%|██████████| 371/371 [02:48<00:00,  2.21it/s, MLM_Accuracy=0.5102, MLM_loss=0.0294, Mask_Percent=10, NSP_Accuracy=0.9757, NSP_loss=0.0361, Total_loss=0.0654]
Epoch 5: 100%|██████████| 371/371 [02:48<00:00,  2.20it/s, MLM_Accuracy

In [13]:
mask_dyn_grow1 = trainModel(modelType = modelType, inputs = nsp_input.returnInput(), batch_size = batch_size, epoch = epoch, acc_goal_each_epoch = epoch_acc, masking_method = "DMLM", saveModelName = "saved_model/saved_model_movemask_dyn_grow1")
mask_dyn_grow1 = None

  return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
Epoch 0: 100%|██████████| 371/371 [02:51<00:00,  2.16it/s, MLM_Accuracy=0.3464, MLM_loss=0.0415, Mask_Percent=6, NSP_Accuracy=0.8422, NSP_loss=1.4112, Total_loss=1.4527]
Epoch 1: 100%|██████████| 371/371 [02:51<00:00,  2.16it/s, MLM_Accuracy=0.5054, MLM_loss=0.0445, Mask_Percent=7, NSP_Accuracy=0.9398, NSP_loss=0.1197, Total_loss=0.1642]
Epoch 2: 100%|██████████| 371/371 [02:51<00:00,  2.16it/s, MLM_Accuracy=0.4941, MLM_loss=0.0306, Mask_Percent=8, NSP_Accuracy=0.9658, NSP_loss=0.0123, Total_loss=0.0429]
Epoch 3: 100%|██████████| 371/371 [02:51<00:00,  2.16it/s, MLM_Accuracy=0.5060, MLM_loss=0.0330, Mask_Percent=9, NSP_Accuracy=0.9699, NSP_loss=0.0070, Total_loss=0.0400]
Epoch 4: 100%|██████████| 371/371 [02:50<00:00,  2.17it/s, MLM_Accuracy=0.5131, MLM_loss=0.0311, Mask_Percent=10, NSP_Accuracy=0.9730, NSP_loss=0.0347, Total_loss=0.0658]
Epoch 5: 100%|██████████| 371/371 [02:49<00:00,  2.19it/s, MLM_Accuracy

In [None]:
mask_dyn_grow1 = trainModel(modelType = modelType, inputs = nsp_input.returnInput(), batch_size = batch_size, epoch = epoch, acc_goal_each_epoch = epoch_acc, masking_method = "DMLM", saveModelName = "saved_model/saved_model_movemask_dyn_grow1")
mask_dyn_grow1 = None

In [None]:
mask_dyn_grow1 = trainModel(modelType = modelType, inputs = nsp_input.returnInput(), batch_size = batch_size, epoch = epoch, acc_goal_each_epoch = epoch_acc, masking_method = "DMLM", saveModelName = "saved_model/saved_model_movemask_dyn_grow1")
mask_dyn_grow1 = None

In [14]:
mask_dyn = trainModel(modelType = modelType, inputs = nsp_input.returnInput(), batch_size = batch_size, epoch = epoch, acc_goal_each_epoch = epoch_acc, saveModelName = "saved_model/saved_model_movemask_grow")
mask_dyn = None

  return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
Epoch 0: 100%|██████████| 371/371 [02:50<00:00,  2.18it/s, MLM_Accuracy=0.3430, MLM_loss=0.0416, Mask_Percent=6, NSP_Accuracy=0.8360, NSP_loss=0.1164, Total_loss=0.1580]
Epoch 1: 100%|██████████| 371/371 [02:49<00:00,  2.19it/s, MLM_Accuracy=0.4993, MLM_loss=0.0256, Mask_Percent=7, NSP_Accuracy=0.9447, NSP_loss=0.0244, Total_loss=0.0500]
Epoch 2: 100%|██████████| 371/371 [02:50<00:00,  2.18it/s, MLM_Accuracy=0.4988, MLM_loss=0.0284, Mask_Percent=8, NSP_Accuracy=0.9690, NSP_loss=0.0066, Total_loss=0.0350]
Epoch 3: 100%|██████████| 371/371 [02:48<00:00,  2.21it/s, MLM_Accuracy=0.5046, MLM_loss=0.0486, Mask_Percent=9, NSP_Accuracy=0.9631, NSP_loss=0.4834, Total_loss=0.5320]
Epoch 4: 100%|██████████| 371/371 [02:47<00:00,  2.21it/s, MLM_Accuracy=0.5082, MLM_loss=0.0436, Mask_Percent=10, NSP_Accuracy=0.9762, NSP_loss=0.0019, Total_loss=0.0455]
Epoch 5: 100%|██████████| 371/371 [02:48<00:00,  2.20it/s, MLM_Accuracy

In [15]:
mask_dyn = trainModel(modelType = modelType, inputs = nsp_input.returnInput(), batch_size = batch_size, epoch = epoch, acc_goal_each_epoch = epoch_acc, saveModelName = "saved_model/saved_model_movemask_grow")
mask_dyn = None

  return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
Epoch 0: 100%|██████████| 371/371 [02:49<00:00,  2.20it/s, MLM_Accuracy=0.3378, MLM_loss=0.0519, Mask_Percent=6, NSP_Accuracy=0.8382, NSP_loss=0.3627, Total_loss=0.4146]
Epoch 1: 100%|██████████| 371/371 [02:46<00:00,  2.22it/s, MLM_Accuracy=0.4833, MLM_loss=0.0204, Mask_Percent=6, NSP_Accuracy=0.9348, NSP_loss=0.5544, Total_loss=0.5747]
Epoch 2: 100%|██████████| 371/371 [02:48<00:00,  2.21it/s, MLM_Accuracy=0.4914, MLM_loss=0.0408, Mask_Percent=8, NSP_Accuracy=0.9645, NSP_loss=0.3295, Total_loss=0.3704]
Epoch 3: 100%|██████████| 371/371 [02:48<00:00,  2.21it/s, MLM_Accuracy=0.5046, MLM_loss=0.0398, Mask_Percent=9, NSP_Accuracy=0.9753, NSP_loss=0.0241, Total_loss=0.0639]
Epoch 4: 100%|██████████| 371/371 [02:47<00:00,  2.21it/s, MLM_Accuracy=0.5015, MLM_loss=0.0283, Mask_Percent=10, NSP_Accuracy=0.9667, NSP_loss=0.0017, Total_loss=0.0300]
Epoch 5: 100%|██████████| 371/371 [02:50<00:00,  2.17it/s, MLM_Accuracy

In [16]:
mask_dyn = trainModel(modelType = modelType, inputs = nsp_input.returnInput(), batch_size = batch_size, epoch = epoch, acc_goal_each_epoch = epoch_acc, saveModelName = "saved_model/saved_model_movemask_grow")
mask_dyn = None

  return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
Epoch 0: 100%|██████████| 371/371 [02:46<00:00,  2.23it/s, MLM_Accuracy=0.3773, MLM_loss=0.0478, Mask_Percent=6, NSP_Accuracy=0.8301, NSP_loss=0.5959, Total_loss=0.6437]
Epoch 1: 100%|██████████| 371/371 [02:47<00:00,  2.21it/s, MLM_Accuracy=0.4997, MLM_loss=0.0444, Mask_Percent=7, NSP_Accuracy=0.9398, NSP_loss=0.0573, Total_loss=0.1016]
Epoch 2: 100%|██████████| 371/371 [02:47<00:00,  2.22it/s, MLM_Accuracy=0.4994, MLM_loss=0.0326, Mask_Percent=8, NSP_Accuracy=0.9717, NSP_loss=0.0470, Total_loss=0.0796]
Epoch 3: 100%|██████████| 371/371 [02:48<00:00,  2.20it/s, MLM_Accuracy=0.4947, MLM_loss=0.0255, Mask_Percent=9, NSP_Accuracy=0.9784, NSP_loss=0.0001, Total_loss=0.0256]
Epoch 4: 100%|██████████| 371/371 [02:48<00:00,  2.21it/s, MLM_Accuracy=0.5013, MLM_loss=0.0247, Mask_Percent=10, NSP_Accuracy=0.9627, NSP_loss=0.0016, Total_loss=0.0264]
Epoch 5: 100%|██████████| 371/371 [02:48<00:00,  2.20it/s, MLM_Accuracy

In [None]:
mask_dyn = trainModel(modelType = modelType, inputs = nsp_input.returnInput(), batch_size = batch_size, epoch = epoch, acc_goal_each_epoch = epoch_acc, saveModelName = "saved_model/saved_model_movemask_grow")
mask_dyn = None

In [None]:
mask_dyn = trainModel(modelType = modelType, inputs = nsp_input.returnInput(), batch_size = batch_size, epoch = epoch, acc_goal_each_epoch = epoch_acc, saveModelName = "saved_model/saved_model_movemask_grow")
mask_dyn = None

# --------------------------------

In [11]:
class trainModel_V3():
    def __init__(self, modelType, inputs, batch_size, epoch, acc_goal_each_epoch, masking_method = "purpose", saveModelName = "", saveCSV = True, nspTask = "NSP"):
        self.model = MyBertForPreTraining.from_pretrained(modelType)
        self.tokenizer = BertTokenizer.from_pretrained(modelType)
        self.inputs = inputs
        self.batch_size = batch_size
        self.epoch = epoch
        self.acc_goal_each_epoch = acc_goal_each_epoch  # 每個 epoch 的 MLM 正確率基準
        self.masking_method = masking_method
        self.saveModelName = saveModelName
        self.saveCSV = saveCSV
        self.loader = torch.utils.data.DataLoader(OurDataset(self.inputs), \
                                             batch_size=self.batch_size, shuffle=True)
        self.device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
        self.model.to(self.device)
        self.model.train()
        self.optim = AdamW(self.model.parameters(), lr = 5e-5)
        self.last_acc = 0.0
        
        if os.path.isfile("record_mask_grow_move.csv"):
            self.rec = pd.read_csv("record_mask_grow_move.csv")
        else:
            self.rec = pd.DataFrame({"mlm_acc_each_epoch":[], "mlm_loss_each_epoch":[], 'Mask_Percent_each_epoch':[]})
            
        self.training()
        self.save_model(self.saveModelName)
    
    def mlmPrepare(self, input_sentences, maskPercentNow):
        rand = torch.rand(input_sentences.shape)
        # create mask array
        mask_arr = (rand < maskPercentNow * 0.01) * (input_sentences != 101) * \
                (input_sentences != 102) * (input_sentences != 0)
        
        selection = []

        for i in range(input_sentences.shape[0]):
            selection.append(
                torch.flatten(mask_arr[i].nonzero()).tolist()
            )

        rand_mask_type = copy.deepcopy(selection)

        for row in range(len(rand_mask_type)):
            for col in range(len(rand_mask_type[row])):
                rand_mask_type[row][col] = random.random()

        vocab_size = len(self.tokenizer.vocab)
        vocab = self.tokenizer.get_vocab()
        special_tokens = [vocab['[CLS]'], vocab['[SEP]'], vocab['[MASK]'], vocab['[UNK]'],  vocab['[PAD]']]

        for i in range(input_sentences.shape[0]):
            for j in range(len(selection[i])):
                if rand_mask_type[i][j] < 0.10:
                    continue
                elif rand_mask_type[i][j] < 0.20:
                    rand_num = vocab['[CLS]']
                    while rand_num in special_tokens:
                        rand_num = random.randint(1, vocab_size)
                    input_sentences[i, selection[i][j]] = rand_num
                else:
                    input_sentences[i, selection[i][j]] = 103
        
        return input_sentences, mask_arr

    def training(self):
        acc_each_epoch = []
        loss_each_epoch = []
        Mask_Percent_each_epoch = []
        stay = 0
        percent_now = 6

        for epoch in range(self.epoch):
            # setup loop with TQDM and dataloader
            mask_nums = 0
            mlm_correct = 0
            nsp_nums = 0
            nsp_correct = 0
            loop = tqdm(self.loader, leave=True)

            for batch in loop:
                input_sentences, mask_arr = self.mlmPrepare(batch["input_ids"].detach().clone(), percent_now)

                # initialize calculated gradients (from prev step)
                self.optim.zero_grad()
                # pull all tensor batches required for training
                input_ids = input_sentences.to(self.device)
                token_type_ids = batch['token_type_ids'].to(self.device)
                attention_mask = batch['attention_mask'].to(self.device)
                next_sentence_label = batch['next_sentence_label'].to(self.device)
                labels = batch['labels'].to(self.device)
                # process
                outputs = self.model(input_ids, attention_mask=attention_mask,
                                token_type_ids=token_type_ids,
                                next_sentence_label=next_sentence_label,
                                labels=labels)
                
                prediction_logits = outputs.prediction_logits[mask_arr]
                predicted_ids = prediction_logits.argmax(-1)
                
                seq_relationship_logits = outputs.seq_relationship_logits
                predicted_labels = torch.argmax(seq_relationship_logits, dim=1)
                predicted_label = predicted_labels

                mask_nums += len(predicted_ids)
                mlm_correct += torch.eq(predicted_ids, labels[mask_arr]).sum().item()
                nsp_nums += len(predicted_label)
                nsp_correct += predicted_label.eq(torch.squeeze(next_sentence_label)).sum().item()
                
                # extract loss
                loss = outputs.loss
                mlm_loss = outputs.mlm_loss.item()
                nsp_loss = outputs.nsp_loss.item()
                mlm_acc = mlm_correct / mask_nums
                nsp_acc = nsp_correct / nsp_nums
                # calculate loss for every parameter that needs grad update
                loss.backward()
                # update parameters
                self.optim.step()
                
                # print relevant info to progress bar
                loop.set_description(f'Epoch {epoch}')
                loop.set_postfix(Total_loss='{:.4f}'.format(loss.item()), MLM_Accuracy='{:.4f}'.format(mlm_acc), NSP_Accuracy='{:.4f}'.format(nsp_acc), \
                                MLM_loss='{:.4f}'.format(mlm_loss), NSP_loss='{:.4f}'.format(nsp_loss), Mask_Percent=percent_now)

            acc_each_epoch.append(mlm_acc)
            loss_each_epoch.append(mlm_loss)
            Mask_Percent_each_epoch.append(percent_now)

            if self.masking_method == "DMLM":
                percent_now += 1
            elif self.masking_method == "purpose":
                if (mlm_acc >= self.acc_goal_each_epoch[epoch] * 0.01) or stay >= 2:
                    stay = 0
                    percent_now = 6 + epoch + 1
                else:
                    stay += 1
            elif self.masking_method == "adaptive":
                if mlm_acc > self.last_acc:
                    percent_now += 1
                else:
                    percent_now -= 1
                self.last_acc = mlm_acc
            
            if epoch % 5 == 4:
                self.save_model(self.saveModelName + "_epoch" + str(epoch + 1))

        if self.saveCSV:
            
            new_rec = pd.concat([self.rec, pd.DataFrame(pd.DataFrame({'mlm_acc_each_epoch': [acc_each_epoch], 'mlm_loss_each_epoch': [loss_each_epoch], 'Mask_Percent_each_epoch': [Mask_Percent_each_epoch]}))], ignore_index=True)
            new_rec.to_csv("record_mask_grow_move.csv", index = False)
        torch.cuda.empty_cache()
    
    def save_model(self, model_name):
        self.model.save_pretrained(model_name)

In [18]:
mask_dyn_grow1 = trainModel_V3(modelType = modelType, inputs = nsp_input.returnInput(), batch_size = batch_size, epoch = epoch, acc_goal_each_epoch = epoch_acc, masking_method = "DMLM", saveModelName = "saved_model/saved_model_mask_dyn_grow1")
mask_dyn_grow1 = None

  return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
Epoch 0: 100%|██████████| 371/371 [02:47<00:00,  2.21it/s, MLM_Accuracy=0.3447, MLM_loss=0.0351, Mask_Percent=6, NSP_Accuracy=0.8557, NSP_loss=0.1295, Total_loss=0.1646]
Epoch 1: 100%|██████████| 371/371 [02:49<00:00,  2.19it/s, MLM_Accuracy=0.4122, MLM_loss=0.0476, Mask_Percent=7, NSP_Accuracy=0.9344, NSP_loss=0.3681, Total_loss=0.4157]
Epoch 2: 100%|██████████| 371/371 [02:47<00:00,  2.22it/s, MLM_Accuracy=0.4185, MLM_loss=0.0178, Mask_Percent=8, NSP_Accuracy=0.9613, NSP_loss=0.2208, Total_loss=0.2386]
Epoch 3: 100%|██████████| 371/371 [02:48<00:00,  2.21it/s, MLM_Accuracy=0.4196, MLM_loss=0.0409, Mask_Percent=9, NSP_Accuracy=0.9658, NSP_loss=0.5859, Total_loss=0.6268]
Epoch 4: 100%|██████████| 371/371 [02:46<00:00,  2.22it/s, MLM_Accuracy=0.4263, MLM_loss=0.0317, Mask_Percent=10, NSP_Accuracy=0.9784, NSP_loss=0.0150, Total_loss=0.0467]
Epoch 5: 100%|██████████| 371/371 [02:47<00:00,  2.22it/s, MLM_Accuracy

In [19]:
mask_dyn_grow1 = trainModel_V3(modelType = modelType, inputs = nsp_input.returnInput(), batch_size = batch_size, epoch = epoch, acc_goal_each_epoch = epoch_acc, masking_method = "DMLM", saveModelName = "saved_model/saved_model_mask_dyn_grow1")
mask_dyn_grow1 = None

  return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
Epoch 0: 100%|██████████| 371/371 [02:50<00:00,  2.18it/s, MLM_Accuracy=0.3344, MLM_loss=0.0391, Mask_Percent=6, NSP_Accuracy=0.8413, NSP_loss=0.8922, Total_loss=0.9313]
Epoch 1: 100%|██████████| 371/371 [02:51<00:00,  2.17it/s, MLM_Accuracy=0.4216, MLM_loss=0.0415, Mask_Percent=7, NSP_Accuracy=0.9344, NSP_loss=0.2081, Total_loss=0.2496]
Epoch 2: 100%|██████████| 371/371 [02:50<00:00,  2.18it/s, MLM_Accuracy=0.4110, MLM_loss=0.0353, Mask_Percent=8, NSP_Accuracy=0.9654, NSP_loss=0.0823, Total_loss=0.1177]
Epoch 3: 100%|██████████| 371/371 [02:51<00:00,  2.17it/s, MLM_Accuracy=0.4201, MLM_loss=0.0253, Mask_Percent=9, NSP_Accuracy=0.9784, NSP_loss=0.0042, Total_loss=0.0295]
Epoch 4: 100%|██████████| 371/371 [02:49<00:00,  2.18it/s, MLM_Accuracy=0.4234, MLM_loss=0.0584, Mask_Percent=10, NSP_Accuracy=0.9721, NSP_loss=0.0191, Total_loss=0.0775]
Epoch 5: 100%|██████████| 371/371 [02:48<00:00,  2.20it/s, MLM_Accuracy

In [20]:
mask_dyn_grow1 = trainModel_V3(modelType = modelType, inputs = nsp_input.returnInput(), batch_size = batch_size, epoch = epoch, acc_goal_each_epoch = epoch_acc, masking_method = "DMLM", saveModelName = "saved_model/saved_model_mask_dyn_grow1")
mask_dyn_grow1 = None

  return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
Epoch 0: 100%|██████████| 371/371 [02:46<00:00,  2.23it/s, MLM_Accuracy=0.3403, MLM_loss=0.0451, Mask_Percent=6, NSP_Accuracy=0.8418, NSP_loss=0.3970, Total_loss=0.4421]
Epoch 1: 100%|██████████| 371/371 [02:48<00:00,  2.20it/s, MLM_Accuracy=0.4083, MLM_loss=0.0159, Mask_Percent=7, NSP_Accuracy=0.9281, NSP_loss=0.1757, Total_loss=0.1916]
Epoch 2: 100%|██████████| 371/371 [02:48<00:00,  2.20it/s, MLM_Accuracy=0.4199, MLM_loss=0.0425, Mask_Percent=8, NSP_Accuracy=0.9654, NSP_loss=0.0947, Total_loss=0.1372]
Epoch 3: 100%|██████████| 371/371 [02:47<00:00,  2.22it/s, MLM_Accuracy=0.4291, MLM_loss=0.0323, Mask_Percent=9, NSP_Accuracy=0.9735, NSP_loss=0.0018, Total_loss=0.0341]
Epoch 4: 100%|██████████| 371/371 [02:47<00:00,  2.22it/s, MLM_Accuracy=0.4202, MLM_loss=0.0206, Mask_Percent=10, NSP_Accuracy=0.9703, NSP_loss=0.0501, Total_loss=0.0707]
Epoch 5: 100%|██████████| 371/371 [02:47<00:00,  2.21it/s, MLM_Accuracy

In [21]:
mask_dyn = trainModel_V3(modelType = modelType, inputs = nsp_input.returnInput(), batch_size = batch_size, epoch = epoch, acc_goal_each_epoch = epoch_acc, saveModelName = "saved_model/saved_model_mask_grow")
mask_dyn = None

  return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
Epoch 0: 100%|██████████| 371/371 [02:50<00:00,  2.18it/s, MLM_Accuracy=0.3170, MLM_loss=0.0381, Mask_Percent=6, NSP_Accuracy=0.8355, NSP_loss=0.6524, Total_loss=0.6905]
Epoch 1: 100%|██████████| 371/371 [02:48<00:00,  2.20it/s, MLM_Accuracy=0.4061, MLM_loss=0.0344, Mask_Percent=6, NSP_Accuracy=0.9398, NSP_loss=0.0004, Total_loss=0.0349]
Epoch 2: 100%|██████████| 371/371 [02:47<00:00,  2.21it/s, MLM_Accuracy=0.4172, MLM_loss=0.0584, Mask_Percent=6, NSP_Accuracy=0.9681, NSP_loss=0.0079, Total_loss=0.0663]
Epoch 3: 100%|██████████| 371/371 [02:48<00:00,  2.20it/s, MLM_Accuracy=0.4000, MLM_loss=0.0256, Mask_Percent=9, NSP_Accuracy=0.9672, NSP_loss=0.7689, Total_loss=0.7945]
Epoch 4: 100%|██████████| 371/371 [02:48<00:00,  2.20it/s, MLM_Accuracy=0.4248, MLM_loss=0.0278, Mask_Percent=9, NSP_Accuracy=0.9703, NSP_loss=0.0187, Total_loss=0.0465]
Epoch 5: 100%|██████████| 371/371 [02:48<00:00,  2.20it/s, MLM_Accuracy=

In [12]:
mask_dyn = trainModel_V3(modelType = modelType, inputs = nsp_input.returnInput(), batch_size = batch_size, epoch = epoch, acc_goal_each_epoch = epoch_acc, saveModelName = "saved_model/saved_model_mask_grow")
mask_dyn = None

  return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
Epoch 0: 100%|██████████| 371/371 [02:40<00:00,  2.31it/s, MLM_Accuracy=0.3387, MLM_loss=0.0385, Mask_Percent=6, NSP_Accuracy=0.8324, NSP_loss=0.0635, Total_loss=0.1020]
Epoch 1: 100%|██████████| 371/371 [02:47<00:00,  2.21it/s, MLM_Accuracy=0.4092, MLM_loss=0.0268, Mask_Percent=6, NSP_Accuracy=0.9438, NSP_loss=0.0195, Total_loss=0.0463]
Epoch 2: 100%|██████████| 371/371 [02:50<00:00,  2.17it/s, MLM_Accuracy=0.4272, MLM_loss=0.0187, Mask_Percent=6, NSP_Accuracy=0.9596, NSP_loss=0.0602, Total_loss=0.0789]
Epoch 3: 100%|██████████| 371/371 [02:49<00:00,  2.18it/s, MLM_Accuracy=0.4263, MLM_loss=0.0396, Mask_Percent=9, NSP_Accuracy=0.9631, NSP_loss=0.0941, Total_loss=0.1337]
Epoch 4: 100%|██████████| 371/371 [02:50<00:00,  2.17it/s, MLM_Accuracy=0.4203, MLM_loss=0.0251, Mask_Percent=9, NSP_Accuracy=0.9766, NSP_loss=0.4379, Total_loss=0.4630]
Epoch 5: 100%|██████████| 371/371 [02:51<00:00,  2.17it/s, MLM_Accuracy=

In [12]:
mask_dyn = trainModel_V3(modelType = modelType, inputs = nsp_input.returnInput(), batch_size = batch_size, epoch = epoch, acc_goal_each_epoch = epoch_acc, saveModelName = "saved_model/saved_model_mask_grow")
mask_dyn = None

  return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
Epoch 0: 100%|██████████| 371/371 [02:54<00:00,  2.12it/s, MLM_Accuracy=0.3284, MLM_loss=0.0584, Mask_Percent=6, NSP_Accuracy=0.8261, NSP_loss=0.3186, Total_loss=0.3770]
Epoch 1: 100%|██████████| 371/371 [02:50<00:00,  2.17it/s, MLM_Accuracy=0.4120, MLM_loss=0.0280, Mask_Percent=6, NSP_Accuracy=0.9213, NSP_loss=0.4587, Total_loss=0.4867]
Epoch 2: 100%|██████████| 371/371 [02:53<00:00,  2.14it/s, MLM_Accuracy=0.4254, MLM_loss=0.0229, Mask_Percent=6, NSP_Accuracy=0.9591, NSP_loss=0.0001, Total_loss=0.0229]
Epoch 3: 100%|██████████| 371/371 [02:54<00:00,  2.12it/s, MLM_Accuracy=0.4205, MLM_loss=0.0483, Mask_Percent=9, NSP_Accuracy=0.9694, NSP_loss=0.0245, Total_loss=0.0728]
Epoch 4: 100%|██████████| 371/371 [02:51<00:00,  2.17it/s, MLM_Accuracy=0.4277, MLM_loss=0.0248, Mask_Percent=9, NSP_Accuracy=0.9847, NSP_loss=0.0092, Total_loss=0.0340]
Epoch 5: 100%|██████████| 371/371 [02:50<00:00,  2.17it/s, MLM_Accuracy=

In [12]:
adp = trainModel_V3(modelType = modelType, inputs = nsp_input.returnInput(), batch_size = batch_size, epoch = epoch, acc_goal_each_epoch = epoch_acc, masking_method = "adaptive", saveModelName = "saved_model/saved_model_mask_adaptive")
adp = None

  return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
Epoch 0: 100%|██████████| 371/371 [02:49<00:00,  2.19it/s, MLM_Accuracy=0.3339, MLM_loss=0.0421, Mask_Percent=6, NSP_Accuracy=0.8274, NSP_loss=0.1160, Total_loss=0.1580]
Epoch 1: 100%|██████████| 371/371 [02:51<00:00,  2.16it/s, MLM_Accuracy=0.4124, MLM_loss=0.0314, Mask_Percent=7, NSP_Accuracy=0.9321, NSP_loss=0.0280, Total_loss=0.0594]
Epoch 2: 100%|██████████| 371/371 [02:51<00:00,  2.16it/s, MLM_Accuracy=0.4202, MLM_loss=0.0401, Mask_Percent=8, NSP_Accuracy=0.9649, NSP_loss=0.0473, Total_loss=0.0873]
Epoch 3: 100%|██████████| 371/371 [02:51<00:00,  2.17it/s, MLM_Accuracy=0.4130, MLM_loss=0.0570, Mask_Percent=9, NSP_Accuracy=0.9654, NSP_loss=0.0435, Total_loss=0.1005]
Epoch 4: 100%|██████████| 371/371 [02:51<00:00,  2.17it/s, MLM_Accuracy=0.4279, MLM_loss=0.0536, Mask_Percent=8, NSP_Accuracy=0.9762, NSP_loss=0.0053, Total_loss=0.0589]
Epoch 5: 100%|██████████| 371/371 [02:53<00:00,  2.14it/s, MLM_Accuracy=

In [12]:
adp = trainModel_V3(modelType = modelType, inputs = nsp_input.returnInput(), batch_size = batch_size, epoch = epoch, acc_goal_each_epoch = epoch_acc, masking_method = "adaptive", saveModelName = "saved_model/saved_model_mask_adaptive")
adp = None

  return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
Epoch 0: 100%|██████████| 371/371 [02:53<00:00,  2.13it/s, MLM_Accuracy=0.3524, MLM_loss=0.0436, Mask_Percent=6, NSP_Accuracy=0.8288, NSP_loss=0.0983, Total_loss=0.1419]
Epoch 1: 100%|██████████| 371/371 [03:01<00:00,  2.04it/s, MLM_Accuracy=0.4078, MLM_loss=0.0435, Mask_Percent=7, NSP_Accuracy=0.9204, NSP_loss=0.1667, Total_loss=0.2102]
Epoch 2: 100%|██████████| 371/371 [03:04<00:00,  2.02it/s, MLM_Accuracy=0.4199, MLM_loss=0.0284, Mask_Percent=8, NSP_Accuracy=0.9609, NSP_loss=0.0079, Total_loss=0.0363]
Epoch 3: 100%|██████████| 371/371 [02:41<00:00,  2.29it/s, MLM_Accuracy=0.4269, MLM_loss=0.0252, Mask_Percent=9, NSP_Accuracy=0.9681, NSP_loss=0.0142, Total_loss=0.0395]
Epoch 4: 100%|██████████| 371/371 [02:38<00:00,  2.34it/s, MLM_Accuracy=0.4165, MLM_loss=0.0286, Mask_Percent=10, NSP_Accuracy=0.9739, NSP_loss=0.2421, Total_loss=0.2708]
Epoch 5: 100%|██████████| 371/371 [02:39<00:00,  2.33it/s, MLM_Accuracy

In [13]:
adp = trainModel_V3(modelType = modelType, inputs = nsp_input.returnInput(), batch_size = batch_size, epoch = epoch, acc_goal_each_epoch = epoch_acc, masking_method = "adaptive", saveModelName = "saved_model/saved_model_mask_adaptive")
adp = None

  return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
Epoch 0: 100%|██████████| 371/371 [02:39<00:00,  2.32it/s, MLM_Accuracy=0.3235, MLM_loss=0.0437, Mask_Percent=6, NSP_Accuracy=0.8310, NSP_loss=0.3136, Total_loss=0.3573]
Epoch 1: 100%|██████████| 371/371 [02:39<00:00,  2.32it/s, MLM_Accuracy=0.4146, MLM_loss=0.0143, Mask_Percent=7, NSP_Accuracy=0.9263, NSP_loss=0.2761, Total_loss=0.2904]
Epoch 2: 100%|██████████| 371/371 [02:40<00:00,  2.32it/s, MLM_Accuracy=0.4148, MLM_loss=0.0225, Mask_Percent=8, NSP_Accuracy=0.9685, NSP_loss=0.0296, Total_loss=0.0521]
Epoch 3: 100%|██████████| 371/371 [02:36<00:00,  2.37it/s, MLM_Accuracy=0.4127, MLM_loss=0.0470, Mask_Percent=9, NSP_Accuracy=0.9582, NSP_loss=0.0402, Total_loss=0.0872]
Epoch 4: 100%|██████████| 371/371 [02:36<00:00,  2.36it/s, MLM_Accuracy=0.4425, MLM_loss=0.0272, Mask_Percent=8, NSP_Accuracy=0.9825, NSP_loss=0.0238, Total_loss=0.0510]
Epoch 5: 100%|██████████| 371/371 [02:39<00:00,  2.33it/s, MLM_Accuracy=

In [14]:
adp = trainModel_V3(modelType = modelType, inputs = nsp_input.returnInput(), batch_size = batch_size, epoch = epoch, acc_goal_each_epoch = epoch_acc, masking_method = "adaptive", saveModelName = "saved_model/saved_model_mask_adaptive")
adp = None

  return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
Epoch 0: 100%|██████████| 371/371 [02:47<00:00,  2.22it/s, MLM_Accuracy=0.3300, MLM_loss=0.0365, Mask_Percent=6, NSP_Accuracy=0.8216, NSP_loss=0.4495, Total_loss=0.4860]
Epoch 1: 100%|██████████| 371/371 [02:47<00:00,  2.21it/s, MLM_Accuracy=0.4103, MLM_loss=0.0097, Mask_Percent=7, NSP_Accuracy=0.9384, NSP_loss=0.8086, Total_loss=0.8182]
Epoch 2: 100%|██████████| 371/371 [02:47<00:00,  2.22it/s, MLM_Accuracy=0.4179, MLM_loss=0.0205, Mask_Percent=8, NSP_Accuracy=0.9551, NSP_loss=0.0043, Total_loss=0.0248]
Epoch 3: 100%|██████████| 371/371 [02:46<00:00,  2.23it/s, MLM_Accuracy=0.4310, MLM_loss=0.0221, Mask_Percent=9, NSP_Accuracy=0.9708, NSP_loss=0.1444, Total_loss=0.1665]
Epoch 4: 100%|██████████| 371/371 [02:46<00:00,  2.22it/s, MLM_Accuracy=0.4296, MLM_loss=0.0434, Mask_Percent=10, NSP_Accuracy=0.9753, NSP_loss=0.1067, Total_loss=0.1501]
Epoch 5: 100%|██████████| 371/371 [02:47<00:00,  2.22it/s, MLM_Accuracy

In [15]:
adp = trainModel_V3(modelType = modelType, inputs = nsp_input.returnInput(), batch_size = batch_size, epoch = epoch, acc_goal_each_epoch = epoch_acc, masking_method = "adaptive", saveModelName = "saved_model/saved_model_mask_adaptive")
adp = None

  return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
Epoch 0: 100%|██████████| 371/371 [02:49<00:00,  2.19it/s, MLM_Accuracy=0.3253, MLM_loss=0.0530, Mask_Percent=6, NSP_Accuracy=0.8301, NSP_loss=0.9729, Total_loss=1.0259]
Epoch 1: 100%|██████████| 371/371 [02:49<00:00,  2.19it/s, MLM_Accuracy=0.4152, MLM_loss=0.0616, Mask_Percent=7, NSP_Accuracy=0.9317, NSP_loss=0.7327, Total_loss=0.7943]
Epoch 2: 100%|██████████| 371/371 [02:49<00:00,  2.19it/s, MLM_Accuracy=0.4329, MLM_loss=0.0441, Mask_Percent=8, NSP_Accuracy=0.9694, NSP_loss=0.0017, Total_loss=0.0458]
Epoch 3: 100%|██████████| 371/371 [02:48<00:00,  2.20it/s, MLM_Accuracy=0.4323, MLM_loss=0.0169, Mask_Percent=9, NSP_Accuracy=0.9757, NSP_loss=0.0077, Total_loss=0.0246]
Epoch 4: 100%|██████████| 371/371 [02:50<00:00,  2.18it/s, MLM_Accuracy=0.4295, MLM_loss=0.0423, Mask_Percent=8, NSP_Accuracy=0.9703, NSP_loss=0.0232, Total_loss=0.0655]
Epoch 5: 100%|██████████| 371/371 [02:50<00:00,  2.18it/s, MLM_Accuracy=

In [16]:
nsp_input = None