# 決定 Tokenizer 與使用 BertForPretraining 來做 BERT 預訓練

In [1]:
import os
os.environ['CUDA_LAUNCH_BLOCKING'] = "1"

In [2]:
from transformers import BertTokenizer, BertForPreTraining, AdamW
from transformers.models.bert.modeling_bert import BertForPreTrainingOutput, BertPreTrainingHeads, BertConfig, BERT_INPUTS_DOCSTRING, _CONFIG_FOR_DOC
from transformers.models.albert.modeling_albert import AlbertSOPHead
from transformers.utils import ModelOutput
from transformers.utils.doc import add_start_docstrings_to_model_forward, replace_return_docstrings
from torch.nn import CrossEntropyLoss
from dataclasses import dataclass
from typing import List, Optional, Tuple, Union
from tqdm import tqdm
import pandas as pd
import torch
import random
import copy

In [3]:
class MyBertForPreTrainingOutput(BertForPreTrainingOutput):
    """
    Output type of [`MyBertForPreTraining`].
    Args:
        loss (*optional*, returned when `labels` is provided, `torch.FloatTensor` of shape `(1,)`):
            Total loss as the sum of the masked language modeling loss and the next sequence prediction
            (classification) loss.
        prediction_logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
            Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
        seq_relationship_logits (`torch.FloatTensor` of shape `(batch_size, 2)`):
            Prediction scores of the next sequence prediction (classification) head (scores of True/False continuation
            before SoftMax).
        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
            shape `(batch_size, sequence_length, hidden_size)`.
            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
            sequence_length)`.
            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
        mlm_loss (`float`):
            MLM loss.
        nsp_loss (`float`):
            NSP loss.
    """
    def __init__(self, loss=None, prediction_logits=None, seq_relationship_logits=None, hidden_states=None, attentions=None, mlm_loss=None, nsp_loss=None):
        super().__init__(loss=loss, prediction_logits=prediction_logits, seq_relationship_logits=seq_relationship_logits, hidden_states=hidden_states, attentions=attentions)
        self.mlm_loss = mlm_loss
        self.nsp_loss = nsp_loss

In [4]:
class MyAlbertSOPHead(torch.nn.Module):
    def __init__(self, config: BertConfig):
        super().__init__()  

        self.dropout = torch.nn.Dropout(config.hidden_dropout_prob)
        self.classifier = torch.nn.Linear(config.hidden_size , config.num_labels)

    def forward(self, pooled_output: torch.Tensor) -> torch.Tensor:
        dropout_pooled_output = self.dropout(pooled_output)
        logits = self.classifier(dropout_pooled_output)
        return logits

In [5]:
class BertPretrainingHeadsWithSOP(BertPreTrainingHeads):
    def __init__(self, config):
        super().__init__(config)
        self.seq_relationship = MyAlbertSOPHead(config)

In [6]:
class MyBertForPreTraining(BertForPreTraining):
    def __init__(self, config, nspTask = "NSP"):
        super().__init__(config)
        if nspTask == "SOP":
            self.cls = BertPretrainingHeadsWithSOP(config)
            
    @add_start_docstrings_to_model_forward(BERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
    @replace_return_docstrings(output_type=MyBertForPreTrainingOutput, config_class=_CONFIG_FOR_DOC)
    def forward(
        self,
        input_ids: Optional[torch.Tensor] = None,
        attention_mask: Optional[torch.Tensor] = None,
        token_type_ids: Optional[torch.Tensor] = None,
        position_ids: Optional[torch.Tensor] = None,
        head_mask: Optional[torch.Tensor] = None,
        inputs_embeds: Optional[torch.Tensor] = None,
        labels: Optional[torch.Tensor] = None,
        next_sentence_label: Optional[torch.Tensor] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
    ) -> Union[Tuple[torch.Tensor], MyBertForPreTrainingOutput]:
        r"""
            labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
                Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ...,
                config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked),
                the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
            next_sentence_label (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
                Labels for computing the next sequence prediction (classification) loss. Input should be a sequence
                pair (see `input_ids` docstring) Indices should be in `[0, 1]`:
                - 0 indicates sequence B is a continuation of sequence A,
                - 1 indicates sequence B is a random sequence.
            kwargs (`Dict[str, any]`, optional, defaults to *{}*):
                Used to hide legacy arguments that have been deprecated.
        Returns:
        Example:
        ```python
        >>> from transformers import AutoTokenizer, BertForPreTraining
        >>> import torch
        >>> tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
        >>> model = BertForPreTraining.from_pretrained("bert-base-uncased")
        >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
        >>> outputs = model(**inputs)
        >>> prediction_logits = outputs.prediction_logits
        >>> seq_relationship_logits = outputs.seq_relationship_logits
        ```
        """
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        outputs = self.bert(
            input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            position_ids=position_ids,
            head_mask=head_mask,
            inputs_embeds=inputs_embeds,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )

        sequence_output, pooled_output = outputs[:2]
        prediction_scores, seq_relationship_score = self.cls(sequence_output, pooled_output)

        total_loss = None
        if labels is not None and next_sentence_label is not None:
            loss_fct = CrossEntropyLoss()
            masked_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), labels.view(-1))
            next_sentence_loss = loss_fct(seq_relationship_score.view(-1, 2), next_sentence_label.view(-1))
            total_loss = masked_lm_loss + next_sentence_loss

        if not return_dict:
            output = (prediction_scores, seq_relationship_score) + outputs[2:]
            return ((total_loss,) + output) if total_loss is not None else output

        return MyBertForPreTrainingOutput(
            loss=total_loss,
            prediction_logits=prediction_scores,
            seq_relationship_logits=seq_relationship_score,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
            mlm_loss = masked_lm_loss,
            nsp_loss = next_sentence_loss,
        )

# 取出資料集

In [7]:
class getData():
    def __init__(self, modelType, datapath, maskPercent, nspTask = "NSP"):
        self.datapath = datapath
        self.tokenizer = BertTokenizer.from_pretrained(modelType)
        self.maskPercent = maskPercent
        self.nspTask = nspTask
        self.text = self.toText()
        self.inputs = None
        self.nspPrepare()
        self.mlmPrepare()
    
    def toText(self):
        df = pd.read_csv("IMDB Dataset.csv")
        text = []
        for review in df["review"]:
            text.append(review)
        
        return text
    
    def nspPrepare(self):
        bag = []
        for sentence in self.text:
            for s_str in sentence.split('.<br /><br />'):
                if '. ' in s_str:
                    bag.extend(s_str.split('. '))
                elif '!' in s_str:
                    bag.extend(s_str.split('!'))
                elif '?' in s_str:
                    bag.extend(s_str.split('?'))
                else:
                    bag.append(s_str)
        bag_size = len(bag)

        if self.nspTask == "NSP":
            (sentence_a, sentence_b, label) = self.nspData(bag, bag_size)
        elif self.nspTask == "SOP":
            (sentence_a, sentence_b, label) = self.sopData()

        self.inputs = self.tokenizer(sentence_a, sentence_b, return_tensors='pt',
                   max_length=512, truncation=True, padding='max_length')
        self.inputs['next_sentence_label'] = torch.LongTensor([label]).T
    
    def nspData(self, bag, bag_size):
        sentence_a = []
        sentence_b = []
        label = []
        for paragraph in self.text:
            sentences = []
            for s_str in paragraph.split('.  '):
                if '!' in s_str:
                    sentences.extend(s_str.split('!'))
                elif '?' in s_str:
                    sentences.extend(s_str.split('?'))
                elif ';' in s_str:
                    sentences.extend(s_str.split(';'))
                else:
                    sentences.append(s_str)
            num_sentences = len(sentences)
            if num_sentences > 1:
                start = random.randint(0, num_sentences-2)
                # 50/50 whether is IsNextSentence or NotNextSentence
                if random.random() >= 0.5:
                    # this is IsNextSentence
                    sentence_a.append(sentences[start])
                    sentence_b.append(sentences[start+1])
                    label.append(0)
                else:
                    index = random.randint(0, bag_size-1)
                    # this is NotNextSentence
                    sentence_a.append(sentences[start])
                    sentence_b.append(bag[index])
                    label.append(1)
        
        return (sentence_a, sentence_b, label)
    
    def sopData(self):
        sentence_a = []
        sentence_b = []
        label = []
        for paragraph in self.text:
            sentences = [
                sentence for sentence in paragraph.split('.') if sentence != ''
            ]
            num_sentences = len(sentences)
            if num_sentences > 1:
                start = random.randint(0, num_sentences-2)
                # 50/50 whether is IsNextSentence or NotNextSentence
                if random.random() >= 0.5:
                    # this is IsNextSentence
                    sentence_a.append(sentences[start])
                    sentence_b.append(sentences[start+1])
                    label.append(0)
                else:
                    # this is NotNextSentence
                    sentence_a.append(sentences[start+1])
                    sentence_b.append(sentences[start])
                    label.append(1)
        
        return (sentence_a, sentence_b, label)

    def mlmPrepare(self):
        self.inputs['labels'] = self.inputs.input_ids.detach().clone()
        rand = torch.rand(self.inputs.input_ids.shape)
        # create mask array
        mask_arr = (rand < self.maskPercent * 0.01) * (self.inputs.input_ids != 101) * \
                (self.inputs.input_ids != 102) * (self.inputs.input_ids != 0)
        self.inputs['mask_arr'] = mask_arr
        
        selection = []

        for i in range(self.inputs.input_ids.shape[0]):
            selection.append(
                torch.flatten(mask_arr[i].nonzero()).tolist()
            )

        rand_mask_type = copy.deepcopy(selection)

        for row in range(len(rand_mask_type)):
            for col in range(len(rand_mask_type[row])):
                rand_mask_type[row][col] = random.random()

        vocab_size = len(self.tokenizer.vocab)
        vocab = self.tokenizer.get_vocab()
        special_tokens = [vocab['[CLS]'], vocab['[SEP]'], vocab['[MASK]'], vocab['[UNK]'],  vocab['[PAD]']]

        for i in range(self.inputs.input_ids.shape[0]):
            for j in range(len(selection[i])):
                if rand_mask_type[i][j] < 0.10:
                    continue
                elif rand_mask_type[i][j] < 0.20:
                    rand_num = vocab['[CLS]']
                    while rand_num in special_tokens:
                        rand_num = random.randint(1, vocab_size)
                    self.inputs.input_ids[i, selection[i][j]] = rand_num
                else:
                    self.inputs.input_ids[i, selection[i][j]] = 103
    
    def returnInput(self):
        return self.inputs

In [8]:
class OurDataset(torch.utils.data.Dataset):
    def __init__(self, encodings):
        self.encodings = encodings
    def __getitem__(self, idx):
        return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
    def __len__(self):
        return len(self.encodings.input_ids)

In [9]:
class trainModel():
    def __init__(self, modelType, inputs, batch_size, epoch, maskPercent, saveModelName, saveCSV = True, nspTask = "NSP"):
        # configuration = BertConfig.from_pretrained(modelType)
        self.model = MyBertForPreTraining.from_pretrained(modelType)
        self.tokenizer = BertTokenizer.from_pretrained(modelType)
        self.inputs = inputs
        self.batch_size = batch_size
        self.epoch = epoch
        self.maskPercent = maskPercent
        self.saveModelName = saveModelName
        self.saveCSV = saveCSV
        self.loader = torch.utils.data.DataLoader(OurDataset(self.inputs), \
                                             batch_size=self.batch_size, shuffle=True)
        self.device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
        self.model.to(self.device)
        self.model.train()
        self.optim = AdamW(self.model.parameters(), lr = 5e-5)
        self.record = {"mask_percent": None,
                  "mlm_acc_each_epoch": [], 
                  "mlm_loss_each_epoch": []}
        
        if os.path.isfile("record.csv"):
            self.rec = pd.read_csv("record.csv")
        else:
            self.rec = pd.DataFrame()
            
        self.training()
        self.save_model(self.saveModelName)
    
    def training(self):
        acc_each_epoch = []
        loss_each_epoch = []
        for epoch in range(self.epoch):
            # setup loop with TQDM and dataloader
            mask_nums = 0
            mlm_correct = 0
            nsp_nums = 0
            nsp_correct = 0
            loop = tqdm(self.loader, leave=True)
            for batch in loop:
                # initialize calculated gradients (from prev step)
                self.optim.zero_grad()
                # pull all tensor batches required for training
                input_ids = batch['input_ids'].to(self.device)
                token_type_ids = batch['token_type_ids'].to(self.device)
                attention_mask = batch['attention_mask'].to(self.device)
                next_sentence_label = batch['next_sentence_label'].to(self.device)
                labels = batch['labels'].to(self.device)
                mask_arr = batch['mask_arr'].to(self.device)
                # process
                outputs = self.model(input_ids, attention_mask=attention_mask,
                                token_type_ids=token_type_ids,
                                next_sentence_label=next_sentence_label,
                                labels=labels)
                
                prediction_logits = outputs.prediction_logits[mask_arr]
                predicted_ids = prediction_logits.argmax(-1)
                
                seq_relationship_logits = outputs.seq_relationship_logits
                predicted_labels = torch.argmax(seq_relationship_logits, dim=1)
                predicted_label = predicted_labels

                mask_nums += len(predicted_ids)
                mlm_correct += torch.eq(predicted_ids, labels[mask_arr]).sum().item()
                nsp_nums += len(predicted_label)
                nsp_correct += predicted_label.eq(torch.squeeze(next_sentence_label)).sum().item()
                
                # extract loss
                loss = outputs.loss
                mlm_loss = outputs.mlm_loss.item()
                nsp_loss = outputs.nsp_loss.item()
                mlm_acc = mlm_correct / mask_nums
                nsp_acc = nsp_correct / nsp_nums
                # calculate loss for every parameter that needs grad update
                loss.backward()
                # update parameters
                self.optim.step()
                # print relevant info to progress bar
                loop.set_description(f'Epoch {epoch}')
                loop.set_postfix(Total_loss='{:.4f}'.format(loss.item()), MLM_Accuracy='{:.4f}'.format(mlm_acc), NSP_Accuracy='{:.4f}'.format(nsp_acc), \
                                MLM_loss='{:.4f}'.format(mlm_loss), NSP_loss='{:.4f}'.format(nsp_loss))
            acc_each_epoch.append(mlm_acc)
            loss_each_epoch.append(mlm_loss)

        if self.saveCSV:
            self.record["mask_percent"] = self.maskPercent
            self.record["mlm_acc_each_epoch"].append(acc_each_epoch)
            self.record["mlm_loss_each_epoch"].append(loss_each_epoch)
            new_rec = self.rec.append(self.record, ignore_index=True)
            new_rec.to_csv("record.csv", index = None)
        torch.cuda.empty_cache()
    
    def save_model(self, maskPercent):
        self.model.save_pretrained(maskPercent)

In [10]:
datapath = "IMDB Dataset.csv"
modelType = 'bert-base-uncased'
epoch = 10
batch_size = 8

In [11]:
mask15_input = getData(modelType = modelType, datapath = datapath, maskPercent = 15, nspTask = "NSP")
mask15 = trainModel(modelType = modelType, inputs = mask15_input.returnInput(), batch_size = batch_size, epoch = epoch, maskPercent = 15, saveModelName = "saved_model/saved_model_mask15")
mask15_input = None
mask15 = None

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
  return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
Epoch 0: 100%|████

In [11]:
mask14_input = getData(modelType = modelType, datapath = datapath, maskPercent = 14, nspTask = "NSP")
mask14 = trainModel(modelType = modelType, inputs = mask14_input.returnInput(), batch_size = batch_size, epoch = epoch, maskPercent = 14, saveModelName = "saved_model/saved_model_mask14")
mask14_input = None
mask14 = None

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
  return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
Epoch 0: 100%|██████████| 6207/6207 [57:27<00:00,  1.80it/s, MLM_Accuracy=0.4661, MLM_loss=0.0394, NSP_Accuracy=0.7155, NSP_loss=0.3957, Total_loss=0.4351] 
Epoch 1: 100%|██████████| 6207/6207 [57:15<00:00,  1.81it/s, MLM_Accuracy=0.4566, MLM_loss=0.0399, NSP_Accuracy=0.8202, NSP_loss=0.7781, Total_loss=0.8180]
Epoch 2: 100%|██████████| 6207/6207 [57:20<00:00,  1.80it/s, MLM_Accuracy=0.4417, MLM_loss=0.0457, NSP_Accuracy=0.9075, NSP_loss=0.4141, Total_loss=0.4598]
Epoch 3: 100%|██████████| 6207/6207 [57:20<00:00,  1.80it/s, MLM_Accuracy=0.4293, MLM_loss=0.0624, NSP_Accuracy=0.9455, NSP_loss=0.6049, Total_loss=0.6673]
Epoch 4: 100%|██████████| 6207/6207 [57:16<00:00,  1.81it/s, MLM_Accura

In [11]:
mask13_input = getData(modelType = modelType, datapath = datapath, maskPercent = 13, nspTask = "NSP")
mask13 = trainModel(modelType = modelType, inputs = mask13_input.returnInput(), batch_size = batch_size, epoch = epoch, maskPercent = 13, saveModelName = "saved_model/saved_model_mask13")
mask13_input = None
mask13 = None

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
  return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
Epoch 0: 100%|██████████| 6207/6207 [57:45<00:00,  1.79it/s, MLM_Accuracy=0.4732, MLM_loss=0.0461, NSP_Accuracy=0.7114, NSP_loss=0.5922, Total_loss=0.6383] 
Epoch 1: 100%|██████████| 6207/6207 [57:07<00:00,  1.81it/s, MLM_Accuracy=0.4603, MLM_loss=0.0621, NSP_Accuracy=0.8184, NSP_loss=0.1282, Total_loss=0.1903]
Epoch 2: 100%|██████████| 6207/6207 [57:05<00:00,  1.81it/s, MLM_Accuracy=0.4448, MLM_loss=0.0405, NSP_Accuracy=0.9073, NSP_loss=0.0359, Total_loss=0.0764]
E

In [12]:
mask12_input = getData(modelType = modelType, datapath = datapath, maskPercent = 12, nspTask = "NSP")
mask12 = trainModel(modelType = modelType, inputs = mask12_input.returnInput(), batch_size = batch_size, epoch = epoch, maskPercent = 12, saveModelName = "saved_model/saved_model_mask12")
mask12_input = None
mask12 = None

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
  return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
Epoch 0: 100%|██████████| 6207/6207 [56:00<00:00,  1.85it/s, MLM_Accuracy=0.4767, MLM_loss=0.0527, NSP_Accuracy=0.7179, NSP_loss=0.4914, Total_loss=0.5441] 
Epoch 1: 100%|██████████| 6207/6207 [56:04<00:00,  1.84it/s, MLM_Accuracy=0.4597, MLM_loss=0.0461, NSP_Accuracy=0.8199, NSP_loss=0.6716, Total_loss=0.7177]
Epoch 2: 100%|██████████| 6207/6207 [56:05<00:00,  1.84it/s, MLM_Accuracy=0.4462, MLM_loss=0.0342, NSP_Accuracy=0.9091, NSP_loss=0.2677, Total_loss=0.3020]
E

In [13]:
mask11_input = getData(modelType = modelType, datapath = datapath, maskPercent = 11, nspTask = "NSP")
mask11 = trainModel(modelType = modelType, inputs = mask11_input.returnInput(), batch_size = batch_size, epoch = epoch, maskPercent = 11, saveModelName = "saved_model/saved_model_mask11")
mask11_input = None
mask11 = None

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
  return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
Epoch 0: 100%|██████████| 6207/6207 [56:41<00:00,  1.82it/s, MLM_Accuracy=0.4821, MLM_loss=0.0536, NSP_Accuracy=0.7226, NSP_loss=0.5463, Total_loss=0.5999] 
Epoch 1: 100%|██████████| 6207/6207 [57:11<00:00,  1.81it/s, MLM_Accuracy=0.4663, MLM_l

In [12]:
mask10_input = getData(modelType = modelType, datapath = datapath, maskPercent = 10, nspTask = "NSP")
mask10 = trainModel(modelType = modelType, inputs = mask10_input.returnInput(), batch_size = batch_size, epoch = epoch, maskPercent = 10, saveModelName = "saved_model/saved_model_mask10")
mask10_input = None
mask10 = None

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
  return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
Epoch 0: 100%|██████████| 6207/6207 [56:35<00:00,  1.83it/s, MLM_Accuracy=0.4812, MLM_loss=0.0355, NSP_Accuracy=0.7214, NSP_loss=0.5366, Total_loss=0.5721] 
Epoch 1: 100%|██████████| 6207/6207 [56:47<00:00,  1.82it/s, MLM_Accuracy=0.4670, MLM_loss=0.0333, NSP_Accuracy=0.8232, NSP_loss=0.3186, Total_loss=0.3519]
Epoch 2: 100%|██████████| 6207/6207 [56:48<00:00,  1.82it/s, MLM_Accuracy=0.4542, MLM_loss=0.0400, NSP_Accuracy=0.9114, NSP_loss=0.2130, Total_loss=0.2530]
E

In [11]:
mask9_input = getData(modelType = modelType, datapath = datapath, maskPercent = 9, nspTask = "NSP")
mask9 = trainModel(modelType = modelType, inputs = mask9_input.returnInput(), batch_size = batch_size, epoch = epoch, maskPercent = 9, saveModelName = "saved_model/saved_model_mask9")
mask9_input = None
mask9 = None

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
  return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
Epoch 0: 100%|██████████| 6207/6207 [57:19<00:00,  1.80it/s, MLM_Accuracy=0.4887, MLM_loss=0.0189, NSP_Accuracy=0.7187, NSP_loss=0.8390, Total_loss=0.8578] 
Epoch 1: 100%|██████████| 6207/6207 [57:42<00:00,  1.79it/s, MLM_Accuracy=0.4717, MLM_loss=0.0160, NSP_Accuracy=0.8203, NSP_loss=0.1324, Total_loss=0.1484]
Epoch 2: 100%|██████████| 6207/6207 [57:48<00:00,  1.79it/s, MLM_Accuracy=0.4504, MLM_loss=0.0346, NSP_Accuracy=0.9078, NSP_loss=0.2396, Total_loss=0.2743]
E

In [11]:
mask8_input = getData(modelType = modelType, datapath = datapath, maskPercent = 8, nspTask = "NSP")
mask8 = trainModel(modelType = modelType, inputs = mask8_input.returnInput(), batch_size = batch_size, epoch = epoch, maskPercent = 8, saveModelName = "saved_model/saved_model_mask8")
mask8_input = None
mask8 = None

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
  return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
Epoch 0: 100%|██████████| 6207/6207 [56:36<00:00,  1.83it/s, MLM_Accuracy=0.4841, MLM_loss=0.0137, NSP_Accuracy=0.7229, NSP_loss=0.4346, Total_loss=0.4484] 
Epoch 1: 100%|██████████| 6207/6207 [56:41<00:00,  1.82it/s, MLM_Accuracy=0.4688, MLM_loss=0.0132, NSP_Accuracy=0.8220, NSP_loss=0.3078, Total_loss=0.3211]
Epoch 2: 100%|██████████| 6207/6207 [56:47<00:00,  1.82it/s, MLM_Accuracy=0.4497, MLM_loss=0.0171, NSP_Accuracy=0.9059, NSP_loss=0.3251, Total_loss=0.3422]
E

In [12]:
mask7_input = getData(modelType = modelType, datapath = datapath, maskPercent = 7, nspTask = "NSP")
mask7 = trainModel(modelType = modelType, inputs = mask7_input.returnInput(), batch_size = batch_size, epoch = epoch, maskPercent = 7, saveModelName = "saved_model/saved_model_mask7")
mask7_input = None
mask7 = None

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
  return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
Epoch 0: 100%|██████████| 6207/6207 [55:52<00:00,  1.85it/s, MLM_Accuracy=0.4867, MLM_loss=0.0252, NSP_Accuracy=0.7240, NSP_loss=0.7841, Total_loss=0.8093] 
Epoch 1: 100%|██████████| 6207/6207 [55:54<00:00,  1.85it/s, MLM_Accuracy=0.4748, MLM_loss=0.0270, NSP_Accuracy=0.8249, NSP_loss=1.0036, Total_loss=1.0306]
Epoch 2: 100%|██████████| 6207/6207 [56:30<00:00,  1.83it/s, MLM_Accuracy=0.4556, MLM_loss=0.0348, NSP_Accuracy=0.9073, NSP_loss=0.1858, Total_loss=0.2206]
E

In [12]:
mask6_input = getData(modelType = modelType, datapath = datapath, maskPercent = 6, nspTask = "NSP")
mask6 = trainModel(modelType = modelType, inputs = mask6_input.returnInput(), batch_size = batch_size, epoch = epoch, maskPercent = 6, saveModelName = "saved_model/saved_model_mask6")
mask6_input = None
mask6 = None

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
  return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
Epoch 0: 100%|██████████| 6207/6207 [54:09<00:00,  1.91it/s, MLM_Accuracy=0.4921, MLM_loss=0.0043, NSP_Accuracy=0.7222, NSP_loss=0.4447, Total_loss=0.4490] 
Epoch 1: 100%|██████████| 6207/6207 [54:05<00:00,  1.91it/s, MLM_Accuracy=0.4767, MLM_l

In [None]:
# mask15_input_sop = getData(modelType = modelType, datapath = datapath, maskPercent = 15, nspTask = "SOP")
# mask15_sop = trainModel(modelType = modelType, inputs = mask15_input_sop.returnInput(), batch_size = 6, epoch = epoch, maskPercent = 15, saveModelName = "saved_model_mask15_sop", saveCSV = False, nspTask = "SOP")