In [1]:
import torch
from tqdm import tqdm
from transformers import BertTokenizer, BertForMaskedLM

In [2]:
DATASET_PATH = "../../data/meditations.txt"
PRE_TRAINED_MODEL_PATH = "../../models/meditations_mlm_bert_base_uncased_continual_pretraining"
MODEL_NAME = "bert-base-uncased"
NUM_EPOCHS = 2
BATCH_SIZE = 2
MAX_LENGTH = 256
SHUFFLE    = True

In [3]:
with open(DATASET_PATH, "r") as fin:
    text = fin.read().split('\n')

In [4]:
def generate_mlm_dataset(tokenizer, inputs):
    # create labels tensor by cloning the input_ids tensor
    inputs['labels'] = inputs.input_ids.detach().clone()
    
    # create random array of floats with equal dimensions to input_ids tensor
    rand = torch.rand(inputs.input_ids.shape)
    
    # create mask array except for special tokens [CLS], [SEP], [PAD]
    mask_arr = (rand < 0.15) \
             * (inputs.input_ids != 101) \
             * (inputs.input_ids != 102) \
             * (inputs.input_ids != 0)
    
    selection = []
    for i in range(inputs.input_ids.shape[0]):
        selection.append(
            torch.flatten(mask_arr[i].nonzero()).tolist()
        )
    
    # For each row in the input_ids, assign 103 [MASK] token to the selection indices
    for i in range(inputs.input_ids.shape[0]):
        inputs.input_ids[i, selection[i]] = 103
    
    return inputs

In [5]:
class MLMDataset(torch.utils.data.Dataset):
    def __init__(self, encodings):
        self.encodings = encodings
    
    def __len__(self):
        return len(self.encodings.input_ids)
    
    def __getitem__(self, index):
        return {key: torch.tensor(value[index].clone().detach()) for key, value in self.encodings.items()}

In [6]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

#
tokenizer = BertTokenizer.from_pretrained(MODEL_NAME)
model = BertForMaskedLM.from_pretrained(MODEL_NAME)

# tokenize the text
inputs = tokenizer(text,
                   return_tensors='pt',
                   max_length=MAX_LENGTH,
                   truncation=True,
                   padding='max_length')

inputs = generate_mlm_dataset(tokenizer, inputs)

#
dataset = MLMDataset(inputs)
loader  = torch.utils.data.DataLoader(dataset, batch_size=BATCH_SIZE, shuffle=SHUFFLE)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
	nonzero()
Consider using one of the following signatures instead:
	nonzero(*, bool as_tuple) (Triggered internally at  /opt/conda/conda-bld/pytorch_1607370116979/work/torch/csrc/utils/python_arg_parser.cpp:882.)


In [None]:
model.to(device)
model.train()
optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5)

for epoch in range(NUM_EPOCHS):
    # this just creates blank progress bars for both our epochs
    loop = tqdm(loader, leave=True) #leave=True enables us to see the progress bars for each epoch
    for batch in loop:
        # zero_grad sets the gradients of all optimized tensors to zero
        # we need to set the gradients to zero b/f starting backprop b/c pytoch accumulates the gradients on subsequent backward pass (this is convenient while training RNNs)
        # 
        optimizer.zero_grad() # stops the gradient calculations from the previous set being carried over to the next set
        
        # batch in loop contains the 4 inputs (ie. input_ids, token_ids, attention_masks and labels)
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        
        # forward pass, feeding input data through all the neurons in the network from first to last layer
        outputs = model(input_ids,
                        attention_mask=attention_mask,
                        labels=labels)
        loss = outputs.loss

        # Backward propagation, compute the gradient
        loss.backward()
        
        # update model parameters and take a step using the computerd gradient
        optimizer.step()
        loop.set_description("Epoch {}".format(epoch))
        loop.set_postfix(loss=loss.item())
        
model.save_pretrained(PRE_TRAINED_MODEL_PATH)