# The Masked Language Modelling Task

In [1]:
from transformers import BertForMaskedLM, pipeline, BertForNextSentencePrediction, BertTokenizer
import torch

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
bert_lm = BertForMaskedLM.from_pretrained('bert-base-cased')

BertForMaskedLM has generative capabilities, as `prepare_inputs_for_generation` is explicitly overwritten. However, it doesn't directly inherit from `GenerationMixin`. From 👉v4.50👈 onwards, `PreTrainedModel` will NOT inherit from `GenerationMixin`, and this model will lose the ability to call `generate` and other related functions.
  - If you are the owner of the model architecture code, please modify your model class such that it inherits from `GenerationMixin` (after `PreTrainedModel`, otherwise you'll get an exception).
  - If you are not the owner of the model architecture class, please contact the model code owner to update it.
Some weights of the model checkpoint at bert-base-cased were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architect

In [3]:
# inspect the model
bert_lm

BertForMaskedLM(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(28996, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwi

This shows that BertForMaskedLM is a standard BERT model in its core with additional layer on top of it.

[cls]: in this layer for the MaskedLM, in the decoder layer we can see that the in_features is 768 which the standard BERT outputs and feeds to this layer. the out_features is 28996 which shows that it predicts the [mask] token out of all these words in vocabulary.

In [4]:
# pipeline in transformers take in the models/tokenizers and are easy way to perform several tasks

nlp = pipeline("fill-mask", model = 'bert-base-cased')

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Device set to use mps:0


In [5]:
type(nlp.model)

transformers.models.bert.modeling_bert.BertForMaskedLM

In [6]:
nlp.tokenizer

BertTokenizerFast(name_or_path='bert-base-cased', vocab_size=28996, model_max_length=512, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=False, added_tokens_decoder={
	0: AddedToken("[PAD]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	100: AddedToken("[UNK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	101: AddedToken("[CLS]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	102: AddedToken("[SEP]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	103: AddedToken("[MASK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}
)

In [7]:
preds = nlp(f"If you don't [MASK] at the sign, you will get a ticket.")

print("If you don't *** at the sign, you will get a ticket.")

for p in preds:
    print(f"Token: {p['token_str']}.  |  Score: {100*p['score']:,.2f}%")

If you don't *** at the sign, you will get a ticket.
Token: stop.  |  Score: 51.11%
Token: look.  |  Score: 38.41%
Token: arrive.  |  Score: 1.11%
Token: glance.  |  Score: 1.05%
Token: turn.  |  Score: 0.72%


# Next Sentence Prediction Task

In [9]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

bert_nsp = BertForNextSentencePrediction.from_pretrained('bert-base-uncased')

In [10]:
bert_nsp

BertForNextSentencePrediction(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e

This shows that BertForNextSentencePrediction is a standard BERT model in its core with additional layer on top of it.

[pooler]: in this layer, it takes the final hidden state of the final encoder and outputs the final representation of the tokens including the [CLS] token.

[cls]: in this layer for the NSPHead, in the decoder layer we can see that the in_features is 768 which the standard BERT outputs and feeds to this layer. the out_features is 2 which shows that it predicts the is_next or not_next probability

In [11]:
text1 = "Deliver huge imporvements to your machine learning pipelines without spending hours fine-tuning parameters!"
text2 = "This book's practical case-studies reveal feature engineering techniques that upgrade your data wrangling-and optimization."

In [12]:
inputs = tokenizer(text1,text2, return_tensors='pt')

In [13]:
inputs

{'input_ids': tensor([[  101,  8116,  4121, 17727,  2953,  3726,  8163,  2000,  2115,  3698,
          4083, 13117,  2015,  2302,  5938,  2847,  2986,  1011, 17372, 11709,
           999,   102,  2023,  2338,  1005,  1055,  6742,  2553,  1011,  2913,
          7487,  3444,  3330,  5461,  2008, 12200,  2115,  2951, 23277,  5654,
          2989,  1011,  1998, 20600,  1012,   102]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}

In [15]:
inputs.input_ids # tokens for sentence A and B

tensor([[  101,  8116,  4121, 17727,  2953,  3726,  8163,  2000,  2115,  3698,
          4083, 13117,  2015,  2302,  5938,  2847,  2986,  1011, 17372, 11709,
           999,   102,  2023,  2338,  1005,  1055,  6742,  2553,  1011,  2913,
          7487,  3444,  3330,  5461,  2008, 12200,  2115,  2951, 23277,  5654,
          2989,  1011,  1998, 20600,  1012,   102]])

In [16]:
# 0 == "isNextSentence"
# 1 == "notNextSentence"

output = bert_nsp(**inputs)

output

NextSentencePredictorOutput(loss=None, logits=tensor([[ 6.0527, -5.6015]], grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)

In this case: looking at logits:

it is trying to predict 0 more that 1 : 6.0527 for 0 and -5.6015 for 1 

In [17]:
# claculating loss by passing through label
outputs = bert_nsp(**inputs, labels= torch.LongTensor([0]))

outputs


# this tells the model to calculate loss on the account that we are telling sentence B comes after sentence A

NextSentencePredictorOutput(loss=tensor(8.7022e-06, grad_fn=<NllLossBackward0>), logits=tensor([[ 6.0527, -5.6015]], grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)

In [19]:
# claculating loss by passing through label
outputs = bert_nsp(**inputs, labels= torch.LongTensor([1]))

outputs


# this tells the model to calculate loss on the account that we are telling sentence B does not comes after sentence A

NextSentencePredictorOutput(loss=tensor(11.6542, grad_fn=<NllLossBackward0>), logits=tensor([[ 6.0527, -5.6015]], grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)

# Fine-Tuning BERT for NLP Problems