In [None]:
# We can use the function encode_plus, which does the following in one go

'''
1. Tokenize the input sentence
2. Add the [CLS] and [SEP] tokens.
3. Pad or truncate the sentence to the maximum length allowed
4. Encode the tokens into their corresponding IDs Pad or truncate all sentences to the same length.
5. Create the attention masks which explicitly differentiate real tokens from [PAD] tokens

'''

In [7]:
from transformers import BertTokenizer
import torch

In [27]:
# load the model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [29]:
sentence = "I Love Mysore"

In [36]:
# Encode the sentence
encoded = tokenizer.encode_plus(
                                text=sentence,  # the sentence to be encoded
                                add_special_tokens=True,  # Add [CLS] and [SEP]
                                max_length = 64,  # maximum length of a sentence
                                truncation=True,
                                pad_to_max_length=True,  # Add [PAD]s
                                return_attention_mask = True,  # Generate the attention mask
                                return_tensors = 'pt',  # ask the function to return PyTorch tensors
                              )




In [37]:
# Get the input IDs and attention mask in tensor format
input_ids = encoded['input_ids']
attn_mask = encoded['attention_mask']

In [38]:
input_ids

tensor([[  101,  1045,  2293, 20761,   102,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0]])

In [39]:
attn_mask

tensor([[1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]])

In [None]:
# The “attention mask” tells the model which tokens should be attended to and which (the [PAD] tokens) should not