In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
# Import required libraries
from transformers import BertModel, AutoTokenizer
import pandas as pd

In [3]:
model_name = 'bert-base-cased'

In [4]:
model = BertModel.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [5]:
sentence = "When life gives you lemons, don't make lemonade."
tokens = tokenizer.tokenize(sentence)
tokens

['When',
 'life',
 'gives',
 'you',
 'lemon',
 '##s',
 ',',
 'don',
 "'",
 't',
 'make',
 'lemon',
 '##ade',
 '.']

In [6]:
vocab = tokenizer.vocab
vocab_df = pd.DataFrame({"token": vocab.keys(), "token_id": vocab.values()})
vocab_df.head()

Unnamed: 0,token,token_id
0,Retired,14454
1,handball,17678
2,##ℓ,28734
3,surf,20114
4,Yellow,8278


In [7]:
vocab_df = vocab_df.sort_values(by="token_id").set_index("token_id")
vocab_df.head()

Unnamed: 0_level_0,token
token_id,Unnamed: 1_level_1
0,[PAD]
1,[unused1]
2,[unused2]
3,[unused3]
4,[unused4]


In [8]:
print(len(vocab_df))

28996


In [9]:
# Encode the sentence into token_ids using the tokenizer
token_ids = tokenizer.encode(sentence)
token_ids

[101,
 1332,
 1297,
 3114,
 1128,
 22782,
 1116,
 117,
 1274,
 112,
 189,
 1294,
 22782,
 6397,
 119,
 102]

In [10]:
print("Number of tokens:", len(tokens))
print("Number of token IDs:", len(token_ids))

Number of tokens: 14
Number of token IDs: 16


In [11]:
print("Token at position 101:", vocab_df.iloc[101])
print("Token at position 102:", vocab_df.iloc[102])

Token at position 101: token    [CLS]
Name: 101, dtype: object
Token at position 102: token    [SEP]
Name: 102, dtype: object


In [12]:
list(zip(tokens, token_ids[1:-1]))

[('When', 1332),
 ('life', 1297),
 ('gives', 3114),
 ('you', 1128),
 ('lemon', 22782),
 ('##s', 1116),
 (',', 117),
 ('don', 1274),
 ("'", 112),
 ('t', 189),
 ('make', 1294),
 ('lemon', 22782),
 ('##ade', 6397),
 ('.', 119)]

In [13]:
tokenizer.decode(token_ids)

"[CLS] When life gives you lemons, don't make lemonade. [SEP]"

In [14]:
tokenizer.decode(token_ids[1:-1])

"When life gives you lemons, don't make lemonade."

In [15]:
tokenizer.decode([1297,1116,6397])

'lifesade'

In [16]:
tokenizer_out = tokenizer(sentence)
tokenizer_out

{'input_ids': [101, 1332, 1297, 3114, 1128, 22782, 1116, 117, 1274, 112, 189, 1294, 22782, 6397, 119, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

# Handling Multiple Sentences

In [17]:
# Create a new sentence by removing "don't " from the original sentence
sentence2 = sentence.replace("don't ", "")
sentence2

'When life gives you lemons, make lemonade.'

In [18]:
# Tokenize both sentences with padding
# padding adds extra dummy tokens to shorter sentences, in order to make the shapes fit together.
tokenizer_out2 = tokenizer([sentence, sentence2], padding=True) 
tokenizer_out2

{'input_ids': [[101, 1332, 1297, 3114, 1128, 22782, 1116, 117, 1274, 112, 189, 1294, 22782, 6397, 119, 102], [101, 1332, 1297, 3114, 1128, 22782, 1116, 117, 1294, 22782, 6397, 119, 102, 0, 0, 0]], 'token_type_ids': [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0]]}

In [19]:
print(tokenizer.decode(tokenizer_out2['input_ids'][0]))
print(tokenizer.decode(tokenizer_out2['input_ids'][1]))
# Paddings in the last

[CLS] When life gives you lemons, don't make lemonade. [SEP]
[CLS] When life gives you lemons, make lemonade. [SEP] [PAD] [PAD] [PAD]
