In [2]:
# Import necessary libraries
from transformers import AutoTokenizer

# Define the model checkpoint (BERT base uncased)
checkpoint = 'bert-base-uncased'
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

# Display a subset of the tokenizer's vocabulary and its size
print("Sample of Tokenizer's vocabulary (first 20 tokens):")
vocab_sample = {k: tokenizer.vocab[k] for k in list(tokenizer.vocab)[:20]}
print(vocab_sample)
print(f'\nThe vocabulary size is {len(tokenizer.vocab)}')

# Tokenizing a sentence
sentence = 'Transformers are revolutionizing NLP!'
print(f"\nOriginal sentence: {sentence}")
print(f'{tokenizer.cls_token} -> {tokenizer.cls_token_id}')
print(f'{tokenizer.sep_token} -> {tokenizer.sep_token_id}')

# Tokenize the sentence and get the tokens
tokens = tokenizer.tokenize(sentence)
print(f"Tokens: {tokens}")

# Encode the sentence into token IDs
ids = tokenizer.encode(sentence)
print(f"Token IDs: {ids}")

# Decode the token IDs back to the original sentence
decoded_sentence = tokenizer.decode(ids)
print(f"Decoded sentence: {decoded_sentence}")

# Example: Tokenizing a sentence with an emoji (out-of-vocabulary handling)
emoji_sentence = 'I love NLP 😊'
print(f"\nOriginal sentence with emoji: {emoji_sentence}")
emoji_tokens = tokenizer.tokenize(emoji_sentence)
print(f"Tokens (emoji handling): {emoji_tokens}")

# Handling paired sentences
first_sentence = 'Natural language processing is interesting.'
second_sentence = 'What do you think about it?'
input = tokenizer(first_sentence, second_sentence, return_tensors='pt')
print(f"\nToken IDs for paired sentences: {input['input_ids']}")
print(f"Token type IDs for paired sentences: {input['token_type_ids']}")
print(f"Attention mask: {input['attention_mask']}")

# Padding sentences in a batch
first_sentence = 'Machine learning models require lots of data.'
second_sentence = 'Deep learning models learn through neural networks.'
batch_input = tokenizer([first_sentence, second_sentence], padding=True, return_tensors='pt')
print(f"\nPadded attention mask for batched sentences: {batch_input['attention_mask']}")

# Experiment with tokenizing a longer sentence
long_sentence = 'Tokenization in NLP is a crucial step for preparing text data for machine learning models.'
long_tokens = tokenizer.tokenize(long_sentence)
print(f"\nTokens for a longer sentence: {long_tokens}")


Sample of Tokenizer's vocabulary (first 20 tokens):
{'edmonton': 10522, 'android': 11924, 'inscription': 9315, '##raße': 27807, 'furious': 9943, '##œ': 29674, 'sighted': 19985, 'combatants': 26622, '##00': 8889, 'legion': 8009, 'giacomo': 22873, 'transit': 6671, 'wheels': 7787, 'lift': 6336, '##rlin': 19403, 'cyclist': 14199, 'persuaded': 11766, 'metre': 7924, '##physics': 15638, 'ol': 19330}

The vocabulary size is 30522

Original sentence: Transformers are revolutionizing NLP!
[CLS] -> 101
[SEP] -> 102
Tokens: ['transformers', 'are', 'revolution', '##izing', 'nl', '##p', '!']
Token IDs: [101, 19081, 2024, 4329, 6026, 17953, 2361, 999, 102]
Decoded sentence: [CLS] transformers are revolutionizing nlp! [SEP]

Original sentence with emoji: I love NLP 😊
Tokens (emoji handling): ['i', 'love', 'nl', '##p', '[UNK]']

Token IDs for paired sentences: tensor([[ 101, 3019, 2653, 6364, 2003, 5875, 1012,  102, 2054, 2079, 2017, 2228,
         2055, 2009, 1029,  102]])
Token type IDs for paired se