<a href="https://colab.research.google.com/github/Sumit73102/Tokenization_python/blob/main/Tokenization.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Byte Pair Encoding Algorithm of Roberta-tokenizer

In [8]:
from transformers import RobertaTokenizer

# Load RoBERTa tokenizer (uses Byte-level BPE)
tokenizer = RobertaTokenizer.from_pretrained("roberta-base")

text = "He was walking around streets of Kalifornia."
tokens = tokenizer.tokenize(text)
token_ids = tokenizer.encode(text)

print("Tokens:", tokens)
print("Token IDs:", token_ids)

Tokens: ['He', 'Ġwas', 'Ġwalking', 'Ġaround', 'Ġstreets', 'Ġof', 'ĠKal', 'if', 'ornia', '.']
Token IDs: [0, 894, 21, 3051, 198, 2827, 9, 5507, 1594, 43052, 4, 2]


Byte Pair Encoding Algorithm for tokenization example (GPT-2)

In [6]:
from transformers import GPT2Tokenizer

# Load a pretrained BPE tokenizer (GPT-2 uses BPE)
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")

# Input text
text = "He was walking around streets of Kalifornia."

# Tokenize the text
tokens = tokenizer.tokenize(text)
token_ids = tokenizer.encode(text)

# Decode to check reconstruction
decoded = tokenizer.decode(token_ids)

# Print results
print("Token IDs:", token_ids)
print("Raw tokens:", tokens)

Token IDs: [1544, 373, 6155, 1088, 6483, 286, 12612, 361, 3317, 13]
Raw tokens: ['He', 'Ġwas', 'Ġwalking', 'Ġaround', 'Ġstreets', 'Ġof', 'ĠKal', 'if', 'ornia', '.']


Word-piece algorithm for tokenization example (BERT)

In [4]:
from transformers import BertTokenizer

# Load pretrained WordPiece tokenizer
tokenizer = BertTokenizer.from_pretrained("bert-base-cased")

# Tokenize some text
text = "He was walking around streets of Kalifornia."
# Encode text
input_ids = tokenizer.encode(text, add_special_tokens=True)

# Output the token IDs
print("Token IDs:", input_ids)

# Convert token IDs back to raw tokens and output them
raw_tokens = [tokenizer.decode([token_id]) for token_id in input_ids]
print("Raw tokens:", raw_tokens)

Token IDs: [101, 1124, 1108, 3179, 1213, 4324, 1104, 22576, 14467, 4558, 1465, 119, 102]
Raw tokens: ['[CLS]', 'He', 'was', 'walking', 'around', 'streets', 'of', 'Kali', '##fo', '##rn', '##ia', '.', '[SEP]']


Embedding Vector Generation for same word with different meanings

In [2]:
from transformers import BertTokenizer, BertModel
import torch

# Load pre-trained BERT model and tokenizer
model = BertModel.from_pretrained('bert-base-uncased')
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Two sentences with the word "bank" having different meanings
sentence1 = "He deposited money in the bank."
sentence2 = "The boat reached the river bank."

# Tokenize and convert to tensor
inputs1 = tokenizer(sentence1, return_tensors='pt')
inputs2 = tokenizer(sentence2, return_tensors='pt')
print("Tokenization of input 1:")
print(inputs1.input_ids)
tokens = tokenizer.convert_ids_to_tokens(inputs1["input_ids"][0])
print(tokens,"\n")
print("Tokenization of input 2:")
print(inputs2.input_ids)
tokens = tokenizer.convert_ids_to_tokens(inputs2["input_ids"][0])
print(tokens,"\n")
# Generate embeddings
with torch.no_grad():
    outputs1 = model(**inputs1)
    outputs2 = model(**inputs2)

print("Dimension of Embedding matrix 1:", outputs1.last_hidden_state[0].shape)
print("Dimension of Embedding matrix 2:", outputs2.last_hidden_state[0].shape)
print("\n")
# Extract the embeddings for the word "bank"
bank_embedding1 = outputs1.last_hidden_state[0][5]  # Position of 'bank' in sentence 1
bank_embedding2 = outputs2.last_hidden_state[0][5]  # Position of 'bank' in sentence 2

print("Bank (Financial Institution) Embedding:", bank_embedding1[:5])
print("Bank (River) Embedding:", bank_embedding2[:5])

Tokenization of input 1:
tensor([[  101,  2002, 14140,  2769,  1999,  1996,  2924,  1012,   102]])
['[CLS]', 'he', 'deposited', 'money', 'in', 'the', 'bank', '.', '[SEP]'] 

Tokenization of input 2:
tensor([[ 101, 1996, 4049, 2584, 1996, 2314, 2924, 1012,  102]])
['[CLS]', 'the', 'boat', 'reached', 'the', 'river', 'bank', '.', '[SEP]'] 

Dimension of Embedding matrix 1: torch.Size([9, 768])
Dimension of Embedding matrix 2: torch.Size([9, 768])


Bank (Financial Institution) Embedding: tensor([ 0.5439, -0.3908, -0.3082,  0.3952,  0.3161])
Bank (River) Embedding: tensor([ 0.4154,  0.1082, -0.3767,  0.1519,  0.4705])
