##1.Tokenize with python

In [3]:
class SimpleTokenizer:
    def __init__(self):
        self.vocab = {}
        self.add_word('<PAD>') #Padding token
        self.add_word('<UNK>') #Unknown token

    def add_word(self,word):
        if word not in self.vocab:
            self.vocab[word] = len(self.vocab)

    def tokenize(self,text):
        return [word if word in self.vocab else '<UNK>' for word in text.split()]

    def convert_tokens_to_ids(self,tokens):
        return [self.vocab[token] for token in tokens]

    def convert_ids_to_tokens(self,ids):
        reverse_vocab = {id: word for word, id in self.vocab.items()}
        return [reverse_vocab[id] for id in ids]


In [5]:
sentences = [
        'I love Vietnam',
        'Vietnamese people are pretty friendly',
        'My mom loves cooking',
        'I am Vietnamese'
    ]
tokenizer = SimpleTokenizer()
for st in sentences:
    for word in st.split():
        tokenizer.add_word(word)
print(f'Vocabulary: {tokenizer.vocab}')

Vocabulary: {'<PAD>': 0, '<UNK>': 1, 'I': 2, 'love': 3, 'Vietnam': 4, 'Vietnamese': 5, 'people': 6, 'are': 7, 'pretty': 8, 'friendly': 9, 'My': 10, 'mom': 11, 'loves': 12, 'cooking': 13, 'am': 14}


In [6]:
tokenizer = SimpleTokenizer()
sentence = "I love Vietnam"
for word in sentence.split():
    tokenizer.add_word(word)
new_sentence = "I like Vietnam"
tokens = tokenizer.tokenize(new_sentence)
print(f"Tokens: {tokens}")

Tokens: ['I', '<UNK>', 'Vietnam']


In [7]:
tokenizer.vocab

{'<PAD>': 0, '<UNK>': 1, 'I': 2, 'love': 3, 'Vietnam': 4}

In [8]:
ids = tokenizer.convert_tokens_to_ids(tokens)
ids

[2, 1, 4]

In [9]:
tokenizer.convert_ids_to_tokens(ids)


['I', '<UNK>', 'Vietnam']

##2.Pytorch tokenizer(BPE)

In [None]:
pip install torchtext

In [None]:
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator

In [None]:
!pip uninstall -y torchtext
!pip install torchtext==0.15.2 --quiet


##3.BPE From Scratch

In [10]:
words = [list("toi_")]*5 + [list("la_")]*4 + [list("tung_")]*3 + [list("dep_")]*6 + [list("zai_")]*4

In [11]:
words

[['t', 'o', 'i', '_'],
 ['t', 'o', 'i', '_'],
 ['t', 'o', 'i', '_'],
 ['t', 'o', 'i', '_'],
 ['t', 'o', 'i', '_'],
 ['l', 'a', '_'],
 ['l', 'a', '_'],
 ['l', 'a', '_'],
 ['l', 'a', '_'],
 ['t', 'u', 'n', 'g', '_'],
 ['t', 'u', 'n', 'g', '_'],
 ['t', 'u', 'n', 'g', '_'],
 ['d', 'e', 'p', '_'],
 ['d', 'e', 'p', '_'],
 ['d', 'e', 'p', '_'],
 ['d', 'e', 'p', '_'],
 ['d', 'e', 'p', '_'],
 ['d', 'e', 'p', '_'],
 ['z', 'a', 'i', '_'],
 ['z', 'a', 'i', '_'],
 ['z', 'a', 'i', '_'],
 ['z', 'a', 'i', '_']]

In [12]:
aSet = set()
for word in words:
    for chr in word:
        aSet.add(chr)
aSet

{'_', 'a', 'd', 'e', 'g', 'i', 'l', 'n', 'o', 'p', 't', 'u', 'z'}

In [13]:
from collections import Counter
def most_common(words_):
    counter = Counter()
    for word in words:
        for i in range(len(word)-1):
            counter["{}{}".format(word[i],word[i+1])]+=1
    print(counter)
    return counter.most_common(1)

In [14]:
common_text_and_cnt = most_common(words)

Counter({'i_': 9, 'de': 6, 'ep': 6, 'p_': 6, 'to': 5, 'oi': 5, 'la': 4, 'a_': 4, 'za': 4, 'ai': 4, 'tu': 3, 'un': 3, 'ng': 3, 'g_': 3})


In [15]:
common_text_and_cnt

[('i_', 9)]

In [16]:
common_text_and_cnt[0][0]

'i_'

In [17]:
#Hợp token
token = common_text_and_cnt[0][0]
def merge_tokens(words,token):
    token_to_merge = token
    for i in range(len(words)):
        word = words[i]
        for j in range(len(word)-1):
            if "{}{}".format(word[j],word[j+1]) == token_to_merge:
                new_word = word[:j] + [token_to_merge] + word[j+2:]
                words[i] = new_word
    return words

In [18]:
new_words = merge_tokens(words,common_text_and_cnt[0][0])
new_words

[['t', 'o', 'i_'],
 ['t', 'o', 'i_'],
 ['t', 'o', 'i_'],
 ['t', 'o', 'i_'],
 ['t', 'o', 'i_'],
 ['l', 'a', '_'],
 ['l', 'a', '_'],
 ['l', 'a', '_'],
 ['l', 'a', '_'],
 ['t', 'u', 'n', 'g', '_'],
 ['t', 'u', 'n', 'g', '_'],
 ['t', 'u', 'n', 'g', '_'],
 ['d', 'e', 'p', '_'],
 ['d', 'e', 'p', '_'],
 ['d', 'e', 'p', '_'],
 ['d', 'e', 'p', '_'],
 ['d', 'e', 'p', '_'],
 ['d', 'e', 'p', '_'],
 ['z', 'a', 'i_'],
 ['z', 'a', 'i_'],
 ['z', 'a', 'i_'],
 ['z', 'a', 'i_']]

In [19]:
common_text_and_cnt = most_common(new_words)

Counter({'de': 6, 'ep': 6, 'p_': 6, 'to': 5, 'oi_': 5, 'la': 4, 'a_': 4, 'za': 4, 'ai_': 4, 'tu': 3, 'un': 3, 'ng': 3, 'g_': 3})


In [20]:
common_text_and_cnt

[('de', 6)]

In [21]:
new_words = merge_tokens(words,common_text_and_cnt[0][0])
new_words

[['t', 'o', 'i_'],
 ['t', 'o', 'i_'],
 ['t', 'o', 'i_'],
 ['t', 'o', 'i_'],
 ['t', 'o', 'i_'],
 ['l', 'a', '_'],
 ['l', 'a', '_'],
 ['l', 'a', '_'],
 ['l', 'a', '_'],
 ['t', 'u', 'n', 'g', '_'],
 ['t', 'u', 'n', 'g', '_'],
 ['t', 'u', 'n', 'g', '_'],
 ['de', 'p', '_'],
 ['de', 'p', '_'],
 ['de', 'p', '_'],
 ['de', 'p', '_'],
 ['de', 'p', '_'],
 ['de', 'p', '_'],
 ['z', 'a', 'i_'],
 ['z', 'a', 'i_'],
 ['z', 'a', 'i_'],
 ['z', 'a', 'i_']]

In [22]:
common_text_and_cnt = most_common(new_words)

Counter({'dep': 6, 'p_': 6, 'to': 5, 'oi_': 5, 'la': 4, 'a_': 4, 'za': 4, 'ai_': 4, 'tu': 3, 'un': 3, 'ng': 3, 'g_': 3})


In [23]:
common_text_and_cnt

[('dep', 6)]

In [24]:
new_words = merge_tokens(words,common_text_and_cnt[0][0])
new_words

[['t', 'o', 'i_'],
 ['t', 'o', 'i_'],
 ['t', 'o', 'i_'],
 ['t', 'o', 'i_'],
 ['t', 'o', 'i_'],
 ['l', 'a', '_'],
 ['l', 'a', '_'],
 ['l', 'a', '_'],
 ['l', 'a', '_'],
 ['t', 'u', 'n', 'g', '_'],
 ['t', 'u', 'n', 'g', '_'],
 ['t', 'u', 'n', 'g', '_'],
 ['dep', '_'],
 ['dep', '_'],
 ['dep', '_'],
 ['dep', '_'],
 ['dep', '_'],
 ['dep', '_'],
 ['z', 'a', 'i_'],
 ['z', 'a', 'i_'],
 ['z', 'a', 'i_'],
 ['z', 'a', 'i_']]

In [25]:
#Full code
from collections import Counter
#Initial word_list
words = [list('Hoc_')]*5 + [list('AI_')]*4 + [list('Rat_')]*6 + [list('De_')]*5
def most_common(words_):
    counter = Counter()
    for word in words_:
        for i in range(len(word)-1):
            counter["{}{}".format(word[i],word[i+1])]+=1
    return counter.most_common(1)

#Initialize an empty set to store unique characters
aSet = set()
for word in words:
    for chr in word:
        aSet.add(chr)
print(f'Initial set of unique characters:{aSet}')

bpe_vocab = {char: 0 for char in aSet}

def extract_bpe_vocab(words,num_iterations = 10):
    from collections import Counter
    for _ in range(num_iterations):
        counter = Counter()
        for word in words:
            for i in range(len(word)-1):
                token_pair = "{}{}".format(word[i],word[i+1])
                counter[token_pair]+=1
        if not counter:
            break
        most_common_pair,frequency = counter.most_common(1)[0]
        bpe_vocab[most_common_pair] = frequency
        words = merge_tokens(words,most_common_pair)
    bpe_vocab_sorted = dict(sorted(bpe_vocab.items(),key = lambda item:-item[1]))
    return bpe_vocab_sorted

def merge_tokens(words,token):
    token_to_merge = token
    for i in range(len(words)):
        word = words[i]
        for j in range(len(word)-1):
            if "{}{}".format(word[j],word[j+1]) == token_to_merge:
                new_word = word[:j] + [token_to_merge] + word[j+2:]
                words[i] = new_word
    return words

bpe_vocab = extract_bpe_vocab(words)
print(f"Vocabulary in bpe_vocab: {bpe_vocab}")

token_to_idx = {token: idx for idx,token in enumerate(bpe_vocab.keys())}
print(f"Token to Index mapping: {token_to_idx}")


    


Initial set of unique characters:{'I', 'R', 'e', 'c', 'D', 't', '_', 'H', 'o', 'a', 'A'}
Vocabulary in bpe_vocab: {'Ra': 6, 'Rat': 6, 'Rat_': 6, 'Ho': 5, 'Hoc': 5, 'Hoc_': 5, 'De': 5, 'De_': 5, 'AI': 4, 'AI_': 4, 'I': 0, 'R': 0, 'e': 0, 'c': 0, 'D': 0, 't': 0, '_': 0, 'H': 0, 'o': 0, 'a': 0, 'A': 0}
Token to Index mapping: {'Ra': 0, 'Rat': 1, 'Rat_': 2, 'Ho': 3, 'Hoc': 4, 'Hoc_': 5, 'De': 6, 'De_': 7, 'AI': 8, 'AI_': 9, 'I': 10, 'R': 11, 'e': 12, 'c': 13, 'D': 14, 't': 15, '_': 16, 'H': 17, 'o': 18, 'a': 19, 'A': 20}


In [26]:
bpe_vocab

{'Ra': 6,
 'Rat': 6,
 'Rat_': 6,
 'Ho': 5,
 'Hoc': 5,
 'Hoc_': 5,
 'De': 5,
 'De_': 5,
 'AI': 4,
 'AI_': 4,
 'I': 0,
 'R': 0,
 'e': 0,
 'c': 0,
 'D': 0,
 't': 0,
 '_': 0,
 'H': 0,
 'o': 0,
 'a': 0,
 'A': 0}

In [27]:
token_to_idx = {token: idx for idx,token in enumerate(bpe_vocab.keys())}
print(f"Token to Index mapping: {token_to_idx}")

Token to Index mapping: {'Ra': 0, 'Rat': 1, 'Rat_': 2, 'Ho': 3, 'Hoc': 4, 'Hoc_': 5, 'De': 6, 'De_': 7, 'AI': 8, 'AI_': 9, 'I': 10, 'R': 11, 'e': 12, 'c': 13, 'D': 14, 't': 15, '_': 16, 'H': 17, 'o': 18, 'a': 19, 'A': 20}


##4.Encoding Process

In [28]:
bpe_vocab

{'Ra': 6,
 'Rat': 6,
 'Rat_': 6,
 'Ho': 5,
 'Hoc': 5,
 'Hoc_': 5,
 'De': 5,
 'De_': 5,
 'AI': 4,
 'AI_': 4,
 'I': 0,
 'R': 0,
 'e': 0,
 'c': 0,
 'D': 0,
 't': 0,
 '_': 0,
 'H': 0,
 'o': 0,
 'a': 0,
 'A': 0}

In [29]:
def encode_to_numbers(sentence,bpe_vocab,token_to_index):
    words = list(sentence)
    while True:
        merged = False
        for token in sorted(bpe_vocab.keys(), key = lambda k: -bpe_vocab[k]):
            token_len = len(token)
            if token_len == 1:
                continue
            for i in range(len(words) - token_len + 1):
                print('===words', words)
                if "".join(words[i:i+token_len]) == token:
                    words = word[:i] + [token] + word[i+token_len:]
                    print(f"Merging token '{token}' at position {i}: {words}")
                    merged = True
                    break
            if merged:
                break
        if not merged:
            break
    print('token_to_index', token_to_index)
    # Convert the final list of tokens in 'words' to their corresponding numeric indices
    encoded_numbers = [token_to_index[token] for token in words if token in token_to_index]

    # Print the final merged words and their numeric encoding for debugging
    print('--- Final merged words:', words)
    print('--- Encoded numeric representation:', encoded_numbers)

    # Return the list of numeric indices representing the encoded sentence
    return encoded_numbers

In [30]:
sentence = "cam cam nham tam can ham"
sentence = sentence.replace(" ", "_")

print('--sentence', sentence)

encoded_numbers = encode_to_numbers(sentence, bpe_vocab, token_to_idx)
print(f"Encoded Sentence as Numbers: {encoded_numbers}")

--sentence cam_cam_nham_tam_can_ham
===words ['c', 'a', 'm', '_', 'c', 'a', 'm', '_', 'n', 'h', 'a', 'm', '_', 't', 'a', 'm', '_', 'c', 'a', 'n', '_', 'h', 'a', 'm']
===words ['c', 'a', 'm', '_', 'c', 'a', 'm', '_', 'n', 'h', 'a', 'm', '_', 't', 'a', 'm', '_', 'c', 'a', 'n', '_', 'h', 'a', 'm']
===words ['c', 'a', 'm', '_', 'c', 'a', 'm', '_', 'n', 'h', 'a', 'm', '_', 't', 'a', 'm', '_', 'c', 'a', 'n', '_', 'h', 'a', 'm']
===words ['c', 'a', 'm', '_', 'c', 'a', 'm', '_', 'n', 'h', 'a', 'm', '_', 't', 'a', 'm', '_', 'c', 'a', 'n', '_', 'h', 'a', 'm']
===words ['c', 'a', 'm', '_', 'c', 'a', 'm', '_', 'n', 'h', 'a', 'm', '_', 't', 'a', 'm', '_', 'c', 'a', 'n', '_', 'h', 'a', 'm']
===words ['c', 'a', 'm', '_', 'c', 'a', 'm', '_', 'n', 'h', 'a', 'm', '_', 't', 'a', 'm', '_', 'c', 'a', 'n', '_', 'h', 'a', 'm']
===words ['c', 'a', 'm', '_', 'c', 'a', 'm', '_', 'n', 'h', 'a', 'm', '_', 't', 'a', 'm', '_', 'c', 'a', 'n', '_', 'h', 'a', 'm']
===words ['c', 'a', 'm', '_', 'c', 'a', 'm', '_', 'n',

In [33]:
#Decode process
def decode_from_numbers(encoded_numbers,index_to_token):
    decoded_tokens = [index_to_token[idx] for idx in encoded_numbers]
    decoded_sentence = "".join(decoded_tokens)
    return decoded_sentence

index_to_token = {idx:token for token,idx in token_to_idx.items()}
decoded_sentence = decode_from_numbers(encoded_numbers,index_to_token)
decoded_sentence = decoded_sentence.replace("_"," ")
print(f"Decoded sentence: {decoded_sentence}")

Decoded sentence: ca ca a ta ca a


##4.Huggingface BPE

In [34]:
!wget https://huggingface.co/gpt2/raw/main/vocab.json

--2025-08-14 02:56:29--  https://huggingface.co/gpt2/raw/main/vocab.json
Resolving huggingface.co (huggingface.co)... 18.239.50.49, 18.239.50.80, 18.239.50.16, ...
Connecting to huggingface.co (huggingface.co)|18.239.50.49|:443... connected.
HTTP request sent, awaiting response... 307 Temporary Redirect
Location: /openai-community/gpt2/raw/main/vocab.json [following]
--2025-08-14 02:56:29--  https://huggingface.co/openai-community/gpt2/raw/main/vocab.json
Reusing existing connection to huggingface.co:443.
HTTP request sent, awaiting response... 200 OK
Length: 1042301 (1018K) [text/plain]
Saving to: ‘vocab.json’


2025-08-14 02:56:29 (4.08 MB/s) - ‘vocab.json’ saved [1042301/1042301]



In [35]:
import json 
with open('/kaggle/working/vocab.json','r',encoding = 'utf-8') as file:
    data = json.load(file)

In [36]:
data

{'!': 0,
 '"': 1,
 '#': 2,
 '$': 3,
 '%': 4,
 '&': 5,
 "'": 6,
 '(': 7,
 ')': 8,
 '*': 9,
 '+': 10,
 ',': 11,
 '-': 12,
 '.': 13,
 '/': 14,
 '0': 15,
 '1': 16,
 '2': 17,
 '3': 18,
 '4': 19,
 '5': 20,
 '6': 21,
 '7': 22,
 '8': 23,
 '9': 24,
 ':': 25,
 ';': 26,
 '<': 27,
 '=': 28,
 '>': 29,
 '?': 30,
 '@': 31,
 'A': 32,
 'B': 33,
 'C': 34,
 'D': 35,
 'E': 36,
 'F': 37,
 'G': 38,
 'H': 39,
 'I': 40,
 'J': 41,
 'K': 42,
 'L': 43,
 'M': 44,
 'N': 45,
 'O': 46,
 'P': 47,
 'Q': 48,
 'R': 49,
 'S': 50,
 'T': 51,
 'U': 52,
 'V': 53,
 'W': 54,
 'X': 55,
 'Y': 56,
 'Z': 57,
 '[': 58,
 '\\': 59,
 ']': 60,
 '^': 61,
 '_': 62,
 '`': 63,
 'a': 64,
 'b': 65,
 'c': 66,
 'd': 67,
 'e': 68,
 'f': 69,
 'g': 70,
 'h': 71,
 'i': 72,
 'j': 73,
 'k': 74,
 'l': 75,
 'm': 76,
 'n': 77,
 'o': 78,
 'p': 79,
 'q': 80,
 'r': 81,
 's': 82,
 't': 83,
 'u': 84,
 'v': 85,
 'w': 86,
 'x': 87,
 'y': 88,
 'z': 89,
 '{': 90,
 '|': 91,
 '}': 92,
 '~': 93,
 '¡': 94,
 '¢': 95,
 '£': 96,
 '¤': 97,
 '¥': 98,
 '¦': 99,
 '§': 100

In [38]:
from transformers import GPT2Tokenizer, GPT2Model
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
model = GPT2Model.from_pretrained('gpt2')
text = 'Toi la Tung Tom'
encoded_input = tokenizer(text,return_tensors = 'pt')

2025-08-14 03:00:14.039193: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1755140414.285960      36 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1755140414.352646      36 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

In [39]:
encoded_input

{'input_ids': tensor([[2514,   72, 8591,  309, 2150, 4186]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1]])}

In [40]:
tokenizer.convert_ids_to_tokens([2514,   72, 8591,  309, 2150, 4186])

['To', 'i', 'Ġla', 'ĠT', 'ung', 'ĠTom']