In [7]:
from huggingface_hub import login

login(token = "YOUR_ID")
from transformers import AutoTokenizer


llama_tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-3.2-1B")
print(llama_tokenizer.all_special_tokens)



['<|begin_of_text|>', '<|end_of_text|>']


In [8]:

from tokenizers import Tokenizer, models, trainers, pre_tokenizers, processors
from tokenizers.normalizers import Sequence, Lowercase, NFKC
from tokenizers.pre_tokenizers import Whitespace
from tokenizers.trainers import WordPieceTrainer

from datasets import load_dataset



dataset = load_dataset("Skylion007/openwebtext", split="train", streaming=True)

num_rows = 10_000_000 # Change the number if you want to take larger/smaller portion of data
texts = (example["text"] for example in dataset.take(num_rows)) 


tokenizer = Tokenizer(models.WordPiece(unk_token="[UNK]"))
tokenizer.normalizer = Sequence([NFKC(), Lowercase()])
tokenizer.pre_tokenizer = Whitespace()

trainer = WordPieceTrainer(vocab_size=128256, special_tokens=['<|begin_of_text|>', '<|end_of_text|>'])
# vocab size is equal to llama tokenizer vocab size


tokenizer.train_from_iterator(texts, trainer)

tokenizer_name = "wordpiece_tokenizer"
tokenizer.save(f"{tokenizer_name}.json")

print(f"Training is done. Tokenizer is saved as {tokenizer_name}.json")


'(ReadTimeoutError("HTTPSConnectionPool(host='huggingface.co', port=443): Read timed out. (read timeout=10)"), '(Request ID: 85f6d14a-06f2-4f40-93db-fddfeefef2b3)')' thrown while requesting GET https://huggingface.co/datasets/Skylion007/openwebtext/resolve/f3808c30e817981b845ec549c43e82bb467d8144/subsets/urlsf_subset18.tar
Retrying in 1s [Retry 1/5].
'(ReadTimeoutError("HTTPSConnectionPool(host='huggingface.co', port=443): Read timed out. (read timeout=10)"), '(Request ID: 335caf93-ac9f-4edc-a909-3bedddc3e711)')' thrown while requesting GET https://huggingface.co/datasets/Skylion007/openwebtext/resolve/f3808c30e817981b845ec549c43e82bb467d8144/subsets/urlsf_subset20.tar
Retrying in 1s [Retry 1/5].


Training is done. Tokenizer is saved as wordpiece_tokenizer.json


### Test the functionality

In [9]:
from tokenizers import Tokenizer

tokenizer = Tokenizer.from_file("wordpiece_tokenizer.json")
tokenizer.get_vocab_size()

128256

In [18]:
test_sentences = [
    "The government is planning new policies.",
    "AI models are becoming more powerful.",
    "A quick brown fox jumps over the lazy dog."
]

for sentence in test_sentences:
    encoded = tokenizer.encode(sentence)
    print(f"Input: {sentence}")

    print(f"Tokens: {encoded.tokens}")
    print(f"IDs: {encoded.ids}")
    print("-" * 50)


Input: The government is planning new policies.
Tokens: ['the', 'government', 'is', 'planning', 'new', 'policies', '.']
IDs: [46345, 46914, 46385, 50176, 46539, 49811, 15]
--------------------------------------------------
Input: AI models are becoming more powerful.
Tokens: ['ai', 'models', 'are', 'becoming', 'more', 'powerful', '.']
IDs: [53017, 50421, 46443, 50119, 46532, 49439, 15]
--------------------------------------------------
Input: A quick brown fox jumps over the lazy dog.
Tokens: ['a', 'quick', 'brown', 'fox', 'jumps', 'over', 'the', 'lazy', 'dog', '.']
IDs: [40, 47922, 49015, 49540, 61897, 46633, 46345, 58745, 49709, 15]
--------------------------------------------------
