In [102]:
from tokenizers import Tokenizer
from tokenizers.normalizers import (Sequence, Lowercase, NFD, StripAccents)
from tokenizers.pre_tokenizers import Whitespace
from tokenizers.models import BPE
from tokenizers.decoders import BPEDecoder

from tokenizers.processors import TemplateProcessing

from tokenizers.trainers import BpeTrainer

import nltk
from nltk.data import find
from nltk.corpus import gutenberg


nltk.download('gutenberg', download_dir='V:/llm-project/datasets')
file_path = 'V:/llm-project/datasets/corpora/gutenberg/'
try:
    find('V:/llm-project/datasets/corpora/gutenberg')
    print('Corpora Gutenberg is There')
except LookupError:
    print('Corpora Gutenberg is Not There')
# nltk.download('V:/llm-project/tokenizer/punkt')


[nltk_data] Downloading package gutenberg to V:/llm-
[nltk_data]     project/datasets...


Corpora Gutenberg is Not There


[nltk_data]   Unzipping corpora\gutenberg.zip.


In [103]:
vocab_size = 1000000
plays = [
    f'{file_path}austen-sense.txt',
    f'{file_path}blake-poems.txt',
    f'{file_path}austen-persuasion.txt',
    f'{file_path}austen-emma.txt',
    f'{file_path}bryant-stories.txt',
    f'{file_path}burgess-busterbrown.txt',
    f'{file_path}bismarck.txt',
    f'{file_path}carroll-alice.txt',
    f'{file_path}chesterton-ball.txt',
    f'{file_path}chesterton-brown.txt',
    f'{file_path}chesterton-thursday.txt',
    f'{file_path}corpus1.txt',
    f'{file_path}corpus2.txt',
    f'{file_path}corpus3.txt',
    f'{file_path}corpus4.txt',
    f'{file_path}corpus5.txt',
    f'{file_path}edgeworth-parents.txt',
    f'{file_path}melville-moby_dick.txt',
    f'{file_path}milton-paradise.txt',
    f'{file_path}shakespeare-macbeth.txt',
    f'{file_path}shakespeare-hamlet.txt',
    f'{file_path}shakespeare-caesar.txt',
    f'{file_path}whitman-leaves.txt'
    ]
text = [" ".join(s) for ply in plays for s in gutenberg.sents(ply)]

In [104]:
print(len(text))
print(text[100])

160453
But , then , if Mrs . Dashwood should live fifteen years we shall be completely taken in ."


In [107]:
special_tokens=["[UNK]","[CLS]","[SEP]","[PAD]","[MASK]"]
temp_proc = TemplateProcessing(
    single="[CLS] $A [SEP]",
    pair="[CLS] $A [SEP] $B:1 [SEP]:1",
    special_tokens=[
        ("[CLS]", special_tokens.index("[CLS]")),
        ("[SEP]", special_tokens.index("[SEP]")),
    ],
)

In [108]:
tokenizer = Tokenizer(BPE())
tokenizer.normalizer = Sequence([NFD(),Lowercase(),StripAccents()])
tokenizer.pre_tokenizer = Whitespace()
tokenizer.decoder = BPEDecoder()
tokenizer.post_processor=temp_proc

In [109]:
trainer = BpeTrainer(vocab_size=vocab_size,special_tokens=special_tokens)
tokenizer.train_from_iterator(text, trainer=trainer)

In [89]:
print(f"Trained vocab size: {tokenizer.get_vocab_size()}")

Trained vocab size: 133242


In [110]:
sen = "In a random hidden valley surrounded by towering mountains, the ancient village of Eldoria thrived in harmonious isolation. The villagers spoke of magical creatures dwelling in the dense forests, guardians of secrets lost to time. Each spring, a festival of lights transformed the village into a realm of wonder, with lanterns floating like ethereal fireflies against the star-studded sky. Among the villagers was a young dreamer named Elara, whose curiosity about the world beyond the mountains sparked tales of adventure and discovery. Little did she know, her destiny was intertwined with the very legends that danced in the shadows of Eldoria's enchanted night."

In [111]:
sen_enc=tokenizer.encode(sen)
print(f"Output: {sen_enc.tokens}")

Output: ['[CLS]', 'in', 'a', 'random', 'hidden', 'valley', 'surrounded', 'by', 'towering', 'mountains', ',', 'the', 'ancient', 'village', 'of', 'eld', 'oria', 'thrived', 'in', 'harmonious', 'isolation', '.', 'the', 'villagers', 'spoke', 'of', 'magical', 'creatures', 'dwelling', 'in', 'the', 'dense', 'forests', ',', 'guardians', 'of', 'secrets', 'lost', 'to', 'time', '.', 'each', 'spring', ',', 'a', 'festival', 'of', 'lights', 'transformed', 'the', 'village', 'into', 'a', 'realm', 'of', 'wonder', ',', 'with', 'lanterns', 'floating', 'like', 'ethereal', 'fire', 'flies', 'against', 'the', 'star', '-', 'studded', 'sky', '.', 'among', 'the', 'villagers', 'was', 'a', 'young', 'dreamer', 'named', 'el', 'ara', ',', 'whose', 'curiosity', 'about', 'the', 'world', 'beyond', 'the', 'mountains', 'sparked', 'tales', 'of', 'adventure', 'and', 'discovery', '.', 'little', 'did', 'she', 'know', ',', 'her', 'destiny', 'was', 'intertwined', 'with', 'the', 'very', 'legends', 'that', 'danced', 'in', 'the', 