In [32]:
from tokenizers import Tokenizer
from tokenizers.normalizers import (Sequence, Lowercase, NFD, StripAccents)
from tokenizers.pre_tokenizers import Whitespace
from tokenizers.models import BPE
from tokenizers.decoders import BPEDecoder

from tokenizers.processors import TemplateProcessing

from tokenizers.trainers import BpeTrainer

vocab_size = 5000

In [33]:
import nltk
from nltk.corpus import gutenberg
nltk.download('gutenberg')
nltk.download('punkt')
plays = ['shakespeare-macbeth.txt','shakespeare-hamlet.txt','shakespeare-caesar.txt']
shakespeare = [" ".join(s) for ply in plays for s in gutenberg.sents(ply)]

[nltk_data] Downloading package gutenberg to
[nltk_data]     C:\Users\ZeroAsWill\AppData\Roaming\nltk_data...
[nltk_data]   Package gutenberg is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\ZeroAsWill\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [34]:
print(len(shakespeare))
print(shakespeare[100])

7176
Enter Macbeth and Banquo .


In [35]:
special_tokens=["[UNK]","[CLS]","[SEP]","[PAD]","[MASK]"]
temp_proc = TemplateProcessing(
    single="[CLS] $A [SEP]",
    pair="[CLS] $A [SEP] $B:1 [SEP]:1",
    special_tokens=[
        ("[CLS]", special_tokens.index("[CLS]")),
        ("[SEP]", special_tokens.index("[SEP]")),
    ],
)

In [36]:
tokenizer = Tokenizer(BPE())
tokenizer.normalizer = Sequence([NFD(),Lowercase(),StripAccents()])
tokenizer.pre_tokenizer = Whitespace()
tokenizer.decoder = BPEDecoder()
tokenizer.post_processor=temp_proc

In [37]:
trainer = BpeTrainer(vocab_size=vocab_size,special_tokens=special_tokens)
tokenizer.train_from_iterator(shakespeare, trainer=trainer)

In [38]:
print(f"Trained vocab size: {tokenizer.get_vocab_size()}")

Trained vocab size: 5000


In [39]:
sen = "in the village churches the medals won at Waterloo were hung up by those of Grossbehren and Leipzig."
sen_enc=tokenizer.encode(sen)
print(f"Output: {sen_enc.tokens}")

Output: ['[CLS]', 'in', 'the', 'vill', 'age', 'chur', 'ches', 'the', 'med', 'als', 'won', 'at', 'water', 'loo', 'were', 'hung', 'u', 'p', 'by', 'those', 'of', 'gros', 's', 'beh', 'ren', 'and', 'le', 'i', 'p', 'z', 'ig', '.', '[SEP]']
