## Building word-peice tokenizer from scratch

- Tokenizer object with a model, then set its `normalizer`, `pre_tokenizer`, `post_processor`, and `decoder` attributes to the values we want.

In [57]:
from tokenizers  import (
    decoders, 
    models,
    normalizers,
    pre_tokenizers, 
    processors, 
    trainers,
    Tokenizer,
)
tokenizer = Tokenizer(models.WordPiece(unk_token="[UNK]"))
# normalization
tokenizer.normalizer = normalizers.Sequence([
    normalizers.NFD(), normalizers.Lowercase(), normalizers.StripAccents()
])
# pre-tokenization
tokenizer.pre_tokenizer = pre_tokenizers.Sequence([
    pre_tokenizers.WhitespaceSplit(), pre_tokenizers.Punctuation()
])
# trainer
special_tokens = ['[UNK]', '[PAD]', '[CLS]', '[SEP]', '[MASK]']
trainer = trainers.WordPieceTrainer(vocab_size=35000, special_tokens=special_tokens)

# train the tokenizer
tokenizer.model = models.WordPiece(unk_token="[UNK]")
tokenizer.train(['/Data/deeksha/disha/code_p/transformers/train.en'], trainer=trainer)
tokenizer.train(['/Data/deeksha/disha/code_p/transformers/train.fr'], trainer=trainer)

# post-processing
cls_token_id = tokenizer.token_to_id('[CLS]')
sep_token_id = tokenizer.token_to_id('[SEP]')
tokenizer.post_processor = processors.TemplateProcessing(
    single=f"[CLS]:0 $A:0 [SEP]:0",
    pair=f"[CLS]:0 $A:0 [SEP]:0 $B:1 [SEP]:1",
    special_tokens=[("[CLS]", cls_token_id), ("[SEP]", sep_token_id)],
)
encoding = tokenizer.encode("Hello how are you?")

## decoder
tokenizer.decoder = decoders.WordPiece(prefix="##")
tokenizer.decode(encoding.ids)

## save the tokenizer
tokenizer.save("tokenizer.json")

## load the tokenizer
new_tokenizer = Tokenizer.from_file("tokenizer.json")







