# BPE Tokenizer usage exemple

## imports

In [1]:
import re
from json import load
from pathlib import Path

from tokenizer import BPETokenizer

## read and clean data

In [2]:
def clean_text(text:str):
    return re.sub(r'([^\w\s])\1+', r'\1', text)

In [3]:
files = list(Path('corpus/').glob('**/*.json'))[:5000]
corpus = "\n".join([load(file.open())["text"] for file in files])
corpus = clean_text(corpus)

## Instanciate and train BPE

In [5]:
tokenizer = BPETokenizer()
tokenizer.train(corpus, n_merges=1000)

tokenized 6,866,738 basic tokens
Counted 269,051 different basic tokens
Initial tokens: 256


100%|██████████| 1000/1000 [06:11<00:00,  2.69it/s]

Tokens 1,256





## Test BPE Tokenizer

In [9]:
tokens_ids = tokenizer.encode('Oi como você está?')
tokens_ids

[79, 105, 775, 111, 366, 360, 379, 834, 289, 63]

In [10]:
tokenizer.convert_ids_to_tokens(tokens_ids)

['O', 'i', ' com', 'o', ' v', 'oc', 'ê', ' est', 'á', '?']

In [11]:
tokenizer.decode(tokens_ids)

'Oi como você está?'