In [1]:
from pathlib import Path

In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
from tokenizers import BertWordPieceTokenizer

## Training the Tokenizer
We will be using our `enwiki` (from 0_Setup Notebook) for Training the Tokenizer.

In [4]:
# !wget -c https://norvig.com/big.txt
files = ["big.txt", "Movie_Reviews.txt"]

In [5]:
%%time 
tokenizer = BertWordPieceTokenizer(
    clean_text=True, strip_accents=False, lowercase=True,
)

CPU times: user 83 µs, sys: 38 µs, total: 121 µs
Wall time: 125 µs


In [6]:
%%time
tokenizer.train(
    files,
    vocab_size=100000,
    min_frequency=2,
    show_progress=True,
    special_tokens=["[PAD]", "[UNK]", "[CLS]", "[SEP]", "[MASK]"],
    limit_alphabet=100,
    wordpieces_prefix="##",
)

CPU times: user 1min 28s, sys: 25 s, total: 1min 53s
Wall time: 1min 1s


In [7]:
tokenizer.save("./", "bert-wordpiece") # Save the files

['./bert-wordpiece-vocab.txt']

## Get data for Tokenizer

In [8]:
def read_file(filepath: Path = "Movie_Reviews.txt"):
    with Path(filepath).open("r") as f:
        movie_review_text:str = f.read()
        return movie_review_text

In [9]:
movie_review_text = read_file(filepath="Movie_Reviews.txt")
movie_review_text.__sizeof__(), len(movie_review_text)

(264133428, 132066677)

## From Official Huggingface docs

In [10]:
from tokenizers import BertWordPieceTokenizer
tokenizer = BertWordPieceTokenizer("bert-wordpiece-vocab.txt", lowercase=True)
output = tokenizer.encode("Hello, y'all! How are you 😁 ?")
print(output.tokens)

['[CLS]', 'hello', ',', 'y', "'", 'all', '!', 'how', 'are', 'you', '[UNK]', '?', '[SEP]']


## Run for Our Data

In [11]:
%%time
tokenizer = BertWordPieceTokenizer("bert-wordpiece-vocab.txt", lowercase=True)
output = tokenizer.encode(movie_review_text)

CPU times: user 1min 20s, sys: 16.4 s, total: 1min 36s
Wall time: 1min 36s


## Get Tokens

In [14]:
%time tokens = output.tokens

CPU times: user 7.89 s, sys: 1.39 s, total: 9.28 s
Wall time: 9.35 s
