In [17]:
import os
import datasets

from tokenizers import Tokenizer
from tokenizers.models import BPE
from tokenizers.trainers import BpeTrainer
from tokenizers.pre_tokenizers import ByteLevel
from tokenizers.decoders import ByteLevel as ByteLevelDecoder, BPEDecoder
from transformers import PreTrainedTokenizerFast

In [6]:
dataset = datasets.load_dataset("ag_news")
tokenizer_train = list(dataset["train"]["text"])
tokenizer_path = "tokenizers"
vocab_size = 10000

In [7]:
char_tokenizer = Tokenizer(BPE(unk_token="<unk>"))
char_tokenizer.decoder = BPEDecoder()

trainer = BpeTrainer(
    vocab_size=vocab_size,
    special_tokens=["<pad>", "<unk>", "<mask>"],
    show_progress=True,
)
char_tokenizer.train_from_iterator(tokenizer_train, trainer)
char_tokenizer.save(os.path.join(tokenizer_path, "char_tokenizer.json"))

In [8]:
byte_tokenizer = Tokenizer(BPE(unk_token="<unk>"))
byte_tokenizer.pre_tokenizer = ByteLevel(add_prefix_space=False)
byte_tokenizer.decoder = ByteLevelDecoder()

trainer = BpeTrainer(
    vocab_size=vocab_size,
    special_tokens=["<pad>", "<unk>", "<mask>"],
    initial_alphabet=ByteLevel.alphabet(),
    show_progress=True,
)
byte_tokenizer.train_from_iterator(tokenizer_train, trainer)
byte_tokenizer.save(os.path.join(tokenizer_path, "byte_tokenizer.json"))

In [None]:
hf_tokenizer = PreTrainedTokenizerFast(
    tokenizer_file=os.path.join(tokenizer_path, "byte_tokenizer.json"),
    unk_token="<unk>",
    pad_token="<pad>",
    mask_token="<mask>",
)

In [15]:
hf_tokenizer.decode(hf_tokenizer("HelloØ")["input_ids"])

'HelloØ'

In [16]:
hf_tokenizer.vocab_size

10000

In [None]:
raw_byte_tokenizer = Tokenizer(BPE(unk_token="<unk>"))
raw_byte_tokenizer.pre_tokenizer = ByteLevel(add_prefix_space=False)
raw_byte_tokenizer.decoder = ByteLevelDecoder()
vocab_size = 259
trainer = BpeTrainer(
    vocab_size=vocab_size,
    special_tokens=["<pad>", "<unk>", "<mask>"],
    initial_alphabet=ByteLevel.alphabet(),
    show_progress=True,
)
raw_byte_tokenizer.train_from_iterator(tokenizer_train, trainer)
raw_byte_tokenizer.save(os.path.join(tokenizer_path, "raw_byte_tokenizer.json"))


In [48]:
byte_tokenizer.get_vocab_size()

259

In [53]:
raw_byte_tokenizer = PreTrainedTokenizerFast(
    tokenizer_file=os.path.join(tokenizer_path, "raw_byte_tokenizer.json"),
    unk_token="<unk>",
    pad_token="<pad>",
    mask_token="<mask>",
)

In [54]:
byte_tokenizer = PreTrainedTokenizerFast(
    tokenizer_file=os.path.join(tokenizer_path, "byte_tokenizer.json"),
    unk_token="<unk>",
    pad_token="<pad>",
    mask_token="<mask>",
)

In [55]:
raw_byte_tokenizer("HelloØ")

{'input_ids': [42, 71, 78, 78, 81, 130, 249], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1]}

In [56]:
byte_tokenizer("HelloØ")

{'input_ids': [42, 9533, 130, 249], 'token_type_ids': [0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1]}

In [50]:
hf_tokenizer.decode(hf_tokenizer("HelloØ")["input_ids"])

'HelloØ'