In [14]:
import os
import lzma
from tqdm import tqdm
import time

from tokenizers import Tokenizer
from tokenizers.normalizers import (Sequence, Lowercase, NFD, StripAccents)
from tokenizers.pre_tokenizers import Whitespace
from tokenizers.models import BPE
from tokenizers.decoders import BPEDecoder

from tokenizers.processors import TemplateProcessing

from tokenizers.trainers import BpeTrainer

vocab_size = 5000

In [12]:
special_tokens=["[UNK]","[CLS]","[SEP]","[PAD]","[MASK]"]
temp_proc = TemplateProcessing(
    single="[CLS] $A [SEP]",
    pair="[CLS] $A [SEP] $B:1 [SEP]:1",
    special_tokens=[
        ("[CLS]", special_tokens.index("[CLS]")),
        ("[SEP]", special_tokens.index("[SEP]")),
    ],
)

In [13]:
tokenizer = Tokenizer(BPE())
tokenizer.normalizer = Sequence([NFD(),Lowercase(),StripAccents()])
tokenizer.pre_tokenizer = Whitespace()
tokenizer.decoder = BPEDecoder()
tokenizer.post_processor=temp_proc

In [15]:
def xz_files_in_dir(directory):
    files = []
    for filename in os.listdir(directory):
        if filename.endswith(".xz") and os.path.isfile(os.path.join(directory, filename)):
            files.append(filename)
    return files

In [16]:
folder_path = "V:/llm-project/datasets/openwebtext"
vocab_file = "vocab.txt"

files = xz_files_in_dir(folder_path)

In [17]:
trainer = BpeTrainer(vocab_size=vocab_size,special_tokens=special_tokens)

start_time = time.time()

with open(vocab_file, "w", encoding="utf-8") as outfile:
    for filename in tqdm(files, total=len(files)):
        file_path = os.path.join(folder_path, filename)
        with lzma.open(file_path, "rt", encoding="utf-8") as infile:
            text = infile.read()
            tokenizer.train_from_iterator(text, trainer=trainer)
            # outfile.write(text)
            # char = set(text)

end_time = time.time()
elapsed_time = end_time - start_time

print(f"{elapsed_time:.8f}")

In [18]:
print(f"Trained vocab size: {tokenizer.get_vocab_size()}")

Trained vocab size: 1153


In [20]:
sen = "hello who is this, taliban?"
sen_enc=tokenizer.encode(sen)
print(f"Output: {sen_enc.tokens}")

Output: ['[CLS]', 'e', 'l', 'l', 's', 't', 's', 't', 'a', 'l', 'b', 'a', '[SEP]']
