In [None]:
from tokenizers import Tokenizer
from tokenizers.models import BPE
from tokenizers.normalizers import Lowercase, NFD, StripAccents, Sequence
from tokenizers.pre_tokenizers import BertPreTokenizer
from itertools import islice
from multiprocessing import Pool, cpu_count
import os
from transformers import AutoTokenizer
from huggingface_hub import login

import os
import re

import multiprocessing as mp

os.environ["TOKENIZERS_PARALLELISM"] = "false"

## Tokenize dataset

In [None]:
tokenizer_size = 10

In [None]:
train_text_list = [
    "/kaggle/input/ubettextfiles/ubertext_court.txt",
   "/kaggle/input/ubettextfiles/ubertext_fiction.txt",
   "/kaggle/input/ubettextfiles/ubertext_social.txt",
   "/kaggle/input/ubettextfiles/ubertext_wikipedia.txt",
   "/kaggle/input/ubettextfiles/ubertext_news.txt"
]

In [None]:
MERGES_PATH = f"/kaggle/input/morphemetokenizers/bpe_{tokenizer_size}k_full/merges.txt"
VOCAB_PATH = f"/kaggle/input/morphemetokenizers/bpe_{tokenizer_size}k_full/vocab.json"

In [None]:
BATCH_SIZE = 512
NUM_WORKERS = cpu_count()
RE_PATTERN = re.compile(r"[^а-яА-ЯіІїЇєЄґҐ0-9\s.,!?\"'()-]")

In [None]:
def batch_line_generator(file, batch_size):
    while True:
        lines = [clean_line(line) for line in islice(file, batch_size)]
        lines = [line for line in lines if line]
        if not lines:
            break
        yield lines

def clean_line(line):
    line = line.strip()
    cleaned = RE_PATTERN.sub("", line)
    return cleaned if cleaned else None

def process_file(args):
    file_path, tokenizer = args
    token_counts = []

    with open(file_path, "r", encoding="utf-8", errors="ignore") as f_in:
        for batch in batch_line_generator(f_in, BATCH_SIZE):
            encoded_batch = tokenizer(batch, add_special_tokens=False, return_length=True)
            token_counts.extend(encoded_batch["length"])  # use precomputed lengths

    return token_counts


def parallel_tokenize(file_list, tokenizer):
    args_list = [
        (file_path, tokenizer)
        for file_path in file_list
    ]

    all_token_counts = []

    with Pool(processes=NUM_WORKERS) as pool:
        results = pool.map(process_file, args_list)
        for token_counts in results:
            all_token_counts.extend(token_counts)  # flatten

    return all_token_counts

In [None]:
login(token="hf_token")

# tokenizer = AutoTokenizer.from_pretrained("m/gemma-7b")
tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-chat-hf")

In [None]:
bpe_model = BPE(
        vocab=VOCAB_PATH,
        merges=MERGES_PATH,
        unk_token="[UNK]"
    )
tokenizer = Tokenizer(bpe_model)
tokenizer.normalizer = Sequence([NFD(), Lowercase()])
tokenizer.pre_tokenizer = BertPreTokenizer()

  bpe_model = BPE(


In [None]:
if __name__ == "__main__":
    # mp.set_start_method("forkserver")

    token_counts = parallel_tokenize(
        train_text_list,
        tokenizer
    )

    print("Lines:", len(token_counts))
    print("Avg tokens/line:", sum(token_counts) / len(token_counts))

Lines: 16930386
Avg tokens/line: 30.51459252021779


In [None]:
sum(token_counts)

516623830

In [None]:
len(token_counts)

16930386

In [None]:
import json

with open(f'/kaggle/working/tokenized_{tokenizer_size}k.json', 'w') as f:
    json.dump({'token_counts': token_counts}, f)