In [19]:
import regex as re
word_pattern = r"""'(?:[sdmt]|ll|ve|re)| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+"""
test_str = "some text that i'll pre-tokenize"
for m in re.finditer(word_pattern, test_str):
    word = m.group()
    print(word)

some
 text
 that
 i
'll
 pre
-
tokenize


In [20]:
test_str = '''u don't have to be scared of the loud dog, I'll protect you". The mole felt so safe with the little girl. She was very kind and the mole soon came to trust her. He leaned against her and she kept him safe. The mole had found his best friend.
<|endoftext|>
Once upon a time<|endoftext|>'''
special_tokens = ["<|endoftext|>"]
special_tokens = list(map(lambda x: re.escape(x), special_tokens))
split_pattern = "|".join(special_tokens)
for m in re.splititer(split_pattern, test_str):
    line = m
    print(f"LINE: {line}")


LINE: u don't have to be scared of the loud dog, I'll protect you". The mole felt so safe with the little girl. She was very kind and the mole soon came to trust her. He leaned against her and she kept him safe. The mole had found his best friend.

LINE: 
Once upon a time
LINE: 


In [21]:
import os
from typing import BinaryIO


def find_chunk_boundaries(
    file: BinaryIO,
    desired_num_chunks: int,
    split_special_token: bytes,
) -> list[int]:
    """
    Chunk the file into parts that can be counted independently.
    May return fewer chunks if the boundaries end up overlapping.
    """
    assert isinstance(split_special_token, bytes), "Must represent special token as a bytestring"

    # Get total file size in bytes
    file.seek(0, os.SEEK_END)
    file_size = file.tell()
    file.seek(0)

    chunk_size = file_size // desired_num_chunks

    # Initial guesses for chunk boundary locations, uniformly spaced
    # Chunks start on previous index, don't include last index
    chunk_boundaries = [i * chunk_size for i in range(desired_num_chunks + 1)]
    chunk_boundaries[-1] = file_size

    mini_chunk_size = 4096  # Read ahead by 4k bytes at a time

    for bi in range(1, len(chunk_boundaries) - 1):
        initial_position = chunk_boundaries[bi]
        file.seek(initial_position)  # Start at boundary guess
        while True:
            mini_chunk = file.read(mini_chunk_size)  # Read a mini chunk

            # If EOF, this boundary should be at the end of the file
            if mini_chunk == b"":
                chunk_boundaries[bi] = file_size
                break

            # Find the special token in the mini chunk
            found_at = mini_chunk.find(split_special_token)
            if found_at != -1:
                chunk_boundaries[bi] = initial_position + found_at
                break
            initial_position += mini_chunk_size

    # Make sure all boundaries are unique, but might be fewer than desired_num_chunks
    return sorted(set(chunk_boundaries))

In [None]:
from rich.progress import track
word_count_dict = dict[str, int]()
def process_doc(doc: str) -> int:
    count = 0
    for m in re.finditer(word_pattern, doc):
        word = m.group()
        key = word
        count += 1
        if  key in word_count_dict:
            word_count_dict[key] += 1
        else:
            word_count_dict[key] = 1
    return count

## Usage
with open("../data/TinyStoriesV2-GPT4-valid.txt", "rb") as f:
        num_processes = 4
        boundaries = find_chunk_boundaries(f, num_processes, b"<|endoftext|>")

        # The following is a serial implementation, but you can parallelize this
        # by sending each start/end pair to a set of processes.
        for start, end in zip(boundaries[:-1], boundaries[1:]):
            f.seek(start)
            chunk = f.read(end - start).decode("utf-8", errors="ignore")

            # Run pre-tokenization on your chunk and store the counts for each pre-token
            for t, doc in enumerate(track(re.split(split_pattern, chunk))):
                count = process_doc(doc)
            break

print(f"Total unique pre-tokens: {len(word_count_dict)}")

Total unique pre-tokens: 8307


{'u': 1,
 ' don': 658,
 "'t": 2860,
 ' have': 2681,
 ' to': 37577,
 ' be': 3177,
 ' scared': 1552,
 ' of': 6277,
 ' the': 52454,
 ' loud': 499,
 ' dog': 3353,
 ',': 59196,
 ' I': 4678,
 "'ll": 110,
 ' protect': 59,
 ' you': 7056,
 '".': 217,
 ' The': 11679,
 ' mole': 59,
 ' felt': 2208,
 ' so': 4536,
 ' safe': 662,
 ' with': 10675,
 ' little': 5984,
 ' girl': 3187,
 '.': 105299,
 ' She': 9697,
 ' was': 27162,
 ' very': 6320,
 ' kind': 788,
 ' and': 49225,
 ' soon': 455,
 ' came': 2037,
 ' trust': 45,
 ' her': 10142,
 ' He': 12295,
 ' leaned': 19,
 ' against': 26,
 ' she': 5443,
 ' kept': 397,
 ' him': 2935,
 ' had': 7222,
 ' found': 2556,
 ' his': 9649,
 ' best': 1165,
 ' friend': 2412,
 '\n': 38176,
 'Once': 4426,
 ' upon': 4193,
 ' a': 38047,
 ' time': 5584,
 ' in': 9736,
 ' warm': 449,
 ' sunny': 387,
 ' place': 487,
 ' there': 5411,
 ' big': 8648,
 ' pit': 57,
 ' A': 382,
 ' boy': 2706,
 ' named': 5264,
 ' Tom': 4778,
 ' liked': 2287,
 ' play': 5950,
 ' near': 672,
 ' One': 2236,
 