In [6]:
# test chunking

import os
from typing import BinaryIO
from find_chunk_boundaries import find_chunk_boundaries

TEST_FILE = os.path.join("..", "tests", "fixtures", "tinystories_sample.txt")
SPLIT_TOKEN = b"<|endoftext|>"
def test_find_chunk_boundaries():
    with open(TEST_FILE, "rb") as f:
        boundaries = find_chunk_boundaries(f, desired_num_chunks=2, split_special_token=SPLIT_TOKEN)
    
        # output the first chunk
        start = boundaries[0]
        end = boundaries[1] if len(boundaries) > 1 else None
        f.seek(start)
        chunk = f.read(end - start if end else None)
        print(chunk.decode("utf-8", errors="ignore"))

        # check the size of the chunk
        chunk_size = end - start if end else os.path.getsize(TEST_FILE) - start
        print(f"Chunk size: {chunk_size} bytes")
        return chunk
        

if __name__ == "__main__":
    chunk = test_find_chunk_boundaries()


Once upon a time there was a little boy named Ben. Ben loved to explore the world around him. He saw many amazing things, like beautiful vases that were on display in a store. One day, Ben was walking through the store when he came across a very special vase. When Ben saw it he was amazed!
He said, “Wow, that is a really amazing vase! Can I buy it?”
The shopkeeper smiled and said, “Of course you can. You can take it home and show all your friends how amazing it is!”
So Ben took the vase home and he was so proud of it! He called his friends over and showed them the amazing vase. All his friends thought the vase was beautiful and couldn't believe how lucky Ben was.
And that's how Ben found an amazing vase in the store!
<|endoftext|>
Once upon a time, there was a reliable otter named Ollie. He lived in a river with his family. They all loved to play and swim together.
One day, Ollie's mom said, "Ollie, hurry and get some fish for dinner!" Ollie swam fast to catch fish. He saw his friend,

In [11]:
import regex as re
chunk = str(chunk)
special_tokens = [SPLIT_TOKEN.decode("utf-8")]
re.split(f"{'|'.join(re.escape(special_token) for special_token in special_tokens)}",chunk)

["b'\\nOnce upon a time there was a little boy named Ben. Ben loved to explore the world around him. He saw many amazing things, like beautiful vases that were on display in a store. One day, Ben was walking through the store when he came across a very special vase. When Ben saw it he was amazed!\\nHe said, \\xe2\\x80\\x9cWow, that is a really amazing vase! Can I buy it?\\xe2\\x80\\x9d\\nThe shopkeeper smiled and said, \\xe2\\x80\\x9cOf course you can. You can take it home and show all your friends how amazing it is!\\xe2\\x80\\x9d\\nSo Ben took the vase home and he was so proud of it! He called his friends over and showed them the amazing vase. All his friends thought the vase was beautiful and couldn\\'t believe how lucky Ben was.\\nAnd that\\'s how Ben found an amazing vase in the store!\\n",
 '\\nOnce upon a time, there was a reliable otter named Ollie. He lived in a river with his family. They all loved to play and swim together.\\nOne day, Ollie\\\'s mom said, "Ollie, hurry and g

In [1]:
# test the pretonizer on the first chunk

import os
from typing import BinaryIO
from find_chunk_boundaries import find_chunk_boundaries
from pretokenization import pretokenize

TEST_FILE = os.path.join("..", "tests", "fixtures", "tinystories_sample.txt")
SPLIT_TOKEN = b"<|endoftext|>"

def test_pretokenize_first_chunk():
    with open(TEST_FILE, "rb") as f:
        boundaries = find_chunk_boundaries(f, desired_num_chunks=2, split_special_token=SPLIT_TOKEN)
        
        # read the first chunk
        start = boundaries[0]
        end = boundaries[1]
        f.seek(start)
        chunk = f.read(end - start)
        print(f"Chunk size: {len(chunk)} bytes")
        
        # pretokenize the chunk
        tokens = pretokenize(
            file_dir=TEST_FILE,
            start=start,
            end=end,
            special_tokens=[SPLIT_TOKEN.decode("utf-8")]
        )
        return tokens
        print(f"Number of tokens in first chunk: {len(tokens)}")
        print(f"First 10 tokens: {list(tokens.items())[:10]}")

        # check the most frequent pretokens
        sorted_tokens = sorted(tokens.items(), key=lambda item: item[1], reverse=True)
        print("pretokens:")
        for token, count in sorted_tokens[:]:
            print(f"Pretoken: {token}, Count: {count}")

if __name__ == "__main__":
    tokens = test_pretokenize_first_chunk()

Chunk size: 1940 bytes


In [2]:
tokens

{"\nOnce upon a time there was a little boy named Ben. Ben loved to explore the world around him. He saw many amazing things, like beautiful vases that were on display in a store. One day, Ben was walking through the store when he came across a very special vase. When Ben saw it he was amazed!\nHe said, “Wow, that is a really amazing vase! Can I buy it?”\nThe shopkeeper smiled and said, “Of course you can. You can take it home and show all your friends how amazing it is!”\nSo Ben took the vase home and he was so proud of it! He called his friends over and showed them the amazing vase. All his friends thought the vase was beautiful and couldn't believe how lucky Ben was.\nAnd that's how Ben found an amazing vase in the store!\n": 1,
 '\nOnce upon a time, there was a reliable otter named Ollie. He lived in a river with his family. They all loved to play and swim together.\nOne day, Ollie\'s mom said, "Ollie, hurry and get some fish for dinner!" Ollie swam fast to catch fish. He saw his f

In [3]:
# test multiprocessing pretokenization
import os
from typing import BinaryIO
from train_bpe import train_bpe

# TEST_FILE = os.path.join("..", "data", "TinyStoriesV2-GPT4-valid.txt")
TEST_FILE = os.path.join("..", "tests", "fixtures", "german.txt")
vocab, merges = train_bpe(
    input_path=TEST_FILE,
    vocab_size=1000,
    special_tokens=["<|endoftext|>"],
    num_processes=4,
    chunk_num=16
)