In [10]:
# test chunking

import os
from typing import BinaryIO
from find_chunk_boundaries import find_chunk_boundaries

TEST_FILE = os.path.join("..", "data", "TinyStoriesV2-GPT4-valid.txt")
SPLIT_TOKEN = b"<|endoftext|>"

def test_find_chunk_boundaries():
    with open(TEST_FILE, "rb") as f:
        boundaries = find_chunk_boundaries(f, desired_num_chunks=20000, split_special_token=SPLIT_TOKEN)
    
        # output the first chunk
        start = boundaries[0]
        end = boundaries[1] if len(boundaries) > 1 else None
        f.seek(start)
        chunk = f.read(end - start if end else None)
        print(chunk.decode("utf-8", errors="ignore"))

        # check the size of the chunk
        chunk_size = end - start if end else os.path.getsize(TEST_FILE) - start
        print(f"Chunk size: {chunk_size} bytes")
        

if __name__ == "__main__":
    test_find_chunk_boundaries()

u don't have to be scared of the loud dog, I'll protect you". The mole felt so safe with the little girl. She was very kind and the mole soon came to trust her. He leaned against her and she kept him safe. The mole had found his best friend.
<|endoftext|>
Once upon a time, in a warm and sunny place, there was a big pit. A little boy named Tom liked to play near the pit. One day, Tom lost his red ball. He was very sad.
Tom asked his friend, Sam, to help him search for the ball. They looked high and low, but they could not find the ball. Tom said, "I think my ball fell into the pit."
Sam and Tom went close to the pit. They were scared, but they wanted to find the red ball. They looked into the pit, but it was too dark to see. Tom said, "We must go in and search for my ball."
They went into the pit to search. It was dark and scary. They could not find the ball. They tried to get out, but the pit was too deep. Tom and Sam were stuck in the pit. They called for help, but no one could hear t

In [5]:
# test the pretonizer on the first chunk

import os
from typing import BinaryIO
from find_chunk_boundaries import find_chunk_boundaries
from pretokenization import pretokenize

TEST_FILE = os.path.join("..", "data", "TinyStoriesV2-GPT4-valid.txt")
SPLIT_TOKEN = b"<|endoftext|>"

def test_pretokenize_first_chunk():
    with open(TEST_FILE, "rb") as f:
        boundaries = find_chunk_boundaries(f, desired_num_chunks=20000, split_special_token=SPLIT_TOKEN)
        
        # read the first chunk
        start = boundaries[0]
        end = boundaries[1] if len(boundaries) > 1 else None
        f.seek(start)
        chunk = f.read(end - start if end else None)
        print(f"First chunk size: {len(chunk)} bytes")
        
        # pretokenize the chunk
        tokens = pretokenize(chunk.decode("utf-8"), special_tokens=[SPLIT_TOKEN.decode("utf-8")])
        print(f"Number of tokens in first chunk: {len(tokens)}")
        print(f"First 10 tokens: {list(tokens.items())[:10]}")

        # check the most frequent pretokens
        sorted_tokens = sorted(tokens.items(), key=lambda item: item[1], reverse=True)
        print("Top 10 most frequent pretokens:")
        for token, count in sorted_tokens[:10]:
            print(f"Pretoken: {token}, Count: {count}")

if __name__ == "__main__":
    test_pretokenize_first_chunk()

First chunk size: 3035 bytes
Number of tokens in first chunk: 256
First 10 tokens: [((b'u',), 1), ((b' don',), 1), ((b"'t",), 1), ((b' have',), 3), ((b' to',), 21), ((b' be',), 1), ((b' scared',), 5), ((b' of',), 6), ((b' the',), 28), ((b' loud',), 2)]
Top 10 most frequent pretokens:
Pretoken: (b'.',), Count: 69
Pretoken: (b' and',), Count: 31
Pretoken: (b' the',), Count: 28
Pretoken: (b',',), Count: 26
Pretoken: (b' to',), Count: 21
Pretoken: (b'\n',), Count: 21
Pretoken: (b' They',), Count: 16
Pretoken: (b' was',), Count: 13
Pretoken: (b' a',), Count: 11
Pretoken: (b' Tom',), Count: 11
