In [1]:
import sys
sys.path.append('..')
import pickle
import psutil
import time
from tokenization_utils import train_bpe

# input_path = "../data/TinyStoriesV2-GPT4-train.txt"
input_path = "../../data/TinyStoriesV2-GPT4-valid.txt"
# vocab_size = 10_000
# input_path = "../../data/sampled_50k.txt"
vocab_size = 500
special_tokens = ["<|endoftext|>"]

# Measure memory before training
process = psutil.Process()
memory_before = process.memory_info().rss / (1024 ** 3)  # in GB

In [2]:
    # Train BPE tokenizer
    start_time = time.time()
    vocab, merges = train_bpe(
        input_path=input_path,
        vocab_size=vocab_size,
        special_tokens=special_tokens,
        num_processes=None,  # None: use all CPUs
        use_naive_merge=False
    )
    elapsed_time = time.time() - start_time

    # Measure memory after training
    memory_after = process.memory_info().rss / (1024 ** 3)  # in GB
    memory_used = memory_after - memory_before

    # Serialize vocab and merges
    with open("bpe_vocab.pkl", "wb") as f:
        pickle.dump(vocab, f)

    with open("bpe_merges.pkl", "wb") as f:
        pickle.dump(merges, f)

    # Since final_vocab is {raw_token: id}, raw tokens are in vocab.keys()
    longest_token = max(vocab.keys(), key=lambda token: len(token))

    print(f"Training completed in {elapsed_time / 60:.2f} minutes")
    print(f"Memory used: {memory_used:.2f} GB")
    print(f"Longest token length: {len(longest_token)} bytes")
    print(f"Longest token (decoded): {longest_token.decode('utf-8', errors='replace')}")

[merge_with_heap] took 31.64 seconds.
[train_bpe] took 32.14 seconds.
Training completed in 0.54 minutes
Memory used: 0.93 GB
Longest token length: 42 bytes
Longest token (decoded): Once upon a time, there was a little girl 


In [2]:
from tokenizer import Tokenizer

# input_path = "../data/TinyStoriesV2-GPT4-train.txt"
# vocab_size = 10_000
input_path = "../../data/TinyStoriesV2-GPT4-valid.txt"
vocab_size = 500
special_tokens = ["<|endoftext|>"]

process = psutil.Process()
memory_before = process.memory_info().rss / (1024 ** 3)  # in GB

# Train tokenizer
start_time = time.time()
tokenizer = Tokenizer.train(input_path=input_path, vocab_size=vocab_size, special_tokens=special_tokens)
elapsed_time = time.time() - start_time

memory_after = process.memory_info().rss / (1024 ** 3)  # in GB
memory_used = memory_after - memory_before

# Serialize vocab and merges
with open("bpe_vocab.pkl", "wb") as f:
    pickle.dump(tokenizer.vocab, f)

with open("bpe_merges.pkl", "wb") as f:
    pickle.dump(tokenizer.merges, f)

longest_token = max(tokenizer.vocab.values(), key=lambda token: len(token))

print(f"Training completed in {elapsed_time / 60:.2f} minutes")
print(f"Memory used: {memory_used:.2f} GB")
print(f"Longest token length: {len(longest_token)} bytes")
print(f"Longest token (decoded): {longest_token.decode('utf-8', errors='replace')}")

NameError: name 'Iterable' is not defined