# Byte Pair Encoding (BPE) Tokenizer - Testing Notebook

This notebook tests the functionality of the BPE tokenizer you've implemented.

## Setup


Run the cells below to load necessary modules.


In [None]:
import os
import sys
print("Current working directory:", os.getcwd())
sys.path.append(os.getcwd())

In [None]:
from regex_tokenizer import RegexTokenizer
from basic import BasicTokenizer
sample_text_path = "sample_text.txt"
with open(sample_text_path, "r") as file:
    sample_text = file.read()

print("Sample Text Loaded:")
print(sample_text[:200])  


### Kindly complete the code before running the shell.

In [None]:
import os
import time

text = open("sample_text.txt", "r", encoding="utf-8").read()
os.makedirs("models", exist_ok=True)

t0 = time.time()
for TokenizerClass, name in zip([BasicTokenizer, RegexTokenizer], ["basic", "regex"]):
    tokenizer = TokenizerClass()
    tokenizer.building_merges(text, 512, verbose=True)
    prefix = os.path.join("models", name)
    tokenizer.save(prefix)
t1 = time.time()

print(f"Merging took {t1 - t0:.2f} seconds")

In [None]:
sample_text = "This is an example sentence to encode."

basic_tokenizer = BasicTokenizer()
basic_tokenizer.load("models/basic.model")

regex_tokenizer = RegexTokenizer()
regex_tokenizer.load("models/regex.model")

encoded_basic = basic_tokenizer.encode(sample_text)
encoded_regex = regex_tokenizer.encode(sample_text)

print("Encoded with Basic Tokenizer:", encoded_basic)
print("Encoded with Regex Tokenizer:", encoded_regex)

decoded_basic = basic_tokenizer.decode(encoded_basic)
decoded_regex = regex_tokenizer.decode(encoded_regex)

print("Decoded with Basic Tokenizer:", decoded_basic)
print("Decoded with Regex Tokenizer:", decoded_regex)
