In [1]:
########################################################################################################################
## -- libraries and packages -- ########################################################################################
########################################################################################################################
import os
import sys
sys.path.append(os.path.abspath(".."))
from calm import tokenization

In [2]:
########################################################################################################################
## -- sample pararaphs for pretraining tokenizer and validating it -- ##################################################
########################################################################################################################
tr_text = """The sun dipped below the horizon, casting long shadows across the quiet town. Birds chirped their evening songs, and the gentle rustle of leaves filled the air. In the distance, the faint hum of a passing train echoed through the streets, blending with the soft murmur of conversations from open windows. Lanterns flickered to life, illuminating cobblestone paths and the faces of people winding down their day. Even in the stillness, there was a sense of anticipation, as if the night held secrets waiting to be discovered."""
va_text = """The library was nearly empty, its tall shelves stretching into dimly lit aisles that smelled faintly of old paper and dust. A single desk lamp glowed in the corner where a student scribbled furiously into a notebook, the sound of pen scratching mixing with the occasional creak of the wooden floor. Outside, rain tapped steadily against the windows, a soft percussion that made the silence inside even heavier. Somewhere deep in the stacks, a book thudded shut, as if reminding anyone listening that the night was far from over."""

In [3]:
########################################################################################################################
## -- pretraining the tokenizer, and validating encoder and decoder performance -- #####################################
########################################################################################################################
tokenizer = tokenization.Tokenizer(tr_text)
tokenizer.max_vocab_size = tokenizer.base_vocab_size + 64
tokenizer.generate_vocab()

print("original text:", va_text)
tokens = tokenizer.encoder(va_text)
print("tokens extracted:", tokens)
new_text = tokenizer.decoder(tokens)
print("decoded tokens:", new_text, end = "\n\n")

print("original text length:", len(va_text), "length after tokenization:", len(tokens))
print("generated pairs:", tokenizer. new_merged_indexes)
print("validation results:", va_text == tokenizer.decoder(tokenizer.encoder(va_text)))

original text: The library was nearly empty, its tall shelves stretching into dimly lit aisles that smelled faintly of old paper and dust. A single desk lamp glowed in the corner where a student scribbled furiously into a notebook, the sound of pen scratching mixing with the occasional creak of the wooden floor. Outside, rain tapped steadily against the windows, a soft percussion that made the silence inside even heavier. Somewhere deep in the stacks, a book thudded shut, as if reminding anyone listening that the night was far from over.
tokens extracted: [84, 104, 256, 108, 105, 98, 114, 97, 114, 121, 32, 119, 97, 261, 110, 101, 97, 114, 108, 121, 32, 101, 109, 112, 116, 121, 269, 313, 115, 257, 97, 108, 108, 278, 104, 290, 118, 101, 261, 265, 114, 282, 99, 104, 270, 259, 116, 111, 32, 277, 109, 108, 121, 32, 108, 105, 287, 97, 105, 115, 302, 115, 258, 276, 278, 109, 290, 108, 264, 32, 304, 259, 116, 108, 121, 307, 111, 108, 100, 32, 112, 97, 112, 274, 32, 300, 32, 100, 117, 265, 275,

In [4]:
########################################################################################################################
## -- loading the pretrained tokenizer, and validating encoder and decoder performance -- ##############################
########################################################################################################################
tokenizer = tokenization.Tokenizer()
tokenizer.load_state("../data/vocab/tokenizer_state.pkl")

print("original text:", va_text)
tokens = tokenizer.encoder(va_text)
print("tokens extracted:", tokens)
new_text = tokenizer.decoder(tokens)
print("decoded tokens:", new_text, end = "\n\n")

print("original text length:", len(va_text), "length after tokenization:", len(tokens))
print("generated pairs:", tokenizer. new_merged_indexes)
print("validation results:", va_text == tokenizer.decoder(tokenizer.encoder(va_text)))

original text: The library was nearly empty, its tall shelves stretching into dimly lit aisles that smelled faintly of old paper and dust. A single desk lamp glowed in the corner where a student scribbled furiously into a notebook, the sound of pen scratching mixing with the occasional creak of the wooden floor. Outside, rain tapped steadily against the windows, a soft percussion that made the silence inside even heavier. Somewhere deep in the stacks, a book thudded shut, as if reminding anyone listening that the night was far from over.
tokens extracted: [84, 104, 256, 108, 105, 98, 114, 97, 114, 121, 32, 119, 97, 261, 110, 101, 97, 114, 108, 121, 32, 101, 109, 112, 116, 121, 269, 313, 115, 257, 97, 108, 108, 278, 104, 290, 118, 101, 261, 265, 114, 282, 99, 104, 270, 259, 116, 111, 32, 277, 109, 108, 121, 32, 108, 105, 287, 97, 105, 115, 302, 115, 258, 276, 278, 109, 290, 108, 264, 32, 304, 259, 116, 108, 121, 307, 111, 108, 100, 32, 112, 97, 112, 274, 32, 300, 32, 100, 117, 265, 275,