### Get File Size

In [1]:
from pathlib import Path

file_path = "boericke_full_text.txt"

path = Path(file_path)
size_bytes = path.stat().st_size

size_kb = size_bytes / 1024
size_mb = size_bytes / (1024 ** 2)

print(f"Size in MB: {size_mb:.3f}")


Size in MB: 1.241


In [2]:
from time import time

In [3]:
from rich import print
from rich.console import Console

console = Console(width=140)

### Read and Load File Contents

In [4]:
with open(file_path, "r", encoding="utf-8") as f:
    text = f.read()

print(text[:100])

# Document Structure Aware

## 1. Sentence-based

### NLTK

In [5]:
import nltk
from nltk.tokenize import sent_tokenize

nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/srijanshovit/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [6]:
def sentence_based_chunking(text):
    sentences = sent_tokenize(text)
    return [sentence.strip() for sentence in sentences]

In [7]:
start = time()
chunks = sentence_based_chunking(text)
end = time()
print(f"Time taken to chunk with custom code: {(end - start):.3f} s")
print(f"Total no of chunks: {len(chunks)}")

In [8]:
for i, chunk in enumerate(chunks[:5]):
    print(f"Chunk {i+1}: {chunk}")

### SpaCy

In [9]:
import spacy

nlp = spacy.blank("en")
nlp.max_length = len(text) + 1000
nlp.add_pipe("sentencizer")

start = time()
doc = nlp(text)
sentences = [s.text for s in doc.sents]
end = time()
print(f"Time taken to chunk with custom code: {(end - start):.3f} s")
print(f"Total no of chunks: {len(chunks)}")



In [10]:
for i, chunk in enumerate(chunks[:5]):
    print(f"Chunk {i+1}: {chunk}")

### Llama-Index

##### Sentence Splitter

#### Boundary tokens overlap

In [11]:
from llama_index.core.utils import get_tokenizer

tokenizer = get_tokenizer()

In [12]:
from llama_index.core.node_parser import SentenceSplitter

splitter = SentenceSplitter(
    chunk_size=256,
    chunk_overlap=50,
    separator="."
)

start = time()
chunks = splitter.split_text(text)
end = time()
print(f"Time taken to chunk with custom code: {(end - start):.3f} s")

In [13]:
print(f"Total no of chunks: {len(chunks)}")
for i, c in enumerate(chunks[:3]):
    console.print(f"Chunk {i+1}:\n{c}")
    console.print(f"No of chars in the chunk: {len(c)}")

    n_tokens = len(tokenizer(c))
    console.print(f"No of tokens in the chunk: {n_tokens}")

### SentenceWindowNodeParser

#### Boundary Sentences Overlap

In [14]:
from llama_index.core import Document
from llama_index.core.node_parser import SentenceWindowNodeParser
from llama_index.core.utils import get_tokenizer

doc = Document(text=text)


In [15]:
window_size = 1

In [16]:
node_parser = SentenceWindowNodeParser.from_defaults(
    window_size=window_size,
    window_metadata_key="window",
    original_text_metadata_key="original_sentence",
)


In [17]:
start = time()
nodes = node_parser.get_nodes_from_documents([doc])
end = time()
print(f"Time taken to chunk with custom code: {(end - start):.3f} s")
print(f"Total no of nodes (sentences): {len(nodes)}")


In [18]:
tokenizer = get_tokenizer()
tokenizer

functools.partial(<bound method Encoding.encode of <Encoding 'cl100k_base'>>, allowed_special='all')

### What is cl100k_base?

cl100k_base is a Byte Pair Encoding (BPE) tokenizer designed for ~100k vocabulary scale and used by modern OpenAI-family models.

- cl → Chat / Completion lineage
- 100k → ~100,000 merge vocabulary
- base → base encoding (no chat-specific wrapping like <|system|>,<|user|>,<|assistant|>)

In [19]:


for i, node in enumerate(nodes[:3], start=len(nodes) - 4):
    center = node.text
    window = node.metadata["window"]

    print(f"Node {i}")
    print("CENTER SENTENCE:")
    print(center)

    print(f"WINDOW (±{window_size} sentences):")
    print(window)

    print("Stats:")
    print(f"tokens (center): {len(tokenizer(center))}; tokens (window): {len(tokenizer(window))}")

