### Get File Size

In [37]:
from pathlib import Path

file_path = "boericke_full_text.txt"

path = Path(file_path)
size_bytes = path.stat().st_size

size_kb = size_bytes / 1024
size_mb = size_bytes / (1024 ** 2)

print(f"Size in MB: {size_mb:.3f}")


In [38]:
from time import time

In [39]:
from rich import print
from rich.console import Console

console = Console(width=140)

### Read and Load File Contents

In [40]:
with open(file_path, "r", encoding="utf-8") as f:
    text = f.read()

print(text[:100])

# Fixed-Size

## 1. Word-based

### Custom Code

In [41]:
def fixed_size_word_chunker(
    input_text: str,
    chunk_size: int,
    overlap: int = 0,
):
    """
    Split text into fixed-size word chunks with or without overlap as specified.

    Returns:
        List[str]: Word-based chunks
    """
    if overlap >= chunk_size:
        raise ValueError("overlap must be smaller than chunk_size")

    words = input_text.split()

    step = chunk_size - overlap
    chunks = []

    for i in range(0, len(words), step):
        chunk_words = words[i:i + chunk_size]
        if not chunk_words:
            break
        chunks.append(" ".join(chunk_words))

    return chunks


In [42]:
start = time()

chunks = fixed_size_word_chunker(text, chunk_size=512, overlap=100)
end = time()
print(f"Time taken to chunk with custom code: {(end - start):.3f} s")


print(f"Total no of chunks: {len(chunks)}")
for i, c in enumerate(chunks[:3]):
    console.print(f"Chunk {i+1}:\n{c}\n")


## 2. Char Based

### Custom Code

In [43]:
def fixed_size_char_chunker(
    input_text: str,
    chunk_size: int,
    overlap: int = 0,
):
    """
    Split text into fixed-size character chunks with optional overlap.

    Returns:
        List[str]: Character-based chunks
    """
    if overlap >= chunk_size:
        raise ValueError("overlap must be smaller than chunk_size")

    step = chunk_size - overlap
    chunks = []

    for i in range(0, len(input_text), step):
        chunk = input_text[i:i + chunk_size]
        if not chunk:
            break
        chunks.append(chunk)

    return chunks


In [44]:
start = time()
chunks = fixed_size_char_chunker(text, chunk_size=512, overlap=100)
end = time()
print(f"Total no of chunks: {len(chunks)}")
print(f"Time taken to chunk with custom code: {(end - start):.3f} s")

for i, c in enumerate(chunks[:3]):
    console.print(f"Chunk {i+1}:\n{c}\n")

### LangChain

In [45]:
from langchain_text_splitters import CharacterTextSplitter

splitter = CharacterTextSplitter(
    separator="",
    chunk_size=512,
    chunk_overlap=100
)

start = time()
chunks = splitter.split_text(text)
end = time()

print(f"Time taken to chunk with custom code: {(end - start):.3f} s")
print(f"Total no of chunks: {len(chunks)}")


In [46]:
for i, c in enumerate(chunks[:3]):
    console.print(f"Chunk {i+1}:\n{c}\n")

## 3. Token-Based

### LangChain

In [47]:
from langchain_text_splitters import TokenTextSplitter

splitter = TokenTextSplitter(
    chunk_size=256,    
    chunk_overlap=50,   
    encoding_name="gpt2"  # tokenizer for GPT-2/3 style
)

start = time()
chunks = splitter.split_text(text)
end = time()

print(f"Time taken to chunk with custom code: {(end - start):.3f} s")
print(f"Total no of chunks: {len(chunks)}")

In [48]:
for i, c in enumerate(chunks[:3]):
    console.print(f"Chunk {i+1}:\n{c}\n")

### Llama-Index

In [49]:
import tiktoken
encoding = tiktoken.get_encoding("gpt2")  # GPT-2 style tokenizer


In [50]:
from llama_index.core.text_splitter import TokenTextSplitter

splitter = TokenTextSplitter(
    chunk_size=256,
    chunk_overlap=50,
    tokenizer=encoding.encode,
)

start = time()
chunks = splitter.split_text(text)
end = time()

print(f"Time taken to chunk with custom code: {(end - start):.3f} s")
print(f"Total no of chunks: {len(chunks)}")


In [51]:
for i, c in enumerate(chunks[:3]):
    console.print(f"Chunk {i+1}:\n{c}\n")

Under the hood, processing of overlap and edges might be different

### Using Tiktoken Custom Code

In [52]:
import tiktoken
encoding = tiktoken.get_encoding("gpt2")

chunk_size = 256
overlap = 50


start = time()
tokens = encoding.encode(text)
step = chunk_size - overlap
chunks = [tokens[i:i+chunk_size] for i in range(0, len(tokens), step)]
end = time()

print(f"Time taken to chunk with custom code: {(end - start):.3f} s")
print(f"Total no of chunks: {len(chunks)}")



In [53]:
for i, c in enumerate(chunks[:3]):
    console.print(f"Chunk {i+1}:\n{c}\n")
    print(len(c))

In [54]:
chunks_text = [encoding.decode(c) for c in chunks]
for i, c in enumerate(chunks_text[:3]):
    console.print(f"Chunk {i+1}:\n{c}\n")