In [None]:
from typing import Dict, List, Callable, Optional
import uuid

In [None]:
# document class from previous module
class Document:
    def __init__(self, pageContent: str, metadata: dict, id: str):
        self.pageContent = pageContent  # raw text from pdf
        self.metadata: dict = metadata  # pdf metadata (author, date ...)
        self.id = id  # pdf id

    def __repr__(self):
        meta_repr = {k: v for k, v in self.metadata.items() if k not in ('full_text',)}
        return f"Document(id={self.id!r}, chunk={self.metadata.get('chunk')}, total={self.metadata.get('totalChunks')}, len={len(self.pageContent)})"

In [None]:
class TextSplitter:
    def __init__(self, chunkSize: int, chunkOverlap: int, lengthfunction: Optional[Callable[[str], int]] = None, keepSeperator:bool = False):
        self.chunkSize = chunkSize
        self.chunkOverlap = chunkOverlap
        self.lengthfunction = lengthfunction or (lambda t: len(t))
        self.keepSeperator = keepSeperator

        if chunkOverlap > chunkSize:
            raise ValueError(f"chunkOverlap cannot be greater than chunkSize: {chunkSize} < {chunkOverlap}")
        
    def splitText(self, text: str):
        # forbidden, can only be implemented by the sub classes
        raise NotImplementedError()

    def splitDocuments(self, documents: List[Document]) -> List[Document]:
        """Split a list of Documents into chunks of Documents"""
        chunks: List[Document] = []
        for doc in documents:
            split_doc = self.createDocuments([doc.pageContent], [doc.metadata], source_id = doc.id)
            chunks.append(split_doc)
        return chunks # chunks of documents

    def createDocuments(self, texts: List[str], metadatas: Optional[List[Dict]] = [], source_id: Optional[str] = None) -> List[Document]:
        metadatas = metadatas or [{}]
        doucments: List[Document] = []
        for i, text in enumerate(texts):
            metadata = metadatas[i] if i < len(metadatas) else {}
            chunks = self.splitText(text)
            total = len(chunks)

            for j, chunk in enumerate(chunks):
                chunk_id = f"{source_id or metadata.get('id', str(uuid.uuid4()))}_chunk_{j}"
                doc_meta = {**metadata, "chunk": j, "totalChunk": total}
                doucments.append(Document(chunk, doc_meta, chunk_id))

        return doucments
    
    def mergeSplits(self, splits: List[str], separator: str) -> List[str]:
        """Merge splits of text into chunks of size <= chunkSize"""
        chunks: List[str] = []
        current: List[str] = []
        length = 0

        for split in splits:
            split_length = self.lengthfunction(split)
            extra_length = len(separator) if current else 0
            if length + split_length + extra_length >= self.chunkSize:
                if current:
                    joined = self.joinSplits(current, separator)
                    if joined:
                        chunks.append(joined)
                while length > self.chunkOverlap and current:
                    removed = current.pop(0)
                    length -= (self.lengthfunction(removed) + (len(separator) if current else 0))
            current.append(split)
            length += split_length + (len(separator) if len(current) > 1 else 0)
        if current:
            joined = self.joinSplits(current, separator)
            if joined:
                chunks.append(joined)
        return [c for c in chunks if c]
        

    def joinSplits(self, splits: List[str], separator: str) -> Optional[str]:
        """Join the temp splits"""
        text = separator.join(splits).strip()
        return text if text else None
        
        

In [None]:
class RecursiveCharacterTextSplitter(TextSplitter):
    def __init__(self, separators: Optional[List[str]] = None,
                 chunkSize: int = 1000,
                 chunkOverlap: int = 200,
                 lengthfunction: Optional[Callable[[str], int]] = None,
                 keepSeparator: bool = False):
        if separators is None:
            separators = ["\n\n", "\n", ". ", " ", ""]
        super().__init__(chunkSize=chunkSize, chunkOverlap=chunkOverlap, lengthfunction=lengthfunction, keepSeperator=keepSeparator)
        self.separators =separators

    def splitText(self, text: str) -> List[str]:
        """Recursively split texts based on the next separator from the list of separators"""
        final_chunk: List[str] = []
        separator = self.separators[-1]
        next_separator: List[str] = []

        for i, sep in enumerate(self.separators):
            if sep and sep in text:
                separator = sep
                next_separator = self.separators[i+1:]
                break

            if sep == "" and sep == "":
                separator = ""
                next_separator = []

        splits = [s for s in text.split(separator) if s is not None and (s.strip() if separator else s != "")]
        temp: List[str] = []

        for s in splits:
            if self.lengthfunction(s) <= self.chunkSize:
                temp.append(s)
            else:
                if temp:
                    final_chunk.extend(self.mergeSplits(temp, separator))
                    temp = []
                if not separator:
                    final_chunk.append(s)
                else:
                    recursive = RecursiveCharacterTextSplitter(separators=next_separator,
                                                               chunkSize=self.chunkSize,
                                                               chunkOverlap=self.chunkOverlap,
                                                               lengthfunction=self.lengthfunction,
                                                               keepSeparator=self.keepSeperator)
                    final_chunk.extend(recursive.splitText(s))
                
        if temp:
            final_chunk.extend(self.mergeSplits(temp, separator))
        return final_chunk
        

In [23]:
sample_text = (
    "This is the first paragraph. It has several sentences. Each sentence helps demonstrate the splitter.\n\n"
    "Second paragraph here. It is shorter, but still useful."
)
pdf_doc = Document(pageContent=sample_text, metadata={"source": "sample.pdf", "author": "tester"}, id="pdf1")

splitter = RecursiveCharacterTextSplitter(chunkSize=50, chunkOverlap=10)
chunked_docs = splitter.splitDocuments([pdf_doc])

print("RESULT: created", sum(len(docs) for docs in chunked_docs), "documents (chunks)\n")
for docs in chunked_docs:
    for d in docs:
        print(d)
        print("-> content preview:", repr(d.pageContent))
        print("-> metadata:", d.metadata)
        print("---")

"""print("\n-- CharacterTextSplitter demo --")
csplitter = CharacterTextSplitter(chunk_size=80, chunk_overlap=20, separator="\n\n")
cchunks = csplitter.split_documents([pdf_doc])
print("Character splitter produced", len(cchunks), "chunks. First chunk preview:")
print(cchunks[0].pageContent[:200])

print("\n-- TokenTextSplitter demo --")
tsplitter = TokenTextSplitter(chunk_size=100, chunk_overlap=20)
tchunks = tsplitter.split_documents([pdf_doc])
print("Token splitter produced", len(tchunks), "chunks. First chunk approx tokens:", tsplitter.length_function(tchunks[0].pageContent))"""

RESULT: created 5 documents (chunks)

Document(id='pdf1_chunk_0', chunk=0, total=None, len=27)
-> content preview: 'This is the first paragraph'
-> metadata: {'source': 'sample.pdf', 'author': 'tester', 'chunk': 0, 'totalChunk': 5}
---
Document(id='pdf1_chunk_1', chunk=1, total=None, len=24)
-> content preview: 'It has several sentences'
-> metadata: {'source': 'sample.pdf', 'author': 'tester', 'chunk': 1, 'totalChunk': 5}
---
Document(id='pdf1_chunk_2', chunk=2, total=None, len=45)
-> content preview: 'Each sentence helps demonstrate the splitter.'
-> metadata: {'source': 'sample.pdf', 'author': 'tester', 'chunk': 2, 'totalChunk': 5}
---
Document(id='pdf1_chunk_3', chunk=3, total=None, len=21)
-> content preview: 'Second paragraph here'
-> metadata: {'source': 'sample.pdf', 'author': 'tester', 'chunk': 3, 'totalChunk': 5}
---
Document(id='pdf1_chunk_4', chunk=4, total=None, len=32)
-> content preview: 'It is shorter, but still useful.'
-> metadata: {'source': 'sample.pdf', 'author': '

'print("\n-- CharacterTextSplitter demo --")\ncsplitter = CharacterTextSplitter(chunk_size=80, chunk_overlap=20, separator="\n\n")\ncchunks = csplitter.split_documents([pdf_doc])\nprint("Character splitter produced", len(cchunks), "chunks. First chunk preview:")\nprint(cchunks[0].pageContent[:200])\n\nprint("\n-- TokenTextSplitter demo --")\ntsplitter = TokenTextSplitter(chunk_size=100, chunk_overlap=20)\ntchunks = tsplitter.split_documents([pdf_doc])\nprint("Token splitter produced", len(tchunks), "chunks. First chunk approx tokens:", tsplitter.length_function(tchunks[0].pageContent))'