# Notebook on Document splitting 

In [None]:
# Import required libraries to interfact with the docuements

import os
import sys
from langchain_community.document_loaders import (
    PyMuPDFLoader,
    UnstructuredMarkdownLoader,
)
from langchain_core.documents import Document

from langchain.text_splitter import RecursiveCharacterTextSplitter


import requests
import logging
from typing import List, Dict, Any, Optional, Literal
from langchain_core.documents import Document
from langchain_text_splitters import RecursiveCharacterTextSplitter
import json # Added for parsing LLM response if needed

from pprint import pprint

In [42]:
file_paths = ("./data/quantum_computing.md", "./data/quantum_physics.md",)

In [76]:
# Setup logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

CONTEXTUAL_PROMPT_TEMPLATE = """Given the following document:
<document>
{full_doc_content}
</document>

And this specific chunk from the document:
<chunk>
{chunk_content}
</chunk>

Provide a short, succinct context (1-2 sentences) that helps understand where this chunk fits within the larger document. This context will be used to improve search retrieval for the chunk. Focus on the chunk's relationship to surrounding information or its main topic within the document's scope. Respond *only* with the context itself, nothing else.
"""

# Custom Exception
class LLMError(Exception):
    """Custom exception for LLM API errors."""
    pass

class SemanticSplitter:
    """
    Splits documents into semantic chunks using an LLM, with a fallback mechanism.
    """
    DEFAULT_SYSTEM_PROMPT = (
        "You are an expert text analyzer. Your task is to segment the provided text "
        "into semantically coherent chunks. Each chunk should represent a distinct idea or topic. "
        "Aim for chunks around {target_size} {size_unit} long, but prioritize semantic boundaries "
        "over strict size adherence. Output *only* the chunks separated by the delimiter '{separator}'. "
        "Try to stick with the document's original structure and meaning. "
        "Sometimes the document may contain tabular data but the structure might be lost in the text. "
        "In such cases, please try to keep the table data as much as possible. and add context about the table and table data in single chunk. "
        #"Do not chunk the table data. instead, keep it in a single chunk with the context."
        "The document may contain technical jargon or specialized terms. "
        "If you encounter any such terms, please try to keep them in the same chunk. "
    )

    DEFAULT_SUMMARY_PROMPT = CONTEXTUAL_PROMPT_TEMPLATE


    def __init__(
        self,
        llm_endpoint_url: str,
        llm_headers: Dict[str, str],
        target_chunk_size: int = 125, # Default size in tokens
        size_unit: Literal['tokens', 'characters'] = 'tokens', # Note: Token count is approximate unless tokenizer is matched
        llm_chunk_separator: str = "\n---CHUNK_SEPARATOR---\n",
        llm_request_timeout: int = 60, # seconds
        fallback_chunk_size: int = 650,
        fallback_chunk_overlap: int = 50,
        add_context: bool = False, # Flag to enable summarization
    ):
        """
        Initializes the SemanticSplitter.

        Args:
            llm_endpoint_url: The URL for the LLM API endpoint.
            llm_headers: Headers for the LLM API request.
            target_chunk_size: Desired approximate size for LLM-generated chunks.
            size_unit: Unit for target_chunk_size ('tokens' or 'characters').
            llm_chunk_separator: A unique string used to separate chunks in the LLM output.
            llm_request_timeout: Timeout for LLM API calls in seconds.
            fallback_chunk_size: Chunk size for the RecursiveCharacterTextSplitter fallback.
            fallback_chunk_overlap: Chunk overlap for the fallback splitter.
            add_context: Whether to add a summary metadata field (requires extra LLM calls).
        """
        self.llm_endpoint_url = llm_endpoint_url
        self.llm_headers = llm_headers.copy() # Make a copy

        self.target_chunk_size = target_chunk_size
        self.size_unit = size_unit
        self.llm_chunk_separator = llm_chunk_separator
        self.llm_request_timeout = llm_request_timeout
        self.add_context = add_context

        self.fallback_splitter = RecursiveCharacterTextSplitter(
            chunk_size=fallback_chunk_size,
            chunk_overlap=fallback_chunk_overlap,
            separators=["\n\n", "\n", " ", ".", ""], # Added more separators for flexibility
        )
        logger.info(f"SemanticSplitter initialized. LLM Endpoint: {self.llm_endpoint_url}, Target Size: {target_chunk_size} {size_unit}, Fallback Size: {fallback_chunk_size}")


    def _call_llm_api(self, payload: Dict[str, Any]) -> str:
        """Helper function to call the LLM API endpoint."""
        try:
            response = requests.post(
                self.llm_endpoint_url,
                headers=self.llm_headers,
                json=payload,
                timeout=self.llm_request_timeout
            )
            response.raise_for_status() # Raises HTTPError for bad responses (4xx or 5xx)

            # Assuming the response JSON has a key like 'response' or 'generated_text'
            # Adjust based on your actual API response structure
            response_data = response.json()
            if "response" in response_data:
                 generated_text = response_data["response"]
            else:
                raise LLMError(f"Could not extract generated text from LLM response: {response_data}")


            if not isinstance(generated_text, str):
                raise LLMError(f"LLM response content is not a string: {type(generated_text)}")

            return generated_text.strip()

        except requests.exceptions.Timeout:
            logger.error(f"LLM API call timed out after {self.llm_request_timeout} seconds.")
            raise LLMError("LLM API call timed out.")
        except requests.exceptions.RequestException as e:
            logger.error(f"LLM API request failed: {e}")
            raise LLMError(f"LLM API request failed: {e}")
        except (json.JSONDecodeError, KeyError, TypeError) as e:
             logger.error(f"Failed to parse LLM JSON response or access expected key: {e}")
             raise LLMError(f"Failed to parse LLM JSON response: {e}")


    def _split_with_llm(self, doc: Document) -> List[Document]:
        """Attempts to split a single document using the LLM."""
        logger.info(f"Attempting LLM splitting for source: {doc.metadata.get('source', 'N/A')}, page: {doc.metadata.get('page', 'N/A')}")
        page_content = doc.page_content
        if not page_content or page_content.isspace():
             logger.warning(f"Skipping empty page content for source: {doc.metadata.get('source', 'N/A')}, page: {doc.metadata.get('page', 'N/A')}")
             return []

        system_prompt = self.DEFAULT_SYSTEM_PROMPT.format(
            target_size=self.target_chunk_size,
            size_unit=self.size_unit,
            separator=self.llm_chunk_separator
        )

        payload = {
            "prompt": page_content,
            "system_prompt": system_prompt,
            "kwargs": {} # Add any specific LLM params here if needed
        }

        try:
            generated_text = self._call_llm_api(payload)

            if not generated_text or self.llm_chunk_separator not in generated_text:
                # Basic validation: LLM didn't follow instructions or produced empty result
                # Log the generated text for debugging
                logger.warning(f"LLm generated text: {generated_text}")
                logger.warning(f"LLM output validation failed for source: {doc.metadata.get('source', 'N/A')}, page: {doc.metadata.get('page', 'N/A')}. Output didn't contain separator or was empty.")
                raise ValueError("Invalid LLM response format (separator missing or empty).")

            raw_chunks = generated_text.split(self.llm_chunk_separator)
            # Filter out potential empty strings resulting from split
            raw_chunks = [chunk.strip() for chunk in raw_chunks if chunk.strip()]

            if not raw_chunks:
                 logger.warning(f"LLM output resulted in zero valid chunks after splitting for source: {doc.metadata.get('source', 'N/A')}, page: {doc.metadata.get('page', 'N/A')}.")
                 raise ValueError("LLM output resulted in zero valid chunks.")


            logger.info(f"LLM successfully generated {len(raw_chunks)} raw chunks.")
            return self._post_process_chunks(raw_chunks, doc, split_method="llm")

        except (LLMError, ValueError) as e:
            logger.warning(f"LLM splitting failed for source: {doc.metadata.get('source', 'N/A')}, page: {doc.metadata.get('page', 'N/A')}. Reason: {e}")
            # Re-raise the specific error to be caught by the calling method for fallback
            raise e


    def _split_with_fallback(self, doc: Document) -> List[Document]:
        """Splits a single document using the fallback RecursiveCharacterTextSplitter."""
        logger.warning(f"Using fallback RecursiveCharacterTextSplitter for source: {doc.metadata.get('source', 'N/A')}, page: {doc.metadata.get('page', 'N/A')}")

        fallback_chunks_text = self.fallback_splitter.split_text(doc.page_content)

        processed_chunks = []
        source_metadata = doc.metadata # Preserve original metadata

        for i, chunk_text in enumerate(fallback_chunks_text):
            chunk_metadata = source_metadata.copy() # Start with original metadata
            chunk_metadata.update({
                "chunk_number": i + 1,
                "split_method": "recursive",
                # Potentially add estimated size if needed
                # "estimated_char_count": len(chunk_text)
            })
            processed_chunks.append(Document(page_content=chunk_text, metadata=chunk_metadata))

        logger.info(f"Fallback splitter created {len(processed_chunks)} chunks.")
        return processed_chunks

    def _summarize_chunk(self, chunk_text: str, doc_content: str) -> Optional[str]:
        """(Optional) Generates a summary for a given text chunk using the LLM."""
        if not chunk_text or chunk_text.isspace():
            return None

        logger.debug(f"Requesting summary for chunk: {chunk_text[:100]}...") # Log first 100 chars

        summary_prompt = self.DEFAULT_SUMMARY_PROMPT.format(full_doc_content=doc_content, chunk_content=chunk_text)
        payload = {
            "prompt": summary_prompt,
            "kwargs": {}
        }

        try:
            summary = self._call_llm_api(payload)
            logger.debug(f"Received summary: {summary}")
            return summary
        except LLMError as e:
            logger.warning(f"Failed to generate summary for chunk: {e}")
            return None # Return None on summary failure


    def _post_process_chunks(self, raw_chunks: List[str], original_doc: Document, split_method: str) -> List[Document]:
        """Formats raw text chunks into Document objects with metadata."""
        processed_chunks = []
        source_metadata = original_doc.metadata

        for i, chunk_text in enumerate(raw_chunks):
            if not chunk_text or chunk_text.isspace():
                continue # Skip empty chunks just in case

            chunk_metadata = source_metadata.copy() # Start with original metadata
            chunk_metadata.update({
                "chunk_number": i + 1,
                "split_method": split_method,
                # "estimated_char_count": len(chunk_text)
            })

            # --- Optional Summarization ---
            if self.add_context:
                summary = self._summarize_chunk(chunk_text, original_doc.page_content)
                if summary:
                    chunk_metadata["summary"] = summary
                    chunk_text += f"\n\nContext of above chunk based on whole document: {summary}" # Append summary to chunk text
            # --- End Optional Summarization ---

            processed_chunks.append(Document(page_content=chunk_text, metadata=chunk_metadata))

        return processed_chunks


    def split_documents(self, documents: List[Document]) -> List[Document]:
        """
        Splits a list of documents using the LLM semantic splitter with fallback.

        Args:
            documents: A list of LangChain Document objects (e.g., from loader.load()).

        Returns:
            A list of new Document objects, each representing a chunk.
        """
        all_chunks = []
        total_docs = len(documents)
        for i, doc in enumerate(documents):
            logger.info(f"Processing document {i+1}/{total_docs}: source={doc.metadata.get('source', 'N/A')}, page={doc.metadata.get('page', 'N/A')}")
            try:
                # Decide based on content length
                # if len(doc.page_content) < self.target_chunk_size / 2: # Example heuristic
                #    logger.info("Content too short, using fallback splitter directly.")
                #    doc_chunks = self._split_with_fallback(doc)
                # else:
                #    doc_chunks = self._split_single_document(doc) # Use the method with try/except LLM->Fallback

                # Simpler: Always try LLM first for any non-empty doc
                if not doc.page_content or doc.page_content.isspace():
                    logger.info(f"Skipping document {i+1}/{total_docs} due to empty content.")
                    continue

                doc_chunks = self._split_single_document(doc) # Use the method with try/except LLM->Fallback
                all_chunks.extend(doc_chunks)
                logger.info(f"Finished processing document {i+1}/{total_docs}. Generated {len(doc_chunks)} chunks.")

            except Exception as e:
                # Catch unexpected errors during processing of a single document
                logger.error(f"Unexpected error processing document {i+1}/{total_docs} (source: {doc.metadata.get('source', 'N/A')}). Skipping this document. Error: {e}", exc_info=True)
                # Decide if you want to halt completely or just skip the problematic doc
                # continue

        logger.info(f"Completed splitting. Total chunks generated: {len(all_chunks)}")
        return all_chunks

    def _split_single_document(self, doc: Document) -> List[Document]:
        """Internal helper to try LLM split and fallback on error for one document."""
        try:
            # Attempt LLM splitting
            llm_chunks = self._split_with_llm(doc)
            # Optional: Add a size check here? If LLM produced chunks that are way too big?
            # For now, trust the LLM or accept its output.
            return llm_chunks
        except (LLMError, ValueError, requests.exceptions.RequestException) as e:
            # Logged inside _split_with_llm or _call_llm_api, log fallback activation here
            logger.warning(f"LLM splitting failed or produced invalid output for doc (source: {doc.metadata.get('source', 'N/A')}, page: {doc.metadata.get('page', 'N/A')}). Activating fallback splitter. Reason: {e}")
            # Use fallback splitter
            fallback_chunks = self._split_with_fallback(doc)
            return fallback_chunks
        except Exception as e:
             # Catch any other unexpected error during LLM attempt
             logger.error(f"Unexpected error during LLM splitting attempt for doc (source: {doc.metadata.get('source', 'N/A')}). Error: {e}", exc_info=True)
             logger.warning("Activating fallback splitter due to unexpected error.")
             fallback_chunks = self._split_with_fallback(doc)
             return fallback_chunks

In [77]:
LLM_ENDPOINT = 'http://localhost:8000/api/llm/generate_response'
LLM_HEADERS = {
    'accept': 'application/json',
    'Content-Type': 'application/json'
}
TIMEOUT = 3000 # seconds

In [78]:
# --- Initialize the splitter ---
splitter = SemanticSplitter(
    llm_endpoint_url=LLM_ENDPOINT,
    llm_headers=LLM_HEADERS,
    llm_request_timeout=TIMEOUT,
)

2025-03-31 01:49:39,320 - INFO - SemanticSplitter initialized. LLM Endpoint: http://localhost:8000/api/llm/generate_response, Target Size: 125 tokens, Fallback Size: 650


In [79]:
# loading docs using the langchain document loaders
#docs = PyMuPDFLoader(file_paths[0]).load()
# function for loading documents
def load_documents(file_path: str) -> List[Document]:
    """
    Loads documents using the appropriate loader based on file extension.
    """
    _, ext = os.path.splitext(file_path)
    ext = ext.lower()

    if ext == '.pdf':
        logger.info(f"Loading PDF file: {file_path}")
        loader = PyMuPDFLoader(file_path)
    elif ext == '.md':
        logger.info(f"Loading Markdown file: {file_path}")
        loader = UnstructuredMarkdownLoader(file_path)
    else:
        raise ValueError(f"Unsupported file extension: {ext}")

    documents = loader.load()
    logger.info(f"Loaded {len(documents)} documents from {file_path}.")
    return documents

In [80]:
docs = load_documents(file_paths[0])

2025-03-31 01:49:39,691 - INFO - Loading Markdown file: ./data/quantum_computing.md
2025-03-31 01:49:39,711 - INFO - Loaded 1 documents from ./data/quantum_computing.md.


In [81]:
print(docs[0].page_content)

Quantum Computing: Harnessing the Power of the Quantum Realm

Quantum computing is an emerging field that leverages the principles of quantum mechanics to solve complex problems that are intractable for classical computers. By exploiting phenomena like superposition and entanglement, quantum computers have the potential to revolutionize various industries, from medicine and materials science to finance and artificial intelligence.

The Limitations of Classical Computing

Classical computers, the workhorses of our digital age, store and process information as bits, which can exist in one of two states: 0 or 1. These bits are the fundamental units of information. While classical computers have made tremendous progress, they face fundamental limitations when dealing with certain types of problems, particularly those involving:

Exponential Complexity: Many real-world problems, such as simulating large molecules, factoring large numbers, or optimizing complex systems, exhibit exponential c

In [82]:
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=650,
    chunk_overlap=50,
    separators=["\n\n", "\n", " ", ".", ""], # Added more separators for flexibility
)

splitted_docs = text_splitter.split_documents(docs)

print(f"Splitted {len(docs)} documents into {len(splitted_docs)} chunks.")
print(f"First chunk: {splitted_docs[0].page_content[:1000]}...") # Print first 100 chars of the first chunk

Splitted 1 documents into 18 chunks.
First chunk: Quantum Computing: Harnessing the Power of the Quantum Realm

Quantum computing is an emerging field that leverages the principles of quantum mechanics to solve complex problems that are intractable for classical computers. By exploiting phenomena like superposition and entanglement, quantum computers have the potential to revolutionize various industries, from medicine and materials science to finance and artificial intelligence.

The Limitations of Classical Computing...


In [84]:
try:
    semantic_chunks = splitter.split_documents(docs)

    # --- Inspect the results ---
    if semantic_chunks:
        print(f"\nSuccessfully generated {len(semantic_chunks)} chunks.")
        print("\nExample chunk 1:")
        print("Content:", semantic_chunks[0].page_content[:500] + "...") # Print first 500 chars
        print("Metadata:", semantic_chunks[0].metadata)

        # Find a chunk created by fallback (if any)
        fallback_chunk = next((c for c in semantic_chunks if c.metadata.get("split_method") == "recursive"), None)
        if fallback_chunk:
             print("\nExample fallback chunk:")
             print("Content:", fallback_chunk.page_content[:500] + "...")
             print("Metadata:", fallback_chunk.metadata)
        else:
             print("\nNo fallback chunks were generated in this run.")

    else:
        print("\nNo chunks were generated.")

except Exception as e:
    logger.error(f"An error occurred during the main splitting process: {e}", exc_info=True)

2025-03-31 01:53:35,634 - INFO - Processing document 1/1: source=./data/quantum_computing.md, page=N/A
2025-03-31 01:53:35,636 - INFO - Attempting LLM splitting for source: ./data/quantum_computing.md, page: N/A
2025-03-31 01:56:59,361 - INFO - LLM successfully generated 9 raw chunks.
2025-03-31 01:56:59,365 - INFO - Finished processing document 1/1. Generated 9 chunks.
2025-03-31 01:56:59,365 - INFO - Completed splitting. Total chunks generated: 9



Successfully generated 9 chunks.

Example chunk 1:
Content: Quantum computing represents a paradigm shift in computation, utilizing quantum mechanics to tackle problems beyond the reach of classical computers. It leverages phenomena like superposition and entanglement to explore vastly larger solution spaces and achieve exponential speedups for specific tasks....
Metadata: {'source': './data/quantum_computing.md', 'chunk_number': 1, 'split_method': 'llm'}

No fallback chunks were generated in this run.


In [85]:
len(semantic_chunks)

9

In [86]:
for i, x in enumerate(semantic_chunks):
    print(f"Chunk {i+1}:")
    pprint(f"Content: {x.page_content}")
    #print(f"Metadata: {x.metadata}")
    print("-" * 40)
    if i >= 5:
        break

Chunk 1:
('Content: Quantum computing represents a paradigm shift in computation, '
 'utilizing quantum mechanics to tackle problems beyond the reach of classical '
 'computers. It leverages phenomena like superposition and entanglement to '
 'explore vastly larger solution spaces and achieve exponential speedups for '
 'specific tasks.')
----------------------------------------
Chunk 2:
('Content: Classical computers store information as bits representing either 0 '
 'or 1, while quantum computers utilize qubits, which can exist in a '
 'superposition of both states simultaneously. This allows quantum computers '
 'to perform multiple calculations concurrently, dramatically increasing '
 'processing power for certain types of problems.')
----------------------------------------
Chunk 3:
('Content: A key limitation of classical computing arises from exponential '
 'complexity – problems that require exponentially increasing computational '
 'resources as their size grows. For instance,

In [87]:
for x, y in zip(semantic_chunks[:5], splitted_docs[:5]):
    print("Chunk from LLM: ", x.page_content)
    print("*" * 40)
    print("Chunk from Normal splitter: ", y.page_content)
    print("-" * 40)

Chunk from LLM:  Quantum computing represents a paradigm shift in computation, utilizing quantum mechanics to tackle problems beyond the reach of classical computers. It leverages phenomena like superposition and entanglement to explore vastly larger solution spaces and achieve exponential speedups for specific tasks.
****************************************
Chunk from Normal splitter:  Quantum Computing: Harnessing the Power of the Quantum Realm

Quantum computing is an emerging field that leverages the principles of quantum mechanics to solve complex problems that are intractable for classical computers. By exploiting phenomena like superposition and entanglement, quantum computers have the potential to revolutionize various industries, from medicine and materials science to finance and artificial intelligence.

The Limitations of Classical Computing
----------------------------------------
Chunk from LLM:  Classical computers store information as bits representing either 0 or 1, whi