# Chunking

## Imports

In [9]:
import os
import glob
from pathlib import Path
from typing import List, Dict, Any
import pandas as pd
from pprint import pprint

# Import LangChain components
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.schema import Document

## Document Reading

In [10]:
def read_documents(folder_path: str) -> List[Document]:
    """
    Read all text documents from the given folder and return as LangChain Document objects.
    Documents are sorted by filename to maintain order.
    """
    print(f"Reading documents from {folder_path}...")
    files = sorted(glob.glob(os.path.join(folder_path, "*.txt")))
    documents = []
    
    for file_path in files:
        with open(file_path, "r", encoding="utf-8") as file:
            content = file.read()
            doc_name = Path(file_path).stem
            doc_index = int(doc_name.split('_')[-1]) if doc_name.split('_')[-1].isdigit() else len(documents)
            
            documents.append(
                Document(
                    page_content=content,
                    metadata={
                        "source": file_path, 
                        "doc_name": doc_name, 
                        "doc_index": doc_index
                    }
                )
            )
    
    # Sort documents by their index to ensure correct ordering
    documents.sort(key=lambda x: x.metadata["doc_index"])
    
    print(f"Found {len(documents)} documents")
    return documents

## Document Splitting

In [11]:
def split_documents(documents: List[Document], chunk_size: int = 1000, chunk_overlap: int = 200) -> List[Document]:
    """
    Split documents into chunks using RecursiveCharacterTextSplitter.
    Maintains document boundaries and stores adjacent document references.
    """
    print(f"\nSplitting documents with chunk_size={chunk_size}, chunk_overlap={chunk_overlap}...")
    
    # Initialize text splitter
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap,
        length_function=len
    )
    
    all_chunks = []
    doc_chunks_map = {}  # Maps doc_index to its chunks
    
    # Process each document separately to maintain boundaries
    for doc in documents:
        doc_index = doc.metadata["doc_index"]
        print(f"Processing document {doc_index}: {doc.metadata['doc_name']}")
        
        doc_chunks = text_splitter.split_documents([doc])
        doc_chunks_map[doc_index] = doc_chunks
        
        # Add additional metadata to each chunk
        for j, chunk in enumerate(doc_chunks):
            # Copy existing metadata and add chunk-specific info
            chunk.metadata["chunk_index"] = j
            chunk.metadata["total_chunks_in_doc"] = len(doc_chunks)
            chunk.metadata["is_first_chunk"] = (j == 0)
            chunk.metadata["is_last_chunk"] = (j == len(doc_chunks) - 1)
            
            # Store adjacent document indices
            chunk.metadata["prev_doc_index"] = doc_index - 1 if doc_index > 0 else None
            chunk.metadata["next_doc_index"] = doc_index + 1 if doc_index < len(documents) - 1 else None
            
            all_chunks.append(chunk)
    
    print(f"Created {len(all_chunks)} total chunks across {len(documents)} documents")
    return all_chunks

## Retrive Adjacent Chunks

In [12]:
def get_adjacent_docs(chunks: List[Document], chunk_index: int, n: int = 1) -> Dict[str, Any]:
    """
    Get the current chunk and its adjacent chunks within the same document.
    
    Args:
        chunks: List of all document chunks
        chunk_index: Index of the current chunk
        n: Number of adjacent chunks to retrieve in each direction (default: 1)
    
    Returns:
        Dictionary containing the current chunk and lists of previous and next chunks
    """
    if chunk_index < 0 or chunk_index >= len(chunks):
        raise ValueError(f"Chunk index {chunk_index} is out of bounds")
    
    current_chunk = chunks[chunk_index]
    current_doc_index = current_chunk.metadata["doc_index"]
    current_chunk_idx_in_doc = current_chunk.metadata["chunk_index"]
    
    print(f"\nRetrieving adjacent chunks for chunk {chunk_index}:")
    print(f"  Document: {current_doc_index} ({current_chunk.metadata['doc_name']})")
    print(f"  Chunk position within document: {current_chunk_idx_in_doc + 1} of {current_chunk.metadata['total_chunks_in_doc']}")
    print(f"  Retrieving {n} chunks before and after")
    
    # Find all chunks from the current document
    current_doc_chunks = [
        chunk for chunk in chunks 
        if chunk.metadata["doc_index"] == current_doc_index
    ]
    
    # Sort chunks by their position in the document
    current_doc_chunks.sort(key=lambda x: x.metadata["chunk_index"])
    
    # Find the position of the current chunk within the document
    current_position = -1
    for i, chunk in enumerate(current_doc_chunks):
        if chunk.metadata["chunk_index"] == current_chunk.metadata["chunk_index"]:
            current_position = i
            break
    
    if current_position == -1:
        raise ValueError("Current chunk not found in its document chunks")
    
    # Get previous n chunks (if available)
    prev_chunks = []
    start_idx = max(0, current_position - n)
    if start_idx < current_position:
        prev_chunks = current_doc_chunks[start_idx:current_position]
    
    # Get next n chunks (if available)
    next_chunks = []
    end_idx = min(len(current_doc_chunks), current_position + n + 1)
    if current_position + 1 < end_idx:
        next_chunks = current_doc_chunks[current_position + 1:end_idx]
    
    print(f"  Found {len(prev_chunks)} previous chunks and {len(next_chunks)} next chunks")
    
    return {
        "current_chunk": current_chunk,
        "prev_chunks": prev_chunks,
        "next_chunks": next_chunks,
        "all_doc_chunks": current_doc_chunks
    }

## Storing documents in local

In [13]:
import pickle
import os
from typing import List
from langchain.schema import Document

def save_chunks_to_disk(chunks: List[Document], output_folder: str = "/workspaces/RAG_BOT/LocalChunks") -> str:
    """
    Save document chunks to disk using pickle serialization.
    
    Args:
        chunks: List of document chunks
        output_folder: Folder to save the chunks
        
    Returns:
        Path to the saved chunks file
    """
    print(f"\nSaving {len(chunks)} chunks to disk...")
    
    # Create output folder if it doesn't exist
    os.makedirs(output_folder, exist_ok=True)
    
    # Create a filename with timestamp to avoid overwriting
    from datetime import datetime
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    chunks_path = os.path.join(output_folder, f"document_chunks_{timestamp}.pkl")
    
    # Save the chunks to disk
    with open(chunks_path, "wb") as f:
        pickle.dump(chunks, f)
    
    print(f"Chunks saved to {chunks_path}")
    return chunks_path

## Loading local documents

In [14]:
import pickle
import os
from typing import List
from langchain.schema import Document


def load_chunks_from_disk(chunks_path: str) -> List[Document]:
    """
    Load document chunks from disk.
    
    Args:
        chunks_path: Path to the saved chunks file
        
    Returns:
        List of document chunks
    """
    print(f"\nLoading chunks from {chunks_path}...")
    
    # Check if file exists
    if not os.path.exists(chunks_path):
        raise FileNotFoundError(f"Chunks file not found at {chunks_path}")
    
    # Load the chunks from disk
    with open(chunks_path, "rb") as f:
        chunks = pickle.load(f)
    
    print(f"Loaded {len(chunks)} chunks from disk")
    return chunks

## Analysis

In [15]:
def analyze_chunks(chunks: List[Document]) -> None:
    """
    Analyze and print information about the created chunks.
    """
    print("\nChunk Analysis:")
    
    # Get unique document indices
    doc_indices = sorted(set(chunk.metadata["doc_index"] for chunk in chunks))
    
    # Count chunks per document
    chunks_per_doc = {}
    chunk_sizes = {}
    
    for doc_idx in doc_indices:
        doc_chunks = [chunk for chunk in chunks if chunk.metadata["doc_index"] == doc_idx]
        chunks_per_doc[doc_idx] = len(doc_chunks)
        
        # Get document name from first chunk
        doc_name = doc_chunks[0].metadata["doc_name"] if doc_chunks else "Unknown"
        
        # Calculate chunk sizes
        sizes = [len(chunk.page_content) for chunk in doc_chunks]
        chunk_sizes[doc_idx] = {
            "doc_name": doc_name,
            "min_size": min(sizes) if sizes else 0,
            "max_size": max(sizes) if sizes else 0,
            "avg_size": sum(sizes) / len(sizes) if sizes else 0
        }
    
    # Create a DataFrame for better visualization
    analysis_data = []
    for doc_idx in doc_indices:
        analysis_data.append({
            "Document Index": doc_idx,
            "Document Name": chunk_sizes[doc_idx]["doc_name"],
            "Number of Chunks": chunks_per_doc[doc_idx],
            "Min Chunk Size": chunk_sizes[doc_idx]["min_size"],
            "Max Chunk Size": chunk_sizes[doc_idx]["max_size"],
            "Avg Chunk Size": chunk_sizes[doc_idx]["avg_size"]
        })
    
    df = pd.DataFrame(analysis_data)
    print(df)
    
    # Overall statistics
    total_chunks = len(chunks)
    avg_chunks_per_doc = sum(chunks_per_doc.values()) / len(chunks_per_doc) if chunks_per_doc else 0
    
    print(f"\nTotal chunks: {total_chunks}")
    print(f"Average chunks per document: {avg_chunks_per_doc:.2f}")
    print(f"Total documents: {len(doc_indices)}")

## Generate Embedding

In [16]:
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS
import os
from typing import List, Dict, Any

def generate_and_save_embeddings(chunks:List[Document], embedding_document_name:str, output_folder: str = "/workspaces/RAG_BOT/LocalEmbeddings") -> None:
    """
    Generate embeddings for all chunks using HuggingFaceEmbeddings and save as FAISS index.
    
    Args:
        chunks: List of document chunks
        output_folder: Folder to save the FAISS index
    """
    print(f"\nGenerating embeddings for {len(chunks)} chunks...")
    
    # Initialize the embedding model
    embedding_model = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
    print(f"Using embedding model: all-MiniLM-L6-v2")
    
    # Create FAISS index from documents
    print("Creating FAISS index...")
    db = FAISS.from_documents(chunks, embedding_model)
    
    # Create output folder if it doesn't exist
    os.makedirs(output_folder, exist_ok=True)
    
    # Save the index to disk
    index_path = os.path.join(output_folder, embedding_document_name)
    db.save_local(index_path)
    print(f"FAISS index saved to {index_path}")
    
    return db

## Main

In [17]:
# Main execution
def main(save_to_disk=True):
    # Path to the enriched endpoints folder
    folder_path = "/workspaces/RAG_BOT/EnrichedData"
    local_chunks_folder = "/workspaces/RAG_BOT/LocalChunks"
    local_embedding_folder = "/workspaces/RAG_BOT/LocalEmbeddings"

    # Read all documents
    documents = read_documents(folder_path)
    
    # Print document names and sizes
    print("\nDocument Details:")
    for i, doc in enumerate(documents):
        print(f"Document {i}: {doc.metadata['doc_name']} - {len(doc.page_content)} characters")
    
    # Split documents into chunks
    chunks = split_documents(documents, chunk_size=1000, chunk_overlap=150)
    
    # Save chunks to disk if requested
    if save_to_disk:
        chunks_path = save_chunks_to_disk(chunks, local_chunks_folder)
    
    # Analyze chunk distribution
    analyze_chunks(chunks)
    
    # Generate embeddings and save to FAISS index
    db = generate_and_save_embeddings(chunks, embedding_document_name="Chatgpt_Enriched_Full_Embedding", output_folder=local_embedding_folder)

    return chunks, db

chunks, db = main(save_to_disk=True)

Reading documents from /workspaces/RAG_BOT/EnrichedData...
Found 77 documents

Document Details:
Document 0: ApplicationManagement_endpoint_001 - 4373 characters
Document 1: PolicyMangement_endpoint_001 - 3950 characters
Document 2: UserManagement_endpoint_001 - 11841 characters
Document 3: ApplicationManagement_endpoint_002 - 3878 characters
Document 4: PolicyMangement_endpoint_002 - 5197 characters
Document 5: UserManagement_endpoint_002 - 15569 characters
Document 6: ApplicationManagement_endpoint_003 - 7771 characters
Document 7: PolicyMangement_endpoint_003 - 5984 characters
Document 8: UserManagement_endpoint_003 - 8009 characters
Document 9: ApplicationManagement_endpoint_004 - 6158 characters
Document 10: PolicyMangement_endpoint_004 - 3544 characters
Document 11: UserManagement_endpoint_004 - 4863 characters
Document 12: ApplicationManagement_endpoint_005 - 16342 characters
Document 13: PolicyMangement_endpoint_005 - 4815 characters
Document 14: UserManagement_endpoint_005 - 3

  embedding_model = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")


Using embedding model: all-MiniLM-L6-v2
Creating FAISS index...
FAISS index saved to /workspaces/RAG_BOT/LocalEmbeddings/Chatgpt_Enriched_Full_Embedding


## Load from local and Perform get adjacent chunks

In [18]:
local_chunks_path = "/workspaces/RAG_BOT/LocalChunks/document_chunks_20250715_020245.pkl"

local_chunks = load_chunks_from_disk(local_chunks_path)

get_adjacent_docs(chunks, 3, n=3)


Loading chunks from /workspaces/RAG_BOT/LocalChunks/document_chunks_20250715_020245.pkl...
Loaded 58 chunks from disk

Retrieving adjacent chunks for chunk 3:
  Document: 1 (ApplicationManagement_endpoint_001)
  Chunk position within document: 4 of 7
  Retrieving 3 chunks before and after
  Found 3 previous chunks and 3 next chunks


{'current_chunk': Document(metadata={'source': '/workspaces/RAG_BOT/EnrichedData/ApplicationManagement_endpoint_001.txt', 'doc_name': 'ApplicationManagement_endpoint_001', 'doc_index': 1, 'chunk_index': 3, 'total_chunks_in_doc': 7, 'is_first_chunk': False, 'is_last_chunk': False, 'prev_doc_index': 0, 'next_doc_index': 2}, page_content='### Response Parameters  \n- **Result (object):** A list of application templates.\n  - **Category (string):** The category of the application (e.g., "Productivity").\n  - **DisplayName (string):** The display name of the application template (e.g., "Task Manager").\n  - **AppType (string):** The type of the application (e.g., "Web").\n  - **AppTypeDisplayName (string):** The display name of the application type (e.g., "Web Application").\n  - **Icon (string):** The path to the application icon image (e.g., "/icons/task-manager.png").\n  - **WebAppType (string):** The web application type (e.g., "Single Page Application").\n  - **Description (string):** 