# 1. Semantic Chunking

**What:** Split text where meaning changes, not at fixed sizes

**Why:** Preserves sentence integrity and topic flow

**When:** Structured documents with clear topic shifts (SOPs, manuals, reports)

**How It Works:**
1. Split text into sentences
2. Compute sentence embeddings
3. Measure cosine similarity between adjacent sentences
4. Insert split when similarity drops below threshold

**Comparison:**
```
Fixed-size:  "The cat sat on the | mat. Dogs are loyal ani | mals..."
Semantic:    "The cat sat on the mat. | Dogs are loyal animals..."
```

**Parameters:**
- `breakpoint_threshold_type`: "percentile", "standard_deviation", "interquartile"
- `breakpoint_threshold_amount`: e.g., 95 (percentile), 3 (std dev)

In [None]:
import os
from langchain_experimental.text_splitter import SemanticChunker
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import FAISS
from langchain_core.documents import Document

from config import embeddings, load_documents, format_docs, VECTOR_STORE_PATH

## Create Semantic Chunker

In [None]:
def create_semantic_chunker(threshold_type="percentile", threshold_amount=95):
    """Create semantic chunker with specified threshold"""
    return SemanticChunker(
        embeddings=embeddings,
        breakpoint_threshold_type=threshold_type,
        breakpoint_threshold_amount=threshold_amount
    )

## Apply Semantic Chunking

In [None]:
def semantic_chunk_documents(documents, threshold_type="percentile", threshold_amount=95):
    """Apply semantic chunking to documents"""
    splitter = create_semantic_chunker(threshold_type, threshold_amount)
    
    all_chunks = []
    
    for doc in documents:
        # Skip very short text
        if len(doc.page_content) < 100:
            doc.metadata["chunk_method"] = "original"
            all_chunks.append(doc)
            continue
        
        try:
            chunks = splitter.split_documents([doc])
            for chunk in chunks:
                chunk.metadata.update(doc.metadata)
                chunk.metadata["chunk_method"] = "semantic"
            all_chunks.extend(chunks)
        except:
            doc.metadata["chunk_method"] = "original"
            all_chunks.append(doc)
    
    return all_chunks

## Compare with Fixed-Size Chunking

In [None]:
def compare_chunking_methods(documents, fixed_size=1000, semantic_threshold=95):
    """Compare fixed-size vs semantic chunking"""
    print("="*60)
    print("CHUNKING COMPARISON")
    print("="*60)
    
    # Fixed-size chunking
    fixed_splitter = RecursiveCharacterTextSplitter(chunk_size=fixed_size, chunk_overlap=200)
    fixed_chunks = fixed_splitter.split_documents(documents)
    
    fixed_lengths = [len(c.page_content) for c in fixed_chunks]
    print(f"\nFixed-Size Chunks:")
    print(f"  Count: {len(fixed_chunks)}")
    print(f"  Avg length: {sum(fixed_lengths)//len(fixed_lengths)} chars")
    
    # Semantic chunking
    semantic_chunks = semantic_chunk_documents(documents, threshold_amount=semantic_threshold)
    
    semantic_lengths = [len(c.page_content) for c in semantic_chunks]
    print(f"\nSemantic Chunks:")
    print(f"  Count: {len(semantic_chunks)}")
    print(f"  Avg length: {sum(semantic_lengths)//len(semantic_lengths)} chars")
    
    return {
        "fixed_chunks": fixed_chunks,
        "semantic_chunks": semantic_chunks
    }

## Create Vectorstore

In [None]:
def create_semantic_vectorstore(documents, threshold=95, force_rebuild=False):
    """Create FAISS vectorstore with semantic chunks"""
    store_path = f"{VECTOR_STORE_PATH}_semantic"
    
    if os.path.exists(store_path) and not force_rebuild:
        return FAISS.load_local(store_path, embeddings, allow_dangerous_deserialization=True)
    
    chunks = semantic_chunk_documents(documents, threshold_amount=threshold)
    print(f"Created {len(chunks)} semantic chunks")
    
    vectorstore = FAISS.from_documents(chunks, embeddings)
    vectorstore.save_local(store_path)
    
    return vectorstore

## Test

In [None]:
# Load documents
documents = load_documents()

# Compare methods
comparison = compare_chunking_methods(documents[:10])

# Create vectorstore
vectorstore = create_semantic_vectorstore(documents, force_rebuild=True)

# Test retrieval
test_questions = [
    "What is DeMask?",
    "What is the difference between Graph DTA and Graph DF?"
]

for q in test_questions:
    print(f"\nQuestion: {q}")
    docs = vectorstore.similarity_search(q, k=2)
    for i, doc in enumerate(docs):
        print(f"  {i+1}. [{doc.metadata.get('filename', 'unknown')}] {doc.page_content[:80]}...")