# Lab 3.5.2 Solutions: Chunking Strategies for RAG

Complete solutions with performance analysis and recommendations.

## Setup

In [None]:
import sys
sys.path.insert(0, '..')

from pathlib import Path
from typing import List, Dict, Any, Callable
import numpy as np
import torch
import time

from langchain.schema import Document
from langchain.text_splitter import (
    RecursiveCharacterTextSplitter,
    CharacterTextSplitter,
    TokenTextSplitter
)
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_community.vectorstores import Chroma

print(f"GPU available: {torch.cuda.is_available()}")

In [None]:
# Load sample documents
def load_documents():
    documents = []
    data_path = Path("../data/sample_documents")
    
    for file_path in data_path.glob("*.md"):
        content = file_path.read_text(encoding='utf-8')
        doc = Document(
            page_content=content,
            metadata={"source": str(file_path), "filename": file_path.name}
        )
        documents.append(doc)
    
    return documents

documents = load_documents()
print(f"Loaded {len(documents)} documents")

# Combine all text for analysis
full_text = "\n\n".join([doc.page_content for doc in documents])
print(f"Total text length: {len(full_text):,} characters")

## Exercise 1 Solution: Implement All Chunking Strategies

**Task**: Implement fixed-size, recursive, sentence-based, and semantic chunking.

In [None]:
class ChunkingStrategies:
    """Collection of chunking strategies with consistent interface."""
    
    @staticmethod
    def fixed_size(documents: List[Document], chunk_size: int = 500, overlap: int = 0) -> List[Document]:
        """
        Fixed-size chunking - simple but may break mid-sentence.
        
        Best for: Uniform processing, simple requirements
        Weakness: Ignores semantic boundaries
        """
        splitter = CharacterTextSplitter(
            chunk_size=chunk_size,
            chunk_overlap=overlap,
            separator=""
        )
        return splitter.split_documents(documents)
    
    @staticmethod
    def recursive(documents: List[Document], chunk_size: int = 500, overlap: int = 100) -> List[Document]:
        """
        Recursive chunking - respects document structure.
        
        Best for: Structured documents (Markdown, code)
        Strength: Preserves semantic units
        """
        splitter = RecursiveCharacterTextSplitter(
            chunk_size=chunk_size,
            chunk_overlap=overlap,
            separators=["\n## ", "\n### ", "\n\n", "\n", ". ", " "]
        )
        return splitter.split_documents(documents)
    
    @staticmethod
    def sentence_based(documents: List[Document], sentences_per_chunk: int = 5) -> List[Document]:
        """
        Sentence-based chunking - natural language units.
        
        Best for: Prose text, Q&A content
        Strength: Complete thoughts preserved
        """
        import re
        
        chunks = []
        for doc in documents:
            # Split into sentences
            sentences = re.split(r'(?<=[.!?])\s+', doc.page_content)
            
            # Group sentences
            for i in range(0, len(sentences), sentences_per_chunk):
                chunk_sentences = sentences[i:i + sentences_per_chunk]
                chunk_text = ' '.join(chunk_sentences)
                
                if chunk_text.strip():
                    chunks.append(Document(
                        page_content=chunk_text,
                        metadata={
                            **doc.metadata,
                            "chunk_type": "sentence",
                            "sentences": len(chunk_sentences)
                        }
                    ))
        
        return chunks
    
    @staticmethod
    def semantic(
        documents: List[Document],
        embedding_model: HuggingFaceEmbeddings,
        similarity_threshold: float = 0.75,
        max_chunk_size: int = 1000
    ) -> List[Document]:
        """
        Semantic chunking - groups by meaning.
        
        Best for: Complex documents, varied content
        Strength: Highest semantic coherence
        Weakness: Computationally expensive
        """
        import re
        
        all_chunks = []
        
        for doc in documents:
            # Split into sentences
            sentences = re.split(r'(?<=[.!?])\s+', doc.page_content)
            sentences = [s.strip() for s in sentences if s.strip()]
            
            if len(sentences) < 2:
                all_chunks.append(doc)
                continue
            
            # Get sentence embeddings
            embeddings = np.array(embedding_model.embed_documents(sentences))
            
            # Group similar consecutive sentences
            current_chunk = [sentences[0]]
            current_emb = embeddings[0]
            
            for i in range(1, len(sentences)):
                # Compute similarity
                similarity = np.dot(current_emb, embeddings[i]) / (
                    np.linalg.norm(current_emb) * np.linalg.norm(embeddings[i]) + 1e-8
                )
                
                chunk_text = ' '.join(current_chunk)
                
                # Start new chunk if dissimilar or too large
                if similarity < similarity_threshold or len(chunk_text) > max_chunk_size:
                    all_chunks.append(Document(
                        page_content=chunk_text,
                        metadata={
                            **doc.metadata,
                            "chunk_type": "semantic",
                            "sentences": len(current_chunk)
                        }
                    ))
                    current_chunk = [sentences[i]]
                    current_emb = embeddings[i]
                else:
                    current_chunk.append(sentences[i])
                    # Update centroid
                    chunk_indices = list(range(i - len(current_chunk) + 1, i + 1))
                    current_emb = np.mean(embeddings[chunk_indices], axis=0)
            
            # Don't forget last chunk
            if current_chunk:
                all_chunks.append(Document(
                    page_content=' '.join(current_chunk),
                    metadata={
                        **doc.metadata,
                        "chunk_type": "semantic",
                        "sentences": len(current_chunk)
                    }
                ))
        
        return all_chunks

print("ChunkingStrategies class defined with 4 strategies")

## Exercise 2 Solution: Compare Chunk Statistics

**Task**: Analyze chunk size distribution for each strategy.

In [None]:
def analyze_chunks(chunks: List[Document], strategy_name: str) -> Dict[str, Any]:
    """Compute comprehensive chunk statistics."""
    sizes = [len(c.page_content) for c in chunks]
    
    return {
        "strategy": strategy_name,
        "num_chunks": len(chunks),
        "mean_size": np.mean(sizes),
        "std_size": np.std(sizes),
        "min_size": np.min(sizes),
        "max_size": np.max(sizes),
        "median_size": np.median(sizes),
        "size_variance_coef": np.std(sizes) / np.mean(sizes)  # Lower = more uniform
    }

# Create chunks with each strategy
fixed_chunks = ChunkingStrategies.fixed_size(documents, chunk_size=500)
recursive_chunks = ChunkingStrategies.recursive(documents, chunk_size=500, overlap=100)
sentence_chunks = ChunkingStrategies.sentence_based(documents, sentences_per_chunk=5)

# Load embedding model for semantic chunking
device = "cuda" if torch.cuda.is_available() else "cpu"
embedding_model = HuggingFaceEmbeddings(
    model_name="BAAI/bge-large-en-v1.5",
    model_kwargs={"device": device}
)

print("Computing semantic chunks (this may take a moment)...")
semantic_chunks = ChunkingStrategies.semantic(documents, embedding_model)

# Analyze all strategies
strategies = [
    (fixed_chunks, "Fixed-Size"),
    (recursive_chunks, "Recursive"),
    (sentence_chunks, "Sentence-Based"),
    (semantic_chunks, "Semantic")
]

print("\n" + "="*80)
print("CHUNK STATISTICS COMPARISON")
print("="*80)

results = []
for chunks, name in strategies:
    stats = analyze_chunks(chunks, name)
    results.append(stats)
    print(f"\n{name}:")
    print(f"  Chunks: {stats['num_chunks']}")
    print(f"  Mean size: {stats['mean_size']:.0f} chars")
    print(f"  Std dev: {stats['std_size']:.0f} chars")
    print(f"  Range: {stats['min_size']}-{stats['max_size']} chars")
    print(f"  Variance coef: {stats['size_variance_coef']:.3f}")

## Exercise 3 Solution: Benchmark Retrieval Quality

**Task**: Compare retrieval performance across chunking strategies.

In [None]:
def benchmark_retrieval(
    chunks: List[Document],
    embedding_model: HuggingFaceEmbeddings,
    test_queries: List[Dict[str, Any]],
    k: int = 5
) -> Dict[str, float]:
    """
    Benchmark retrieval quality for a chunking strategy.
    
    Metrics:
    - Recall@k: Did we find the expected source?
    - Avg similarity: How relevant are results?
    - Latency: How fast is retrieval?
    """
    import shutil
    import tempfile
    
    # Create temporary vector store
    with tempfile.TemporaryDirectory() as tmpdir:
        vectorstore = Chroma.from_documents(
            documents=chunks,
            embedding=embedding_model,
            persist_directory=tmpdir
        )
        
        recalls = []
        similarities = []
        latencies = []
        
        for query_data in test_queries:
            query = query_data["query"]
            expected = query_data.get("expected_source", "")
            
            # Time the retrieval
            start = time.time()
            results = vectorstore.similarity_search_with_score(query, k=k)
            latency = time.time() - start
            latencies.append(latency)
            
            # Check if expected source was found
            found = False
            for doc, score in results:
                if expected in doc.metadata.get("source", ""):
                    found = True
                    break
            recalls.append(1.0 if found else 0.0)
            
            # Average similarity (convert distance to similarity)
            avg_sim = np.mean([1 / (1 + score) for _, score in results])
            similarities.append(avg_sim)
    
    return {
        "recall@k": np.mean(recalls),
        "avg_similarity": np.mean(similarities),
        "avg_latency_ms": np.mean(latencies) * 1000
    }

# Define test queries
test_queries = [
    {"query": "What is the memory capacity of DGX Spark?", "expected_source": "dgx_spark"},
    {"query": "How does LoRA reduce trainable parameters?", "expected_source": "lora"},
    {"query": "Explain the attention mechanism in transformers", "expected_source": "transformer"},
    {"query": "What quantization methods are available?", "expected_source": "quantization"},
    {"query": "How do vector databases work?", "expected_source": "vector_database"}
]

print("Benchmarking retrieval quality for each strategy...\n")

benchmark_results = []
for chunks, name in strategies:
    print(f"Testing {name}...")
    metrics = benchmark_retrieval(chunks, embedding_model, test_queries)
    metrics["strategy"] = name
    benchmark_results.append(metrics)

print("\n" + "="*80)
print("RETRIEVAL QUALITY COMPARISON")
print("="*80)
print(f"{'Strategy':<20} {'Recall@5':<12} {'Avg Similarity':<16} {'Latency (ms)':<12}")
print("-"*60)

for result in benchmark_results:
    print(f"{result['strategy']:<20} {result['recall@k']:<12.3f} {result['avg_similarity']:<16.3f} {result['avg_latency_ms']:<12.2f}")

## Exercise 4 Solution: Optimal Parameters

**Task**: Find optimal chunk size and overlap for the recursive strategy.

In [None]:
def grid_search_parameters(
    documents: List[Document],
    embedding_model: HuggingFaceEmbeddings,
    test_queries: List[Dict[str, Any]],
    chunk_sizes: List[int] = [256, 512, 768, 1024],
    overlaps: List[int] = [0, 50, 100, 150]
) -> List[Dict[str, Any]]:
    """Grid search for optimal chunking parameters."""
    
    results = []
    total = len(chunk_sizes) * len(overlaps)
    current = 0
    
    for chunk_size in chunk_sizes:
        for overlap in overlaps:
            current += 1
            print(f"Testing {current}/{total}: size={chunk_size}, overlap={overlap}")
            
            # Skip invalid combinations
            if overlap >= chunk_size:
                continue
            
            # Create chunks
            chunks = ChunkingStrategies.recursive(documents, chunk_size, overlap)
            
            # Benchmark
            metrics = benchmark_retrieval(chunks, embedding_model, test_queries)
            
            results.append({
                "chunk_size": chunk_size,
                "overlap": overlap,
                "num_chunks": len(chunks),
                **metrics
            })
    
    return results

# Run grid search
print("Running grid search for optimal parameters...\n")
grid_results = grid_search_parameters(
    documents,
    embedding_model,
    test_queries,
    chunk_sizes=[256, 512, 768],
    overlaps=[0, 50, 100]
)

# Find best configuration
best = max(grid_results, key=lambda x: x['recall@k'])

print("\n" + "="*80)
print("PARAMETER OPTIMIZATION RESULTS")
print("="*80)
print(f"{'Size':<8} {'Overlap':<10} {'Chunks':<10} {'Recall@5':<12} {'Similarity':<12}")
print("-"*52)

for r in sorted(grid_results, key=lambda x: -x['recall@k']):
    marker = " <-- BEST" if r == best else ""
    print(f"{r['chunk_size']:<8} {r['overlap']:<10} {r['num_chunks']:<10} {r['recall@k']:<12.3f} {r['avg_similarity']:<12.3f}{marker}")

print(f"\nOptimal configuration:")
print(f"  Chunk size: {best['chunk_size']}")
print(f"  Overlap: {best['overlap']}")
print(f"  Recall@5: {best['recall@k']:.3f}")

## Recommendations Summary

Based on the experiments:

In [None]:
recommendations = """
CHUNKING STRATEGY RECOMMENDATIONS
==================================

1. FIXED-SIZE CHUNKING
   - Use for: Simple documents, uniform processing needs
   - Avoid for: Technical documentation, code
   - Parameters: 500 chars, no overlap

2. RECURSIVE CHUNKING (RECOMMENDED DEFAULT)
   - Use for: Most use cases, structured documents
   - Parameters: 512 chars, 100 overlap
   - Separators: Prioritize section headers, then paragraphs

3. SENTENCE-BASED CHUNKING
   - Use for: Q&A datasets, conversational content
   - Parameters: 5-7 sentences per chunk
   - Best for: High answer relevancy requirements

4. SEMANTIC CHUNKING
   - Use for: Complex documents with varied topics
   - Parameters: 0.75 similarity threshold
   - Trade-off: Best quality, highest compute cost

DGX SPARK OPTIMIZATION
======================
- Use GPU for semantic chunking (5x faster)
- Batch size 64 for embedding computation
- Pre-compute embeddings for large document sets
"""

print(recommendations)