# Notebook 2: Semantic Text Chunking

## Purpose
This notebook implements **semantic chunking** - an advanced RAG technique that creates chunks based on meaning and context rather than arbitrary character counts.

## Semantic Chunking Benefits
1. **Meaning-based boundaries** - Chunks break at natural semantic boundaries
2. **Better retrieval** - More coherent chunks improve embedding quality
3. **Context preservation** - Maintains topic coherence within chunks
4. **Reduced fragmentation** - Avoids splitting related concepts

## Process
1. Load extracted text data from Notebook 1
2. Use sentence embeddings to detect semantic boundaries
3. Group sentences with similar embeddings into chunks
4. Add context enrichment (surrounding context)
5. Preserve metadata (book, chapter, page)
6. Save semantic chunks for embedding generation

## Output
- Semantically coherent chunks with metadata
- Context-enriched chunks for better retrieval
- Ready for embedding generation in Notebook 3

In [None]:
# Import required libraries
import os
import json
import re
import numpy as np
from pathlib import Path
from typing import List, Dict, Any, Tuple
from tqdm import tqdm
import hashlib
import pandas as pd
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import nltk
from nltk.tokenize import sent_tokenize

# Download NLTK data for sentence tokenization
try:
    nltk.data.find('tokenizers/punkt')
except LookupError:
    nltk.download('punkt')
    nltk.download('punkt_tab')

In [None]:
# Configuration
BASE_DIR = Path(r"d:\AI Book RAG")
EXTRACTED_DIR = BASE_DIR / "data" / "extracted"
CHUNKS_DIR = BASE_DIR / "data" / "chunks"

# Create output directory
CHUNKS_DIR.mkdir(parents=True, exist_ok=True)

# Semantic chunking parameters
SIMILARITY_THRESHOLD = 0.75  # Cosine similarity threshold for grouping sentences
MIN_CHUNK_SENTENCES = 3      # Minimum sentences per chunk
MAX_CHUNK_SENTENCES = 10     # Maximum sentences per chunk
MIN_CHUNK_CHARS = 200        # Minimum characters per chunk
MAX_CHUNK_CHARS = 1500       # Maximum characters per chunk
CONTEXT_SENTENCES = 1        # Number of sentences to add as context before/after

print(f"Extracted Data Directory: {EXTRACTED_DIR}")
print(f"Chunks Output Directory: {CHUNKS_DIR}")
print(f"\nSemantic Chunking Configuration:")
print(f"  - Similarity Threshold: {SIMILARITY_THRESHOLD}")
print(f"  - Chunk Size: {MIN_CHUNK_SENTENCES}-{MAX_CHUNK_SENTENCES} sentences")
print(f"  - Character Range: {MIN_CHUNK_CHARS}-{MAX_CHUNK_CHARS} chars")
print(f"  - Context Enrichment: ±{CONTEXT_SENTENCES} sentences")

In [None]:
# Load embedding model for semantic similarity
print("Loading sentence embedding model...")
embedding_model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
print("✓ Model loaded successfully")

In [None]:
# Load extracted data
combined_file = EXTRACTED_DIR / "all_books_combined.json"
print(f"Loading extracted data from: {combined_file}")

with open(combined_file, 'r', encoding='utf-8') as f:
    all_books_data = json.load(f)

print(f"✓ Loaded data for {len(all_books_data)} books")
total_pages = sum(book['total_pages'] for book in all_books_data)
print(f"✓ Total pages to process: {total_pages}")

In [None]:
# Helper: Clean text
def clean_text(text: str) -> str:
    """Clean and normalize text content"""
    # Replace multiple spaces with single space
    text = re.sub(r' +', ' ', text)
    # Replace multiple newlines with double newline
    text = re.sub(r'\n\s*\n+', '\n\n', text)
    # Remove leading/trailing whitespace from each line
    lines = [line.strip() for line in text.split('\n')]
    text = '\n'.join(lines)
    # Remove standalone page numbers
    text = re.sub(r'^\d+\s*$', '', text, flags=re.MULTILINE)
    # Clean up again
    text = re.sub(r'\n\s*\n+', '\n\n', text)
    return text.strip()

# Helper: Generate chunk ID
def generate_chunk_id(book_title: str, page_num: int, chunk_index: int) -> str:
    """Generate unique chunk identifier"""
    book_hash = hashlib.md5(book_title.encode()).hexdigest()[:8]
    return f"{book_hash}_p{page_num}_sc{chunk_index}"

# Helper: Calculate semantic similarity between sentences
def calculate_sentence_similarities(sentences: List[str], embeddings: np.ndarray) -> List[float]:
    """Calculate cosine similarity between consecutive sentences"""
    similarities = []
    for i in range(len(embeddings) - 1):
        sim = cosine_similarity(
            embeddings[i].reshape(1, -1),
            embeddings[i + 1].reshape(1, -1)
        )[0][0]
        similarities.append(sim)
    return similarities

In [None]:
# Core: Semantic chunking function
def semantic_chunk_text(
    text: str,
    embedding_model: SentenceTransformer,
    similarity_threshold: float = SIMILARITY_THRESHOLD,
    min_sentences: int = MIN_CHUNK_SENTENCES,
    max_sentences: int = MAX_CHUNK_SENTENCES,
    min_chars: int = MIN_CHUNK_CHARS,
    max_chars: int = MAX_CHUNK_CHARS
) -> List[str]:
    """
    Split text into semantic chunks based on sentence similarity.
    
    Algorithm:
    1. Split text into sentences
    2. Generate embeddings for each sentence
    3. Calculate similarity between consecutive sentences
    4. Group sentences where similarity > threshold
    5. Respect min/max constraints
    """
    # Tokenize into sentences
    sentences = sent_tokenize(text)
    
    if len(sentences) < min_sentences:
        return [text] if len(text) >= min_chars else []
    
    # Generate embeddings for all sentences
    embeddings = embedding_model.encode(sentences, show_progress_bar=False)
    
    # Calculate similarities between consecutive sentences
    similarities = calculate_sentence_similarities(sentences, embeddings)
    
    # Find semantic boundaries (where similarity drops below threshold)
    chunks = []
    current_chunk = [sentences[0]]
    current_chars = len(sentences[0])
    
    for i, (sentence, similarity) in enumerate(zip(sentences[1:], similarities), 1):
        sentence_len = len(sentence)
        
        # Check if we should start a new chunk
        should_break = (
            # Semantic boundary detected
            similarity < similarity_threshold or
            # Max sentences reached
            len(current_chunk) >= max_sentences or
            # Max chars would be exceeded
            (current_chars + sentence_len > max_chars and len(current_chunk) >= min_sentences)
        )
        
        if should_break and len(current_chunk) >= min_sentences and current_chars >= min_chars:
            # Save current chunk
            chunks.append(' '.join(current_chunk))
            current_chunk = [sentence]
            current_chars = sentence_len
        else:
            # Add to current chunk
            current_chunk.append(sentence)
            current_chars += sentence_len
    
    # Add final chunk if it meets criteria
    if current_chunk and len(current_chunk) >= min_sentences and current_chars >= min_chars:
        chunks.append(' '.join(current_chunk))
    elif current_chunk and chunks:  # Merge small final chunk with previous
        chunks[-1] += ' ' + ' '.join(current_chunk)
    
    return chunks

In [None]:
# Context enrichment: Add surrounding context to chunks
def enrich_chunk_with_context(
    chunk_text: str,
    page_text: str,
    context_sentences: int = CONTEXT_SENTENCES
) -> Dict[str, str]:
    """
    Add context from surrounding sentences to improve retrieval.
    Returns dict with main chunk and context.
    """
    sentences = sent_tokenize(page_text)
    chunk_sentences = sent_tokenize(chunk_text)
    
    if not chunk_sentences:
        return {'main': chunk_text, 'context': ''}
    
    # Find where chunk starts in page
    first_chunk_sent = chunk_sentences[0]
    try:
        start_idx = sentences.index(first_chunk_sent)
    except ValueError:
        return {'main': chunk_text, 'context': ''}
    
    # Get context before
    context_before = []
    for i in range(max(0, start_idx - context_sentences), start_idx):
        context_before.append(sentences[i])
    
    # Get context after
    end_idx = start_idx + len(chunk_sentences)
    context_after = []
    for i in range(end_idx, min(len(sentences), end_idx + context_sentences)):
        context_after.append(sentences[i])
    
    context = ' '.join(context_before + context_after)
    
    return {
        'main': chunk_text,
        'context': context
    }

In [None]:
# Main: Process all books with semantic chunking
def create_semantic_chunks_from_books(books_data: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
    """
    Process all books and create semantic chunks with context enrichment.
    """
    all_chunks = []
    chunk_global_index = 0
    
    for book_data in books_data:
        book_title = book_data['book_title']
        pages = book_data['pages']
        
        print(f"\nProcessing: {book_title}")
        print(f"Pages: {len(pages)}")
        
        book_chunks = []
        
        for page_data in tqdm(pages, desc=f"Semantic chunking {book_title}"):
            # Clean text
            cleaned_text = clean_text(page_data['text'])
            
            if len(cleaned_text) < MIN_CHUNK_CHARS:
                continue
            
            # Create semantic chunks
            text_chunks = semantic_chunk_text(
                cleaned_text,
                embedding_model
            )
            
            # Process each chunk
            for chunk_index, chunk_text in enumerate(text_chunks):
                # Add context enrichment
                enriched = enrich_chunk_with_context(chunk_text, cleaned_text)
                
                # Combine main chunk with context for embedding
                full_text = chunk_text
                if enriched['context']:
                    full_text = f"{enriched['context']} {chunk_text}"
                
                # Generate chunk ID
                chunk_id = generate_chunk_id(
                    book_title,
                    page_data['page_number'],
                    chunk_index
                )
                
                # Create chunk object
                chunk = {
                    "chunk_id": chunk_id,
                    "global_index": chunk_global_index,
                    "book_title": book_title,
                    "chapter": page_data['chapter'],
                    "page_number": page_data['page_number'],
                    "chunk_index": chunk_index,
                    "text": chunk_text,  # Main chunk for display
                    "text_with_context": full_text,  # For embedding
                    "context": enriched['context'],
                    "char_count": len(chunk_text),
                    "word_count": len(chunk_text.split()),
                    "sentence_count": len(sent_tokenize(chunk_text)),
                    "citation": f"[{book_title}, {page_data['chapter']}, Page {page_data['page_number']}]",
                    "chunking_method": "semantic"
                }
                
                book_chunks.append(chunk)
                all_chunks.append(chunk)
                chunk_global_index += 1
        
        print(f"✓ Created {len(book_chunks)} semantic chunks from {book_title}")
    
    return all_chunks

In [None]:
# Execute semantic chunking
print("Starting semantic chunking process...\n")
print("="*80)

all_chunks = create_semantic_chunks_from_books(all_books_data)

print("\n" + "="*80)
print(f"Semantic chunking complete!")
print(f"Total chunks created: {len(all_chunks)}")
print("="*80)

In [None]:
# Save semantic chunks
print("\nSaving semantic chunks...")

# Save all chunks
chunks_file = CHUNKS_DIR / "all_chunks_semantic.json"
with open(chunks_file, 'w', encoding='utf-8') as f:
    json.dump(all_chunks, f, indent=2, ensure_ascii=False)
print(f"✓ Saved: {chunks_file.name}")

# Save by book
chunks_by_book = {}
for chunk in all_chunks:
    book_title = chunk['book_title']
    if book_title not in chunks_by_book:
        chunks_by_book[book_title] = []
    chunks_by_book[book_title].append(chunk)

for book_title, chunks in chunks_by_book.items():
    book_title_safe = re.sub(r'[^a-zA-Z0-9\s]', '', book_title)
    book_title_safe = '_'.join(book_title_safe.split())
    
    book_chunks_file = CHUNKS_DIR / f"{book_title_safe}_chunks_semantic.json"
    with open(book_chunks_file, 'w', encoding='utf-8') as f:
        json.dump(chunks, f, indent=2, ensure_ascii=False)
    
    print(f"✓ Saved: {book_chunks_file.name} ({len(chunks)} chunks)")

print("\nAll semantic chunks saved successfully!")

In [None]:
# Statistics
print("\n" + "="*80)
print("SEMANTIC CHUNKING STATISTICS")
print("="*80)

total_chunks = len(all_chunks)
total_chars = sum(chunk['char_count'] for chunk in all_chunks)
total_words = sum(chunk['word_count'] for chunk in all_chunks)
total_sentences = sum(chunk['sentence_count'] for chunk in all_chunks)

avg_chars = total_chars / total_chunks if total_chunks > 0 else 0
avg_words = total_words / total_chunks if total_chunks > 0 else 0
avg_sentences = total_sentences / total_chunks if total_chunks > 0 else 0

print(f"\nOverall Statistics:")
print(f"  Total Chunks: {total_chunks:,}")
print(f"  Total Characters: {total_chars:,}")
print(f"  Total Words: {total_words:,}")
print(f"  Total Sentences: {total_sentences:,}")
print(f"\nAverage per Chunk:")
print(f"  Characters: {avg_chars:.1f}")
print(f"  Words: {avg_words:.1f}")
print(f"  Sentences: {avg_sentences:.1f}")

# Chunks with context enrichment
chunks_with_context = sum(1 for chunk in all_chunks if chunk['context'])
print(f"\nContext Enrichment:")
print(f"  Chunks with context: {chunks_with_context} ({chunks_with_context/total_chunks*100:.1f}%)")

In [None]:
# Statistics by book
book_stats = []
for book_title, chunks in chunks_by_book.items():
    book_chars = sum(chunk['char_count'] for chunk in chunks)
    book_words = sum(chunk['word_count'] for chunk in chunks)
    book_sentences = sum(chunk['sentence_count'] for chunk in chunks)
    
    book_stats.append({
        'Book Title': book_title,
        'Chunks': len(chunks),
        'Avg Sentences': f"{book_sentences / len(chunks):.1f}",
        'Avg Words': f"{book_words / len(chunks):.0f}",
        'Avg Chars': f"{book_chars / len(chunks):.0f}"
    })

df_book_stats = pd.DataFrame(book_stats)
print("\n" + "="*80)
print("STATISTICS BY BOOK")
print("="*80)
print(df_book_stats.to_string(index=False))
print("="*80)

In [None]:
# Sample chunks
print("\n" + "="*80)
print("SAMPLE SEMANTIC CHUNKS")
print("="*80)

for i, chunk in enumerate(all_chunks[:3], 1):
    print(f"\nChunk {i}:")
    print("-" * 80)
    print(f"ID: {chunk['chunk_id']}")
    print(f"Book: {chunk['book_title']}")
    print(f"Chapter: {chunk['chapter']}")
    print(f"Page: {chunk['page_number']}")
    print(f"Size: {chunk['sentence_count']} sentences, {chunk['word_count']} words, {chunk['char_count']} chars")
    print(f"Has Context: {'Yes' if chunk['context'] else 'No'}")
    print(f"\nText Preview (first 300 characters):")
    print(chunk['text'][:300] + "...")
    if chunk['context']:
        print(f"\nContext Preview (first 150 characters):")
        print(chunk['context'][:150] + "...")
    print("-" * 80)

## Next Steps

✅ Semantic chunking complete!

### Advantages of Semantic Chunking:
1. **Better coherence** - Chunks maintain topic boundaries
2. **Improved retrieval** - More meaningful embeddings
3. **Context enrichment** - Surrounding sentences provide additional context
4. **Adaptive sizing** - Chunks vary based on content, not arbitrary limits

### Output Files:
- `all_chunks_semantic.json` - All semantic chunks
- `[BookTitle]_chunks_semantic.json` - Per-book semantic chunks

### What's Next:
**Notebook 3**: Generate embeddings using the `text_with_context` field and update ChromaDB