# Twilight Imperium Text Chunker
## Step 2: Chunk Text for Vector Embeddings

This notebook takes the extracted PDF text and breaks it into smaller, manageable chunks that can be embedded and searched efficiently by our LangChain assistant.


In [1]:
# Import necessary libraries
import json
from pathlib import Path
from typing import List, Dict, Any
from langchain.text_splitter import RecursiveCharacterTextSplitter
import re


In [2]:
# Load the extracted data from Step 1
processed_rules_dir = Path("processed_rules")
combined_data_path = processed_rules_dir / "combined_extracted_data.json"

# Check if the processed data exists
if not combined_data_path.exists():
    print("❌ Error: Could not find processed data from Step 1.")
    print("Please run the pdf_processor.ipynb notebook first!")
else:
    print("✅ Found processed data from Step 1")
    
    # Load the data
    with open(combined_data_path, 'r', encoding='utf-8') as f:
        extracted_data = json.load(f)
    
    print(f"📊 Loaded data contains:")
    print(f"  - Learn to Play: {len(extracted_data['learn_to_play']['full_text']):,} characters")
    print(f"  - Rulebook: {len(extracted_data['rulebook']['full_text']):,} characters")


✅ Found processed data from Step 1
📊 Loaded data contains:
  - Learn to Play: 100,023 characters
  - Rulebook: 57,695 characters


In [3]:
# Configure the text splitter
text_splitter = RecursiveCharacterTextSplitter(
    # Chunk size - optimal for embedding models and retrieval
    chunk_size=800,
    
    # Overlap between chunks to maintain context
    chunk_overlap=100,
    
    # Try to split on these separators in order (preserves structure)
    separators=[
        "\n\n",  # Double newlines (paragraphs)
        "\n",    # Single newlines
        ". ",    # Sentences
        ", ",    # Clauses
        " ",     # Words
        ""       # Characters (last resort)
    ],
    
    # Keep related text together
    keep_separator=True
)

print("🔧 Text splitter configured:")
print(f"  - Chunk size: {text_splitter._chunk_size}")
print(f"  - Chunk overlap: {text_splitter._chunk_overlap}")
print(f"  - Separators: {text_splitter._separators[:3]}... (and more)")


🔧 Text splitter configured:
  - Chunk size: 800
  - Chunk overlap: 100
  - Separators: ['\n\n', '\n', '. ']... (and more)


In [4]:
def create_chunks_with_metadata(text: str, source: str, doc_type: str) -> List[Dict[str, Any]]:
    """
    Split text into chunks and add metadata for better retrieval
    
    Args:
        text: The full text to chunk
        source: Source identifier ('learn_to_play' or 'rulebook')
        doc_type: Human-readable document type
        
    Returns:
        List of chunks with metadata
    """
    print(f"\n📝 Processing {doc_type}...")
    
    # Split the text into chunks
    chunks = text_splitter.split_text(text)
    
    # Create chunks with metadata
    chunks_with_metadata = []
    
    for i, chunk in enumerate(chunks):
        # Extract potential section headers (lines that are short and may be titles)
        lines = chunk.split('\n')
        potential_header = None
        
        # Look for short lines that might be headers (under 100 chars, not mostly spaces)
        for line in lines[:3]:  # Check first 3 lines
            if line.strip() and len(line.strip()) < 100 and not line.strip().startswith('---'):
                if len(line.strip().split()) <= 8:  # Likely a header if 8 words or fewer
                    potential_header = line.strip()
                    break
        
        # Create metadata for this chunk
        metadata = {
            'source': source,
            'doc_type': doc_type,
            'chunk_id': f"{source}_chunk_{i:03d}",
            'chunk_index': i,
            'total_chunks': len(chunks),
            'char_count': len(chunk),
            'word_count': len(chunk.split()),
        }
        
        # Add section header if found
        if potential_header:
            metadata['section'] = potential_header
        
        # Store the chunk with its metadata
        chunks_with_metadata.append({
            'content': chunk.strip(),
            'metadata': metadata
        })
    
    print(f"  ✅ Created {len(chunks_with_metadata)} chunks")
    print(f"  📊 Average chunk size: {sum(len(c['content']) for c in chunks_with_metadata) // len(chunks_with_metadata)} characters")
    
    return chunks_with_metadata


In [5]:
# Process Learn to Play document
learn_to_play_chunks = create_chunks_with_metadata(
    text=extracted_data['learn_to_play']['full_text'],
    source='learn_to_play',
    doc_type='Learn to Play Guide'
)



📝 Processing Learn to Play Guide...
  ✅ Created 182 chunks
  📊 Average chunk size: 581 characters


In [6]:
# Process Rulebook document
rulebook_chunks = create_chunks_with_metadata(
    text=extracted_data['rulebook']['full_text'],
    source='rulebook',
    doc_type='Official Rulebook'
)



📝 Processing Official Rulebook...
  ✅ Created 104 chunks
  📊 Average chunk size: 585 characters


In [7]:
# Combine all chunks
all_chunks = learn_to_play_chunks + rulebook_chunks

# Display comprehensive statistics
print("\n" + "="*60)
print("CHUNKING SUMMARY")
print("="*60)

print(f"\n📊 Overall Statistics:")
print(f"  Total chunks created: {len(all_chunks)}")
print(f"  Learn to Play chunks: {len(learn_to_play_chunks)}")
print(f"  Rulebook chunks: {len(rulebook_chunks)}")

# Calculate size statistics
chunk_sizes = [len(chunk['content']) for chunk in all_chunks]
avg_size = sum(chunk_sizes) // len(chunk_sizes)
min_size = min(chunk_sizes)
max_size = max(chunk_sizes)

print(f"\n📏 Chunk Size Distribution:")
print(f"  Average size: {avg_size} characters")
print(f"  Minimum size: {min_size} characters")
print(f"  Maximum size: {max_size} characters")

# Count chunks with sections
chunks_with_sections = sum(1 for chunk in all_chunks if 'section' in chunk['metadata'])
print(f"\n🏷️  Metadata Statistics:")
print(f"  Chunks with section headers: {chunks_with_sections}")
print(f"  Percentage with headers: {chunks_with_sections/len(all_chunks)*100:.1f}%")



CHUNKING SUMMARY

📊 Overall Statistics:
  Total chunks created: 286
  Learn to Play chunks: 182
  Rulebook chunks: 104

📏 Chunk Size Distribution:
  Average size: 583 characters
  Minimum size: 14 characters
  Maximum size: 800 characters

🏷️  Metadata Statistics:
  Chunks with section headers: 4
  Percentage with headers: 1.4%


In [8]:
# Preview sample chunks
print("\n" + "="*60)
print("SAMPLE CHUNKS PREVIEW")
print("="*60)

# Show a few sample chunks from each document
sample_learn_to_play = learn_to_play_chunks[0] if learn_to_play_chunks else None
sample_rulebook = rulebook_chunks[0] if rulebook_chunks else None

if sample_learn_to_play:
    print(f"\n🔹 Learn to Play - Sample Chunk:")
    print(f"   Chunk ID: {sample_learn_to_play['metadata']['chunk_id']}")
    print(f"   Size: {sample_learn_to_play['metadata']['char_count']} characters")
    if 'section' in sample_learn_to_play['metadata']:
        print(f"   Section: {sample_learn_to_play['metadata']['section']}")
    print(f"   Content Preview:")
    print("-" * 40)
    print(sample_learn_to_play['content'][:300] + "..." if len(sample_learn_to_play['content']) > 300 else sample_learn_to_play['content'])

if sample_rulebook:
    print(f"\n🔹 Rulebook - Sample Chunk:")
    print(f"   Chunk ID: {sample_rulebook['metadata']['chunk_id']}")
    print(f"   Size: {sample_rulebook['metadata']['char_count']} characters")
    if 'section' in sample_rulebook['metadata']:
        print(f"   Section: {sample_rulebook['metadata']['section']}")
    print(f"   Content Preview:")
    print("-" * 40)
    print(sample_rulebook['content'][:300] + "..." if len(sample_rulebook['content']) > 300 else sample_rulebook['content'])



SAMPLE CHUNKS PREVIEW

🔹 Learn to Play - Sample Chunk:
   Chunk ID: learn_to_play_chunk_000
   Size: 33 characters
   Section: ®
   Content Preview:
----------------------------------------
--- Page 1 ---

®

--- Page 2 ---

🔹 Rulebook - Sample Chunk:
   Chunk ID: rulebook_chunk_000
   Size: 33 characters
   Section: ®
   Content Preview:
----------------------------------------
--- Page 1 ---

®

--- Page 2 ---


In [9]:
# Save the chunked data for the next step (embeddings)
chunked_data = {
    'all_chunks': all_chunks,
    'learn_to_play_chunks': learn_to_play_chunks,
    'rulebook_chunks': rulebook_chunks,
    'chunking_config': {
        'chunk_size': text_splitter._chunk_size,
        'chunk_overlap': text_splitter._chunk_overlap,
        'total_chunks': len(all_chunks)
    },
    'statistics': {
        'total_chunks': len(all_chunks),
        'learn_to_play_chunks': len(learn_to_play_chunks),
        'rulebook_chunks': len(rulebook_chunks),
        'avg_chunk_size': avg_size,
        'min_chunk_size': min_size,
        'max_chunk_size': max_size,
        'chunks_with_sections': chunks_with_sections
    },
    'processing_complete': True,
    'next_steps': [
        'Generate embeddings for each chunk using OpenAI or local model',
        'Store chunks and embeddings in FAISS or Chroma vector database',
        'Create LangChain retrieval tool',
        'Build the chatbot agent',
        'Test with sample queries'
    ]
}

# Save to file
chunked_data_path = processed_rules_dir / "chunked_data.json"
with open(chunked_data_path, 'w', encoding='utf-8') as f:
    json.dump(chunked_data, f, indent=2, ensure_ascii=False)

print(f"\n✅ Text chunking complete!")
print(f"📁 Chunked data saved to: {chunked_data_path}")
print(f"🚀 Ready for Step 3: Generate embeddings")
print(f"\n📋 Summary:")
print(f"  - Total chunks ready for embedding: {len(all_chunks)}")
print(f"  - Average chunk size: {avg_size} characters")
print(f"  - Data saved with metadata for easy retrieval")



✅ Text chunking complete!
📁 Chunked data saved to: processed_rules\chunked_data.json
🚀 Ready for Step 3: Generate embeddings

📋 Summary:
  - Total chunks ready for embedding: 286
  - Average chunk size: 583 characters
  - Data saved with metadata for easy retrieval
