# 2. Parent-Child Retriever

**What:** Index small chunks for precise search, return large parent chunks for context

**Why:** Solves the chunk size dilemma - small=good retrieval, large=good generation

**When:** Need balance between precise matching and complete context

**The Dilemma:**

| Chunk Size | Retrieval | Generation |
|------------|-----------|------------|
| Small (200) | Precise | Missing context |
| Large (2000) | Noisy | Good context |

**Architecture:**
```
Parent (2000 chars) --> stored in docstore with UUID
    |
    +-- Child 1 (400 chars) --> embedded in vectorstore
    +-- Child 2 (400 chars) --> embedded in vectorstore
    +-- Child 3 (400 chars) --> embedded in vectorstore

Query matches Child 2 --> Returns full Parent
```

In [None]:
import uuid
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import FAISS
from langchain_core.documents import Document
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser

from config import embeddings, model, load_documents, format_docs

# Configuration
PARENT_CHUNK_SIZE = 2000
PARENT_CHUNK_OVERLAP = 400
CHILD_CHUNK_SIZE = 400
CHILD_CHUNK_OVERLAP = 50

## Create Parent-Child Index

In [None]:
def create_parent_child_index(documents):
    """Create parent-child index manually"""
    print(f"Processing {len(documents)} documents...")
    
    parent_splitter = RecursiveCharacterTextSplitter(
        chunk_size=PARENT_CHUNK_SIZE, 
        chunk_overlap=PARENT_CHUNK_OVERLAP
    )
    child_splitter = RecursiveCharacterTextSplitter(
        chunk_size=CHILD_CHUNK_SIZE, 
        chunk_overlap=CHILD_CHUNK_OVERLAP
    )
    
    parent_store = {}
    all_children = []
    
    for doc in documents:
        # Create parent chunks
        parents = parent_splitter.split_documents([doc])
        
        for parent in parents:
            parent_id = str(uuid.uuid4())
            parent_store[parent_id] = parent
            
            # Create child chunks from parent
            children = child_splitter.split_documents([parent])
            
            for child in children:
                child.metadata["parent_id"] = parent_id
                child.metadata["chunk_type"] = "child"
            
            all_children.extend(children)
    
    print(f"Parents: {len(parent_store)}, Children: {len(all_children)}")
    print(f"Avg children per parent: {len(all_children) / max(len(parent_store), 1):.1f}")
    
    # Create vectorstore with children only
    vectorstore = FAISS.from_documents(all_children, embeddings)
    
    return vectorstore, parent_store

## Retrieve with Parent Context

In [None]:
def retrieve_with_parent(question, vectorstore, parent_store, k=3):
    """Search children, return parents"""
    print(f"Question: {question}\n")
    
    # Search child chunks
    child_results = vectorstore.similarity_search(question, k=k*2)
    print(f"Found {len(child_results)} matching children")
    
    # Get unique parent documents
    parent_docs = []
    seen_ids = set()
    
    for child in child_results:
        parent_id = child.metadata.get("parent_id")
        
        if parent_id and parent_id not in seen_ids:
            parent = parent_store.get(parent_id)
            if parent:
                parent_docs.append(parent)
                seen_ids.add(parent_id)
        
        if len(parent_docs) >= k:
            break
    
    print(f"Retrieved {len(parent_docs)} parent documents")
    return parent_docs, child_results

## Complete Parent-Child RAG

In [None]:
def parent_child_rag(question, vectorstore, parent_store, k=3):
    """Complete RAG with parent-child retrieval"""
    parent_docs, child_results = retrieve_with_parent(question, vectorstore, parent_store, k)
    
    context = format_docs(parent_docs)
    
    template = """Answer based on context:

Context:
{context}

Question: {question}

Answer:"""
    
    prompt = ChatPromptTemplate.from_template(template)
    chain = prompt | model | StrOutputParser()
    
    answer = chain.invoke({"context": context, "question": question})
    
    print(f"\nAnswer: {answer}")
    return answer, parent_docs

## Test

In [None]:
# Load documents
documents = load_documents()

# Create index
vectorstore, parent_store = create_parent_child_index(documents)

# Test queries
test_questions = [
    "What optimization techniques were used?",
    "What is RF and RDKit?"
]

for q in test_questions:
    print("="*60)
    answer, docs = parent_child_rag(q, vectorstore, parent_store)
    print(f"\nSources: {len(docs)} parent documents")
    for i, doc in enumerate(docs):
        print(f"  {i+1}. {doc.metadata.get('filename', 'unknown')} ({len(doc.page_content)} chars)")
    print()