# 5. Document Summary Indexing

**What:** Generate a summary for each document and index those summaries

**Why:** Summaries capture core meaning, enabling fast document discovery

**When:** Large documents, document-level retrieval rather than chunk-level

**How It Relates to Multi-Representation:**
- Multi-Rep: Complex, UUID linking, always returns full document
- Doc Summary: Simpler, can return just summary or full document

**Use Cases:**
1. Document Discovery - quickly identify relevant documents
2. Summary-First RAG - retrieve summaries first, then fetch details
3. Hybrid Retrieval - search both summaries and chunks

**Variations:**
1. Summary Only - retrieve summaries
2. Summary + Full Document - store original, return on match
3. Summary + Chunks - index both for broad and fine-grained retrieval

In [None]:
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser
from langchain_community.vectorstores import FAISS
from langchain_core.documents import Document

from config import model, embeddings, load_documents, format_docs

## Create Summary Index

In [None]:
def create_summary_index(documents, max_docs=None):
    """Create vectorstore with document summaries"""
    if max_docs:
        documents = documents[:max_docs]
    
    print(f"Creating summary index for {len(documents)} documents...")
    
    template = """Summarize in 2-3 sentences:

{document}

Summary:"""
    
    prompt = ChatPromptTemplate.from_template(template)
    chain = prompt | model | StrOutputParser()
    
    summary_docs = []
    
    for i, doc in enumerate(documents):
        print(f"  [{i+1}/{len(documents)}] Summarizing...")
        
        # Generate summary
        summary = chain.invoke({"document": doc.page_content[:4000]})
        
        # Create summary document with original stored in metadata
        summary_doc = Document(
            page_content=summary,
            metadata={
                "original_content": doc.page_content,
                "summary": summary,
                "doc_index": i,
                "source": doc.metadata.get("filename", "unknown")
            }
        )
        summary_docs.append(summary_doc)
    
    # Create vectorstore from summaries
    vectorstore = FAISS.from_documents(summary_docs, embeddings)
    
    print(f"Created index with {len(summary_docs)} summaries")
    
    return vectorstore

## Retrieve with Summaries

In [None]:
def retrieve_with_summaries(query, vectorstore, k=3, return_original=True):
    """Search summaries, optionally return original documents"""
    print(f"Query: {query}\n")
    
    # Search summaries
    summaries = vectorstore.similarity_search(query, k=k)
    
    print(f"Matched {len(summaries)} summaries:")
    for i, s in enumerate(summaries):
        print(f"  {i+1}. {s.page_content[:80]}...")
    
    if return_original:
        # Return original documents from metadata
        results = []
        for summary in summaries:
            original = summary.metadata.get("original_content", "")
            if original:
                results.append(Document(
                    page_content=original,
                    metadata={
                        "summary": summary.page_content,
                        "source": summary.metadata.get("source")
                    }
                ))
            else:
                results.append(summary)
        return results
    
    return summaries

## Complete Document Summary RAG

In [None]:
def summary_indexing_rag(question, vectorstore, k=3):
    """Complete RAG with document summary indexing"""
    docs = retrieve_with_summaries(question, vectorstore, k, return_original=True)
    
    context = format_docs(docs)
    
    template = """Answer based on context:

Context:
{context}

Question: {question}

Answer:"""
    
    prompt = ChatPromptTemplate.from_template(template)
    chain = prompt | model | StrOutputParser()
    
    answer = chain.invoke({"context": context, "question": question})
    
    print(f"\nAnswer: {answer}")
    return answer

## Test

In [None]:
# Load documents
documents = load_documents()

# Create summary index (limit for speed)
vectorstore = create_summary_index(documents, max_docs=5)

# Test queries
test_queries = [
    "What is the main topic?",
    "What research was conducted?",
    "What are the key findings?"
]

for q in test_queries:
    print("="*60)
    summary_indexing_rag(q, vectorstore, k=2)
    print()