# 3. Multi-Representation Indexing

**What:** Search on short summaries, return full documents

**Why:** Summaries are precise for search, full docs provide complete context

**When:** Long documents where you need full context for generation

**The Problem:**
- Hard to find relevant parts in lengthy documents
- Chunking loses document-level context
- Full doc embedding loses precision

**Architecture:**
```
Full Document (5000+ chars) --> stored in docstore with UUID
        |
        v LLM generates summary
    Summary (200 chars) --> embedded in vectorstore
        |
        doc_id: links back to full document

Query matches Summary --> Returns Full Document
```

**Variations:**
1. Summary-based (this implementation)
2. Hypothetical Questions - LLM generates questions the doc answers
3. Propositions - Extract key facts

In [None]:
import uuid
from langchain_community.vectorstores import FAISS
from langchain_core.documents import Document
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser

from config import embeddings, model, load_documents, format_docs

## Generate Document Summary

In [None]:
def generate_summary(document):
    """Generate concise summary of document"""
    template = """Summarize in 2-3 sentences. Focus on main topics and key concepts.

Document:
{content}

Summary:"""
    
    prompt = ChatPromptTemplate.from_template(template)
    chain = prompt | model | StrOutputParser()
    
    # Truncate if too long
    content = document.page_content[:8000]
    
    return chain.invoke({"content": content})

## Create Multi-Representation Index

In [None]:
def create_multi_rep_index(documents, max_docs=None):
    """Create index with summaries linked to full documents"""
    if max_docs:
        documents = documents[:max_docs]
    
    print(f"Processing {len(documents)} documents...")
    
    doc_store = {}
    summary_docs = []
    
    for i, doc in enumerate(documents):
        doc_id = str(uuid.uuid4())
        
        # Store full document
        doc_store[doc_id] = doc
        
        # Generate summary
        print(f"  [{i+1}/{len(documents)}] Summarizing...")
        summary = generate_summary(doc)
        
        # Create summary document with link to original
        summary_doc = Document(
            page_content=summary,
            metadata={
                "doc_id": doc_id,
                "source": doc.metadata.get("filename", "unknown")
            }
        )
        summary_docs.append(summary_doc)
    
    # Create vectorstore from summaries
    vectorstore = FAISS.from_documents(summary_docs, embeddings)
    
    print(f"Indexed {len(summary_docs)} summaries")
    
    return vectorstore, doc_store

## Retrieve Full Documents

In [None]:
def retrieve_multi_rep(question, vectorstore, doc_store, k=3):
    """Search summaries, return full documents"""
    print(f"Question: {question}\n")
    
    # Search summaries
    summaries = vectorstore.similarity_search(question, k=k)
    print(f"Matched {len(summaries)} summaries")
    
    # Get full documents
    full_docs = []
    for summary in summaries:
        doc_id = summary.metadata.get("doc_id")
        if doc_id and doc_id in doc_store:
            full_docs.append(doc_store[doc_id])
            print(f"  Summary: {summary.page_content[:80]}...")
    
    print(f"\nReturning {len(full_docs)} full documents")
    return full_docs, summaries

## Complete Multi-Rep RAG

In [None]:
def multi_rep_rag(question, vectorstore, doc_store, k=3):
    """Complete RAG with multi-representation retrieval"""
    full_docs, summaries = retrieve_multi_rep(question, vectorstore, doc_store, k)
    
    context = format_docs(full_docs)
    
    template = """Answer based on context:

Context:
{context}

Question: {question}

Answer:"""
    
    prompt = ChatPromptTemplate.from_template(template)
    chain = prompt | model | StrOutputParser()
    
    answer = chain.invoke({"context": context, "question": question})
    
    print(f"\nAnswer: {answer}")
    return answer, full_docs

## Test

In [None]:
# Load documents
documents = load_documents()

# Create index (limit docs for speed)
vectorstore, doc_store = create_multi_rep_index(documents, max_docs=5)

# Test queries
test_questions = [
    "What is RF and RDKit?",
    "What is LibINVENT?"
]

for q in test_questions:
    print("="*60)
    answer, docs = multi_rep_rag(q, vectorstore, doc_store)
    print(f"\nSources: {len(docs)} full documents")
    for i, doc in enumerate(docs):
        print(f"  {i+1}. {doc.metadata.get('filename', 'unknown')} ({len(doc.page_content)} chars)")
    print()