RETRIEVAL-AUGMENTED GENERATION (RAG) SYSTEM TEMPLATE
====================================================
Use Case: Document Q&A, Knowledge Base, Chatbot with External Knowledge

# 1. PROJECT SETUP & ENVIRONMENT

## 1.1 Install Required Libraries

In [None]:
# !pip install langchain langchain-community langchain-openai
# !pip install chromadb faiss-cpu sentence-transformers
# !pip install pypdf python-docx unstructured
# !pip install openai tiktoken
# !pip install gradio streamlit

## 1.2 Import Libraries

In [None]:
import os
import json
import pandas as pd
import numpy as np
from pathlib import Path
from datetime import datetime
import warnings
warnings.filterwarnings('ignore')

# LangChain Core
from langchain.text_splitter import (
    RecursiveCharacterTextSplitter,
    CharacterTextSplitter,
    TokenTextSplitter
)
from langchain.document_loaders import (
    PyPDFLoader,
    DirectoryLoader,
    TextLoader,
    CSVLoader,
    UnstructuredMarkdownLoader
)
from langchain.vectorstores import FAISS, Chroma
from langchain.embeddings import HuggingFaceEmbeddings, OpenAIEmbeddings
from langchain.chains import RetrievalQA, ConversationalRetrievalChain
from langchain.memory import ConversationBufferMemory, ConversationSummaryMemory
from langchain.prompts import PromptTemplate
from langchain.callbacks import StreamingStdOutCallbackHandler

# LLMs
from langchain.llms import HuggingFacePipeline, OpenAI
from langchain.chat_models import ChatOpenAI
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline

# Evaluation
from langchain.evaluation import load_evaluator

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
# Set environment variables
os.environ['OPENAI_API_KEY'] = 'your-api-key-here'  # Or use local models
os.environ['TOKENIZERS_PARALLELISM'] = 'false'

## 1.3 Configuration

In [None]:
CONFIG = {
    # Document settings
    'data_dir': './documents',
    'chunk_size': 1000,
    'chunk_overlap': 200,
    'separator': '\n\n',
    
    # Embedding settings
    'embedding_model': 'sentence-transformers/all-MiniLM-L6-v2',  # or 'text-embedding-ada-002'
    'embedding_device': 'cpu',
    
    # Vector store settings
    'vector_store_type': 'faiss',  # 'faiss' or 'chroma'
    'vector_store_path': './vector_store',
    'collection_name': 'knowledge_base',
    
    # LLM settings
    'llm_type': 'huggingface',  # 'openai' or 'huggingface'
    'model_name': 'google/flan-t5-base',  # or 'gpt-3.5-turbo'
    'temperature': 0.7,
    'max_tokens': 512,
    
    # Retrieval settings
    'top_k': 4,
    'search_type': 'similarity',  # 'similarity' or 'mmr'
    'mmr_diversity': 0.3,
    
    # Chain settings
    'chain_type': 'stuff',  # 'stuff', 'map_reduce', 'refine', 'map_rerank'
    'return_source_documents': True,
    
    'random_seed': 42
}

# 2. DOCUMENT LOADING

## 2.1 Load Documents from Directory

In [None]:
def load_documents(data_dir, file_types=['pdf', 'txt', 'md', 'csv']):
    """Load documents from directory"""
    documents = []
    data_path = Path(data_dir)
    
    for file_type in file_types:
        print(f"Loading .{file_type} files...")
        
        if file_type == 'pdf':
            loader = DirectoryLoader(
                data_dir,
                glob=f"**/*.{file_type}",
                loader_cls=PyPDFLoader,
                show_progress=True
            )
        elif file_type == 'txt':
            loader = DirectoryLoader(
                data_dir,
                glob=f"**/*.{file_type}",
                loader_cls=TextLoader,
                show_progress=True
            )
        elif file_type == 'md':
            loader = DirectoryLoader(
                data_dir,
                glob=f"**/*.{file_type}",
                loader_cls=UnstructuredMarkdownLoader,
                show_progress=True
            )
        elif file_type == 'csv':
            loader = DirectoryLoader(
                data_dir,
                glob=f"**/*.{file_type}",
                loader_cls=CSVLoader,
                show_progress=True
            )
        
        try:
            docs = loader.load()
            documents.extend(docs)
            print(f"Loaded {len(docs)} documents from .{file_type} files")
        except Exception as e:
            print(f"Error loading {file_type} files: {e}")
    
    return documents

# Load documents
documents = load_documents(CONFIG['data_dir'])
print(f"\nTotal documents loaded: {len(documents)}")

## 2.2 Document Exploration

In [None]:
# Analyze document statistics
if documents:
    doc_lengths = [len(doc.page_content) for doc in documents]
    
    print(f"\nDocument Statistics:")
    print(f"Total documents: {len(documents)}")
    print(f"Average length: {np.mean(doc_lengths):.0f} characters")
    print(f"Median length: {np.median(doc_lengths):.0f} characters")
    print(f"Min length: {np.min(doc_lengths)} characters")
    print(f"Max length: {np.max(doc_lengths)} characters")
    
    # Visualize
    plt.figure(figsize=(12, 5))
    
    plt.subplot(1, 2, 1)
    plt.hist(doc_lengths, bins=30, edgecolor='black')
    plt.xlabel('Document Length (characters)')
    plt.ylabel('Frequency')
    plt.title('Distribution of Document Lengths')
    
    plt.subplot(1, 2, 2)
    plt.boxplot(doc_lengths)
    plt.ylabel('Document Length (characters)')
    plt.title('Document Length Box Plot')
    
    plt.tight_layout()
    plt.show()
    
    # Display sample
    print(f"\nSample Document:")
    print(f"Source: {documents[0].metadata}")
    print(f"Content Preview: {documents[0].page_content[:500]}...")

# 3. TEXT SPLITTING & CHUNKING

## 3.1 Configure Text Splitter

In [None]:
# Choose splitter type
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=CONFIG['chunk_size'],
    chunk_overlap=CONFIG['chunk_overlap'],
    length_function=len,
    separators=["\n\n", "\n", " ", ""]
)

# Alternative: Token-based splitting
# from transformers import AutoTokenizer
# tokenizer = AutoTokenizer.from_pretrained(CONFIG['model_name'])
# 
# text_splitter = TokenTextSplitter(
#     chunk_size=CONFIG['chunk_size'],
#     chunk_overlap=CONFIG['chunk_overlap'],
#     encoding_name=tokenizer.name_or_path
# )

print(f"Text splitter configured:")
print(f"Chunk size: {CONFIG['chunk_size']}")
print(f"Chunk overlap: {CONFIG['chunk_overlap']}")

## 3.2 Split Documents into Chunks

In [None]:
# Split documents
chunks = text_splitter.split_documents(documents)

print(f"\nTotal chunks created: {len(chunks)}")
print(f"Average chunk size: {np.mean([len(chunk.page_content) for chunk in chunks]):.0f} characters")

# Visualize chunk distribution
chunk_lengths = [len(chunk.page_content) for chunk in chunks]

plt.figure(figsize=(12, 5))

plt.subplot(1, 2, 1)
plt.hist(chunk_lengths, bins=30, edgecolor='black')
plt.xlabel('Chunk Length (characters)')
plt.ylabel('Frequency')
plt.title('Distribution of Chunk Lengths')

plt.subplot(1, 2, 2)
sources = [chunk.metadata.get('source', 'Unknown') for chunk in chunks]
source_counts = pd.Series(sources).value_counts().head(10)
source_counts.plot(kind='barh')
plt.xlabel('Number of Chunks')
plt.title('Top 10 Sources by Chunk Count')

plt.tight_layout()
plt.show()

# Display sample chunk
print(f"\nSample Chunk:")
print(f"Metadata: {chunks[0].metadata}")
print(f"Content: {chunks[0].page_content[:300]}...")

# 4. EMBEDDINGS GENERATION

## 4.1 Initialize Embedding Model

In [None]:
# Option 1: HuggingFace Embeddings (Local)
if CONFIG['llm_type'] == 'huggingface':
    embeddings = HuggingFaceEmbeddings(
        model_name=CONFIG['embedding_model'],
        model_kwargs={'device': CONFIG['embedding_device']},
        encode_kwargs={'normalize_embeddings': True}
    )

# Option 2: OpenAI Embeddings (API)
else:
    embeddings = OpenAIEmbeddings(
        model="text-embedding-ada-002",
        openai_api_key=os.environ.get('OPENAI_API_KEY')
    )

print(f"Embedding model loaded: {CONFIG['embedding_model']}")

# Test embedding
sample_text = "This is a test sentence."
sample_embedding = embeddings.embed_query(sample_text)
print(f"Embedding dimension: {len(sample_embedding)}")

## 4.2 Embedding Performance Analysis

In [None]:
import time

def benchmark_embeddings(embeddings, texts, num_samples=10):
    """Benchmark embedding generation speed"""
    test_texts = texts[:num_samples]
    
    start = time.time()
    _ = embeddings.embed_documents(test_texts)
    elapsed = time.time() - start
    
    print(f"Embedded {num_samples} texts in {elapsed:.2f}s")
    print(f"Average: {elapsed/num_samples:.3f}s per text")

# Benchmark
sample_texts = [chunk.page_content for chunk in chunks[:10]]
benchmark_embeddings(embeddings, sample_texts)

# 5. VECTOR STORE CREATION

## 5.1 Create Vector Store

In [None]:
print("Creating vector store...")

if CONFIG['vector_store_type'] == 'faiss':
    # FAISS vector store (in-memory, fast)
    vectorstore = FAISS.from_documents(
        documents=chunks,
        embedding=embeddings
    )
    
    # Save to disk
    vectorstore.save_local(CONFIG['vector_store_path'])
    print(f"Vector store saved to {CONFIG['vector_store_path']}")

elif CONFIG['vector_store_type'] == 'chroma':
    # Chroma vector store (persistent)
    vectorstore = Chroma.from_documents(
        documents=chunks,
        embedding=embeddings,
        collection_name=CONFIG['collection_name'],
        persist_directory=CONFIG['vector_store_path']
    )
    
    vectorstore.persist()
    print(f"Vector store persisted to {CONFIG['vector_store_path']}")

print(f"Vector store created with {vectorstore._collection.count() if hasattr(vectorstore, '_collection') else len(chunks)} vectors")

## 5.2 Load Existing Vector Store

In [None]:
def load_vector_store(path, embeddings, store_type='faiss'):
    """Load existing vector store"""
    if store_type == 'faiss':
        vectorstore = FAISS.load_local(
            path,
            embeddings,
            allow_dangerous_deserialization=True
        )
    elif store_type == 'chroma':
        vectorstore = Chroma(
            persist_directory=path,
            embedding_function=embeddings,
            collection_name=CONFIG['collection_name']
        )
    
    return vectorstore

# Load vector store
# vectorstore = load_vector_store(CONFIG['vector_store_path'], embeddings, CONFIG['vector_store_type'])

# 6. RETRIEVAL TESTING

## 6.1 Similarity Search

In [None]:
def test_retrieval(query, vectorstore, k=4):
    """Test retrieval with a query"""
    print(f"\nQuery: {query}")
    print("="*80)
    
    # Retrieve documents
    docs = vectorstore.similarity_search(query, k=k)
    
    for i, doc in enumerate(docs, 1):
        print(f"\n--- Result {i} ---")
        print(f"Source: {doc.metadata.get('source', 'Unknown')}")
        print(f"Content: {doc.page_content[:300]}...")
    
    return docs

# Test queries
test_queries = [
    "What is machine learning?",
    "Explain neural networks",
    "How does training work?"
]

for query in test_queries:
    retrieved_docs = test_retrieval(query, vectorstore, k=CONFIG['top_k'])

## 6.2 Similarity Search with Scores

In [None]:
def test_retrieval_with_scores(query, vectorstore, k=4):
    """Test retrieval with similarity scores"""
    print(f"\nQuery: {query}")
    print("="*80)
    
    # Retrieve with scores
    docs_with_scores = vectorstore.similarity_search_with_score(query, k=k)
    
    for i, (doc, score) in enumerate(docs_with_scores, 1):
        print(f"\n--- Result {i} (Score: {score:.4f}) ---")
        print(f"Source: {doc.metadata.get('source', 'Unknown')}")
        print(f"Content: {doc.page_content[:200]}...")
    
    return docs_with_scores

# Test with scores
query = "What is deep learning?"
docs_with_scores = test_retrieval_with_scores(query, vectorstore, k=CONFIG['top_k'])

## 6.3 MMR (Maximal Marginal Relevance) Search

In [None]:
# MMR search for diverse results
def test_mmr_retrieval(query, vectorstore, k=4, fetch_k=20):
    """Test MMR retrieval for diverse results"""
    print(f"\nQuery (MMR): {query}")
    print("="*80)
    
    docs = vectorstore.max_marginal_relevance_search(
        query,
        k=k,
        fetch_k=fetch_k,
        lambda_mult=CONFIG['mmr_diversity']
    )
    
    for i, doc in enumerate(docs, 1):
        print(f"\n--- Result {i} ---")
        print(f"Source: {doc.metadata.get('source', 'Unknown')}")
        print(f"Content: {doc.page_content[:200]}...")
    
    return docs

# Test MMR
mmr_docs = test_mmr_retrieval("Explain AI", vectorstore)

# 7. LLM INITIALIZATION

## 7.1 Initialize Language Model

In [None]:
if CONFIG['llm_type'] == 'openai':
    # OpenAI LLM
    llm = ChatOpenAI(
        model_name=CONFIG['model_name'],
        temperature=CONFIG['temperature'],
        max_tokens=CONFIG['max_tokens'],
        openai_api_key=os.environ.get('OPENAI_API_KEY')
    )

elif CONFIG['llm_type'] == 'huggingface':
    # HuggingFace Local LLM
    tokenizer = AutoTokenizer.from_pretrained(CONFIG['model_name'])
    model = AutoModelForCausalLM.from_pretrained(
        CONFIG['model_name'],
        torch_dtype='auto',
        device_map='auto'
    )
    
    pipe = pipeline(
        "text-generation",
        model=model,
        tokenizer=tokenizer,
        max_new_tokens=CONFIG['max_tokens'],
        temperature=CONFIG['temperature'],
        top_p=0.95,
        repetition_penalty=1.15
    )
    
    llm = HuggingFacePipeline(pipeline=pipe)

print(f"LLM initialized: {CONFIG['model_name']}")

# 8. RAG CHAIN CONSTRUCTION

## 8.1 Create Custom Prompt Template

In [None]:
# Custom prompt template
prompt_template = """Use the following pieces of context to answer the question at the end. 
If you don't know the answer, just say that you don't know, don't try to make up an answer.

Context:
{context}

Question: {question}

Helpful Answer:"""

PROMPT = PromptTemplate(
    template=prompt_template,
    input_variables=["context", "question"]
)

## 8.2 Basic RetrievalQA Chain

In [None]:
# Create QA chain
qa_chain = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type=CONFIG['chain_type'],
    retriever=vectorstore.as_retriever(
        search_type=CONFIG['search_type'],
        search_kwargs={'k': CONFIG['top_k']}
    ),
    return_source_documents=CONFIG['return_source_documents'],
    chain_type_kwargs={"prompt": PROMPT}
)

print("RetrievalQA chain created")

## 8.3 Conversational Retrieval Chain

In [None]:
# Memory for conversation history
memory = ConversationBufferMemory(
    memory_key="chat_history",
    return_messages=True,
    output_key='answer'
)

# Conversational chain
conversational_chain = ConversationalRetrievalChain.from_llm(
    llm=llm,
    retriever=vectorstore.as_retriever(
        search_type=CONFIG['search_type'],
        search_kwargs={'k': CONFIG['top_k']}
    ),
    memory=memory,
    return_source_documents=True,
    verbose=True
)

print("Conversational Retrieval chain created")

# 9. QUERY & ANSWER GENERATION

## 9.1 Single Query

In [None]:
def ask_question(question, chain):
    """Ask a question and get an answer"""
    print(f"\nQuestion: {question}")
    print("="*80)
    
    result = chain({"query": question})
    
    answer = result['result']
    source_docs = result.get('source_documents', [])
    
    print(f"\nAnswer: {answer}")
    
    if source_docs:
        print(f"\nSources ({len(source_docs)}):")
        for i, doc in enumerate(source_docs, 1):
            print(f"\n{i}. {doc.metadata.get('source', 'Unknown')}")
            print(f"   {doc.page_content[:150]}...")
    
    return result

# Test questions
questions = [
    "What is the main topic of the documents?",
    "Explain the key concepts",
    "What are the practical applications?"
]

for question in questions:
    result = ask_question(question, qa_chain)
    print("\n" + "="*80 + "\n")

## 9.2 Conversational Q&A

In [None]:
def chat(message, chain):
    """Have a conversation"""
    result = chain({"question": message})
    
    answer = result['answer']
    sources = result.get('source_documents', [])
    
    print(f"User: {message}")
    print(f"Assistant: {answer}")
    
    if sources:
        print(f"\nSources: {[s.metadata.get('source', 'Unknown') for s in sources]}")
    
    return result

# Conversational example
conversation = [
    "What is machine learning?",
    "Can you give me an example?",
    "How is it different from traditional programming?"
]

for msg in conversation:
    chat(msg, conversational_chain)
    print("\n" + "-"*80 + "\n")

# 10. ADVANCED RAG TECHNIQUES

## 10.1 Multi-Query Retrieval

In [None]:
from langchain.retrievers.multi_query import MultiQueryRetriever

# Generate multiple query variations
multi_query_retriever = MultiQueryRetriever.from_llm(
    retriever=vectorstore.as_retriever(search_kwargs={'k': CONFIG['top_k']}),
    llm=llm
)

# Test multi-query
question = "How does AI work?"
docs = multi_query_retriever.get_relevant_documents(question)
print(f"Retrieved {len(docs)} documents using multi-query")

## 10.2 Contextual Compression

In [None]:
from langchain.retrievers import ContextualCompressionRetriever
from langchain.retrievers.document_compressors import LLMChainExtractor

# Compress retrieved documents
compressor = LLMChainExtractor.from_llm(llm)

compression_retriever = ContextualCompressionRetriever(
    base_compressor=compressor,
    base_retriever=vectorstore.as_retriever(search_kwargs={'k': CONFIG['top_k']})
)

# Test compressed retrieval
compressed_docs = compression_retriever.get_relevant_documents("What is deep learning?")
print(f"Compressed to {len(compressed_docs)} relevant documents")

## 10.3 Self-Query Retriever

In [None]:
from langchain.retrievers.self_query.base import SelfQueryRetriever
from langchain.chains.query_constructor.base import AttributeInfo

# Define metadata fields
metadata_field_info = [
    AttributeInfo(
        name="source",
        description="The source file of the document",
        type="string"
    ),
    AttributeInfo(
        name="page",
        description="The page number",
        type="integer"
    ),
]

document_content_description = "Technical documentation about AI and machine learning"

# Create self-query retriever
self_query_retriever = SelfQueryRetriever.from_llm(
    llm=llm,
    vectorstore=vectorstore,
    document_contents=document_content_description,
    metadata_field_info=metadata_field_info,
    verbose=True
)

## 10.4 Parent Document Retriever

In [None]:
from langchain.retrievers import ParentDocumentRetriever
from langchain.storage import InMemoryStore

# Store for parent documents
parent_store = InMemoryStore()

# Create parent document retriever
parent_splitter = RecursiveCharacterTextSplitter(chunk_size=2000)
child_splitter = RecursiveCharacterTextSplitter(chunk_size=400)

parent_retriever = ParentDocumentRetriever(
    vectorstore=vectorstore,
    docstore=parent_store,
    child_splitter=child_splitter,
    parent_splitter=parent_splitter,
)

# Add documents
parent_retriever.add_documents(documents)

# 11. EVALUATION

## 11.1 Create Evaluation Dataset

In [None]:
# Sample Q&A pairs for evaluation
eval_data = {
    'questions': [
        "What is machine learning?",
        "Explain neural networks",
        "How does training work?"
    ],
    'ground_truth': [
        "Machine learning is a subset of AI...",
        "Neural networks are computational models...",
        "Training involves adjusting weights..."
    ]
}

# Generate answers
predictions = []
for question in eval_data['questions']:
    result = qa_chain({"query": question})
    predictions.append(result['result'])

## 11.3 Custom Evaluation Metrics

In [None]:
from sklearn.metrics.pairwise import cosine_similarity

def evaluate_relevance(question, answer, retrieved_docs, embeddings):
    """Evaluate answer relevance to retrieved context"""
    # Embed question, answer, and context
    question_emb = embeddings.embed_query(question)
    answer_emb = embeddings.embed_query(answer)
    context_embs = [embeddings.embed_query(doc.page_content) for doc in retrieved_docs]
    
    # Calculate similarities
    q_a_sim = cosine_similarity([question_emb], [answer_emb])[0][0]
    a_c_sims = [cosine_similarity([answer_emb], [c_emb])[0][0] for c_emb in context_embs]
    
    avg_context_sim = np.mean(a_c_sims)
    
    print(f"Question-Answer Similarity: {q_a_sim:.4f}")
    print(f"Average Answer-Context Similarity: {avg_context_sim:.4f}")
    
    return {
        'question_answer_sim': q_a_sim,
        'answer_context_sim': avg_context_sim
    }

# Evaluate a sample
question = eval_data['questions'][0]
result = qa_chain({"query": question})
metrics = evaluate_relevance(
    question,
    result['result'],
    result['source_documents'],
    embeddings
)

# 12. DEPLOYMENT

## 12.1 Gradio Interface

In [None]:
import gradio as gr

def create_gradio_interface(chain):
    """Create Gradio interface for RAG system"""
    
    def respond(message, chat_history):
        result = chain({"query": message})
        answer = result['result']
        sources = result.get('source_documents', [])
        
        # Format source information
        source_info = "\n\nSources:\n"
        for i, doc in enumerate(sources, 1):
            source_info += f"{i}. {doc.metadata.get('source', 'Unknown')}\n"
        
        response = answer + source_info
        chat_history.append((message, response))
        
        return "", chat_history
    
    with gr.Blocks() as demo:
        gr.Markdown("# RAG-based Q&A System")
        gr.Markdown("Ask questions about your documents!")
        
        chatbot = gr.Chatbot()
        msg = gr.Textbox(placeholder="Ask a question...")
        clear = gr.Button("Clear")
        
        msg.submit(respond, [msg, chatbot], [msg, chatbot])
        clear.click(lambda: None, None, chatbot, queue=False)
    
    return demo

# Launch Gradio interface
# demo = create_gradio_interface(qa_chain)
# demo.launch(share=True)

## 12.2 Streamlit Interface

In [None]:
# Create streamlit_app.py with this content:
streamlit_code = '''
import streamlit as st
from langchain.vectorstores import FAISS
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.chains import RetrievalQA

# Load components
@st.cache_resource
def load_rag_system():
    embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
    vectorstore = FAISS.load_local("./vector_store", embeddings)
    qa_chain = RetrievalQA.from_chain_type(...)
    return qa_chain

qa_chain = load_rag_system()

st.title("ðŸ“š RAG Q&A System")

# Sidebar
with st.sidebar:
    st.header("Settings")
    k = st.slider("Number of documents to retrieve", 1, 10, 4)

# Chat interface
if "messages" not in st.session_state:
    st.session_state.messages = []

for message in st.session_state.messages:
    with st.chat_message(message["role"]):
        st.markdown(message["content"])

if prompt := st.chat_input("Ask a question"):
    st.session_state.messages.append({"role": "user", "content": prompt})
    
    with st.chat_message("user"):
        st.markdown(prompt)
    
    with st.chat_message("assistant"):
        result = qa_chain({"query": prompt})
        response = result['result']
        st.markdown(response)
        
        # Show sources
        with st.expander("View Sources"):
            for i, doc in enumerate(result['source_documents'], 1):
                st.write(f"**Source {i}:** {doc.metadata.get('source')}")
                st.write(doc.page_content[:200] + "...")
    
    st.session_state.messages.append({"role": "assistant", "content": response})
'''

# Save to file
# with open('streamlit_app.py', 'w') as f:
#     f.write(streamlit_code)

# Run with: streamlit run streamlit_app.py

## 12.3 FastAPI REST API

In [None]:
fastapi_code = '''
from fastapi import FastAPI, HTTPException
from pydantic import BaseModel
from typing import List, Optional
import uvicorn

app = FastAPI(title="RAG Q&A API")

# Load RAG system on startup
@app.on_event("startup")
async def startup_event():
    global qa_chain
    embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
    vectorstore = FAISS.load_local("./vector_store", embeddings)
    qa_chain = RetrievalQA.from_chain_type(...)

class Question(BaseModel):
    query: str
    k: Optional[int] = 4

class Answer(BaseModel):
    answer: str
    sources: List[dict]

@app.post("/ask", response_model=Answer)
async def ask_question(question: Question):
    try:
        result = qa_chain({"query": question.query})
        
        sources = [
            {
                "source": doc.metadata.get("source"),
                "content": doc.page_content[:200]
            }
            for doc in result.get("source_documents", [])
        ]
        
        return Answer(answer=result["result"], sources=sources)
    
    except Exception as e:
        raise HTTPException(status_code=500, detail=str(e))

@app.get("/health")
async def health_check():
    return {"status": "healthy"}

if __name__ == "__main__":
    uvicorn.run(app, host="0.0.0.0", port=8000)
'''

# Save to file
# with open('api.py', 'w') as f:
#     f.write(fastapi_code)

# Run with: uvicorn api:app --reload

# 13. OPTIMIZATION & BEST PRACTICES

## 13.1 Caching for Performance

In [None]:
from functools import lru_cache

@lru_cache(maxsize=100)
def cached_retrieval(query: str, k: int = 4):
    """Cache retrieval results for repeated queries"""
    docs = vectorstore.similarity_search(query, k=k)
    return docs

## 13.2 Hybrid Search (Keyword + Semantic)

In [None]:
from langchain.retrievers import BM25Retriever, EnsembleRetriever

# BM25 retriever (keyword-based)
bm25_retriever = BM25Retriever.from_documents(chunks)
bm25_retriever.k = CONFIG['top_k']

# Semantic retriever
semantic_retriever = vectorstore.as_retriever(search_kwargs={'k': CONFIG['top_k']})

# Ensemble retriever (combines both)
ensemble_retriever = EnsembleRetriever(
    retrievers=[bm25_retriever, semantic_retriever],
    weights=[0.5, 0.5]  # Equal weighting
)

# Test hybrid search
hybrid_docs = ensemble_retriever.get_relevant_documents("What is AI?")
print(f"Hybrid search retrieved {len(hybrid_docs)} documents")

## 13.3 Re-ranking Retrieved Documents

In [None]:
# Alternative: Cross-encoder reranking
from sentence_transformers import CrossEncoder

cross_encoder = CrossEncoder('cross-encoder/ms-marco-MiniLM-L-6-v2')

def rerank_documents(query, docs, top_k=4):
    """Rerank documents using cross-encoder"""
    pairs = [[query, doc.page_content] for doc in docs]
    scores = cross_encoder.predict(pairs)
    
    # Sort by scores
    ranked_indices = np.argsort(scores)[::-1][:top_k]
    reranked_docs = [docs[i] for i in ranked_indices]
    
    return reranked_docs

# Test reranking
query = "Explain deep learning"
initial_docs = vectorstore.similarity_search(query, k=10)
reranked_docs = rerank_documents(query, initial_docs, top_k=4)

## 13.4 Metadata Filtering

In [None]:
# Filter by metadata
def filtered_search(query, metadata_filter, k=4):
    """Search with metadata filtering"""
    docs = vectorstore.similarity_search(
        query,
        k=k,
        filter=metadata_filter
    )
    return docs

# Example: Filter by source
# docs = filtered_search("AI", {"source": "document1.pdf"})

# 14. MONITORING & LOGGING

## 14.1 Query Analytics

In [None]:
import json
from datetime import datetime

class QueryLogger:
    """Log queries and responses for analysis"""
    
    def __init__(self, log_file='query_logs.jsonl'):
        self.log_file = log_file
    
    def log_query(self, query, answer, sources, latency):
        """Log a query-answer pair"""
        log_entry = {
            'timestamp': datetime.now().isoformat(),
            'query': query,
            'answer': answer,
            'sources': [s.metadata.get('source') for s in sources],
            'latency_ms': latency * 1000
        }
        
        with open(self.log_file, 'a') as f:
            f.write(json.dumps(log_entry) + '\n')
    
    def analyze_logs(self):
        """Analyze query logs"""
        logs = []
        with open(self.log_file, 'r') as f:
            for line in f:
                logs.append(json.loads(line))
        
        df = pd.DataFrame(logs)
        
        print("\nQuery Analytics:")
        print(f"Total queries: {len(df)}")
        print(f"Average latency: {df['latency_ms'].mean():.2f}ms")
        print(f"\nMost used sources:")
        all_sources = [s for sources in df['sources'] for s in sources]
        print(pd.Series(all_sources).value_counts().head(5))
        
        return df

# Initialize logger
logger = QueryLogger()

# Log queries
import time

def ask_and_log(question, chain, logger):
    """Ask question and log the interaction"""
    start = time.time()
    result = chain({"query": question})
    latency = time.time() - start
    
    logger.log_query(
        query=question,
        answer=result['result'],
        sources=result.get('source_documents', []),
        latency=latency
    )
    
    return result

## 14.2 Performance Monitoring

In [None]:
import psutil

def monitor_system_resources():
    """Monitor system resource usage"""
    # CPU
    cpu_percent = psutil.cpu_percent(interval=1)
    
    # Memory
    memory = psutil.virtual_memory()
    memory_percent = memory.percent
    
    print(f"\nSystem Resources:")
    print(f"CPU Usage: {cpu_percent}%")
    print(f"Memory Usage: {memory_percent}%")

# Monitor during query
monitor_system_resources()

# 15. ADVANCED FEATURES

## 15.2 Streaming Responses

In [None]:
from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler

def create_streaming_chain(llm, vectorstore):
    """Create chain with streaming responses"""
    streaming_llm = llm
    streaming_llm.streaming = True
    streaming_llm.callbacks = [StreamingStdOutCallbackHandler()]
    
    chain = RetrievalQA.from_chain_type(
        llm=streaming_llm,
        retriever=vectorstore.as_retriever(),
        return_source_documents=True
    )
    
    return chain

## 15.3 Agent-based RAG

In [None]:
from langchain.agents import initialize_agent, Tool
from langchain.agents import AgentType

# Create tools
tools = [
    Tool(
        name="Knowledge Base",
        func=qa_chain.run,
        description="Useful for answering questions about documents in the knowledge base"
    ),
    # Add more tools as needed
]

# Initialize agent
agent = initialize_agent(
    tools,
    llm,
    agent=AgentType.ZERO_SHOT_REACT_DESCRIPTION,
    verbose=True
)

# Use agent
# response = agent.run("What is machine learning?")

# 16. TESTING & DEBUGGING

## 16.1 Debug Retrieval Quality

In [None]:
def debug_retrieval(query, vectorstore, k=10):
    """Debug retrieval to understand results"""
    print(f"Query: {query}\n")
    
    # Get more results than usual
    docs_with_scores = vectorstore.similarity_search_with_score(query, k=k)
    
    # Analyze score distribution
    scores = [score for _, score in docs_with_scores]
    
    plt.figure(figsize=(10, 5))
    plt.bar(range(len(scores)), scores)
    plt.xlabel('Document Rank')
    plt.ylabel('Similarity Score')
    plt.title('Retrieval Scores Distribution')
    plt.show()
    
    print("\nTop Retrieved Documents:")
    for i, (doc, score) in enumerate(docs_with_scores[:5], 1):
        print(f"\n{i}. Score: {score:.4f}")
        print(f"   Source: {doc.metadata.get('source')}")
        print(f"   Content: {doc.page_content[:150]}...")

# Debug a query
debug_retrieval("What is neural networks?", vectorstore)

## 16.2 Test Chain Components

In [None]:
def test_chain_components(query, chain):
    """Test individual components of the chain"""
    print(f"Testing query: {query}\n")
    
    # Test retriever
    print("1. Testing Retriever...")
    docs = chain.retriever.get_relevant_documents(query)
    print(f"   Retrieved {len(docs)} documents")
    
    # Test LLM
    print("\n2. Testing LLM...")
    context = "\n".join([doc.page_content for doc in docs])
    prompt = f"Context: {context}\n\nQuestion: {query}\n\nAnswer:"
    response = chain.combine_documents_chain.llm_chain.llm(prompt)
    print(f"   Generated response: {response[:100]}...")
    
    # Test full chain
    print("\n3. Testing Full Chain...")
    result = chain({"query": query})
    print(f"   Final answer: {result['result'][:100]}...")

# Test components
# test_chain_components("What is AI?", qa_chain)

# 17. SAVE & LOAD CONFIGURATION

In [None]:
def save_rag_config(config, path='rag_config.json'):
    """Save RAG configuration"""
    with open(path, 'w') as f:
        json.dump(config, f, indent=2)
    print(f"Configuration saved to {path}")

def load_rag_config(path='rag_config.json'):
    """Load RAG configuration"""
    with open(path, 'r') as f:
        config = json.load(f)
    return config

# Save configuration
save_rag_config(CONFIG)

# Load configuration
# loaded_config = load_rag_config()

# 18. CONCLUSIONS & NEXT STEPS

## Summary:
- Documents processed: {len(documents)}
- Chunks created: {len(chunks)}
- Vector store: {CONFIG['vector_store_type']}
- LLM: {CONFIG['model_name']}
- Retrieval method: {CONFIG['search_type']}

## Next Steps:
- [ ] Implement advanced retrieval strategies (hybrid search, reranking)
- [ ] Add multi-modal support (images, tables)
- [ ] Implement caching for improved performance
- [ ] Set up monitoring and analytics
- [ ] Deploy as production API
- [ ] Add authentication and rate limiting
- [ ] Implement feedback loop for continuous improvement
- [ ] Create comprehensive test suite
- [ ] Add support for multiple languages
- [ ] Implement conversation summarization for long chats