# üîç Multi-Document Search Engine with RAG

This notebook implements a complete RAG (Retrieval-Augmented Generation) system that:
- Searches across PDF, DOCX, and CSV documents
- Uses intelligent routing to determine document type
- Provides conversational answers using LLM

**Model Details:**
- Embeddings: `sentence-transformers/all-mpnet-base-v2` (768 dimensions)
- LLM: `openai/gpt-oss-20b:free` via OpenRouter

## Step 1: Load Environment Variables

In [None]:
import os
from dotenv import load_dotenv, find_dotenv

# Load API keys from .env file
_ = load_dotenv(find_dotenv())

# Explicitly verify and set the GROQ_API_KEY
GROQ_API_KEY = os.getenv('GROQ_API_KEY')

if not GROQ_API_KEY:
    raise ValueError("‚ùå GROQ_API_KEY not found in .env file!")

# Ensure it's in the environment
os.environ['GROQ_API_KEY'] = GROQ_API_KEY

print("‚úÖ Environment variables loaded")
print(f"‚úÖ API Key configured: {GROQ_API_KEY[:10]}...{GROQ_API_KEY[-4:]}")

## Step 2: Import Required Libraries

In [None]:
# LangChain core
from langchain_groq import ChatGroq
from langchain_community.vectorstores import DocArrayInMemorySearch
from langchain_community.document_loaders import CSVLoader, PyPDFLoader, Docx2txtLoader

# LangChain core components
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.documents import Document
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough

# HuggingFace for embeddings
from langchain_huggingface import HuggingFaceEmbeddings

# Utilities
import glob
from typing import List, Dict
import warnings
warnings.filterwarnings('ignore')

print("‚úÖ All libraries imported successfully")

## Step 3: Initialize Embeddings Model

In [None]:
print("Loading embeddings model... (this may take a minute)")

embeddings = HuggingFaceEmbeddings(
    model_name="sentence-transformers/all-mpnet-base-v2",
    encode_kwargs={"normalize_embeddings": True},  # for cosine similarity
)

# Test the embeddings
test_text = "Hello World, how are you?"
test_embedding = embeddings.embed_query(test_text)
print(f"‚úÖ Embedding model loaded successfully!")
print(f"   Embedding dimension: {len(test_embedding)}")
print(f"   Sample values: {test_embedding[:5]}")

## Step 4: Initialize LLM (OpenRouter)

In [None]:
# Initialize the LLM for routing and answering (Using Groq - FREE!)

# Verify API key is loaded
if not GROQ_API_KEY:
    raise ValueError("‚ùå Run Step 1 first to load the API key!")

llm = ChatGroq(
    temperature=0.0,
    api_key=GROQ_API_KEY,  # Explicit API key
    model="llama-3.3-70b-versatile",  # Fast & FREE!
)

print("‚úÖ LLM initialized successfully!")
print(f"   Model: llama-3.3-70b-versatile")
print(f"   Provider: Groq")

# Test the LLM with a simple call
print("\nüß™ Testing LLM connection...")
try:
    test_response = llm.invoke("Say 'ready' in one word")
    print(f"   ‚úÖ LLM Test Response: {test_response.content}")
except Exception as e:
    print(f"   ‚ùå LLM Test Failed: {str(e)}")
    raise


## Step 5: Load Documents

In [None]:
def load_documents_by_type(directory: str = ".") -> Dict[str, List[Document]]:
    """
    Load all documents from directory, organized by type
    Returns a dictionary with keys: 'pdf', 'docx', 'csv'
    """
    documents_by_type = {
        'pdf': [],
        'docx': [],
        'csv': []
    }
    
    # Load PDF files
    pdf_files = glob.glob(f"{directory}/*.pdf")
    for pdf_file in pdf_files:
        print(f"üìÑ Loading PDF: {pdf_file}")
        loader = PyPDFLoader(pdf_file)
        docs = loader.load()
        for doc in docs:
            doc.metadata['doc_type'] = 'pdf'
        documents_by_type['pdf'].extend(docs)
    
    # Load DOCX files
    docx_files = glob.glob(f"{directory}/*.docx")
    for docx_file in docx_files:
        print(f"üìù Loading DOCX: {docx_file}")
        loader = Docx2txtLoader(docx_file)
        docs = loader.load()
        for doc in docs:
            doc.metadata['doc_type'] = 'docx'
        documents_by_type['docx'].extend(docs)
    
    # Load CSV files
    csv_files = glob.glob(f"{directory}/*.csv")
    for csv_file in csv_files:
        print(f"üìä Loading CSV: {csv_file}")
        loader = CSVLoader(file_path=csv_file)
        docs = loader.load()
        for doc in docs:
            doc.metadata['doc_type'] = 'csv'
        documents_by_type['csv'].extend(docs)
    
    return documents_by_type

# Load all documents
print("\n" + "="*60)
print("Loading documents...")
print("="*60)
all_documents = load_documents_by_type(".")

# Print summary
print("\n" + "="*60)
print("Document Loading Summary")
print("="*60)
for doc_type, docs in all_documents.items():
    if docs:
        print(f"  {doc_type.upper()}: {len(docs)} documents")
print(f"\n‚úÖ Total documents loaded: {sum(len(docs) for docs in all_documents.values())}")

## Step 6: Create Vector Stores

In [None]:
print("Creating vector stores...")
vector_stores = {}
retrievers = {}

# Create vector store for PDFs
if all_documents['pdf']:
    print(f"  üî® Creating PDF vector store ({len(all_documents['pdf'])} docs)")
    vector_stores['pdf'] = DocArrayInMemorySearch.from_documents(
        all_documents['pdf'], 
        embeddings
    )
    retrievers['pdf'] = vector_stores['pdf'].as_retriever(search_kwargs={"k": 5})

# Create vector store for DOCX files
if all_documents['docx']:
    print(f"  üî® Creating DOCX vector store ({len(all_documents['docx'])} docs)")
    vector_stores['docx'] = DocArrayInMemorySearch.from_documents(
        all_documents['docx'], 
        embeddings
    )
    retrievers['docx'] = vector_stores['docx'].as_retriever(search_kwargs={"k": 5})

# Create vector store for CSV files
if all_documents['csv']:
    print(f"  üî® Creating CSV vector store ({len(all_documents['csv'])} docs)")
    vector_stores['csv'] = DocArrayInMemorySearch.from_documents(
        all_documents['csv'], 
        embeddings
    )
    retrievers['csv'] = vector_stores['csv'].as_retriever(search_kwargs={"k": 10})

print(f"\n‚úÖ Vector stores created for: {list(retrievers.keys())}")

## Step 7: Create QA Chains

In [None]:
print("Creating QA chains...")
qa_chains = {}

# Create a prompt template for QA
qa_prompt = ChatPromptTemplate.from_template(
    """You are an assistant for question-answering tasks. Use the following pieces of retrieved context to answer the question. If you don't know the answer, say that you don't know.

Question: {question}

Context: {context}

Answer:"""
)

# Helper function to format docs
def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)

for doc_type, retriever in retrievers.items():
    print(f"  ‚õìÔ∏è  Creating QA chain for {doc_type.upper()} documents")
    
    # Create a simple RAG chain using LCEL (LangChain Expression Language)
    qa_chains[doc_type] = (
        {"context": retriever | format_docs, "question": RunnablePassthrough()}
        | qa_prompt
        | llm
        | StrOutputParser()
    )

print(f"\n‚úÖ QA chains created for: {list(qa_chains.keys())}")

## Step 8: Configure Router

In [None]:
# Define retriever information for the router
retriever_infos = []

if 'pdf' in retrievers:
    retriever_infos.append({
        "name": "pdf",
        "description": "The PDF document contains information about iPhone 17 series launch (September 2025). Good for answering questions about iPhone 17 features, specifications, launch dates, pricing, and product details.",
        "retriever": retrievers['pdf']
    })

if 'docx' in retrievers:
    retriever_infos.append({
        "name": "docx",
        "description": "The Word document contains information about F1 Singapore Grand Prix 2025 (October 2025). Good for answering questions about F1 race, Grand Prix details, race results, and Singapore event information.",
        "retriever": retrievers['docx']
    })

if 'csv' in retrievers:
    retriever_infos.append({
        "name": "csv",
        "description": "The CSV document contains sales data with customer orders. Good for answering questions about sales records, orders, corporate segment, customer information, numerical data, and transaction details.",
        "retriever": retrievers['csv']
    })

print(f"‚úÖ Router configured with {len(retriever_infos)} document types")
for info in retriever_infos:
    print(f"   - {info['name']}: {info['description'][:80]}...")

## Step 9: Create Multi-Retrieval QA Chain

In [None]:
# Simple routing logic using LLM classification
print("Setting up intelligent router...")

# Router prompt to determine which document type to search
router_template = """Given the user question below, classify it to route to the most relevant document type.

Available document types:
- pdf: iPhone 17 series information (features, specs, launch, pricing)
- docx: F1 Singapore Grand Prix 2025 information (race results, event details)
- csv: Sales data with orders, customers, segments

User question: {question}

Respond with ONLY ONE WORD - either 'pdf', 'docx', or 'csv'. Nothing else.
Classification:"""

router_prompt = ChatPromptTemplate.from_template(router_template)
router_chain = router_prompt | llm | StrOutputParser()

print("\n" + "="*60)
print("‚úÖ SYSTEM READY!")
print("="*60)
print(f"Available document types: {list(retrievers.keys())}")
print("\nYou can now ask questions using: query_documents('your question')")
print("="*60)

## Step 10: Query Function

In [None]:
def query_documents(user_query: str):
    """
    Main query function with intelligent routing
    
    Args:
        user_query: Your question (string)
        
    The router will analyze your question and route it to:
        - PDF documents (for iPhone 17 info)
        - DOCX documents (for F1 Singapore GP info)
        - CSV files (for sales data analysis)
    """
    print(f"\n{'='*60}")
    print(f"USER QUERY: {user_query}")
    print(f"{'='*60}\n")
    
    try:
        # Use router to determine which document type to search
        doc_type = router_chain.invoke({"question": user_query}).strip().lower()
        
        print(f"üéØ Routing to: {doc_type.upper()} documents\n")
        
        # Validate doc_type
        if doc_type not in qa_chains:
            print(f"‚ö†Ô∏è  Warning: '{doc_type}' not found, using first available chain")
            doc_type = list(qa_chains.keys())[0]
        
        # Run the appropriate QA chain
        answer = qa_chains[doc_type].invoke(user_query)
        
        print(f"\n{'='*60}")
        print("ANSWER:")
        print(f"{'='*60}")
        print(answer)
        print()
        
        return answer
    except Exception as e:
        print(f"‚ùå Error: {str(e)}")
        import traceback
        traceback.print_exc()
        return None

## Example Queries - Try These!

Run the cell below to test the system with example queries.

In [None]:
# Example 1: CSV Query
query_documents("Can you list me five corporate segment orders in the sales data?")

In [None]:
# Example 2: PDF Query
query_documents("What are the main features of iPhone 17?")

In [None]:
# Example 3: DOCX Query
query_documents("Who won the Singapore Grand Prix 2025?")

## Your Custom Queries

Use the cell below to ask your own questions!

In [None]:
# Ask your own question here
my_question = "Your question here"

query_documents(my_question)