In [None]:
from dotenv import load_dotenv

from langchain_groq import ChatGroq
from langchain_community.document_loaders import PyPDFDirectoryLoader 
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.embeddings import HuggingFaceEmbeddings 
# Changed: Import Chroma instead of FAISS
from langchain_chroma import Chroma 
from langchain.chains import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_core.prompts import ChatPromptTemplate

In [17]:
# 1. Load environment variables (for API key)
load_dotenv()
groq_api_key = os.getenv("GROQ_API_KEY")
if not groq_api_key:
    raise ValueError("GROQ_API_KEY not found in environment variables. Please set it.")

In [26]:
# --- Configuration ---
MODEL_NAME = "deepseek-r1-distill-llama-70b"
EMBEDDING_MODEL_NAME = "sentence-transformers/all-MiniLM-L6-v2" 

# Changed: Define the directory path where your PDFs are located
PDF_DIRECTORY = "../docs" 

CHROMA_DB_PATH = "chroma_index_hf_directory" # Changed index path for clarity

In [19]:
# --- 2. Initialize LLM from Groq ---
llm = ChatGroq(
    api_key=groq_api_key,
    model_name=MODEL_NAME,
    temperature=0.7 
)


In [27]:
# --- 3. Document Loading and Processing (Indexing Phase) ---

# Check if ChromaDB data already exists to avoid re-processing
# ChromaDB handles persistence by checking the persist_directory
if os.path.exists(CHROMA_DB_PATH) and os.listdir(CHROMA_DB_PATH):
    print(f"Loading existing ChromaDB from {CHROMA_DB_PATH}...")
    embeddings = HuggingFaceEmbeddings(model_name=EMBEDDING_MODEL_NAME)
    # Changed: Initialize Chroma with persist_directory
    vector_store = Chroma(persist_directory=CHROMA_DB_PATH, embedding_function=embeddings)
    print("ChromaDB loaded.")
else:
    print(f"Loading documents from directory: {PDF_DIRECTORY}...")
    
    if not os.path.exists(PDF_DIRECTORY):
        raise FileNotFoundError(f"The specified PDF directory does not exist: {PDF_DIRECTORY}")

    loader = PyPDFDirectoryLoader(path=PDF_DIRECTORY, glob="**/*.pdf", recursive=True)
    documents = loader.load()

    if not documents:
        print(f"No PDF documents found in {PDF_DIRECTORY}. Please check the path and content.")
        exit() 

    print(f"Loaded {len(documents)} raw documents (pages).")
    
    print(f"Splitting documents into chunks...")
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=1000,
        chunk_overlap=200
    )
    chunks = text_splitter.split_documents(documents)
    print(f"Created {len(chunks)} chunks from all documents.")

    print(f"Generating embeddings using {EMBEDDING_MODEL_NAME} and creating ChromaDB vector store...")
    embeddings = HuggingFaceEmbeddings(model_name=EMBEDDING_MODEL_NAME, model_kwargs={'device': 'cpu'}) 

    # Changed: Create ChromaDB from documents, specifying the persist_directory
    vector_store = Chroma.from_documents(
        chunks, 
        embeddings, 
        persist_directory=CHROMA_DB_PATH
    )
    print("ChromaDB vector store created and persisted.")


Loading documents from directory: ../docs...
Loaded 31 raw documents (pages).
Splitting documents into chunks...
Created 61 chunks from all documents.
Generating embeddings using sentence-transformers/all-MiniLM-L6-v2 and creating ChromaDB vector store...
ChromaDB vector store created and persisted.


In [29]:
# --- 4. Setup Retrieval Chain (Querying Phase) ---

prompt = ChatPromptTemplate.from_template("""
Answer the following question based only on the provided context.
If you don't know the answer, just say "I don't know" and do not try to make up an answer.

Context: {context}

Question: {input}
""")
retriever = vector_store.as_retriever()

document_chain = create_stuff_documents_chain(llm, prompt)

retrieval_chain = create_retrieval_chain(retriever, document_chain)

In [31]:
# --- 5. Ask Questions (Static List) ---
print("\n--- RAG System Ready ---")

# Define your static list of questions
static_questions = [
    "Can you tell me about the company overview of evergreen assurance ltd?",
]

for i, question in enumerate(static_questions):
    print(f"\n--- Processing Question {i+1}/{len(static_questions)} ---")
    print(f"Question: {question}")
    
    print("\nSearching for relevant information...")
    response = retrieval_chain.invoke({"input": question})

    print("\n--- Answer ---")
    print(response["answer"])
    print("\n--- Sources (if available) ---")
    if "context" in response:
        sorted_context = sorted(response["context"], key=lambda doc: (doc.metadata.get('source', ''), doc.metadata.get('page', 0)))
        seen_sources = set()
        for doc in sorted_context:
            source_info = f"Source: {doc.metadata.get('source', 'N/A')}"
            page_info = f"Page: {doc.metadata.get('page', 'N/A')}"
            
            if (source_info, page_info) not in seen_sources:
                print(f"- {source_info}, {page_info}")
                seen_sources.add((source_info, page_info))
    else:
        print("No specific sources found in context.")
    print("-" * 30)

print("\n--- All static questions processed. Exiting. ---")


--- RAG System Ready ---

--- Processing Question 1/1 ---
Question: Can you tell me about the company overview of evergreen assurance ltd??

Searching for relevant information...

--- Answer ---
<think>
Okay, I need to answer the question about the company overview of Evergreen Assurance Ltd. using the provided context. Let me read through the context carefully.

The context starts with a disclaimer stating it's a summary of products and general terms, so it's illustrative and not the official policy. Then it goes into the company overview.

From the context, Evergreen Assurance Ltd. is described as a premier insurance provider in India. Their mission is to secure customers' financial futures with transparent, innovative, and customer-centric products. They aim to be a steadfast partner to policyholders, offering peace of mind and robust protection against life's uncertainties. The document outlines their product portfolio and terms, serving as a comprehensive knowledge base.

I shoul