In [None]:
import warnings
warnings.filterwarnings("ignore", category=UserWarning)
# Suppress Warnings

In [None]:
# Install Required Libraries
!pip install -U -q langchain langchain-google-genai chromadb pdfplumber langchain-community sentence-transformers google-colab pypdf


In [None]:
# Import Libraries and Mount Drive
print("--- Importing Libraries ---")
import os
import getpass
from pathlib import Path
from google.colab import drive, userdata # For API Key Secret and Drive
from langchain_google_genai import ChatGoogleGenerativeAI, GoogleGenerativeAIEmbeddings
from langchain.vectorstores import Chroma
from langchain.document_loaders import PyPDFDirectoryLoader # Using standard PDF loader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.schema.runnable import RunnablePassthrough
from langchain.schema.output_parser import StrOutputParser
from langchain.prompts import ChatPromptTemplate
from langchain.retrievers import ContextualCompressionRetriever
from langchain.retrievers.document_compressors import CrossEncoderReranker
from sentence_transformers.cross_encoder import CrossEncoder # For reranker

print("Libraries imported.")

try:
    print("\n--- Mounting Google Drive ---")
    drive.mount('/content/drive', force_remount=True)
    print("Google Drive mounted successfully.")
except Exception as e:
    print(f"Error mounting Google Drive: {e}. Ensure data paths are accessible.")

In [None]:
# Cell 3: Configure API Key and Paths
print("--- Configuring API Key and Paths ---")

GOOGLE_API_KEY = userdata.get('GOOGLE_API_KEY')

if not GOOGLE_API_KEY:
     print("\nError: Gemini API Key is not set. Further steps requiring the API will fail.")
else:

     os.environ["GOOGLE_API_KEY"] = GOOGLE_API_KEY
     print("\nGemini API Key is configured.")

pdf_directory_path = "/content/drive/MyDrive/HelpMate AI Codes"
print(f"PDF directory path set to: {pdf_directory_path}")

chroma_persist_path = "/content/drive/MyDrive/HelpMate AI Codes/chroma_db_langchain"
print(f"ChromaDB persistent path set to: {chroma_persist_path}")

# Ensure ChromaDB directory exists
try:
    os.makedirs(chroma_persist_path, exist_ok=True)
    print(f"Directory '{chroma_persist_path}' ensured.")
except Exception as e:
    print(f"Warning: Could not create directory {chroma_persist_path}: {e}")

In [None]:
# Load Documents

print(f"\n--- Loading Documents from {pdf_directory_path} using PyPDFDirectoryLoader ---")
# Import the necessary loader
from langchain_community.document_loaders import PyPDFDirectoryLoader
from pathlib import Path

pdf_directory = Path(pdf_directory_path)
docs = []

if not pdf_directory.is_dir():
    print(f"Error: Directory not found at {pdf_directory_path}")
else:
    try:
        # PyPDFDirectoryLoader handles iterating through the directory
        loader = PyPDFDirectoryLoader(
            path=pdf_directory_path,
            recursive=False # True if PDFs in subfolders
            )
        # Load documents - this loads all pages from all PDFs found
        docs = loader.load()

        if docs:
            print(f"Loaded {len(docs)} documents (pages) from text-based PDFs.")
            # Verify sources
            sources = set(doc.metadata.get('source', 'Unknown') for doc in docs)
            print("Sources loaded:", sources)
            print("Sample document metadata (first page):", docs[0].metadata)
            # Note: PyPDFDirectoryLoader usually correctly populates 'source' and 'page' metadata.
        else:
            print("No text-based PDF documents were loaded.")
            print("Check if the directory contains text-based PDFs or if they are corrupted.")

    except Exception as e:
        print(f"An error occurred loading PDFs with PyPDFDirectoryLoader: {e}")
        print("Ensure only text-based PDFs are in the directory or that 'pypdf' library is working.")
        docs = [] # Ensure docs is empty on error

# Now the 'docs' variable holds pages loaded from text-based PDFs
if not docs:
    print("\nWarning: No documents were loaded. Subsequent steps might fail.")

In [None]:
# Split Documents into Chunks
splits = []
if docs:
    print("\n--- Splitting Documents into Chunks ---")
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=2000, # Adjust chunk size based on embedding model limits & desired context
        chunk_overlap=300  # Overlap helps maintain context between chunks
    )
    splits = text_splitter.split_documents(docs)
    print(f"Split into {len(splits)} chunks.")
    if splits:
        print("Sample chunk metadata:", splits[0].metadata)
else:
    print("\nSkipping splitting: No documents loaded.")

In [None]:
# Cell 6: Initialize Embedding Model
embedding_model = None
if GOOGLE_API_KEY:
    print("\n--- Initializing Gemini Embedding Model ---")
    try:
        # Use a Gemini embedding model available via the API
        embedding_model = GoogleGenerativeAIEmbeddings(
            model="models/text-embedding-004",
            google_api_key=GOOGLE_API_KEY
        )
        print(f"Initialized embedding model: {embedding_model.model}")
    except Exception as e:
        print(f"Error initializing embedding model: {e}")
else:
    print("\nSkipping embedding model initialization: API Key not configured.")

In [None]:
# Create or Load Vector Store
vectorstore = None
db_creation_needed = True # Flag to check if we need to create the DB

if embedding_model and splits:
    print(f"\n--- Setting up Chroma Vector Store ---")
    print(f"Using persistent path: {chroma_persist_path}")

    # Check if the database directory already exists and isn't empty
    if os.path.exists(chroma_persist_path) and os.listdir(chroma_persist_path):
        print("Existing ChromaDB directory found. Attempting to load...")
        try:
            vectorstore = Chroma(
                persist_directory=chroma_persist_path,
                embedding_function=embedding_model
            )
            # Quick check to see if it loaded something plausible
            test_search = vectorstore.similarity_search("insurance", k=1)
            if test_search:
                 print("Successfully loaded existing vector store.")
                 db_creation_needed = False
            else:
                 print("Loaded directory, but store seems empty or invalid. Will recreate.")
                 # Consider clearing the directory here if needed: shutil.rmtree(chroma_persist_path)
        except Exception as e:
            print(f"Error loading existing vector store: {e}. Will attempt to recreate.")
            # Consider clearing the directory here if needed: shutil.rmtree(chroma_persist_path)


    if db_creation_needed:
        print("Creating new Chroma vector store...")
        print(f"Embedding {len(splits)} chunks. This may take a significant amount of time...")
        try:
            # Create Chroma vector store FROM the document splits and WITH the Gemini embedding function
            vectorstore = Chroma.from_documents(
                documents=splits,
                embedding=embedding_model,
                persist_directory=chroma_persist_path # Save persistently
            )
            print("New Chroma vector store created and documents embedded.")
        except Exception as e:
            print(f"Error creating new Chroma vector store: {e}")
            vectorstore = None
    else:
        print("Using previously loaded vector store.")


elif not embedding_model:
     print("\nSkipping vector store setup: Embedding model not initialized.")
elif not splits:
     print("\nSkipping vector store setup: No document splits available.")

In [None]:
# Create Retriever
retriever = None
if vectorstore:
    print("\n--- Creating Retriever ---")

    base_retriever = vectorstore.as_retriever(search_kwargs={"k": 20})  # Fetch more for potential reranking
    print("Base retriever created (fetches top 20).")

    try:
        print("Attempting to set up CrossEncoder reranker using LangChain wrapper...")

        from langchain_community.cross_encoders import HuggingFaceCrossEncoder

        hf_cross_encoder_model = HuggingFaceCrossEncoder('cross-encoder/ms-marco-MiniLM-L-6-v2')

        reranker = CrossEncoderReranker(
            model=hf_cross_encoder_model,  # Use LangChain wrapper as model
            top_n=3                        # Return top 3 after reranking
        )
        # Use the reranker as a context compression retriever
        reranker_retriever = ContextualCompressionRetriever(
            base_compressor=reranker, base_retriever=base_retriever
        )

        retriever = reranker_retriever  # Use the reranking retriever
        print("Contextual Compression Retriever with reranker created (returns top 3).")

    except ImportError:
        # Handle case where LangChain or sentence-transformers might not be fully installed
        print("ImportError occurred. Ensure 'sentence-transformers' and 'langchain' are installed. Using base retriever instead.")
        retriever = base_retriever

    except Exception as e:
        # Handle other exceptions
        print(f"Could not set up reranker: {e}. Using base retriever instead.")
        retriever = base_retriever  # Fallback to base retriever

else:
    print("\nSkipping retriever creation: Vector store not available.")


In [None]:
# Create Retriever (Direct Implementation of Re-ranking)
retriever = None # Initialize retriever variable

if vectorstore: # Only proceed if the vector store was successfully created/loaded
    print("\n--- Creating Retriever with Re-ranking ---")
    try:
        # 1. Import the necessary components
        from langchain_community.cross_encoders import HuggingFaceCrossEncoder
        from langchain.retrievers.document_compressors import CrossEncoderReranker
        from langchain.retrievers import ContextualCompressionRetriever

        # 2. Define the base retriever (fetches initial candidates)
        base_retriever = vectorstore.as_retriever(search_kwargs={"k": 20}) # Fetch 20 candidates
        print("Base retriever created (fetches top 20).")

        # 3. Initialize the LangChain wrapper for the cross-encoder model
        print("Initializing CrossEncoder model...")
        hf_cross_encoder_model = HuggingFaceCrossEncoder(model_name='cross-encoder/ms-marco-MiniLM-L-6-v2')

        # 4. Initialize the reranker component using the wrapper
        reranker = CrossEncoderReranker(
            model=hf_cross_encoder_model,
            top_n=3 # Return the top 3 most relevant documents after reranking
        )
        print("CrossEncoderReranker component created (will return top 3).")

        # 5. Create the Contextual Compression Retriever
        # This wraps the base_retriever and uses the reranker to compress/filter results
        compression_retriever = ContextualCompressionRetriever(
            base_compressor=reranker,
            base_retriever=base_retriever
        )
        retriever = compression_retriever # Assign the final reranking retriever
        print("Contextual Compression Retriever setup complete.")

    except ImportError as e:
         # Handle potential missing libraries
         print(f"ImportError: Could not import necessary components ({e}).")
         print("Ensure 'langchain_community' and 'sentence-transformers' are installed.")
         print("Retriever setup failed.")
    except Exception as e:
        # Catch other potential errors during setup
        print(f"An error occurred during retriever setup: {e}")
        print("Retriever setup failed.")

else:
    print("\nSkipping retriever creation: Vector store not available from previous step.")

# Final check
if retriever:
    print("\nRetriever is ready.")
else:
    print("\nRetriever was not successfully created.")

In [None]:
# Initialize LLM
llm = None
if GOOGLE_API_KEY:
    print("\n--- Initializing Gemini LLM ---")
    try:
        # Choose a Gemini generative model
        llm = ChatGoogleGenerativeAI(
            model="gemini-1.5-flash-latest",
            google_api_key=GOOGLE_API_KEY,
            temperature=0.0, # Low temperature for factual answers
            convert_system_message_to_human=True
        )
        print(f"Initialized LLM: {llm.model}")
    except Exception as e:
        print(f"Error initializing LLM: {e}")
else:
    print("\nSkipping LLM initialization: API Key not configured.")

In [None]:
# Define RAG Chain
rag_chain = None
if retriever and llm:
    print("\n--- Defining RAG Chain ---")
    # Define prompt template
    template = """
You are an intelligent assistant helping users understand insurance policies.

Your job is to answer the question strictly based on the provided context.
Do not use any prior knowledge or external sources. If the answer is not in the context, reply:
"The answer is not found in the provided documents."

Instructions:
- Only answer using facts present in the context.
- Summarize the relevant information clearly and accurately.
- When possible, cite the document name and page number like this: [Source: document_name, Page: page_number].

Context:
{context}

Question:
{question}

Answer:
"""
    prompt = ChatPromptTemplate.from_template(template)

    # Function to format retrieved documents
    def format_docs(docs):
        return "\n\n".join(f"Source: {doc.metadata.get('source', 'Unknown')}, Page: {doc.metadata.get('page', 'N/A')}\nContent: {doc.page_content}" for doc in docs)

    # Define the chain using LangChain Expression Language (LCEL)
    rag_chain = (
        {"context": retriever | format_docs, "question": RunnablePassthrough()}
        | prompt
        | llm
        | StrOutputParser()
    )
    print("RAG chain defined successfully.")
else:
    print("\nCannot define RAG chain: Retriever or LLM not available.")

In [None]:
import textwrap

# Run Query
if rag_chain:
    print("\n--- Ready to Query ---\n")

    while True:
        try:
            user_query = input("Enter your question (or type 'quit' to exit): ").strip()

            if user_query.lower() == 'quit':
                print("\nExiting the query session. Goodbye!")
                break
            if not user_query:
                print("\nPlease enter a valid query.")
                continue

            print("\nGenerating response...\n")
            final_response = rag_chain.invoke(user_query)

            print("-"*50)
            print("                  Final Answer")
            print("-"*50)

            # Wrap text to fit the output width (80 characters in this case)
            wrapper = textwrap.TextWrapper(width=80, break_long_words=False)

            # Prepare summary
            summary = []
            paragraphs = final_response.split('[Source:')

            # Process paragraphs and add source references inline
            for i, paragraph in enumerate(paragraphs):
                paragraph = paragraph.strip()

                if '[Source:' in paragraph:
                    # Add the source reference inline with the paragraph
                    source_reference = '[Source:' + '[Source:'.join(final_response.split('[Source:')[i + 1:]).split(']')[0] + ']'
                    paragraph += " " + source_reference
                wrapped_paragraph = wrapper.fill(paragraph)
                summary.append(wrapped_paragraph)

            # Print summary (with sources inline)
            print("\n** Summary **\n")
            print("\n".join(summary))

        except Exception as e:
            print(f"\nAn error occurred: {e}")
        except KeyboardInterrupt:
            print("\nExiting the query session. Goodbye!")
            break
else:
    print("\nCannot run queries: RAG chain was not set up successfully.")
