If you use a Notebook, use the Markdown cells (text cells) to explain your logic. Explain why you chose a chunk_size of 1000 and how you managed the API key. This shows a "Senior" level of documentation.


Why this happens (The "Interview Answer")
If they ask about this error in an interview, you can give a very impressive technical answer:

"During development, I encountered a ModuleNotFoundError due to the LangChain v1.0 migration. I resolved this by utilizing the langchain-classic package for legacy chain support. This experience taught me the importance of managing Dependency Drift and staying current with documentation in rapidly evolving ecosystems like GenAI."

In [1]:
# Cell 1: Install needed packages (only run once, or as needed)
!pip install langchain-google-genai langchain chromadb python-dotenv

Collecting langchain-google-genai
  Using cached langchain_google_genai-4.2.0-py3-none-any.whl.metadata (2.7 kB)
Collecting langchain
  Downloading langchain-1.2.4-py3-none-any.whl.metadata (4.9 kB)
Collecting chromadb
  Downloading chromadb-1.4.1-cp39-abi3-win_amd64.whl.metadata (7.3 kB)
Collecting filetype<2.0.0,>=1.2.0 (from langchain-google-genai)
  Using cached filetype-1.2.0-py2.py3-none-any.whl.metadata (6.5 kB)
Collecting google-genai<2.0.0,>=1.56.0 (from langchain-google-genai)
  Using cached google_genai-1.58.0-py3-none-any.whl.metadata (53 kB)
Collecting langchain-core<2.0.0,>=1.2.5 (from langchain-google-genai)
  Using cached langchain_core-1.2.7-py3-none-any.whl.metadata (3.7 kB)
Collecting google-auth<3.0.0,>=2.47.0 (from google-auth[requests]<3.0.0,>=2.47.0->google-genai<2.0.0,>=1.56.0->langchain-google-genai)
  Using cached google_auth-2.47.0-py3-none-any.whl.metadata (6.4 kB)
Collecting tenacity<9.2.0,>=8.2.3 (from google-genai<2.0.0,>=1.56.0->langchain-google-genai)
 

In [3]:
# Cell 2: Import libraries and load .env (don't edit .env here, just load)
import os
import shutil
from dotenv import load_dotenv

# PDF/Text/vector DB
from langchain_community.document_loaders import PyPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import Chroma

# Gemini AI and retrieval
from langchain_google_genai import (
    ChatGoogleGenerativeAI,
    GoogleGenerativeAIEmbeddings,
)
from langchain_classic.chains import RetrievalQA

# Load your API keys (make sure .env is in the same folder as your notebook)
load_dotenv()

ModuleNotFoundError: No module named 'langchain_community'

In [None]:
# Cell 3: Set up model, embeddings, and database parameters

MODEL_NAME = "models/gemini-flash-lite-latest"          # Chosen Gemini LLM model
EMBEDDING_MODEL = "models/text-embedding-004"           # Chosen embedding model

SYSTEM_PROMPT = (
    "You are an insurance policy assistant. "
    "Answer strictly based on the provided policy text. "
    "If the answer is not in the document, say so."
)

DB_PATH = "./insurance_db"  # Folder for ChromaDB storage

In [None]:
# Cell 4: Define a function to build and save the vector database

def setup_vector_db(pdf_path="policy.pdf"):
    """Load PDF, split text, create embeddings, and store in vector DB."""

    print("--- Step 1: Loading Insurance Policy Document ---")
    loader = PyPDFLoader(pdf_path)
    documents = loader.load()

    print("--- Step 2: Chunking Text ---")
    splitter = RecursiveCharacterTextSplitter(
        chunk_size=1000,
        chunk_overlap=100,
    )
    chunks = splitter.split_documents(documents)

    print("--- Step 3: Creating Google Embeddings ---")
    embeddings = GoogleGenerativeAIEmbeddings(
        model=EMBEDDING_MODEL,
        google_api_key=os.getenv("GOOGLE_API_KEY"),
    )

    print("--- Step 4: Building Vector Database ---")
    if os.path.exists(DB_PATH):
        shutil.rmtree(DB_PATH)

    vector_db = Chroma.from_documents(
        documents=chunks,
        embedding=embeddings,
        persist_directory=DB_PATH,
    )
    return vector_db

In [None]:
# Cell 5: Define the QA function for querying the loaded PDF

def query_policy(question, vector_db):
    """Ask a question using Gemini LLM, return answer and source text."""
    llm = ChatGoogleGenerativeAI(
        model=MODEL_NAME,
        google_api_key=os.getenv("GOOGLE_API_KEY"),
        temperature=0,
        system_prompt=SYSTEM_PROMPT
    )
    qa_chain = RetrievalQA.from_chain_type(
        llm=llm,
        chain_type="stuff",
        retriever=vector_db.as_retriever(),
        return_source_documents=True,
    )
    response = qa_chain.invoke({"query": question})
    answer = response["result"]
    snippets = [
        {"page": doc.metadata.get("page"), "text": doc.page_content}
        for doc in response["source_documents"]
    ]
    return answer, snippets

In [None]:
# Cell 6: Build the vector database from your PDF (place your PDF as "policy.pdf" in the same directory)

db = setup_vector_db(pdf_path="policy.pdf")

In [None]:
# Cell 7: Ask a question (customize as you wish!)

question = "What does the policy say about 'unexpected serious illness'?"
answer, snippets = query_policy(question, db)

print("\nAI Answer based on Insurance Document:")
print(answer)

print("\nSource Text Snippets:")
for s in snippets:
    print(f"\n--- Page {s['page']} ---\n{s['text']}")