In [None]:
## Step 1 Install and import dependencies
%pip install --upgrade langchain langchain-core langchain-community langchain-text-splitters faiss-cpu pypdf chromadb
%pip install gradio --quiet

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 25.2 -> 25.3
[notice] To update, run: python.exe -m pip install --upgrade pip


In [24]:
from langchain_community.document_loaders import TextLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.llms import OpenAI
from langchain_core.prompts import ChatPromptTemplate
import os
from dotenv import load_dotenv

# Load environment variables
load_dotenv()

True

In [23]:
%pip install pypdf  

Collecting pypdf
  Downloading pypdf-6.4.0-py3-none-any.whl.metadata (7.1 kB)
Downloading pypdf-6.4.0-py3-none-any.whl (329 kB)
Installing collected packages: pypdf
Successfully installed pypdf-6.4.0
Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 25.2 -> 25.3
[notice] To update, run: python.exe -m pip install --upgrade pip


In [25]:
## Step 2 Load the Document
# Use PyPDFLoader for PDF files
from langchain_community.document_loaders import PyPDFLoader

pdf_path = r"C:\Users\schal\Documents\Saruchi\Gen_AI_Training\Practice\Agents\data\1728286846_the_nestle_hr_policy_pdf_2012.pdf"
pdf_loader = PyPDFLoader(pdf_path)
documents = pdf_loader.load()

print(f"Loaded {len(documents)} pages from PDF")
print(f"First page content (first 200 chars):\n{documents[0].page_content[:200]}")

Loaded 8 pages from PDF
First page content (first 200 chars):
Policy
Mandatory
September  2012
The Nestlé  
Human Resources Policy


In [26]:
## Step 3: Split loaded documents into chunks
# Use the 'documents' variable from the previous cell
splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
chunks = splitter.split_documents(documents)
print(len(chunks), "document chunks created.")
    


35 document chunks created.


In [27]:
# Step 4: Generate embeddings 
MODEL_NAME = "all-MiniLM-L6-v2"
hf_embed = HuggingFaceEmbeddings(model_name=MODEL_NAME)

# Test embedding on first chunk
text = chunks[0].page_content
hf_embed_result = hf_embed.embed_documents([text])
print(f"Embedding dimension: {len(hf_embed_result[0])}")



Embedding dimension: 384


In [28]:
## Step 5 Build the ChromaDB vector store (direct chromadb usage)
import chromadb
# Import embeddings class (ensure available even if import cell wasn't run)
from langchain_community.embeddings import HuggingFaceEmbeddings

# Provide a lightweight Document class fallback if LangChain schema is not installed
try:
    # prefer langchain_core if available
    from langchain_core.schema import Document
except Exception:
    try:
        from langchain.schema import Document
    except Exception:
        from dataclasses import dataclass
        @dataclass
        class Document:
            page_content: str
            metadata: dict = None

# Ensure MODEL_NAME and chunks are defined (from earlier cells)
MODEL_NAME = "all-MiniLM-L6-v2"
hf_embed = HuggingFaceEmbeddings(model_name=MODEL_NAME)

# Create chroma client (avoid deprecated Settings keys)
# Use default client constructor which is compatible across Chroma versions.
chroma_client = chromadb.Client()
collection_name = "genai_training_collection"
collection = chroma_client.get_or_create_collection(name=collection_name)

# Prepare documents and embeddings
doc_texts = [c.page_content for c in chunks]
ids = [str(i) for i in range(len(doc_texts))]

print("Computing embeddings for documents (this may take a moment)...")
# Compute embeddings in batches to avoid memory spikes
batch_size = 64
embeddings_list = []
for i in range(0, len(doc_texts), batch_size):
    batch = doc_texts[i:i+batch_size]
    embeddings_list.extend(hf_embed.embed_documents(batch))

# Add to collection (ids, documents, embeddings)
# If collection already contains data with same ids, add will error; consider delete+add if re-running
try:
    collection.add(ids=ids, documents=doc_texts, embeddings=embeddings_list)
except Exception as e:
    print("Warning: collection.add raised:", e)
    # Try upsert if available
    try:
        collection.upsert(ids=ids, documents=doc_texts, embeddings=embeddings_list)
    except Exception:
        # as a last resort, clear and add
        try:
            collection.delete()
            collection.add(ids=ids, documents=doc_texts, embeddings=embeddings_list)
        except Exception as e2:
            print("Failed to add documents to Chroma collection:", e2)

# Persist where possible
try:
    # Newer versions: collection.persist()
    collection.persist()
except Exception:
    try:
        chroma_client.persist()
    except Exception:
        pass

# Simple retriever wrapper returning Document objects
class ChromaRetriever:
    def __init__(self, collection, embedder, k=3):
        self.collection = collection
        self.embedder = embedder
        self.k = k
    def get_relevant_documents(self, query):
        # compute query embedding
        if hasattr(self.embedder, "embed_query"):
            q_emb = self.embedder.embed_query(query)
        else:
            q_emb = self.embedder.embed_documents([query])[0]
        res = self.collection.query(query_embeddings=[q_emb], n_results=self.k, include=["documents", "distances"]) 
        docs = []
        for doc_text in res.get("documents", [[]])[0]:
            docs.append(Document(page_content=doc_text))
        return docs

retriever = ChromaRetriever(collection, hf_embed, k=3)
print(f"Chroma collection '{collection_name}' built with {len(doc_texts)} documents")

Computing embeddings for documents (this may take a moment)...
Chroma collection 'genai_training_collection' built with 35 documents
Chroma collection 'genai_training_collection' built with 35 documents


In [35]:
# Step 6: Build a question-answering system using the GPT model (citation-aware, structured output)
import os
import json
import re
from langchain_community.llms import OpenAI
from langchain_core.prompts import ChatPromptTemplate


def _extract_text_from_llm_result(res):
    try:
        gens = getattr(res, "generations", None)
        if gens:
            first_list = gens[0]
            if isinstance(first_list, (list, tuple)) and len(first_list) > 0:
                first = first_list[0]
                if hasattr(first, "text"):
                    return first.text
                if isinstance(first, dict) and "text" in first:
                    return first["text"]
        if isinstance(res, dict):
            choices = res.get("choices")
            if choices and len(choices) > 0:
                first = choices[0]
                if isinstance(first, dict):
                    msg = first.get("message") or first.get("delta")
                    if isinstance(msg, dict) and "content" in msg:
                        return msg["content"]
                    if "text" in first:
                        return first["text"]
        return str(res)
    except Exception:
        return str(res)


def answer_question_with_context(query, top_k=3, temperature=0.2):
    """Retrieve top passages, build a citation-aware prompt, call the LLM, and return structured output: {answer: str, sources: [{id:int, excerpt:str}]}

    Returns (result_dict, retrieved_texts) where result_dict contains keys "answer" and "sources".
    """
    docs = retriever.get_relevant_documents(query)
    if not docs:
        return {"answer": "No relevant documents found.", "sources": []}, []

    # Keep only top_k docs
    docs = docs[:top_k]
    retrieved_texts = [d.page_content for d in docs]

    # Build numbered context with explicit source ids
    numbered_passages = [f"[source {i+1}] {txt}" for i, txt in enumerate(retrieved_texts)]
    context = "\n\n".join(numbered_passages)

    # Ask the model to return JSON with answer and sources array (id + short_excerpt)
    prompt_template = ChatPromptTemplate.from_template(
        "You are a precise assistant. Use ONLY the provided context to answer the question.\n\n"
        "Context:\n{context}\n\n"
        "Question: {question}\n\n"
        "Instructions:\n"
        "- Answer concisely (1-3 sentences).\n"
        "- If the answer cannot be found in the context, reply exactly: \"I don't know.\"\n"
        "- Provide a JSON object as the ONLY output with keys: 'answer' (string) and 'sources' (array).\n"
        "- Each element in 'sources' must be an object with keys 'id' (the source number) and 'excerpt' (a short excerpt <= 200 chars from that source used to support the answer).\n"
        "Example output format:\n{{'answer': '...', 'sources': [{{'id': 1, 'excerpt': '...'}}, ...]}}\n\n"
        "Return only the JSON object (no additional commentary)."
    )

    formatted = prompt_template.format(context=context, question=query)

    # instantiate LLM
    llm = OpenAI(api_key=os.getenv("OPENAI_API_KEY"), temperature=temperature)

    # Try multiple invocation styles
    text = None
    try:
        if hasattr(llm, "generate"):
            res = llm.generate([formatted])
            text = _extract_text_from_llm_result(res)
    except Exception:
        pass

    if text is None:
        try:
            if hasattr(llm, "predict"):
                text = llm.predict(formatted)
        except Exception:
            pass

    if text is None:
        try:
            if callable(llm):
                text = llm(formatted)
        except Exception:
            pass

    # Fallback to OpenAI SDK if nothing yet
    if text is None:
        try:
            from openai import OpenAI as OpenAIClient
            client = OpenAIClient(api_key=os.getenv("OPENAI_API_KEY"))
            resp = client.chat.completions.create(
                model=os.getenv("OPENAI_MODEL", "gpt-4o-mini"),
                messages=[{"role": "user", "content": formatted}],
                max_tokens=600,
            )
            text = _extract_text_from_llm_result(resp)
        except Exception as e:
            return {"answer": f"LLM call failed: {e}", "sources": []}, retrieved_texts

    # Try to extract JSON from model output
    parsed = None
    try:
        # 1) direct parse
        parsed = json.loads(text)
    except Exception:
        # 2) try to find JSON substring
        m = re.search(r"\{[\s\S]*\}", text)
        if m:
            try:
                parsed = json.loads(m.group(0))
            except Exception:
                parsed = None

    # If parsing failed, attempt to build structured result heuristically
    if not isinstance(parsed, dict):
        # Heuristic: treat entire text as answer, and list sources = []
        result = {"answer": text.strip(), "sources": []}
        return result, retrieved_texts

    # Normalize parsed structure
    answer = parsed.get("answer") if isinstance(parsed.get("answer"), str) else str(parsed.get("answer", ""))
    sources_out = []
    for s in parsed.get("sources", []):
        try:
            sid = int(s.get("id")) if isinstance(s, dict) and s.get("id") is not None else None
            excerpt = s.get("excerpt") if isinstance(s, dict) else str(s)
            if sid is None:
                # try to infer id by searching excerpt in retrieved_texts
                sid = None
                for i, txt in enumerate(retrieved_texts, start=1):
                    if excerpt and excerpt.strip()[:40] in txt:
                        sid = i
                        break
            sources_out.append({"id": sid, "excerpt": excerpt})
        except Exception:
            continue

    result = {"answer": answer.strip(), "sources": sources_out}
    return result, retrieved_texts


# Example usage
if __name__ == "__main__":
    q = "What does the Nestle HR policy say about maternity leave?"
    res_struct, ctx = answer_question_with_context(q)
    print("Structured result:\n", json.dumps(res_struct, indent=2))
    print("\nRetrieved contexts:\n")
    for i, c in enumerate(ctx, 1):
        print(f"[{i}] {c[:400]}\n")

Structured result:
 {
  "answer": "{'answer': 'The Nestle HR policy does not explicitly mention maternity leave.', 'sources': [{'id': 1, 'excerpt': 'This document encompasses the guidelines which constitute a solid basis for effective Human Resources Management throughout the Nestl\u00e9 Group around the world.'}, {'id': 2, 'excerpt': 'Mandatory'}, {'id': 3, 'excerpt': 'Nestl\u00e9 not only upholds the freedom of association of its employees and the effective recognition of the right to collective bargaining,'}]}",
  "sources": []
}

Retrieved contexts:

[1] The Nestlé Human Resources Policy
1
At Nestlé, we recognize that our employees 
are the key to our success and nothing can be 
achieved without their engagement. 
This document encompasses the guidelines 
which constitute a solid basis for effective Human 
Resources Management throughout the Nestlé 
Group around the world. It explains to all Nestlé 
employees the vision and mission of the Human 
R

[2] Policy
Mandatory
September  2

In [36]:
# Step 7: Prompt template to guide the chatbot
from langchain_core.prompts import ChatPromptTemplate

# Reusable prompt template for conversational QA that uses retrieved context and cites sources.
prompt_template_chatbot = ChatPromptTemplate.from_template(
    "You are a concise, factual assistant. Use ONLY the provided context to answer the user's question.\n\n"
    "Context:\n{context}\n\n"
    "Question: {question}\n\n"
    "Instructions:\n"
    "- Answer concisely (1-3 sentences).\n"
    "- Cite supporting sources inline using [source N] markers that correspond to the numbered context entries.\n"
    "- If the answer cannot be found in the context, reply exactly: \"I don't know.\"\n"
    "- After the answer, include a short 'Sources:' line listing used source numbers, e.g. 'Sources: [source 1], [source 3]'.\n"
)

# Helper to build numbered context from retrieved texts
def build_numbered_context(retrieved_texts: list[str]) -> str:
    """Return a single string with numbered sources suitable for the prompt template."""
    return "\n\n".join([f"[source {i+1}] {t}" for i, t in enumerate(retrieved_texts)])

# Example (commented) showing how to format the prompt before calling your LLM wrapper:
# numbered = build_numbered_context(retrieved_texts)
# formatted_prompt = prompt_template_chatbot.format(context=numbered, question="Your question here")
# print(formatted_prompt)


In [None]:
## • Step 8: Use Gradio to build a user-friendly chatbot interface, enabling interaction and information retrieval.

In [None]:
## Install Gradio
%pip install gradio --quiet

Collecting gradio
  Downloading gradio-6.0.2-py3-none-any.whl.metadata (16 kB)
Collecting aiofiles<25.0,>=22.0 (from gradio)
  Downloading aiofiles-24.1.0-py3-none-any.whl.metadata (10 kB)
Collecting audioop-lts<1.0 (from gradio)
  Downloading audioop_lts-0.2.2-cp313-abi3-win_amd64.whl.metadata (2.0 kB)
Collecting brotli>=1.1.0 (from gradio)
  Downloading brotli-1.2.0-cp313-cp313-win_amd64.whl.metadata (6.3 kB)
Collecting fastapi<1.0,>=0.115.2 (from gradio)
  Downloading fastapi-0.123.5-py3-none-any.whl.metadata (30 kB)
Collecting ffmpy (from gradio)
  Downloading ffmpy-1.0.0-py3-none-any.whl.metadata (3.0 kB)
Collecting gradio-client==2.0.1 (from gradio)
  Downloading gradio_client-2.0.1-py3-none-any.whl.metadata (7.1 kB)
Collecting groovy~=0.1 (from gradio)
  Downloading groovy-0.1.2-py3-none-any.whl.metadata (6.1 kB)
Collecting pydub (from gradio)
  Downloading pydub-0.25.1-py2.py3-none-any.whl.metadata (1.4 kB)
Collecting python-multipart>=0.0.18 (from gradio)
  Downloading python_


[notice] A new release of pip is available: 25.2 -> 25.3
[notice] To update, run: python.exe -m pip install --upgrade pip


In [None]:
# Gradio chatbot interface for RAG system — return structured JSON result
import gradio as gr
import json
import re


def run_query(question: str):
    """Run the QA pipeline and return the structured JSON result and retrieved contexts."""
    if not question or not question.strip():
        return json.dumps({"error": "Please enter a question."}, indent=2), ""
    try:
        res_struct, contexts = answer_question_with_context(question)
    except Exception as e:
        return json.dumps({"error": f"Error running QA pipeline: {e}"}, indent=2), ""

    # Ensure res_struct is serializable dict
    try:
        result_json = json.dumps(res_struct, indent=2, ensure_ascii=False)
    except Exception:
        # Fallback: convert to string inside JSON
        result_json = json.dumps({"answer": str(res_struct)}, indent=2, ensure_ascii=False)

    # Short preview of retrieved contexts
    contexts_text = "\n\n".join([f"[{i+1}] {c[:600]}" for i, c in enumerate(contexts)]) if contexts else ""

    return result_json, contexts_text


def launch_gradio():
    iface = gr.Interface(
        fn=run_query,
        inputs=gr.Textbox(lines=2, placeholder="Ask a question about the uploaded documents...", label="Question"),
        outputs=[
            gr.Textbox(label="Result (JSON)"),
            gr.Textbox(label="Retrieved Contexts (truncated)")
        ],
        title="Nestlé Chatbot",
        description="Ask questions about the documents indexed in the Chroma collection. Returns structured JSON result and retrieved contexts.",
    )
    iface.launch()

# Launch when run as a cell (uncomment to start the UI):
launch_gradio()


* Running on local URL:  http://127.0.0.1:7864
* To create a public link, set `share=True` in `launch()`.
* To create a public link, set `share=True` in `launch()`.


: 