# AI‑Powered HR Assistant (Nestlé HR Policy, 2012)
This notebook walks through building a Retrieval‑Augmented Generation (RAG) chatbot:
1) Parse PDFs
2) Chunk & embed text
3) Store in ChromaDB
4) Retrieve & answer with OpenAI chat models
5) Optional Gradio UI

**Note:** Set your `OPENAI_API_KEY` in the environment before running.

In [None]:
!python --version


## 1) Setup

In [None]:
%pip -q install -r /mnt/data/hr_assistant_project/requirements.txt


## 2) Configuration

In [None]:
import os, re, hashlib
from typing import List, Tuple, Dict
from dataclasses import dataclass

import numpy as np
import chromadb
from chromadb.config import Settings
from chromadb.utils import embedding_functions
from pypdf import PdfReader
from openai import OpenAI

DEFAULT_CHAT_MODEL = os.getenv("OPENAI_CHAT_MODEL", "gpt-4o-mini")  # or "gpt-3.5-turbo"
DEFAULT_EMBED_MODEL = os.getenv("OPENAI_EMBED_MODEL", "text-embedding-3-small")
CHROMA_DIR = "./chroma_hr_policy_db"

DOC_PATHS = [
    "/mnt/data/the_nestle_hr_policy_pdf_2012.pdf",
    "/mnt/data/Course_End_Project_Crafting_an_AI_Powered_HR_Assistant.pdf",
    "/mnt/data/Gradio_Documentation.pdf",
]

SYSTEM_PROMPT = """You are an HR policy assistant specialized in Nestlé's HR Policy (2012).
Answer ONLY from the provided documents. If not found, say you don't know. Cite filename and page."""

ANSWER_PROMPT_TEMPLATE = """Question:
{question}

Retrieved context:
{context}

Answer from the context with citations like [source: filename.pdf p.X]. If unknown, say so."""

def _require_api_key():
    if not os.getenv("OPENAI_API_KEY"):
        raise RuntimeError("Set OPENAI_API_KEY in your environment.")


## 3) PDF Loading & Chunking

In [None]:
def load_pdf(path: str) -> List[Tuple[int, str]]:
    reader = PdfReader(path)
    pages = []
    for i, page in enumerate(reader.pages, start=1):
        try:
            txt = page.extract_text() or ""
        except Exception:
            txt = ""
        txt = re.sub(r"\s+", " ", txt).strip()
        pages.append((i, txt))
    return pages

def chunk_text(text: str, size: int=900, overlap: int=150):
    chunks = []
    start = 0
    while start < len(text):
        end = min(start + size, len(text))
        chunks.append(text[start:end])
        if end == len(text): break
        start = max(0, end - overlap)
    return chunks

from dataclasses import dataclass
@dataclass
class ChunkMeta:
    source: str
    page: int
    chunk_id: str

def build_corpus(paths: List[str]):
    texts, metas = [], []
    for path in paths:
        if not os.path.exists(path): 
            print("Missing:", path); 
            continue
        for page, txt in load_pdf(path):
            if not txt: continue
            for j, ch in enumerate(chunk_text(txt)):
                cid = hashlib.sha1(f"{path}-{page}-{j}-{len(ch)}".encode()).hexdigest()
                texts.append(ch)
                metas.append(ChunkMeta(source=os.path.basename(path), page=page, chunk_id=cid))
    return texts, metas

texts, metas = build_corpus(DOC_PATHS)
len(texts), metas[0]


## 4) Build / Load Vector Store (ChromaDB)

In [None]:
def embed_and_index(texts, metas, persist_dir=CHROMA_DIR):
    _require_api_key()
    client = chromadb.PersistentClient(path=persist_dir, settings=Settings(anonymized_telemetry=False))
    try:
        client.delete_collection("nestle_hr_policy")
    except Exception:
        pass
    ef = embedding_functions.OpenAIEmbeddingFunction(
        api_key=os.getenv("OPENAI_API_KEY"),
        model_name=DEFAULT_EMBED_MODEL
    )
    col = client.create_collection("nestle_hr_policy", embedding_function=ef)
    col.add(documents=texts, ids=[m.chunk_id for m in metas],
            metadatas=[{"source": m.source, "page": m.page} for m in metas])
    return col

col = embed_and_index(texts, metas)
col.count()


## 5) Retrieve + Generate

In [None]:
def retrieve(query: str, k: int=5):
    ef = embedding_functions.OpenAIEmbeddingFunction(
        api_key=os.getenv("OPENAI_API_KEY"),
        model_name=DEFAULT_EMBED_MODEL
    )
    client = chromadb.PersistentClient(path=CHROMA_DIR, settings=Settings(anonymized_telemetry=False))
    col = client.get_collection("nestle_hr_policy", embedding_function=ef)
    res = col.query(query_texts=[query], n_results=k)
    docs = res["documents"][0]
    metas = res["metadatas"][0]
    return docs, metas

def render_context(docs, metas):
    blocks = []
    for d, m in zip(docs, metas):
        blocks.append(f"[{m['source']} p.{m['page']}] {d}")
    return "\n\n".join(blocks)

def answer(question: str):
    _require_api_key()
    client = OpenAI()
    docs, metas = retrieve(question, k=5)
    context = render_context(docs, metas)
    prompt = ANSWER_PROMPT_TEMPLATE.format(question=question, context=context)
    messages = [{"role": "system", "content": SYSTEM_PROMPT},
                {"role": "user", "content": prompt}]
    resp = client.chat.completions.create(model=DEFAULT_CHAT_MODEL, messages=messages, temperature=0.2, max_tokens=600)
    return resp.choices[0].message.content

print(answer("What are the key elements of Nestlé's Total Rewards?"))


## 6) Optional: Launch Gradio UI

In [None]:
import gradio as gr

def do_answer(q):
    return answer(q)

demo = gr.Interface(fn=do_answer, inputs=gr.Textbox(label="Ask about Nestlé HR Policy"), outputs="text", title="Nestlé HR Assistant")
# Uncomment to launch
# demo.launch()
