#import guardrails

In [46]:
from hr__guardrails import (
    violates_safety_policy,
    is_hr_question,
    is_answer_grounded,
    contains_sensitive_advice,
    is_query_too_long
)


#for llm retrival check accuracy

In [49]:
from sentence_transformers import SentenceTransformer,util
embedding_model= SentenceTransformer('paraphrase-MiniLM-L6-v2')

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


#load pdf

In [22]:
import os
from langchain_community.document_loaders import PyPDFLoader, UnstructuredPDFLoader

def load_all_pdfs(pdf_folder_path):
    documents = []

    for file_name in os.listdir(pdf_folder_path):
        if not file_name.endswith(".pdf"):
            continue

        full_path = os.path.join(pdf_folder_path, file_name)

        # 1️⃣ Try normal text extraction
        loader = PyPDFLoader(full_path)
        pages = loader.load()

        extracted_text = "".join([p.page_content.strip() for p in pages])

        # 2️⃣ If no text → fallback to OCR
        if not extracted_text:
            print(f"OCR applied for: {file_name}")
            loader = UnstructuredPDFLoader(
                full_path,
                strategy="ocr_only"
            )
            pages = loader.load()

        # 3️⃣ Metadata
        for doc in pages:
            doc.metadata["source"] = file_name

        documents.extend(pages)

    return documents


pdf_folder_path = r"C:\Users\91880\Desktop\GenAI_Projects\PuchoHR\Data\documents"
docs = load_all_pdfs(pdf_folder_path)
print("Total pages:", len(docs))



Total pages: 70


#Chunk PDFs

#: Collect ALL chunks

In [23]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=512,
    chunk_overlap=256
)

chunks = text_splitter.split_documents(docs)
print("Total chunks:", len(chunks))



Total chunks: 512


#Ollama Embeddings

In [24]:
from langchain_ollama import OllamaEmbeddings

embedding = OllamaEmbeddings(model="nomic-embed-text")



#Create + FILL Chroma DB

In [25]:
from langchain_community.vectorstores import Chroma

vector_db = Chroma.from_documents(
    documents=chunks,
    embedding=embedding,
    persist_directory="./chroma_db"
)

vector_db.persist()


#Test Retrieval

In [26]:
vector_db = Chroma(
    persist_directory="./chroma_db",
    embedding_function=embedding
)


#Load LLM

In [27]:
from langchain_ollama import OllamaLLM

llm = OllamaLLM(
    model="llama3.2",
    temperature=0.2
)


#Ask-HR FUNCTION

In [None]:
def ask_hr(question):
     # Guardrail- Block overlong queries
    if is_query_too_long(question):
        return "Your query is too long, please shorten it."
    
     # Guardrail-Safety check
    if violates_safety_policy(question):
        return "Sorry, I cannot answer that question due to safety policy."
    
       # Guardrail- Check if question is HR-related
    if not is_hr_question(question):
        return "Sorry, this question is not related to HR policies."
    docs = vector_db.similarity_search(question, k=4)

    if not docs:
        return "Information not found in documents."

    context = "\n\n".join(
        f"Source: {doc.metadata['source']}\n{doc.page_content}"
        for doc in docs
    )

    prompt = f"""
You are an HR assistant.
Answer ONLY using the provided context.
If the answer is not present, say:
"Information not found in documents."

Context:
{context}

Question:
{question}

Answer:
"""

    answer= llm.invoke(prompt)
    
     # Guardrail- Check if answer is grounded in docs
    if not is_answer_grounded(answer, docs):
        return "Information not found in documents."
    
    #Semantic similarity check for checking correctness of llm resonse
    #similarity → Computes the cosine similarity between the answer and the context.Cosine similarity ranges from 0 to 1.1 → perfect semantic match, 0 → completely unrelate
    answer_emb=embedding_model.encode(answer)
    context_text = " ".join(doc.page_content for doc in docs)
    context_emb = embedding_model.encode(context_text)
    similarity = util.cos_sim(answer_emb, context_emb).item()

#threshold i have setted 0.8 ,it can be change
    if similarity< 0.8:
        return "Answer may not be fully accurate based on available documents."


    # Guardrail- Avoid sensitive advice phrases
    if contains_sensitive_advice(answer):
        return "I cannot provide legal or medical advice. Please consult the appropriate professional."
    return answer


#Ask Question

In [51]:
question = "You should consult a lawyer for this issue?"
answer = ask_hr(question)
print(answer)


Sorry, this question is not related to HR policies.
