# loaded all pdfs data


#Load PDFs

In [40]:
import os
from langchain_community.document_loaders import PyPDFLoader

def load_all_pdfs(pdf_folder_path):
    documents = []

    for file_name in os.listdir(pdf_folder_path):
        if file_name.endswith(".pdf"):
            full_path = os.path.join(pdf_folder_path, file_name)
            loader = PyPDFLoader(full_path)
            pages = loader.load()

            for doc in pages:
                doc.metadata["source"] = file_name

            documents.extend(pages)

    return documents


pdf_folder_path = r"C:\Users\91880\Desktop\GenAI_Projects\PuchoHR\Data\documents"
docs = load_all_pdfs(pdf_folder_path)
print("Total pages:", len(docs))



Total pages: 35


#Chunk PDFs

In [41]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=512,
    chunk_overlap=256
)

chunks = text_splitter.split_documents(docs)
print("Total chunks:", len(chunks))


Total chunks: 302


#: Collect ALL chunks

In [42]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=512,
    chunk_overlap=256
)

chunks = text_splitter.split_documents(docs)
print("Total chunks:", len(chunks))



Total chunks: 302


#Ollama Embeddings

In [43]:
from langchain_ollama import OllamaEmbeddings

embedding = OllamaEmbeddings(model="nomic-embed-text")



#Create + FILL Chroma DB

In [44]:
from langchain_community.vectorstores import Chroma

vector_db = Chroma.from_documents(
    documents=chunks,
    embedding=embedding,
    persist_directory="./chroma_db"
)

vector_db.persist()


#Test Retrieval

In [45]:
vector_db = Chroma(
    persist_directory="./chroma_db",
    embedding_function=embedding
)


#Load LLM

In [46]:
from langchain_ollama import OllamaLLM

llm = OllamaLLM(
    model="llama3.2",
    temperature=0.2
)


#Ask-HR FUNCTION

In [47]:
def ask_hr(question):
    docs = vector_db.similarity_search(question, k=4)

    if not docs:
        return "Information not found in documents."

    context = "\n\n".join(
        f"Source: {doc.metadata['source']}\n{doc.page_content}"
        for doc in docs
    )

    prompt = f"""
You are an HR assistant.
Answer ONLY using the provided context.
If the answer is not present, say:
"Information not found in documents."

Context:
{context}

Question:
{question}

Answer:
"""

    return llm.invoke(prompt)


#Ask Question

In [52]:
question = "Who is president of india?"
answer = ask_hr(question)
print(answer)


Information not found in documents.
