In [20]:
#Import necessary libraries
import os
import openai
import sys
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.prompts import PromptTemplate
from langchain.vectorstores import Chroma
import gradio as gr
import pysqlite3
sys.modules["sqlite3"] = sys.modules.pop("pysqlite3")

In [2]:
#Loading Nestle's HR policy using PyPDFLoader

Doc_loader = PyPDFLoader("the_nestle_hr_policy_pdf_2012.pdf")
extracted_text = Doc_loader.load()

In [3]:
# Splitting the text

text_splitter  = RecursiveCharacterTextSplitter(
    chunk_size=750,
    chunk_overlap=100,
    separators=["\n\n", "\n", "(?<=\. )", " ", ""]
)
splitted_text=text_splitter.split_documents(extracted_text)

In [4]:
len(splitted_text)

25

In [5]:
# Create vector representations for text chunks using Chroma dB and OpenAI's embeddings.

from langchain.embeddings import OpenAIEmbeddings

embeddings = OpenAIEmbeddings()



In [7]:
# Loading the vectordb with embeddings

persist_directory = "nestle_hr_policy"

vectordb = Chroma.from_documents(
    documents=splitted_text,
    embedding=embeddings,
    persist_directory=persist_directory
)

In [8]:
# Printing the first 5 document IDs
print(vectordb._collection.get()['ids'][:5])  


['a304a25c-4117-11f0-bf7a-0242ac100002', 'a304a3ba-4117-11f0-bf7a-0242ac100002', 'a304a450-4117-11f0-bf7a-0242ac100002', 'a304a4c8-4117-11f0-bf7a-0242ac100002', 'a304a540-4117-11f0-bf7a-0242ac100002']


In [9]:
# Checking the vectordb to ensure that the data is coorectly loaded

docs = vectordb._collection.get()
for i in range(min(3, len(docs['documents']))):
    print("ID:", docs['ids'][i])
    print("Document:", docs['documents'][i])
    print("Metadata:", docs['metadatas'][i])
    print("---")

ID: a304a25c-4117-11f0-bf7a-0242ac100002
Document: Policy
MandatorySeptember   2012
The Nestlé  
Human Resources Policy
Metadata: {'page': 0, 'source': 'the_nestle_hr_policy_pdf_2012.pdf'}
---
ID: a304a3ba-4117-11f0-bf7a-0242ac100002
Document: Policy
MandatorySeptember 
 20
12Issuing  departement
Hum
an Resources
Target  audience  
All
 employees
Approver
Executive Board, Nestlé S.A.
Repository
All Nestlé Principles and Policies, Standards and  Guidelines can be found in the Centre online repository at:  http://intranet.nestle.com/nestledocs
Copyright
 and confidentiality
Al
l rights belong to Nestec Ltd., Vevey, Switzerland.
© 2012, Nestec Ltd.
Design
Nestec Ltd., Corporate Identity & Design,  Vevey, Switzerland
Production
Metadata: {'page': 1, 'source': 'the_nestle_hr_policy_pdf_2012.pdf'}
---
ID: a304a450-4117-11f0-bf7a-0242ac100002
Document: © 2012, Nestec Ltd.
Design
Nestec Ltd., Corporate Identity & Design,  Vevey, Switzerland
Production
brain’print GmbH, Switzerland
Paper
This r

In [10]:
# Initializing the LLM
from langchain.chat_models import ChatOpenAI
llm = ChatOpenAI(model_name="gpt-3.5-turbo", temperature=0)

In [21]:
# Creating Prompt template

template_summary = """You are an intelligent and helpful AI assistant. 
                    Use only the information from the provided document to answer the user's question.
                    If the answer cannot be found in the document, respond with:
                    "I’m sorry, I don’t have access to that information in the provided document."

                    Always be concise, clear, and accurate.You can summarize the answers based on the context from the pdf.
                    Context:{context}

                    Question: {question}
                    Answer:"""
                  


prompt_summary = PromptTemplate(
                            input_variables=["context", "question"],
                            template=template_summary
                                )

In [22]:
# Create a retrieval function

from langchain.chains import RetrievalQA
Retriever_chain = RetrievalQA.from_chain_type(llm,
                                              retriever=vectordb.as_retriever(),
                                              return_source_documents=True, 
                                              chain_type_kwargs={"prompt": prompt_summary}
                                             )

In [23]:
# Definig function to modify the result obtained from the retriever

from langchain.schema import BaseOutputParser

class NeatAnswerParser(BaseOutputParser):
    def parse(self, text: str) -> str:
        return text.strip()


In [26]:
# Defining a chatbot response for Gradio

def chatbot_response(messages, history=None):
    try:
              
        query = messages.strip()
        
        if query in ["exit", "quit", "bye"]:
            return {"role": "assistant", "content": "Goodbye! Chat ended."}
        
        if not query:
            return {"role": "assistant", "content": "Please enter a question."}

        response = Retriever_chain({"query": query})
        answer = response["result"]
        source_doc = response["source_documents"][0]
        page = source_doc.metadata.get("page", "unknown")
       

        parser = NeatAnswerParser()
        clean_answer = parser.parse(answer)

        final_output = f"""**Answer:**  
        {clean_answer}

         **Source:** Page {page}"""      
        return {"role": "assistant", "content": final_output}
    
    except Exception as e:
        print("ERROR:", e)
        return {"role": "assistant", "content": f"Internal Error: {e}"}


In [27]:
# Setting up the chat interface

gr.ChatInterface(fn=chatbot_response,
                 title="Nestlé HR Policy Chatbot",
                 description="Ask any HR policy question. I will respond based on the uploaded document.",
                 chatbot=gr.Chatbot(type="messages")).launch(share=True)





* Running on local URL:  http://127.0.0.1:7862
* Running on public URL: https://b4be832f670fc73b6b.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


