In [8]:
from llama_index.core import Settings, SimpleDirectoryReader, Document, VectorStoreIndex
from dotenv import load_dotenv
import nest_asyncio

nest_asyncio.apply()
load_dotenv()

True

In [4]:
from llama_index.llms.openai import OpenAI
from llama_index.embeddings.openai import OpenAIEmbedding

llm = OpenAI(model="gpt-4o-mini", temperature=0, strict=True)
embed = OpenAIEmbedding(model="text-embedding-3-small", dimensions=768)

Settings.llm = llm
Settings.embed_model = embed
Settings.context_window = 128_000

In [10]:
documents = SimpleDirectoryReader("./data").load_data(show_progress=True)

print(f"Len: {len(documents)}")
documents

Loading files: 100%|██████████| 5/5 [00:01<00:00,  3.98file/s]

Len: 59





[Document(id_='f2af6bcb-8044-4cb7-8789-69e65fcb3793', embedding=None, metadata={'page_label': '1', 'file_name': 'NQLD01.pdf', 'file_path': '/Users/chinhdinh/Repos/chatbot/chatbot-core/notebooks/POC1/data/NQLD01.pdf', 'file_type': 'application/pdf', 'file_size': 401404, 'creation_date': '2024-11-15', 'last_modified_date': '2024-11-15'}, excluded_embed_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], excluded_llm_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], relationships={}, text='Trang 1 \nCÔNG TY CẤP NƯỚC SÀI GÒN \nTRÁCH NHIỆM HỮU HẠN MỘT THÀNH VIÊN \nCÔNG TY CỔ PHẦN CẤP NƯỚC TRUNG AN \n \n \nCỘNG HÒA XÃ HỘI CHỦ NGHĨA VIỆT NAM \nĐộc lập – Tự do – Hạnh phúc \n \n \nNỘI QUY LAO ĐỘNG \n(Ban hành kèm theo Quyết định số  099 /QĐ-TA-TCHC ngày  07 / 8 /2018) \n \nCHƯƠNG I \nNHỮNG QUY ĐỊNH CHUNG \n \nĐiều 1. Mục đích ban hành Nội quy lao động \nNội quy lao đ

In [11]:
from llama_index.core.node_parser import SentenceSplitter

parser = SentenceSplitter(separator="\n")
nodes = parser.get_nodes_from_documents(documents)

print(f"Len: {len(nodes)}")
nodes

Len: 94


[TextNode(id_='a59fb8df-069a-4667-a202-74f56ddc0561', embedding=None, metadata={'page_label': '1', 'file_name': 'NQLD01.pdf', 'file_path': '/Users/chinhdinh/Repos/chatbot/chatbot-core/notebooks/POC1/data/NQLD01.pdf', 'file_type': 'application/pdf', 'file_size': 401404, 'creation_date': '2024-11-15', 'last_modified_date': '2024-11-15'}, excluded_embed_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], excluded_llm_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], relationships={<NodeRelationship.SOURCE: '1'>: RelatedNodeInfo(node_id='f2af6bcb-8044-4cb7-8789-69e65fcb3793', node_type=<ObjectType.DOCUMENT: '4'>, metadata={'page_label': '1', 'file_name': 'NQLD01.pdf', 'file_path': '/Users/chinhdinh/Repos/chatbot/chatbot-core/notebooks/POC1/data/NQLD01.pdf', 'file_type': 'application/pdf', 'file_size': 401404, 'creation_date': '2024-11-15', 'last_modified_date': 

In [12]:
from llama_index.core.extractors import QuestionsAnsweredExtractor

QUESTION_GEN_TMPL = """\
Here is the context:
{context_str}

Given the contextual information, \
generate {num_questions} questions this context can provide \
specific answers to which are unlikely to be found elsewhere.

Rule: The questions must be related to the section 'text'. \
Do not generate trivial questions such as file size, file name, etc.
These documents contain information about "thoả ước lao động" from \
several companies. The questions should be related to that topic.

Higher-level summaries of surrounding context may be provided \
as well. Try using these summaries to generate better questions \
that this context can answer.

"""

qa_extractor = QuestionsAnsweredExtractor(questions=3, prompt_template=QUESTION_GEN_TMPL)

nodes_with_qa = qa_extractor.extract(nodes)

print(f"Len: {len(nodes_with_qa)}")
nodes_with_qa

100%|██████████| 94/94 [01:06<00:00,  1.40it/s]

Len: 94





[{'questions_this_excerpt_can_answer': 'Based on the provided context regarding the labor regulations of Công ty Cổ phần Cấp nước Trung An, here are three specific questions that can be answered from the text:\n\n1. **What is the purpose of the labor regulations as stated in the document?**\n   - The document outlines that the purpose of the labor regulations is to establish the labor discipline that employees must adhere to while working at the company and to specify the penalties for violations of this discipline.\n\n2. **What are the working hours defined for employees at Công ty Cổ phần Cấp nước Trung An?**\n   - The document specifies that the working hours at the company are 8 hours per day, totaling 40 hours per week, with specific schedules for administrative work and direct production roles.\n\n3. **What are the designated working hours for administrative employees according to the labor regulations?**\n   - For administrative employees, the working hours are divided into two 

In [16]:
full_extracted_qa = ""

for node in nodes_with_qa:
    full_extracted_qa += node["questions_this_excerpt_can_answer"] + "\n"

print(full_extracted_qa)

Based on the provided context regarding the labor regulations of Công ty Cổ phần Cấp nước Trung An, here are three specific questions that can be answered from the text:

1. **What is the purpose of the labor regulations as stated in the document?**
   - The document outlines that the purpose of the labor regulations is to establish the labor discipline that employees must adhere to while working at the company and to specify the penalties for violations of this discipline.

2. **What are the working hours defined for employees at Công ty Cổ phần Cấp nước Trung An?**
   - The document specifies that the working hours at the company are 8 hours per day, totaling 40 hours per week, with specific schedules for administrative work and direct production roles.

3. **What are the designated working hours for administrative employees according to the labor regulations?**
   - For administrative employees, the working hours are divided into two sessions: from 7:30 AM to 11:30 AM in the morning