In [1]:
from langchain_experimental.text_splitter import SemanticChunker
from langchain_huggingface import HuggingFaceEmbeddings

embedding_model = HuggingFaceEmbeddings(model_name="intfloat/multilingual-e5-large-instruct")

  from .autonotebook import tqdm as notebook_tqdm


## Prepare PDF File

In [62]:
# import pymupdf
# from langchain_core.documents import Document
# doc = pymupdf.open('../../chunking/pdfs/bts_esg_2023.pdf')

# import sys
# # sys.path.insert(0, "/Users/peerasit/senior_project/STELLA-Backend")
# from chunking.esg_file import extractAll
# doc = extractAll(file_name="../../chunking/pdfs/bts_esg_2023.pdf")

# import pdfplumber

doc = []
with pdfplumber.open("../../chunking/pdfs/bts_esg_2023.pdf") as pdf:
    page_size = len(pdf.pages)
    doc = [i.extract_text() for i in pdf.pages]

#### Recursive Chunking

In [64]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=500,
    chunk_overlap=100,
    # length_function=len,
    is_separator_regex=False,
)

text_chunks = text_splitter.create_documents(doc)
# text_chunks = text_splitter.create_documents([d.get_text() for d in doc])

In [65]:
len(text_chunks)

146

#### Semantic Chunking

In [42]:
# import pymupdf
# from langchain_core.documents import Document

# semantic_chunker = SemanticChunker(embeddings=embedding_model, breakpoint_threshold_type="percentile")
# text_chunks = semantic_chunker.create_documents([d.get_text() for d in doc])

#### Hybrid Chunking

In [43]:
# import sys
# sys.path.insert(0, "/Users/peerasit/senior_project/STELLA-Backend/")
# from chunking.ndc_file import ndcFileChunking

# text_chunks = ndcFileChunking(content="../../chunking/pdfs/general/Thailand_INDCs_2015.pdf", file_name="Thailand_INDCs_2015.pdf")

## Setup STELLA CORE DB

In [66]:
import sys
sys.path.insert(0, "/Users/peerasit/senior_project/STELLA-Backend")
sys.path.insert(0, "/Users/peerasit/senior_project/STELLA-Backend/milvus")

from milvus.core import Core
from milvus.schema import INDEX_PARAMS, DATA_SOURCE_SCHEMA

core = Core(
            database_name="esg_evaluate",
            schema=DATA_SOURCE_SCHEMA,
            dense_embedding_model=embedding_model,
            create_first_node=True,
            system_prune_first_node=True,
            token=""
        )

[CORE] Initializing Milvus Database Core...
[DB] init Embedding Model...
[DB] init Embedding Model Successfully.
[DB] Found Database: esg_evaluate
[DB] Found Collection "cnode_1".
[DB] Drop Collection "cnode_1"...
cnode_1 has: 0 entities
[DB] Drop Collection "cnode_1" Successfully.
[DB] Create Collection "cnode_1"
[DB] Collection "cnode_1" Is Ready.
[DB] Found Collection "gnode_1".
[DB] Drop Collection "gnode_1"...
gnode_1 has: 147 entities
[DB] Drop Collection "gnode_1" Successfully.
[DB] Create Collection "gnode_1"
[DB] Collection "gnode_1" Is Ready.
[DB] Found Collection "frontend_query_general_documents".
[DB] Drop Collection "frontend_query_general_documents"...
frontend_query_general_documents has: 1 entities
[DB] Drop Collection "frontend_query_general_documents" Successfully.
[DB] Create Collection "frontend_query_general_documents"
[DB] Collection "frontend_query_general_documents" Is Ready.
Create Schma Successfuly.


## Add Document 

In [67]:
core.add_document(name="esg", documents=text_chunks, node_type="g", description="this is esg file")

<Collection>:
-------------
<name>: gnode_1
<description>: Schema for Data Source Collection
<schema>: {'auto_id': True, 'description': 'Schema for Data Source Collection', 'fields': [{'name': 'id', 'description': '', 'type': <DataType.INT64: 5>, 'is_primary': True, 'auto_id': True}, {'name': 'dense_vector', 'description': '', 'type': <DataType.FLOAT_VECTOR: 101>, 'params': {'dim': 1024}}, {'name': 'text', 'description': '', 'type': <DataType.VARCHAR: 21>, 'params': {'max_length': 8192}}, {'name': 'metadata', 'description': '', 'type': <DataType.JSON: 23>}], 'enable_dynamic_field': True}

<class 'pymilvus.orm.collection.Collection'>
[DB] Insert Query FrontEnd Successfuly.
[DB] Create New Partition
[DB] Partition esg: 146 entities


## Setup Retrieval DB

In [68]:
config ={
    "k": 4,
    "partition_names": ["esg"],
}
chunk_retriver = core.initVectorStore(collection_name="gnode_1", partition_names=["esg"], search_kwargs=config)



In [69]:
chunk_retriver.invoke("มาตราการ")

[Document(metadata={}, page_content='วันที่อนุมัติและเผยแพร่ 23/06/2566'),
 Document(metadata={}, page_content='วันที่อนุมัติและเผยแพร่ 23/06/2566'),
 Document(metadata={}, page_content='บริษัท: แจ้งเบาะแสและคุ้มครองผู้ร้องเรียน x การป้องกันการใช้ข้อมูลภายในเพื่อแสวงหา\nผลประโยชน์'),
 Document(metadata={}, page_content='การกำกับดูแลความเสี่ยงองค์กร คณะกรรมการบริหารความเสี่ยง\nการสรรหา คณะกรรมการสรรหาและกำหนดค่าตอบแทน\nการพิจารณาค่าตอบแทน คณะกรรมการสรรหาและกำหนดค่าตอบแทน\nบรรษัทภิบาล คณะกรรมการพัฒนาเพื่อความยั่งยืน\nการพัฒนาความยั่งยืนขององค์กร คณะกรรมการพัฒนาเพื่อความยั่งยืน')]

## Setup LLM For Generate Synthesised Data

In [70]:
import os
from langchain_openai import ChatOpenAI
from dotenv import load_dotenv

load_dotenv()

OPENAI_API_KEY = os.environ["OPENAI_API_KEY"] = os.getenv("OPEN_AI_API_KEY")

### LLM RAG

In [71]:
from langchain_core.prompts import ChatPromptTemplate

rag_template = """\
Use the following context to answer the user's query. If you cannot answer, please respond with 'I don't know'.

User's Query:
{question}

Context:
{context}
"""

rag_prompt = ChatPromptTemplate.from_template(rag_template)

In [72]:
from langchain_core.runnables import RunnablePassthrough
from langchain_core.output_parsers import StrOutputParser
from langchain_openai import ChatOpenAI

llm = ChatOpenAI(model="gpt-3.5-turbo",temperature=0, max_tokens=4096)

recursive_rag_chain = (
    {"context" : chunk_retriver, "question" : RunnablePassthrough()}
    | rag_prompt
    | llm
    | StrOutputParser()
)

In [73]:
synthetic_data_chunks = text_chunks
synthetic_data_chunks_size = len(synthetic_data_chunks)

## LLM Question

In [74]:
question_prompt = """\
You are a teacher preparing a test. Please create a question that can be answered by referencing the following context.

Context:
{context}
"""

question_prompt = ChatPromptTemplate.from_template(question_prompt)
question_chain = question_prompt | llm | StrOutputParser()

## LLM Ground Truth

In [75]:
ground_truth_prompt = """\
Use the following context and question to answer this question using *only* the provided context.

Question:
{question}

Context:
{context}
"""

ground_truth_prompt = ChatPromptTemplate.from_template(ground_truth_prompt)
ground_truth_chain = ground_truth_prompt | llm | StrOutputParser()

## Create Synthesised Data

In [76]:
questions = []
ground_truths_recursive = []
contexts = []
answers = []

c = 0
for chunk in synthetic_data_chunks:
  # print(c)
  questions.append(question_chain.invoke({"context" : chunk.page_content}))
  # contexts.append(chunk.page_content)
  # ground_truths_recursive.append(ground_truth_chain.invoke({"question" : questions[-1], "context" : str(contexts[-1])}))
  ground_truths_recursive.append(ground_truth_chain.invoke({"question" : questions[-1], "context" : chunk.page_content}))
  

  contexts.append([d.page_content for d in chunk_retriver.get_relevant_documents(questions[-1])])
  answers.append(recursive_rag_chain.invoke(questions[-1]))
  # c += 1

In [77]:
# [d.page_content for d in chunk_retriver.get_relevant_documents("d")]

In [78]:
from datasets import load_dataset, Dataset

qagc_list = []

for question, answer, context, ground_truth in zip(questions, answers, contexts, ground_truths_recursive):
  # print("q",question)
  # print("g",ground_truth)
  # print("a",answer)
  # print("c", context)
  qagc_list.append({
      "question" : question,
      "answer" : answer,
      "contexts" : context,
      "ground_truth" : ground_truth
  })

eval_dataset = Dataset.from_list(qagc_list)
eval_dataset

Dataset({
    features: ['question', 'answer', 'contexts', 'ground_truth'],
    num_rows: 146
})

## Evaluation with RAGAS

In [79]:
from ragas.metrics import (
    answer_relevancy,
    faithfulness,
    context_recall,
    context_precision,
)

from ragas import evaluate

result = evaluate(
    eval_dataset,
    metrics=[
        # retrieval
        context_precision,
        context_recall,
    # generation
        # faithfulness,
        # answer_relevancy,
    ],
)
result

Evaluating:  72%|███████▏  | 209/292 [02:30<05:12,  3.77s/it]Exception raised in Job[103]: RateLimitError(Error code: 429 - {'error': {'message': 'Rate limit reached for gpt-4o-mini in organization org-jupNxilFfd1cJCkkq5Zqau32 on requests per day (RPD): Limit 10000, Used 10000, Requested 1. Please try again in 8.64s. Visit https://platform.openai.com/account/rate-limits to learn more.', 'type': 'requests', 'param': None, 'code': 'rate_limit_exceeded'}})
Evaluating:  72%|███████▏  | 211/292 [02:57<12:41,  9.41s/it]Exception raised in Job[123]: RateLimitError(Error code: 429 - {'error': {'message': 'Rate limit reached for gpt-4o-mini in organization org-jupNxilFfd1cJCkkq5Zqau32 on requests per day (RPD): Limit 10000, Used 10000, Requested 1. Please try again in 8.64s. Visit https://platform.openai.com/account/rate-limits to learn more.', 'type': 'requests', 'param': None, 'code': 'rate_limit_exceeded'}})
Evaluating:  73%|███████▎  | 214/292 [03:22<11:41,  9.00s/it]Exception raised in Job

{'context_precision': 0.6726, 'context_recall': 0.7157}

In [80]:
results_df = result.to_pandas()
results_df

Unnamed: 0,user_input,retrieved_contexts,response,reference,context_precision,context_recall
0,Question:\nDoes BTS Group Limited have an Envi...,[ข้อมูลการดำเนินงานด้าน ESG\nบริษัท: บริษัท บี...,"Yes, BTS Group Limited has an Environmental Ma...","Yes, BTS Group Limited has an Environmental Ma...",1.000000,1.0
1,Question: What are some of the environmental m...,[เอกสารนโยบายและแนวปฏิบัติด้านสิ่งแวดล้อมของบร...,Some of the environmental management practices...,The environmental management practices outline...,1.000000,1.0
2,Question: What was the target amount of electr...,[ปริมาณการซื้อหรือผลิตไฟฟ้ามาใช้จากแหล่งพลังงา...,I don't know.,The target amount of electricity usage for the...,0.583333,1.0
3,Question: What is the difference between the t...,[ปริมาณการซื้อหรือผลิตไฟฟ้ามาใช้จากแหล่งพลังงา...,The difference between the total electricity c...,The difference between the total electricity c...,1.000000,1.0
4,Question: In which year did the company exceed...,[ร้อยละของผลต่างระหว่างปริมาณการใช้ไฟฟ้ารวมเที...,I don't know.,"Based on the context provided, the company exc...",0.750000,0.0
...,...,...,...,...,...,...
141,Question: Who are the members of the committee...,[การแต่งตั้งกลับเข้ามาใหม่\nเพศ: ชาย ตำแหน่งกร...,Answer: The members of the committee for evalu...,Members of the committee for evaluating job pe...,,0.0
142,Question: What is the link to the sustainabili...,[performances/relevant-policies/20210616-bts-s...,Link to the sustainability policy of the compa...,The link to the sustainability policy of the c...,,
143,Question: What are the standards for sustainab...,[performances/relevant-policies/20210616-bts-s...,"The company references the GRI Standards, UN G...",The standards for sustainability reporting tha...,,
144,Question:\nWhat disclaimer is provided in the ...,[Task Force on Climate-related Financial Discl...,The disclaimer provided in the context regardi...,The disclaimer provided in the context regardi...,,


In [81]:
# for i in results_df["retrieved_contexts"]:
#     print(len(i))

In [82]:
results_df['context_precision'].mean()

0.6726190475771452

In [83]:
results_df['context_precision'].mean()

0.6726190475771452