In [1]:
from langchain_experimental.text_splitter import SemanticChunker
from langchain_huggingface import HuggingFaceEmbeddings

embedding_model = HuggingFaceEmbeddings(model_name="intfloat/multilingual-e5-large-instruct")

  from .autonotebook import tqdm as notebook_tqdm


## Prepare PDF File

In [2]:
import pymupdf
from langchain_core.documents import Document

doc = pymupdf.open('../../chunking/pdfs/general/Thailand_INDCs_2015.pdf')

#### Recursive Chunking

In [3]:
# from langchain_text_splitters import RecursiveCharacterTextSplitter

# text_splitter = RecursiveCharacterTextSplitter(
#     chunk_size=1200,
#     chunk_overlap=100,
#     # length_function=len,
#     is_separator_regex=False,
# )

# text_chunks = text_splitter.create_documents([d.get_text() for d in doc])

#### Semantic Chunking

In [44]:
import pymupdf
from langchain_core.documents import Document

semantic_chunker = SemanticChunker(embeddings=embedding_model, breakpoint_threshold_type="percentile")
text_chunks = semantic_chunker.create_documents([d.get_text() for d in doc])

#### Hybrid Chunking

In [45]:
# import sys
# sys.path.insert(0, "/Users/peerasit/senior_project/STELLA-Backend/")
# from chunking.ndc_file import ndcFileChunking

# text_chunks = ndcFileChunking(content="../../chunking/pdfs/general/Thailand_INDCs_2015.pdf", file_name="Thailand_INDCs_2015.pdf")

## Setup STELLA CORE DB

In [46]:
import sys
sys.path.insert(0, "/Users/peerasit/senior_project/STELLA-Backend")
sys.path.insert(0, "/Users/peerasit/senior_project/STELLA-Backend/milvus")

from milvus.core import Core
from milvus.schema import INDEX_PARAMS, DATA_SOURCE_SCHEMA

core = Core(
            database_name="ndc_evaluate",
            schema=DATA_SOURCE_SCHEMA,
            dense_embedding_model=embedding_model,
            create_first_node=True,
            system_prune_first_node=True,
            token=""
        )

[CORE] Initializing Milvus Database Core...
[DB] init Embedding Model...
[DB] init Embedding Model Successfully.
[DB] Found Database: ndc_evaluate
[DB] Found Collection "cnode_1".
[DB] Drop Collection "cnode_1"...
cnode_1 has: 0 entities
[DB] Drop Collection "cnode_1" Successfully.
[DB] Create Collection "cnode_1"
[DB] Collection "cnode_1" Is Ready.
[DB] Found Collection "gnode_1".
[DB] Drop Collection "gnode_1"...
gnode_1 has: 7 entities
[DB] Drop Collection "gnode_1" Successfully.
[DB] Create Collection "gnode_1"
[DB] Collection "gnode_1" Is Ready.
[DB] Found Collection "frontend_query_general_documents".
[DB] Drop Collection "frontend_query_general_documents"...
frontend_query_general_documents has: 1 entities
[DB] Drop Collection "frontend_query_general_documents" Successfully.
[DB] Create Collection "frontend_query_general_documents"
[DB] Collection "frontend_query_general_documents" Is Ready.
Create Schma Successfuly.


## Add Document 

In [47]:
core.add_document(name="ndc", documents=text_chunks, node_type="g", description="National disclosure standards for financial climate, climate risk, NFCCC")

<Collection>:
-------------
<name>: gnode_1
<description>: Schema for Data Source Collection
<schema>: {'auto_id': True, 'description': 'Schema for Data Source Collection', 'fields': [{'name': 'id', 'description': '', 'type': <DataType.INT64: 5>, 'is_primary': True, 'auto_id': True}, {'name': 'dense_vector', 'description': '', 'type': <DataType.FLOAT_VECTOR: 101>, 'params': {'dim': 1024}}, {'name': 'text', 'description': '', 'type': <DataType.VARCHAR: 21>, 'params': {'max_length': 8192}}, {'name': 'metadata', 'description': '', 'type': <DataType.JSON: 23>}], 'enable_dynamic_field': True}

<class 'pymilvus.orm.collection.Collection'>
[DB] Insert Query FrontEnd Successfuly.
[DB] Create New Partition
[DB] Partition ndc: 16 entities


## Setup Retrieval DB

In [48]:
config ={
    "k": 4,
    "partition_names": ["ndc"],
}
chunk_retriver = core.initVectorStore(collection_name="gnode_1", partition_names=["ndc"], search_kwargs=config)



In [49]:
chunk_retriver.invoke("มาตราการ")

[Document(metadata={}, page_content=''),
 Document(metadata={}, page_content='.......................ร่าง \n....................พิมพ์ \n.....................ทาน \n....................ตรวจ \n \n.......................ร่าง \n....................พิมพ์ \n.....................ทาน \n....................ตรวจ \n \n \n \n \nNo 1006.3/ \n \n \n \n \nOffice of Natural Resources and \nEnvironmental Policy and Planning \n60/1 Soi Phibun-Wattana 7, Rama VI Rd. Samsen-Nai, Phayathai, Bangkok 10400  \nTHAILAND \nTel. / Fax.'),
 Document(metadata={}, page_content='‐2‐\xa0\n\xa0\n- Power Development Plan B.E. 2558–2579 (2015-2036) \n- Thailand Smart Grid Development Master Plan B.E. 2558-\n2579 (2015-2036) \n- Energy Efficiency Plan B.E. 2558–2579 (2015-2036)   \n- Alternative Energy Development Plan B.E. 2558–2579 (2015-\n2036) \n- Environmentally Sustainable Transport System Plan B.E.'),
 Document(metadata={}, page_content='0 2265 6690  \n \nOctober B.E. 2558 (2015) \n \nDear  Executive Secretary,  \n

## Setup LLM For Generate Synthesised Data

In [50]:
import os
from langchain_openai import ChatOpenAI
from dotenv import load_dotenv

load_dotenv()

OPENAI_API_KEY = os.environ["OPENAI_API_KEY"] = os.getenv("OPEN_AI_API_KEY")

### LLM RAG

In [51]:
from langchain_core.prompts import ChatPromptTemplate

rag_template = """\
Use the following context to answer the user's query. If you cannot answer, please respond with 'I don't know'.

User's Query:
{question}

Context:
{context}
"""

rag_prompt = ChatPromptTemplate.from_template(rag_template)

In [52]:
from langchain_core.runnables import RunnablePassthrough
from langchain_core.output_parsers import StrOutputParser
from langchain_openai import ChatOpenAI

llm = ChatOpenAI(model="gpt-3.5-turbo",temperature=0, max_tokens=4096)

recursive_rag_chain = (
    {"context" : chunk_retriver, "question" : RunnablePassthrough()}
    | rag_prompt
    | llm
    | StrOutputParser()
)

In [53]:
synthetic_data_chunks = text_chunks
synthetic_data_chunks_size = len(synthetic_data_chunks)

## LLM Question

In [54]:
question_prompt = """\
You are a teacher preparing a test. Please create a question that can be answered by referencing the following context.

Context:
{context}
"""

question_prompt = ChatPromptTemplate.from_template(question_prompt)
question_chain = question_prompt | llm | StrOutputParser()

## LLM Ground Truth

In [55]:
ground_truth_prompt = """\
Use the following context and question to answer this question using *only* the provided context.

Question:
{question}

Context:
{context}
"""

ground_truth_prompt = ChatPromptTemplate.from_template(ground_truth_prompt)
ground_truth_chain = ground_truth_prompt | llm | StrOutputParser()

## Create Synthesised Data

In [56]:
questions = []
ground_truths_recursive = []
contexts = []
answers = []

for chunk in synthetic_data_chunks:
    # print(chunk)
  questions.append(question_chain.invoke({"context" : chunk.page_content}))
  # contexts.append(chunk.page_content)
  # ground_truths_recursive.append(ground_truth_chain.invoke({"question" : questions[-1], "context" : str(contexts[-1])}))
  ground_truths_recursive.append(ground_truth_chain.invoke({"question" : questions[-1], "context" : chunk.page_content}))
  

  contexts.append([d.page_content for d in chunk_retriver.get_relevant_documents(questions[-1])])
  answers.append(recursive_rag_chain.invoke(questions[-1]))

In [57]:
# [d.page_content for d in chunk_retriver.get_relevant_documents("d")]

In [58]:
from datasets import load_dataset, Dataset

qagc_list = []

for question, answer, context, ground_truth in zip(questions, answers, contexts, ground_truths_recursive):
  # print("q",question)
  # print("g",ground_truth)
  # print("a",answer)
  # print("c", context)
  qagc_list.append({
      "question" : question,
      "answer" : answer,
      "contexts" : context,
      "ground_truth" : ground_truth
  })

eval_dataset = Dataset.from_list(qagc_list)
eval_dataset

Dataset({
    features: ['question', 'answer', 'contexts', 'ground_truth'],
    num_rows: 16
})

## Evaluation with RAGAS

In [59]:
from ragas.metrics import (
    answer_relevancy,
    faithfulness,
    context_recall,
    context_precision,
)

from ragas import evaluate

result = evaluate(
    eval_dataset,
    metrics=[
        # retrieval
        context_precision,
        context_recall,
    # generation
        # faithfulness,
        # answer_relevancy,
    ],
)
result

Evaluating: 100%|██████████| 32/32 [00:17<00:00,  1.79it/s]


{'context_precision': 0.8212, 'context_recall': 0.9375}

In [60]:
results_df = result.to_pandas()
results_df

Unnamed: 0,user_input,retrieved_contexts,response,reference,context_precision,context_recall
0,Question: What is the address of the Office of...,[.......................ร่าง \n..................,Address of the Office of Natural Resources and...,The address of the Office of Natural Resources...,1.0,1.0
1,Question: Who is the Executive Secretary of th...,[0 2265 6690 \n \nOctober B.E. 2558 (2015) \n...,Answer: Ms. Christiana Figueres is the Executi...,Ms. Christiana Figueres,1.0,1.0
2,Question:\nWhat is Thailand's intended target ...,[‐1‐ \n \nSubmission by Thailand \nIntended Na...,Thailand's intended target for reducing greenh...,Thailand's intended target for reducing greenh...,1.0,1.0
3,Question: What plans were already approved or ...,"[In addition, three \nnational consultations w...",The plans already approved or in the pipeline ...,The plans already approved or in the pipeline ...,1.0,1.0
4,Question: In what years do the Power Developme...,[‐2‐ \n \n- Power Development Plan B.E. 2558–2...,"The Power Development Plan, Smart Grid Develop...",2558-2579 (2015-2036),0.805556,1.0
5,Question:\nWhat percentage of global greenhous...,[2556–2573 (2013-2030) \n- National Industrial...,Thailand represented 0.84% of global greenhous...,Thailand represented 0.84% of global greenhous...,1.0,1.0
6,Question: What percentage of electricity in Th...,[‐3‐ \n \n❑ Thailand has taken early actions i...,72%,72%,1.0,1.0
7,Question:\nWhat is the goal of the EEP in term...,[‐2‐ \n \n- Power Development Plan B.E. 2558–2...,The goal of the Energy Efficiency Plan (EEP) i...,The goal of the EEP in terms of reducing the c...,0.333333,1.0
8,Question: What are some of the major barriers ...,[❑ Major barriers to successful implementation...,Some of the major barriers to successful imple...,Some of the major barriers to successful imple...,0.75,1.0
9,"Question:\nAccording to the context, what are ...",[‐4‐ \n \nshare technology knowledge to enable...,Some of the challenges faced by Thailand in re...,Some of the challenges faced by Thailand in re...,1.0,1.0


In [61]:
# for i in results_df["retrieved_contexts"]:
#     print(len(i))

In [62]:
results_df['context_precision'].mean()

0.8211805554908564

In [63]:
results_df['context_precision'].mean()

0.8211805554908564