# Information

RAG VERSION 8.0

--------------------

Lĩnh vực : Luật doanh nghiệp Việt Nam

Embedding : keepitreal/vietnamese-sbert

Vector Database : Chroma

LLM Model : gemini-2.0-flash-exp

Input document : txt

Reranker Model : namdp-ptit/ViRanker

Key word search : BM25


# Import Libraries

In [1]:
!pip install -q -U langchain-community
!pip install -q chromadb
!pip install -q unstructured
!pip install -q sentence-transformers
!pip install -q rank_bm25
!pip install -q -U rouge_score
!pip install -q streamlit
!pip install -q pyngrok

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# DataLoader

In [3]:
%%writefile data_loader.py
import zipfile, os

from langchain.document_loaders import DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.retrievers import BM25Retriever
from langchain.vectorstores import Chroma
from langchain.embeddings import HuggingFaceEmbeddings

class DataLoader():
    def __init__(self, source_path_1, source_path_2, chroma_path, chunk_size=1000, chunk_overlap=200):
        self.source_path_1 = source_path_1
        self.source_path_2 = source_path_2
        self.choroma_path = chroma_path
        self.chunk_size = chunk_size
        self.chunk_overlap = chunk_overlap

    def load_data(self):
      if not os.path.exists(self.source_path_2):
        os.makedirs(self.source_path_2)


      for path in self.source_path_1:

        try:
          with zipfile.ZipFile(path, 'r') as zip_ref:
            zip_ref.extractall(self.source_path_2)
            print(f"Successfully unzipped '{path}' to '{self.source_path_2}'")
        except FileNotFoundError:
            print(f"Error: Zip file not found at '{path}'")
            pass
        except zipfile.BadZipFile:
            print(f"Error: Invalid zip file at '{path}'")
            pass
        except Exception as e:
            print(f"An error occurred: {e}")
            pass


      loader = DirectoryLoader(self.source_path_2, glob="*.txt", show_progress=True)
      documents = loader.load()

      text_splitter = RecursiveCharacterTextSplitter(chunk_size=self.chunk_size, chunk_overlap=self.chunk_overlap)
      chunks = text_splitter.split_documents(documents)

      bm25_retriever = BM25Retriever.from_documents(chunks)

      os.makedirs(self.choroma_path, exist_ok=True)

      os.system(f"chmod -R u+w {self.choroma_path}")

      db = Chroma.from_documents(chunks, embedding = HuggingFaceEmbeddings(model_name="keepitreal/vietnamese-sbert"),\
        persist_directory=self.choroma_path)

      db.persist()

      print(f"Save {len(chunks)} chunks to {self.choroma_path}")

      return db, bm25_retriever


Overwriting data_loader.py


# Hybrid Search

In [4]:
%%writefile hybrid_search.py
from langchain.retrievers import BM25Retriever,EnsembleRetriever
from langchain.vectorstores import Chroma
from langchain.embeddings import HuggingFaceEmbeddings

class HybridSearch:

  def __init__(self, db, retriever):
    self.db = db
    self.retriever = retriever
    self.ensemble_retriever = None


  def setup(self):
    chroma_retriever = self.db.as_retriever()
    self.ensemble_retriever = EnsembleRetriever(retrievers=[self.retriever,chroma_retriever], weights=[0.3,0.7])


  def keyword_search(self, query , top_k = 10):
    self.retriever.k = top_k
    results = self.retriever.invoke(query)
    return results

  def semantic_search(self, query, top_k = 10):
    results = self.db.similarity_search_with_score(query, k = top_k)
    return results

  def hybrid_search(self, query, top_k = 10):

    if(self.ensemble_retriever == None):
      self.setup()

    results = self.ensemble_retriever.invoke(query)
    return results[:top_k]



Overwriting hybrid_search.py


# Reranker

In [5]:
%%writefile reranker.py
from sentence_transformers import CrossEncoder

class Reranker:
  def __init__(self, model_name="namdp-ptit/ViRanker"):
    self.model = CrossEncoder(model_name)

  def rerank(self, query, retrieved_docs, hybrid_search=True):
    """Rank documents based on relevance to the query."""
    # Combine query and document for reranking
    if hybrid_search:
      query_doc_pairs = [(query, doc.page_content) for doc in retrieved_docs]
    else:
      query_doc_pairs = [(query, doc[0].page_content) for doc in retrieved_docs]
    scores = self.model.predict(query_doc_pairs)

    # Sort documents by scores in descending order
    ranked_results = sorted(zip(scores, retrieved_docs), key=lambda x: x[0], reverse=True)
    return [doc for score, doc in ranked_results]

Overwriting reranker.py


# Generate functions

In [6]:
%%writefile utils_function.py

import google.generativeai as genai
import json, os
import config as cfg

from langchain.prompts import ChatPromptTemplate

def json2dict(response):
    response_1 = response.text.replace("json", "").replace("```", "")

    try:
        response_dict = json.loads(response_1)
        return response_dict
    except json.JSONDecodeError as e:
        print(f"Error decoding JSON: {e}")
        print(f"Problematic text: {response}")



def __generate_answer(prompt):
    genai.configure(api_key=cfg.llm_api_key)
    model = genai.GenerativeModel("gemini-2.0-flash-exp")
    response = model.generate_content(prompt)
    return response


def classfication_article(question, template):

  prompt_template = ChatPromptTemplate.from_template(template)
  prompt = prompt_template.format(question=question)

  return  __generate_answer(prompt).text


def gen_evaluation_score(template, question, context, answer, ground_truth):

  prompt_template = ChatPromptTemplate.from_template(template)
  prompt = prompt_template.format(question=question, context=context, answer=answer, ground_truth=ground_truth)

  return json2dict( __generate_answer(prompt))


def gen_hypothetical_document(question, chat_history=""):
    # Thêm lịch sử chat vào prompt
    history_context = ""
    if chat_history:
        history_context = f"Previous conversation:\n{chat_history}\n"

    prompt = f"""
    {history_context}
    ------------------------
    Write a paragraph about 200 characters that answers this question, using Vietnamese only, if you can not answer, you just need to answer whatever you want.
    Question: {question}
    """
    return __generate_answer(prompt)

def format_chat_history(messages):
    formatted_history = ""
    for msg in messages:
        role = "User" if msg["role"] == "user" else "Assistant"
        formatted_history += f"{role}: {msg['content']}\n"
    return formatted_history



def gen_ans(query, template, db, reranker, hybrid_search, chat_history=[]):

    flag = 0

    formatted_history = format_chat_history(chat_history)

    articles = classfication_article(query, cfg.PROMPT_TEMPLATE_ARTICLE)


    if "Không" not in articles:
        flag = 1




    if flag == 1:

        first_split = articles.split("[")[1]
        second_split = first_split.split("]")[0]
        second_split = second_split.replace('"', '')

        my_list = second_split.split(",")
        my_list = [x.strip() for x in my_list]

        content_list = []
        source_law = []

        law_list = ["LDN_2020", "LDT_2020", "LLD_2019", "LCK_2019", "NQ173_2024", "ND135_2020", "ND145_2020",]


        for items in my_list:

            for law in law_list:
                try:

                  path = f"/content/books/{items}_{law}.txt"

                  if os.path.exists(path):
                    with open(path, "r") as f:
                        content = f.read()
                        content_list.append(content)
                        source_law.append(path)
                except:
                  pass

        if(len(content_list) == 0):
          flag = 0
        else:
          context_text = "\n\n---\n\n".join(content_list)

    if flag == 0:

        response = gen_hypothetical_document(query, formatted_history)

        question = response.text

        # question = query

        # results = db.similarity_search_with_score(query, k = 10)

        results = hybrid_search.hybrid_search(question, top_k=30)

        results = reranker.rerank(question, results)[:10]

        # results = hybrid_search.semantic_search(question, top_k=10)


        source_law = []

        for i in range(len(results)):
          if results[i].metadata['source'] not in source_law:
            source_law.append(results[i].metadata['source'])


        context_text = "\n\n---\n\n".join([doc.page_content for doc in results])

    prompt_template = ChatPromptTemplate.from_template(template)
    prompt = prompt_template.format(context=context_text, question=query, history=formatted_history)


    response =  __generate_answer(prompt)


    response_dict = json2dict(response)

    details_law = "Không có luật liên quan"

    try:
        relevent_law = response_dict["relevent_law"]

        if(relevent_law == "Không có luật liên quan"):
            return [response_dict["answer"], "Không có luật liên quan", details_law, context_text]

        deltails_law_list = []

        for i in range(len(source_law)):
            with open(source_law[i], "r") as f:
                details_law = f.read()

            my_str = ""
            details_law_lines = details_law.splitlines()
            for line in details_law_lines:
                my_str += line + "\n\n"

            path = source_law[i]
            articles = path.replace("/content/books/", "").replace(".txt", "")

            articles_list = articles.split("_")

            articles_name = articles_list[0]


            law_name = articles_list[1]

            law_name = cfg.law_dict[law_name]


            if "ND" in articles_list[1] or "NQ" in articles_list[1]:
                my_articles = f"{law_name} năm {articles_list[2]}"
            else:

                my_articles = f"{law_name} : Điều {articles_name}"


            deltails_law_list.append([my_articles, my_str])

        return [response_dict["answer"], response_dict["relevent_law"], deltails_law_list, context_text]

    except:

        return [response_dict["answer"], "Không có luật liên quan", details_law, context_text]


def read_list_question(data_path):

    question_list = []

    with open(data_path, "r") as f:
        for line in f:
            if len(line) != 0:
                question_list.append(line)

    return question_list

Overwriting utils_function.py


# SETUP

## Set up Global Variable

In [7]:
%%writefile config.py

PROMPT_TEMPLATE =\
"""
You are an assistant with the role of legal consultant in the field of Vietnamese Business Law, your task is to answer questions from users in the most complete and detailed way from the context in the "`` `" and question in the "'''", your answer should follow these requirement:
1. you need to answer in Vietnamese only.
2. answer in json format, first key is "answer", second key is "relevent_law".
3. In "relevent_law" you need to indicate which law your "answer" belongs to in Vietnamese business law (including relevant articles and clauses).
3. if you can not answer question, in "answer" key, answer "Đã có lỗi xảy ra, vui lòng thử lại".
4. If you can not find any law in context, in "relevent_law", answer "Không có luật liên quan".
5. You will also provided chat history from previous questions in "###"

There is some sample answer:

Question : "Luật doanh nghiệp Việt Nam là gì?"

Answer in json format:

"answer" : "...."
"relevent_law" : "..."

:

###
Chat history:
{history}
###


'''
 {question}
'''

 ```
 {context}
  ```

 """



PROMPT_TEMPLATE_ARTICLE = \
"""
Please let me know if there are any articles in the following text.
Only mention things that are affirmative. If the articles is mentioned but has an exclusionary nature, there is no need to list it.
If question relating to decrees or resolutions, you should skip too.
I have some example for you
-------------------------------------

"Câu hỏi" : "Điều 12 là gì"

"Trả lời" : ["12"]


"Câu hỏi": "Điều 23 của luật doanh nghiệp việt nam là gì"

"Trả lời" : ["23"]


"Câu hỏi" : "Vấn đề này nói gì điều này"

"Trả lời" : "Không"


"Câu hỏi" : "Điều này là điều gì?"

"Trả lời" : "Không"


"Câu hỏi" : "Cô muốn nói điều gì với tôi?"

"Trả lời" : "Không"


"Câu hỏi": "Những điều khoản nào nói về việc thành lập doanh nghiêp"

"Trả lời" : "Không"


"Câu hỏi": "Điều 12 và điều 25 là gì?"

"Trả lời" : ["12", "25"]


"Câu hỏi": "Khoản 2 điều 17 là gì?"

"Trả lời" : ["17"]


"Câu hỏi": "Ngoài điều 13 còn điều nào nói về vấn đề này không?"

"Trả lời" : "Không"


"Câu hỏi": "Điều nào nói về luật thành lập doanh nghiệp, ngoại trừ điều 20"

"Trả lời" : "Không"


"Câu hỏi": "Phụ lục 1 nói về điều gì?"

"Trả lời" : "Không"


"Câu hỏi": "Nghị quyết 173 quy định điều gì?"

"Trả lời" : "Không"

"Câu hỏi": "Nghị quyết 135 là gì?"

"Trả lời" : "Không"

-------------------------------------------------------

Sau đây là câu hỏi của tôi :

"Câu hỏi": {question}

You should answer in json format with is key is "articles" and do not need to explain any else.


"""




PROMPT_TEMPLATE_EVALUATION = \
"""
Evaluate the following RAG-generated answer based on the provided question, context, and ground truth answer.

### Input Data:
Question: {question}

Context: {context}

Answer: {answer}

Ground truth answer : {ground_truth}


### Evaluation Criteria (Score: 1-100):
Accuracy: Does the generated answer correctly align with the ground truth answer and the given context?
Completeness: Does the answer provide all the necessary information and fully capture the key details provided in the ground truth answer?
Clarity: Is the generated answer well-structured, easy to understand, and free of ambiguity?
Naturalness: Does the generated answer sound fluent and human-like, rather than robotic or unnatural?


### Output Format (JSON):
Return only the scores in the following JSON format, without additional reasoning,:

```json
"accuracy" : 100
"completeness" : 50
"clarity" : 60
"naturalness" : 20
"total" : 58
```
"""

law_dict = {
    "LDN" : "Luật Doanh Nghiệp",
    "LLD" : "Luật Lao Động",
    "LDT" : "Luật Đầu Tư",
    "LCK" : "Luật Chứng Khoán",
    "LTTNDN" : "Luật Thuế Thu Nhập Doanh Nghiệp",
    "NQ173" : "Nghị Quyết 173",
    "ND135" : "Nghị Định 135",
    "ND145" : "Nghị Định 145",
}


# Đường dẫn đến thư mục đã upload (bao gồm cả tên thư mục)
repo_folder = "/content/drive/MyDrive/TDTU/project"

llm_api_key = "AIzaSyAj3NzXnmGGiSxzaN8BKkXh1kbEmtyJ8pc"


ngrok_api_key = "2qz0BRsjBzGMWxObp6tTneRz2AT_86iyNsQdpg3UCr1rqvjwX"




# Thư mục chứa các văn bản zip về điều luật
root_folder = repo_folder + "/source/"


# Thư mục chứa các file để tiến hành evaluation
evaluation_folder = repo_folder + "/evaluation/"


db = None
bm25_retriever = None
reranker = None
hybrid_search = None

Overwriting config.py


## Set up dataset

In [8]:
%%writefile setup.py

import data_loader as dl
import hybrid_search as hs
import reranker as rk
import config as cfg


def setup():

    law_file_name = ["LDN_2020",
                     "LDT_2020",
                     "LCK_2019",
                     "LLD_2019",
                     "LTTNDN_2008",
                     "NQ173_2024",
                     "ND135_2020",
                     "ND145_2020",
                     ]

    source_file = []
    for law in law_file_name:
        source_file.append(f"{cfg.root_folder}{law}.zip")


    destination_folder = '/content/books'
    CHROMA_PATH = "/content/chroma"
    chunk_size = 4096
    chunk_overlap = 512

    data_loader = dl.DataLoader(source_file, destination_folder, CHROMA_PATH, chunk_size, chunk_overlap)
    db, bm25_retriever = data_loader.load_data()
    reranker = rk.Reranker()
    hybrid_search = hs.HybridSearch(db, bm25_retriever)

    cfg.db = db
    cfg.bm25_retriever = bm25_retriever
    cfg.reranker = reranker
    cfg.hybrid_search = hybrid_search

Overwriting setup.py


In [9]:
import setup
import nltk
nltk.download('punkt_tab')
nltk.download('averaged_perceptron_tagger_eng')
setup.setup()

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger_eng is already up-to-
[nltk_data]       date!


Successfully unzipped '/content/drive/MyDrive/TDTU/project/source/LDN_2020.zip' to '/content/books'
Successfully unzipped '/content/drive/MyDrive/TDTU/project/source/LDT_2020.zip' to '/content/books'
Successfully unzipped '/content/drive/MyDrive/TDTU/project/source/LCK_2019.zip' to '/content/books'
Successfully unzipped '/content/drive/MyDrive/TDTU/project/source/LLD_2019.zip' to '/content/books'
Successfully unzipped '/content/drive/MyDrive/TDTU/project/source/LTTNDN_2008.zip' to '/content/books'
Successfully unzipped '/content/drive/MyDrive/TDTU/project/source/NQ173_2024.zip' to '/content/books'
Successfully unzipped '/content/drive/MyDrive/TDTU/project/source/ND135_2020.zip' to '/content/books'
Successfully unzipped '/content/drive/MyDrive/TDTU/project/source/ND145_2020.zip' to '/content/books'


100%|██████████| 826/826 [00:14<00:00, 55.73it/s]
  db = Chroma.from_documents(chunks, embedding = HuggingFaceEmbeddings(model_name="keepitreal/vietnamese-sbert"),\
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/3.90k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/752 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/540M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/313 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/895k [00:00<?, ?B/s]

bpe.codes:   0%|          | 0.00/1.14M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/540M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/17.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/150 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

  db.persist()


Save 888 chunks to /content/chroma


config.json:   0%|          | 0.00/796 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.27G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.17k [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.1M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/964 [00:00<?, ?B/s]

# Streamlit UI

## Support function

In [10]:
%%writefile support_function.py

import streamlit as st
from datetime import datetime



def new_chat(timestamp):

    save_chat(timestamp)

    st.session_state.messages = []
    st.session_state.law = None
    st.session_state.details_law = None

    new_timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
    st.session_state.current_chat = new_timestamp


def get_title(messages, timestamp):
    title_list = []
    for msg in messages:
        if msg["role"] == "user":
          if len(msg["content"]) > 30:
              title_list.append(msg["content"][:30] + "...")
          else:
              title_list.append(msg["content"])

    if len(title_list) != 0:
       return title_list[0]
    else:
       return timestamp


def save_chat(timestamp):

    if len(st.session_state.messages) <= 1:
        return

    chat_title = get_title(st.session_state.messages, timestamp)

    st.session_state.history[timestamp] = {
              "messages": st.session_state.messages.copy(),
              "law": st.session_state.law,
              "details_law": st.session_state.details_law,
              "title": chat_title
          }


def load_chat(timestamp):
    st.session_state.current_chat = timestamp
    chat_data = st.session_state.history[timestamp]
    st.session_state.messages = chat_data["messages"]
    st.session_state.law = chat_data["law"]
    st.session_state.details_law = chat_data["details_law"]




Writing support_function.py


## Main app

In [11]:
%%writefile app.py

import streamlit as st
import utils_function as uf
import config as cfg
import setup
import support_function as sf
import time
from datetime import datetime


st.markdown(
    """
    <style>
    .stButton>button {
        width: 100%;
    }
    </style>
    """,
    unsafe_allow_html=True
)


if cfg.db == None:
  setup.setup()

db = cfg.db
bm25_retriever = cfg.bm25_retriever
reranker = cfg.reranker
hybrid_search = cfg.hybrid_search
PROMPT_TEMPLATE = cfg.PROMPT_TEMPLATE


st.title("⚖️ Chatbot Tư Vấn Pháp Luật")

if "messages" not in st.session_state:
    st.session_state.messages = []

if "history" not in st.session_state:
    st.session_state.history = {}

if "current_chat" not in st.session_state:
    timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
    st.session_state.current_chat = timestamp


if len(st.session_state.messages) == 0:
    st.session_state.messages = [
          {"role": "assistant", "content": "Xin chào, tôi là chatbot tư vấn về Luật Doanh Nghiệp Việt Nam, tôi có thể giúp gì cho bạn?"}
      ]

if "law" not in st.session_state:
    st.session_state.law = None

if "details_law" not in st.session_state:
    st.session_state.details_law = None


for message in st.session_state.messages:
    with st.chat_message(message["role"]):
        st.markdown(f'<p style="text-align: justify;">{message["content"]}</p>', unsafe_allow_html=True)


prompt = st.chat_input("Xin chào! Tôi có thể giúp gì cho bạn?")

if prompt:
    st.session_state.messages.append({"role": "user", "content": prompt})
    with st.chat_message("user"):
        st.markdown(f'<p style="text-align: justify;">{prompt}</p>', unsafe_allow_html=True)


    response = uf.gen_ans(prompt, PROMPT_TEMPLATE, db, reranker, hybrid_search, st.session_state.messages[:-1])

    answer = response[0]
    st.session_state.law = response[1]
    st.session_state.details_law = response[2]


    st.session_state.messages.append({"role": "assistant", "content": answer})

    with st.chat_message("assistant"):
        st.markdown(f'<p style="text-align: justify;">{answer}</p>', unsafe_allow_html=True)



# Sidebar
with st.sidebar:

    if len(st.session_state.messages) != 1 :
      refesh_button = st.button("Đoạn chat mới")

      if refesh_button:
        sf.new_chat(st.session_state.current_chat)



    if st.session_state.law is not None:

        if st.session_state.law != "Không có luật liên quan":
            st.divider()
            with st.expander("Luật liên quan"):
                st.markdown(f'<p style="text-align: justify;">{st.session_state.law}</p>', unsafe_allow_html=True)

            if st.session_state.details_law != "Không có luật liên quan":
              st.markdown(f'<p style="text-align: justify;">Văn bản ngữ cảnh</p>', unsafe_allow_html=True)

              for article in st.session_state.details_law:
                with st.expander(f"{article[0]}"):
                    st.markdown(f'<p style="text-align: justify;">{article[1]}</p>', unsafe_allow_html=True)


    temp_histoy = st.session_state.history.copy()
    current_chat = st.session_state.current_chat
    if len(temp_histoy) != 0:
        st.divider()
        st.subheader("Previous Chats")
        for timestamp, chat_data in reversed(temp_histoy.items()):
            if st.button(f"{chat_data['title']}", key=timestamp,type="secondary" if timestamp != current_chat else "primary"
            ):
                sf.new_chat(current_chat)
                sf.load_chat(timestamp)



Writing app.py


# Run app

In [12]:
import streamlit as st
import os
import config as cfg

from pyngrok import ngrok, exception
ngrok.set_auth_token(cfg.ngrok_api_key)


!streamlit run /content/app.py &>/content/logs.txt &


tunnel_config = {
    "addr": 8501,  # Port to expose
    "proto": "http",  # Protocol (http or tcp)
    # Add other options as needed
}
public_url = ngrok.connect(**tunnel_config)
print("Ngrok URL:", public_url)

Ngrok URL: NgrokTunnel: "https://aa0a-34-125-94-25.ngrok-free.app" -> "http://localhost:8501"


# Evaluation

## Evaluation class

In [13]:
import re
from rouge_score import rouge_scorer
import pandas as pd
from nltk.translate.bleu_score import sentence_bleu

class Evaluation:

  def get_data(self, path):
    data = ""
    with open(path, "r") as f:
      data = f.read()
    return data

  def remove_special_chars(self, text):
    return re.sub(r'[^\w\s\u4e00-\u9fff\uac00-\ud7af\u3000-\u303f\ufb00-\ufb06\ufb13-\ufb17]+', '', text)

  def data_loader(self, question_path, answer_path, context_path, truth_path):

    question_data = self.get_data(question_path)
    answer_data = self.get_data(answer_path)
    context_data = self.get_data(context_path)
    truth_data = self.get_data(truth_path)

    answer_data = answer_data.replace("\n", "").replace(">>","\n").replace("<<", "")
    context_data = context_data.replace("\n", "").replace(">>","\n").replace("<<", "")
    truth_data = truth_data.replace("\n", "").replace(">>",">>\n").replace("<<", "")

    questions = question_data.strip().split('\n')
    answers = answer_data.strip().split('\n')
    contexts = context_data.strip().split('\n')
    truths = truth_data.strip().split('\n')

    data = []
    min_len = min(len(questions), len(answers), len(truths))
    for i in range(min_len):
      data.append({'question': questions[i], 'answer': answers[i], 'truth': truths[i]})

    # Create the DataFrame
    df = pd.DataFrame(data)

    df = df.map(self.remove_special_chars)

    return [questions, answers, contexts, truths], df


  def calculate_rouge(self, generated, reference, scorer):
    scores = scorer.score(reference, generated)
    return {
        "rouge1": scores['rouge1'].fmeasure,
        "rouge2": scores['rouge2'].fmeasure,
        "rougeL": scores['rougeL'].fmeasure
    }

  def calculate_bleu(self, generated, reference):
    reference = [reference.split()]  # Reference should be a list of lists of tokens
    generated = generated.split()
    return sentence_bleu(reference, generated)

  def evaluation(self, df):
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)

    df['rouge_scores'] = df.apply(
      lambda row: self.calculate_rouge(row['answer'], row['truth'], scorer),
      axis=1)

    df['rouge1'] = df['rouge_scores'].apply(lambda x: x['rouge1'])
    df['rouge2'] = df['rouge_scores'].apply(lambda x: x['rouge2'])
    df['rougeL'] = df['rouge_scores'].apply(lambda x: x['rougeL'])

    average_rouge1 = df['rouge1'].mean()
    average_rouge2 = df['rouge2'].mean()
    average_rougeL = df['rougeL'].mean()

    print(f"Average ROUGE-1: {average_rouge1:.4f}")
    print(f"Average ROUGE-2: {average_rouge2:.4f}")
    print(f"Average ROUGE-L: {average_rougeL:.4f}")

    df['bleu_score'] = df.apply(
        lambda row: self.calculate_bleu(row['answer'], row['truth']),
        axis=1
    )

    average_bleu = df['bleu_score'].mean()
    print(f"\n\nAverage BLEU Score: {average_bleu:.4f}")


In [14]:
evaluator = Evaluation()

## RAG with Sematic Search

In [15]:
import config as cfg

question_path = cfg.evaluation_folder + "question.txt"
answer_path = cfg.evaluation_folder + "answer_sematic.txt"
context_path = cfg.evaluation_folder + "context_sematic.txt"
truth_path = cfg.evaluation_folder + "truth.txt"

evaluation_data_sematic, df_sematic = evaluator.data_loader(question_path, answer_path, context_path, truth_path)

evaluator.evaluation(df_sematic)

Average ROUGE-1: 0.7175
Average ROUGE-2: 0.6191
Average ROUGE-L: 0.6181


Average BLEU Score: 0.4597


The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()


## RAG with Hybrid Search

In [16]:
import config as cfg


question_path = cfg.evaluation_folder + "question.txt"
answer_path = cfg.evaluation_folder + "answer_hybrid.txt"
context_path = cfg.evaluation_folder + "context_hybrid.txt"
truth_path = cfg.evaluation_folder + "truth.txt"

evaluation_data_hybrid, df_hybrid = evaluator.data_loader(question_path, answer_path, context_path, truth_path)

evaluator.evaluation(df_hybrid)


Average ROUGE-1: 0.7347
Average ROUGE-2: 0.6568
Average ROUGE-L: 0.6657


Average BLEU Score: 0.4546


The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()


## RAG with Sematic Search + HyDE

In [17]:
import config as cfg

question_path = cfg.evaluation_folder + "question.txt"
answer_path = cfg.evaluation_folder + "answer_sematic_hyde.txt"
context_path = cfg.evaluation_folder + "context_sematic_hyde.txt"
truth_path = cfg.evaluation_folder + "truth.txt"

evaluation_data_sematic_hyde, df_sematic_hyde = evaluator.data_loader(question_path, answer_path, context_path, truth_path)

evaluator.evaluation(df_sematic_hyde)

Average ROUGE-1: 0.7558
Average ROUGE-2: 0.6311
Average ROUGE-L: 0.6379


Average BLEU Score: 0.4568


The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()


## RAG with Hybrid Search + HyDE

In [18]:
import config as cfg

question_path = cfg.evaluation_folder + "question.txt"
answer_path = cfg.evaluation_folder + "answer_hybrid_hyde.txt"
context_path = cfg.evaluation_folder + "context_hybrid_hyde.txt"
truth_path = cfg.evaluation_folder + "truth.txt"

evaluation_data_hybrid_hyde, df_hybrid_hyde = evaluator.data_loader(question_path, answer_path, context_path, truth_path)

evaluator.evaluation(df_hybrid_hyde)

Average ROUGE-1: 0.7756
Average ROUGE-2: 0.6643
Average ROUGE-L: 0.6692


Average BLEU Score: 0.4948


## Finetune model

In [19]:
import config as cfg

question_path = cfg.evaluation_folder + "question.txt"
answer_path = cfg.evaluation_folder + "answer_ft.txt"
context_path = cfg.evaluation_folder + "context_sematic.txt"
truth_path = cfg.evaluation_folder + "truth.txt"

evaluation_data_finetune, df_finetune = evaluator.data_loader(question_path, answer_path, context_path, truth_path)

evaluator.evaluation(df_finetune)

Average ROUGE-1: 0.2681
Average ROUGE-2: 0.1413
Average ROUGE-L: 0.1972


Average BLEU Score: 0.0118


The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 2-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()


In [20]:
# total = []
# all_total = []

In [21]:
# import utils_function as uf
# import config as cfg
# import importlib
# import time

# importlib.reload(uf)


# for i in range(0, len(evaluation_data_noHyDE_noHybrid[0])):

#     question = evaluation_data_noHyDE_noHybrid[0][i]
#     context = evaluation_data_noHyDE_noHybrid[2][i]
#     answer = evaluation_data_noHyDE_noHybrid[1][i]
#     ground_truth = evaluation_data_noHyDE_noHybrid[3][i]

#     response = uf.gen_evaluation_score(cfg.PROMPT_TEMPLATE_EVALUATION,question, context, answer, ground_truth)

#     print(f"{i}: {response}")

#     time.sleep(6)



#     total.append(response['total'])
#     all_total.append([response['accuracy'], response['completeness'],response['clarity'],response['naturalness'],response['total']])



In [22]:
# sum = [0,0,0,0,0]

# for i in range(len(all_total)):
#   for j in range(5):
#     sum[j] += all_total[i][j]

# for i in range(5):
#   print(sum[i]/len(all_total))

