In [1]:
Hyde_sys_prompt = """
# Character
You're a financial answer guesser. Even without the latest news or a complete knowledge base, your can produce a simple answer utilizing placeholders such as "xxx", "xxx%", "$xxx", and so on for unknown elements.

## Constraints:
- Aim to provide the simple reply, even when the latest information is not readily available.
- The number of the letter of the answer is limited to 30 words.

## Example
#Example1
--------
Question: What will Apple's market value be in 2023?
Reply: Apple's market cap in 2023 is $xx billion.

#Example2
--------
Question: How much does Meta's AI investment increase in 2023 compared to 2022?
Reply: Meta will invested $xx billion in AI in 2023 and $xx billion in 2022, an increase of xx%.

"""

QA_sys_prompt = """
# Character
You're a skilled chatbot, capable of extracting relevant information from retrieved documents. When a user poses a question, you answer it as though you have firsthand knowledge rather than referencing a document.

## Skills
### Skill 1: Answer questions about the retrieved document
- Understand the user's question.
- Analyze the retrieved document to find relevant information.

### Skill 2: Reply when unable to answer
- If the question can't be answered based on the document, respond, "Sorry, I do not have an accurate answer for this."

## Constraints
- Mimic the tone and language used by a chatbot.
- Do not reference any document or outside source in your answers.
- If no accurate answer can be provided, be honest and inform the user.
"""

QA_user_prompt = """
## Reference document
{}

Answer this query: {} Make the answer short and clean.
"""

verifier_sys_prompt = """## Role: Answer verifier

## Goal
You can judge whether the answer is correct or not. 

## Rule
- If the key information predicted answer is same as the ground truth answer, then the answer is correct.
- If the response is "Sorry, I do not have an accurate answer for this.", it means the answer can not be found, then the answer can be treated as correct.

## Output format
{{
"reason": "fill the reason why the predicted answer is wrong (False) or correct (True).", 
"answer": True or False
}}
"""

verifier_user_prompt = """
The question is: {}
Ground truth is: {}
Predicted answer is: {}
"""


In [2]:
def gpt_llm(system_prompt, user_prompt):
    from openai import OpenAI

    client = OpenAI()
    response = client.chat.completions.create(
        model="gpt-4-turbo-preview",
        messages=[
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": user_prompt},
        ],
        temperature=0,
        top_p = 0.9
    )
    return response.choices[0].message.content

# gpt_llm("You are a helpful assistant.", "Who are you?")

In [3]:
import os
from PyPDF2 import PdfReader
from langchain.text_splitter import CharacterTextSplitter
from langchain.vectorstores import FAISS
import glob
from tqdm import tqdm
import json
import numpy as np
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.embeddings import HuggingFaceBgeEmbeddings
import pandas as pd
from langchain_community.retrievers import BM25Retriever
import pickle
from langchain.retrievers import EnsembleRetriever

from dotenv import load_dotenv, find_dotenv
_ = load_dotenv(find_dotenv())  # read local .env file
qa_full_dataset_name = "QA_dataset_v2"  # Define dataset name
TOP_K = 5

def save_pk(chunks, file_path):
    with open(file_path, "wb") as file:
        pickle.dump(chunks, file)

def load_pk(file_path):
    with open(file_path, "rb") as file:
        return pickle.load(file)

def load_doc_chunks(chunk_size, qa_full_dataset_name):
    
    chunks = []
    file_path = f"./chunks/chunks_{chunk_size}_{qa_full_dataset_name}.pkl" 
    
    text_splitter = CharacterTextSplitter(
        separator="\n",
        chunk_size=chunk_size,
        chunk_overlap=chunk_size//5,
        length_function=len
    )
    try:
        # Check if the chunks file already exists
        chunks = load_pk(file_path)
        print("Loaded chunks from existing file. ", file_path)
    except FileNotFoundError:
        # If the file does not exist, process the PDFs
        chunks = []
        for pdf in tqdm(glob.glob("./docs/*.pdf")):
            pdf_reader = PdfReader(pdf)
            text = ""
            for page in pdf_reader.pages:
                text += page.extract_text() if page.extract_text() else ""
            chunks += text_splitter.split_text(text)
        
        # Save the chunks to a file after processing
        save_pk(chunks, file_path)
        print("Saved new chunks to file. ", file_path)
    return chunks
        
def kb_initialization(model_names, chunk_size):
    
    chunks = []   
    retrievers = []
    for model_name in model_names:
        
        # define retriever saving path
        index_filename = f"faiss_index_cs-{chunk_size}_" + model_name.split("/")[-1]
        index_path = "./faiss/" + index_filename
    
        # define embeddings
        if model_name == "text-embedding-ada-002":
            embeddings = OpenAIEmbeddings(model=model_name)
        elif model_name != "BM25":
            embeddings = HuggingFaceBgeEmbeddings(model_name=model_name, model_kwargs = {'device': 'cuda:0'},encode_kwargs = {'normalize_embeddings': True})

        # load retriever
        if not os.path.exists(index_path):
            if chunks == []:
                chunks = load_doc_chunks(chunk_size, qa_full_dataset_name)
            if model_name == "BM25":
                retriever = BM25Retriever.from_texts(chunks, metadatas=[{"source": 1}] * len(chunks))
                retriever.k = TOP_K
                save_pk(retriever, index_path)
            else:
                faiss_vectorstore = FAISS.from_texts(chunks, embeddings)
                faiss_vectorstore.save_local(index_path)
                retriever = faiss_vectorstore.as_retriever(search_kwargs={"k": TOP_K})
        else:
            if model_name == "BM25":
                retriever = load_pk(index_path)
                retriever.k = TOP_K
            else:
                faiss_vectorstore = FAISS.load_local(index_path, embeddings, allow_dangerous_deserialization=True)
                retriever = faiss_vectorstore.as_retriever(search_kwargs={"k": TOP_K})
        retrievers.append(retriever)
        
    # initialize the ensemble retriever
    ensemble_retriever = EnsembleRetriever(
        retrievers=retrievers, weights=[1/len(retrievers) for _ in retrievers]
    )
    return ensemble_retriever

def get_response_and_evaluation(data_loaded, knowledge_base):
    
    correctness = []
    results = []
    for item in tqdm(data_loaded[:]):

        try:
            pdf = item["filename"]
            ques= [item["question_1"] , item["question_2"], item["question_3"]]
            anss = [item["answer_1"], item["answer_2"], item["answer_3"]]
        except:
            print("Key error, please check. ", item)
            continue
        
        for query, answer in zip(ques, anss):
            
            # Hyde
            response_for_retrieval = gpt_llm(Hyde_sys_prompt, query)
            docs = knowledge_base.invoke(response_for_retrieval)

            QA_prompt = QA_user_prompt.format("\n".join([docs[i].page_content for i in range(len(docs))]), query)
            response = gpt_llm(QA_sys_prompt, QA_prompt)
            verified_output, verified_bool = response_evaluation(query, answer, response)
            
            print("======================")
            print(f"Query: {query}")
            print(f"Ans: {answer}")
            print(f"Res: {response}")
            print(f"Correct or not: {verified_bool}")
            correctness.append(verified_bool)
            results.append([pdf, query, answer, response, verified_output, verified_bool])
            
    return correctness, results

def response_evaluation(query, answer, response):
    verifier_prompt = verifier_user_prompt.format(query, answer, response)
    verified_output = gpt_llm(verifier_sys_prompt, verifier_prompt)
    print("verified_output: ", verified_output)
    verified_bool = verified_output.split('"answer": ')[-1]
    if "True" in verified_bool:
        verified_bool = 1
    else:
        verified_bool = 0
    return verified_output, verified_bool

def post_response_evaluation(df):
    
    responses = df.loc[:, "response"].values
    verified_output = df.loc[:, "verified_output"].values
    
    judged_res = []
    for res, v_o in zip(responses, verified_output):
        verified_bool = v_o.split('"answer": ')[-1]
        if "True" in verified_bool:
            verified_bool = 1
        else:
            verified_bool = 0
        if "Sorry" in res:
            verified_bool = 0
        judged_res.append(verified_bool)
    return judged_res

model_names = ["BM25", "mixedbread-ai/mxbai-embed-large-v1", "BAAI/bge-large-en-v1.5"]
chunk_sizes = [500, 1000, 1500]

os.makedirs("./results", exist_ok=True)
os.makedirs("./faiss", exist_ok=True)
os.makedirs("./BM25", exist_ok=True)
os.makedirs("./chunks", exist_ok=True)

acc_scores = []
for chunk_size in chunk_sizes:
    
    n = "fused_hybe"
    result_csv = f"./results/qa_full-v2_cs-{chunk_size}_{n}.xlsx"
    if os.path.exists(result_csv):
        df = pd.read_excel(result_csv)
        correctness = df.loc[:,"verified_bool"].values
        
        judged_res = post_response_evaluation(df)
        acc_scores.append(np.mean(judged_res))
        continue
    
    ## Create or load kb
    knowledge_base = kb_initialization(model_names, chunk_size)
    
    ## Get output
    # load qa dataset
    json_filename = "QA_dataset_v2.json"
    with open(json_filename, 'r') as file:
        data_loaded = json.load(file)

    ## Evaluation
    correctness, results = get_response_and_evaluation(data_loaded, knowledge_base)
        
    ## Save results to csv file
    df = pd.DataFrame(results, columns = ["filename", "query", "answer", "response", "verified_output", "verified_bool"])
    df.to_excel(result_csv, index=False)
    
    ## Re-calculate the accuracy
    judged_res = post_response_evaluation(df)
    acc_scores.append(np.mean(judged_res))

df_res = pd.DataFrame([acc_scores], index=model_names, columns=chunk_sizes)
print(df_res)     
        
        

                                    500      1000     1500
BM25                                 1.0  0.97619  0.97619
mixedbread-ai/mxbai-embed-large-v1   1.0  0.97619  0.97619
BAAI/bge-large-en-v1.5               1.0  0.97619  0.97619


In [4]:
import os
from PyPDF2 import PdfReader
from langchain.text_splitter import CharacterTextSplitter
from langchain.vectorstores import FAISS
import glob
from tqdm import tqdm
import json
import numpy as np
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.embeddings import HuggingFaceBgeEmbeddings
import pandas as pd
from langchain_community.retrievers import BM25Retriever
import pickle
from langchain.retrievers import EnsembleRetriever

from dotenv import load_dotenv, find_dotenv
_ = load_dotenv(find_dotenv())  # read local .env file
qa_full_dataset_name = "QA_dataset_v2"  # Define dataset name
TOP_K = 5

def save_pk(chunks, file_path):
    with open(file_path, "wb") as file:
        pickle.dump(chunks, file)

def load_pk(file_path):
    with open(file_path, "rb") as file:
        return pickle.load(file)

def load_doc_chunks(chunk_size, qa_full_dataset_name):
    
    chunks = []
    file_path = f"./chunks/chunks_{chunk_size}_{qa_full_dataset_name}.pkl" 
    
    text_splitter = CharacterTextSplitter(
        separator="\n",
        chunk_size=chunk_size,
        chunk_overlap=chunk_size//5,
        length_function=len
    )
    try:
        # Check if the chunks file already exists
        chunks = load_pk(file_path)
        print("Loaded chunks from existing file. ", file_path)
    except FileNotFoundError:
        # If the file does not exist, process the PDFs
        chunks = []
        for pdf in tqdm(glob.glob("./docs/*.pdf")):
            pdf_reader = PdfReader(pdf)
            text = ""
            for page in pdf_reader.pages:
                text += page.extract_text() if page.extract_text() else ""
            chunks += text_splitter.split_text(text)
        
        # Save the chunks to a file after processing
        save_pk(chunks, file_path)
        print("Saved new chunks to file. ", file_path)
    return chunks
        
def kb_initialization(model_names, chunk_size):
    
    chunks = []   
    retrievers = []
    for model_name in model_names:
        
        # define retriever saving path
        index_filename = f"faiss_index_cs-{chunk_size}_" + model_name.split("/")[-1]
        index_path = "./faiss/" + index_filename
    
        # define embeddings
        if model_name == "text-embedding-ada-002":
            embeddings = OpenAIEmbeddings(model=model_name)
        elif model_name != "BM25":
            embeddings = HuggingFaceBgeEmbeddings(model_name=model_name, model_kwargs = {'device': 'cuda:0'},encode_kwargs = {'normalize_embeddings': True})

        # load retriever
        if not os.path.exists(index_path):
            if chunks == []:
                chunks = load_doc_chunks(chunk_size, qa_full_dataset_name)
            if model_name == "BM25":
                retriever = BM25Retriever.from_texts(chunks, metadatas=[{"source": 1}] * len(chunks))
                retriever.k = TOP_K
                save_pk(retriever, index_path)
            else:
                faiss_vectorstore = FAISS.from_texts(chunks, embeddings)
                faiss_vectorstore.save_local(index_path)
                retriever = faiss_vectorstore.as_retriever(search_kwargs={"k": TOP_K})
        else:
            if model_name == "BM25":
                retriever = load_pk(index_path)
                retriever.k = TOP_K
            else:
                faiss_vectorstore = FAISS.load_local(index_path, embeddings, allow_dangerous_deserialization=True)
                retriever = faiss_vectorstore.as_retriever(search_kwargs={"k": TOP_K})
        retrievers.append(retriever)
        
    # initialize the ensemble retriever
    ensemble_retriever = EnsembleRetriever(
        retrievers=retrievers, weights=[1/len(retrievers) for _ in retrievers]
    )
    return ensemble_retriever

def get_response_and_evaluation(data_loaded, knowledge_base):
    
    correctness = []
    results = []
    for item in tqdm(data_loaded[:]):

        try:
            pdf = item["filename"]
            ques= [item["question_1"] , item["question_2"], item["question_3"]]
            anss = [item["answer_1"], item["answer_2"], item["answer_3"]]
        except:
            print("Key error, please check. ", item)
            continue
        
        for query, answer in zip(ques, anss):
            
            # Hyde
            response_for_retrieval = gpt_llm(Hyde_sys_prompt, query)
            docs = knowledge_base.invoke(response_for_retrieval)

            QA_prompt = QA_user_prompt.format("\n".join([docs[i].page_content for i in range(len(docs))]), query)
            response = gpt_llm(QA_sys_prompt, QA_prompt)
            verified_output, verified_bool = response_evaluation(query, answer, response)
            
            print("======================")
            print(f"Query: {query}")
            print(f"Ans: {answer}")
            print(f"Res: {response}")
            print(f"Correct or not: {verified_bool}")
            correctness.append(verified_bool)
            results.append([pdf, query, answer, response, verified_output, verified_bool])
            
    return correctness, results

def response_evaluation(query, answer, response):
    verifier_prompt = verifier_user_prompt.format(query, answer, response)
    verified_output = gpt_llm(verifier_sys_prompt, verifier_prompt)
    print("verified_output: ", verified_output)
    verified_bool = verified_output.split('"answer": ')[-1]
    if "True" in verified_bool:
        verified_bool = 1
    else:
        verified_bool = 0
    return verified_output, verified_bool

def post_response_evaluation(df):
    
    responses = df.loc[:, "response"].values
    verified_output = df.loc[:, "verified_output"].values
    
    judged_res = []
    for res, v_o in zip(responses, verified_output):
        verified_bool = v_o.split('"answer": ')[-1]
        if "True" in verified_bool:
            verified_bool = 1
        else:
            verified_bool = 0
        if "Sorry" in res:
            verified_bool = 0
        judged_res.append(verified_bool)
    return judged_res

model_names = ["BM25", "mixedbread-ai/mxbai-embed-large-v1", "BAAI/bge-large-en-v1.5"]
chunk_sizes = [500, 1000, 1500]

os.makedirs("./results", exist_ok=True)
os.makedirs("./faiss", exist_ok=True)
os.makedirs("./BM25", exist_ok=True)
os.makedirs("./chunks", exist_ok=True)

acc_scores = []
for chunk_size in chunk_sizes:
    
    n = "fused_hybe"
    result_csv = f"./results/qa_full-v3_cs-{chunk_size}_{n}.xlsx"
    if os.path.exists(result_csv):
        df = pd.read_excel(result_csv)
        correctness = df.loc[:,"verified_bool"].values
        
        judged_res = post_response_evaluation(df)
        acc_scores.append(np.mean(judged_res))
        continue
    
    ## Create or load kb
    knowledge_base = kb_initialization(model_names, chunk_size) 
    
    ## Get output
    # load qa dataset
    json_filename = "QA_dataset_v3.json"
    with open(json_filename, 'r') as file:
        data_loaded = json.load(file)

    ## Evaluation
    correctness, results = get_response_and_evaluation(data_loaded, knowledge_base)
        
    ## Save results to csv file
    df = pd.DataFrame(results, columns = ["filename", "query", "answer", "response", "verified_output", "verified_bool"])
    df.to_excel(result_csv, index=False)
    
    ## Re-calculate the accuracy
    judged_res = post_response_evaluation(df)
    acc_scores.append(np.mean(judged_res))

df_res = pd.DataFrame([acc_scores], index=model_names, columns=chunk_sizes)
print(df_res)     
        
        

  from .autonotebook import tqdm as notebook_tqdm
  0%|          | 0/4 [00:00<?, ?it/s]

verified_output:  {
"reason": "The predicted answer's revenue figures for both years do not match the ground truth. The ground truth states Huawei's revenue was 642,338 million CNY in 2021 and 704,174 million CNY in 2022, whereas the predicted answer states the revenue was 636,807 million CNY in 2021 and 642,338 million CNY in 2022.",
"answer": False
}
Query: What was Huawei's revenue in CNY for the years 2021 and 2022?
Ans: 642,338 million CNY in 2021 and 704,174 million CNY in 2022
Res: Huawei's revenue in 2021 was CNY636,807 million and in 2022 was CNY642,338 million.
Correct or not: 0
verified_output:  {
"reason": "The predicted answer matches the ground truth information, stating that Huawei invested more than 20% of its annual sales revenue into R&D over the past three years by the end of 2023.",
"answer": True
}
Query: How much did Huawei invest in R&D as a percentage of its annual sales revenue over the past three years by the end of 2023?
Ans: More than 20%
Res: Over the past 

 25%|██▌       | 1/4 [00:35<01:45, 35.33s/it]

verified_output:  {
"reason": "The predicted answer matches the ground truth information exactly, stating that HarmonyOS had been deployed on more than 800 million devices by the end of 2023.",
"answer": True
}
Query: By the end of 2023, how many devices had HarmonyOS been deployed on?
Ans: More than 800 million devices
Res: By the end of 2023, HarmonyOS had been deployed on more than 800 million devices.
Correct or not: 1
verified_output:  {
"reason": "The key information predicted answer (9%) is different from the ground truth answer (Nearly 11%).",
"answer": False
}
Query: What was McDonald's global comparable sales growth rate in 2023 compared to 2022?
Ans: Nearly 11%
Res: McDonald's global comparable sales growth rate in 2023 was 9% compared to 2022.
Correct or not: 0
verified_output:  {
"reason": "The predicted answer is incorrect because it states that McDonald's systemwide sales increased by nearly $30 billion since 2020 by the end of 2022, while the ground truth is that the in

 50%|█████     | 2/4 [00:53<00:50, 25.13s/it]

verified_output:  {
"reason": "The predicted answer matches the ground truth answer, stating that McDonald's had almost 50 million active loyalty users in its top six markets by the end of 2022.",
"answer": True
}
Query: How many active loyalty users did McDonald’s have in its top six markets by the end of 2022?
Ans: Almost 50 million
Res: McDonald's had almost 50 million active loyalty users in its top six markets by the end of 2022.
Correct or not: 1
verified_output:  {
"reason": "The predicted answer matches the ground truth in terms of the net profit figures for both 2021 and 2022, and it correctly states the increase in net profit from 2021 to 2022. The slight difference in the 2021 figure ($4.86 billion vs. S$4,858 million) is due to rounding but does not change the factual accuracy of the response.",
"answer": True
}
Query: What was Oversea-Chinese Banking Corporation Limited's net profit in 2021 and how did it change in 2022?
Ans: In 2021, the net profit was S$4,858 million, an

 75%|███████▌  | 3/4 [01:16<00:24, 24.07s/it]

verified_output:  {
"reason": "The predicted answer indicates an inability to provide specific details requested in the question, which aligns with the protocol for cases where accurate information cannot be found. Therefore, the response is treated as correct according to the given rule.",
"answer": True
}
Query: What was the geographical area featured on OCBC's 2021 annual report cover, and what special project did OCBC launch there as part of its sustainability efforts according to the 2022 report?
Ans: The 2021 report featured Guilin, located in Guangxi Province, China, and as part of its sustainability efforts, OCBC gifted two mangrove projects in 2022, including 9,000 trees at the OCBC Mangrove Park in Singapore.
Res: Sorry, I do not have an accurate answer for this.
Correct or not: 1
verified_output:  {
"reason": "The predicted answer closely matches the ground truth with specific figures provided for both years, which align with the approximate values given in the ground truth.

100%|██████████| 4/4 [01:37<00:00, 24.32s/it]

verified_output:  {
"reason": "The predicted answer matches the ground truth answer exactly, stating the planned investment amount by Shell plc in oil and gas between 2023 and 2025 as around $40 billion.",
"answer": True
}
Query: How much did Shell plc plan to invest in oil and gas between 2023 and 2025?
Ans: Around $40 billion
Res: Shell plc planned to invest around $40 billion in oil and gas between 2023 and 2025.
Correct or not: 1



  0%|          | 0/4 [00:00<?, ?it/s]

verified_output:  {
"reason": "The predicted answer's revenue figures for both 2021 and 2022 do not match the ground truth. The ground truth states Huawei's revenue was 642,338 million CNY in 2021 and 704,174 million CNY in 2022, whereas the predicted answer states Huawei's revenue was 636,807 million CNY in 2021 and 642,338 million CNY in 2022.",
"answer": False
}
Query: What was Huawei's revenue in CNY for the years 2021 and 2022?
Ans: 642,338 million CNY in 2021 and 704,174 million CNY in 2022
Res: Huawei's revenue for 2021 was CNY 636,807 million and for 2022 was CNY 642,338 million.
Correct or not: 0
verified_output:  {
"reason": "The predicted answer matches the ground truth information. Both state that Huawei invested more than 20% of its annual sales revenue into R&D each year over the past three years by the end of 2023.", 
"answer": True
}
Query: How much did Huawei invest in R&D as a percentage of its annual sales revenue over the past three years by the end of 2023?
Ans: Mo

 25%|██▌       | 1/4 [00:25<01:15, 25.09s/it]

verified_output:  {
"reason": "The predicted answer matches the ground truth information exactly, stating that HarmonyOS had been deployed on more than 800 million devices by the end of 2023.",
"answer": True
}
Query: By the end of 2023, how many devices had HarmonyOS been deployed on?
Ans: More than 800 million devices
Res: By the end of 2023, HarmonyOS had been deployed on more than 800 million devices.
Correct or not: 1
verified_output:  {
"reason": "The key information predicted answer (9%) is different from the ground truth answer (Nearly 11%).",
"answer": False
}
Query: What was McDonald's global comparable sales growth rate in 2023 compared to 2022?
Ans: Nearly 11%
Res: McDonald's global comparable sales grew by 9% in 2023 compared to 2022.
Correct or not: 0
verified_output:  {
"reason": "Since the predicted answer indicates an inability to provide a specific figure, and it does not contradict the ground truth information, it can be treated as correct because it acknowledges the

 50%|█████     | 2/4 [00:53<00:54, 27.05s/it]

verified_output:  {
"reason": "The predicted answer matches the ground truth answer, stating that McDonald's had almost 50 million active loyalty users in its top six markets by the end of 2022.",
"answer": True
}
Query: How many active loyalty users did McDonald’s have in its top six markets by the end of 2022?
Ans: Almost 50 million
Res: McDonald's had almost 50 million active loyalty users in its top six markets by the end of 2022.
Correct or not: 1
verified_output:  {
"reason": "The predicted answer matches the ground truth in terms of the net profit figures for both 2021 and 2022, with a minor rounding difference for the 2021 figure ($4.86 billion vs. S$4,858 million). The change in net profit from 2021 to 2022 is correctly described as an increase.",
"answer": True
}
Query: What was Oversea-Chinese Banking Corporation Limited's net profit in 2021 and how did it change in 2022?
Ans: In 2021, the net profit was S$4,858 million, and it rose to S$5.75 billion in 2022.
Res: Oversea-Ch

 75%|███████▌  | 3/4 [01:21<00:27, 27.29s/it]

verified_output:  {
"reason": "The predicted answer incorrectly states that the geographical area featured on OCBC's 2021 annual report cover is not specified, whereas the ground truth specifies it as Guilin, located in Guangxi Province, China. Additionally, the predicted answer does not mention the gifting of two mangrove projects in 2022, including 9,000 trees at the OCBC Mangrove Park in Singapore, which is a key detail in the ground truth.", 
"answer": False
}
Query: What was the geographical area featured on OCBC's 2021 annual report cover, and what special project did OCBC launch there as part of its sustainability efforts according to the 2022 report?
Ans: The 2021 report featured Guilin, located in Guangxi Province, China, and as part of its sustainability efforts, OCBC gifted two mangrove projects in 2022, including 9,000 trees at the OCBC Mangrove Park in Singapore.
Res: The geographical area featured on OCBC's 2021 annual report cover is not specified in the provided informa

100%|██████████| 4/4 [01:40<00:00, 25.20s/it]

verified_output:  {
"reason": "The predicted answer matches the ground truth answer exactly, stating the planned investment amount by Shell plc in oil and gas between 2023 and 2025.",
"answer": True
}
Query: How much did Shell plc plan to invest in oil and gas between 2023 and 2025?
Ans: Around $40 billion
Res: Shell plc planned to invest around $40 billion in oil and gas between 2023 and 2025.
Correct or not: 1



  0%|          | 0/4 [00:00<?, ?it/s]

verified_output:  {
"reason": "The predicted answer's revenue figures for both 2021 and 2022 do not match the ground truth data. The ground truth states Huawei's revenue was 642,338 million CNY in 2021 and 704,174 million CNY in 2022, whereas the predicted answer provides different figures for both years.",
"answer": False
}
Query: What was Huawei's revenue in CNY for the years 2021 and 2022?
Ans: 642,338 million CNY in 2021 and 704,174 million CNY in 2022
Res: Huawei's revenue for 2021 was CNY 636,807 million and for 2022 was CNY 642,338 million.
Correct or not: 0
verified_output:  {
"reason": "The predicted answer provides specific percentages of Huawei's R&D investment relative to its annual sales revenue for each of the past three years, all of which are more than 20%. This directly supports the ground truth claim that Huawei invested more than 20% of its annual sales revenue in R&D over the past three years by the end of 2023.", 
"answer": True
}
Query: How much did Huawei invest 

 25%|██▌       | 1/4 [00:27<01:22, 27.64s/it]

verified_output:  {
"reason": "The predicted answer matches the ground truth answer exactly, stating that HarmonyOS had been deployed on more than 800 million devices by the end of 2023.",
"answer": True
}
Query: By the end of 2023, how many devices had HarmonyOS been deployed on?
Ans: More than 800 million devices
Res: By the end of 2023, HarmonyOS had been deployed on more than 800 million devices.
Correct or not: 1
verified_output:  {
"reason": "The key information in the predicted answer does not match the ground truth. The ground truth states a nearly 11% growth rate, whereas the predicted answer states a 9% growth rate.",
"answer": False
}
Query: What was McDonald's global comparable sales growth rate in 2023 compared to 2022?
Ans: Nearly 11%
Res: McDonald's global comparable sales grew by 9% in 2023 compared to 2022.
Correct or not: 0
verified_output:  {
"reason": "The predicted answer is incorrect because it states that McDonald's systemwide sales increased by nearly $30 billio

 50%|█████     | 2/4 [00:58<00:58, 29.38s/it]

verified_output:  {
"reason": "The predicted answer matches the ground truth information, stating that McDonald's had almost 50 million active loyalty users in its top six markets by the end of 2022.",
"answer": True
}
Query: How many active loyalty users did McDonald’s have in its top six markets by the end of 2022?
Ans: Almost 50 million
Res: By the end of 2022, McDonald's had almost 50 million active loyalty users in its top six markets.
Correct or not: 1
verified_output:  {
"reason": "The predicted answer matches the ground truth in terms of the net profit figures for both 2021 and 2022, with a minor rounding difference for the 2021 figure (S$4,858 million vs. $4.86 billion). The change in net profit from 2021 to 2022 is correctly described as an increase.",
"answer": True
}
Query: What was Oversea-Chinese Banking Corporation Limited's net profit in 2021 and how did it change in 2022?
Ans: In 2021, the net profit was S$4,858 million, and it rose to S$5.75 billion in 2022.
Res: Over

 75%|███████▌  | 3/4 [01:30<00:30, 30.72s/it]

verified_output:  {
"reason": "The predicted answer incorrectly identifies the geographical area featured on OCBC's 2021 annual report cover as Pulau Ubin, Singapore, while the correct location is Guilin, located in Guangxi Province, China. Additionally, the special project mentioned in the prediction, the OCBC Mangrove Park project, is correctly associated with OCBC's sustainability efforts but incorrectly linked to Pulau Ubin instead of being part of the broader initiative that included gifting two mangrove projects in 2022, including 9,000 trees at the OCBC Mangrove Park in Singapore.",
"answer": False
}
Query: What was the geographical area featured on OCBC's 2021 annual report cover, and what special project did OCBC launch there as part of its sustainability efforts according to the 2022 report?
Ans: The 2021 report featured Guilin, located in Guangxi Province, China, and as part of its sustainability efforts, OCBC gifted two mangrove projects in 2022, including 9,000 trees at th

100%|██████████| 4/4 [01:54<00:00, 28.58s/it]

verified_output:  {
"reason": "The predicted answer matches the ground truth answer exactly, stating the planned investment amount by Shell plc in oil and gas between 2023 and 2025.",
"answer": True
}
Query: How much did Shell plc plan to invest in oil and gas between 2023 and 2025?
Ans: Around $40 billion
Res: Shell plc planned to invest around $40 billion in oil and gas between 2023 and 2025.
Correct or not: 1
                                        500       1000      1500
BM25                                0.666667  0.666667  0.666667
mixedbread-ai/mxbai-embed-large-v1  0.666667  0.666667  0.666667
BAAI/bge-large-en-v1.5              0.666667  0.666667  0.666667



