## Setting

In [1]:
import os
import re
import pandas as pd

import nest_asyncio
nest_asyncio.apply()

from llama_index.core import Settings
from llama_index.llms.ollama import Ollama
from llama_index.embeddings.ollama import OllamaEmbedding
from llama_index.core import VectorStoreIndex
from llama_index.core import StorageContext,load_index_from_storage

from llama_index.core.evaluation import (
    generate_qa_embedding_pairs,
    EmbeddingQAFinetuneDataset
)

In [2]:
# 设置llm模型
llm_name="qwen2.5:latest"
embedding_name="quentinz/bge-large-zh-v1.5:latest"
base_url='http://localhost:11434'

Settings.llm = Ollama(
    model=llm_name, 
    request_timeout=360.0,
    base_url=base_url)

# 设置embedding model 
Settings.embed_model = OllamaEmbedding(
    model_name=embedding_name,
    base_url=base_url)

## Documents->nodes

In [3]:
import glob
from copy import deepcopy
from llama_index.core.schema import Document,TextNode

def split_markdown_by_headers(markdown_text):
    # 正则表达式匹配Markdown的一级和二级标题
    headers = re.compile(r'^(#+) (.*)$', re.MULTILINE)
    
    # 用于存储结果的列表
    headers_content = {}
    current_block = []
    current_title=''
    no_content_h1=''
    
    # 按行分割文档
    lines = markdown_text.split('\n')
    
    for line in lines:
        # 检查当前行是否是标题
        match = headers.match(line)
        if match:
            # 如果是标题，保存当前块（如果有的话）
            if current_block:
                if len(current_block)<=2:
                    no_content_h1=current_block[0]
                else:
                    headers_content[current_title]=no_content_h1+'\n'+''.join(current_block)
                    current_block=[]
            current_title=match.string
            
        # 如果不是标题，添加到当前块
        current_block.append(line + '\n')
    
    # 添加最后一个块
    headers_content[current_title]=''.join(current_block)
    
    return headers_content

def get_block_images(block):
    images_path = re.findall(r'!\[.*?\]\((.*?)\)', block)
    return images_path

def get_page_nodes(headers_content,separator=""):
    nodes=[]
    
    for header in headers_content:
        # 获取block的图片
        block=headers_content[header]
        images_path=get_block_images(block)

        # 去掉block的图片文本
        pattern = r"!\[.*\)"
        block= re.sub(pattern, "\n", block)

        # 添加metadata：标题、内容等级、图片路径
        metadata={
            'title': block.split('\n')[0].replace('#',''),
            'content_level': header.count('#') ,
            'images_path': images_path
        }
        
 
        node=TextNode(
            text=block,
            metadata=deepcopy(metadata),
        )
        nodes.append(node)
    return nodes

def get_nodes_by_documents():
    documents=[]
    md_files=glob.glob('./preprocess/*.md')
    for md_file in md_files:
        with open(md_file,encoding='utf-8') as fr:
            md_content='\n'.join(fr.readlines())
        documents.append(md_content)
    
    nodes=[]
    for document in documents:
        headers_content=split_markdown_by_headers(document)
        document_nodes=get_page_nodes(headers_content)
        nodes.extend(document_nodes)
    
    return nodes


nodes=get_nodes_by_documents()
print(len(nodes),nodes[30])

563 Node ID: edf882a8-9b37-4524-bcbd-a1a39b04b6e2
Text: ## 打开/关闭全景天窗   手动滑动打开（轻按按钮至第1个停止位置）。  自动滑动打开（按到底）。
手动滑动关闭（轻按按钮至第1个停止位置）。  自动滑动关闭（按到底）。  如果全景天窗和遮阳帘处于完全关闭状态，轻按控制按钮，先打开遮
阳帘，只有再次按下控制按钮后，才能打开全景天窗。  如果全景天窗和遮阳帘处于完全关闭状态，短时间内将控制按钮按到
底两次，遮阳帘和全景天窗同时打开。


In [4]:
import os
import re

import nest_asyncio
nest_asyncio.apply()

from llama_index.core import Settings
from llama_index.llms.ollama import Ollama
from llama_index.embeddings.ollama import OllamaEmbedding
from llama_index.core import VectorStoreIndex
from llama_index.core import StorageContext,load_index_from_storage

from llama_index.core.evaluation import (
    generate_qa_embedding_pairs,
    EmbeddingQAFinetuneDataset
)

In [5]:
# 设置llm模型
llm_name="qwen2.5:latest"
embedding_name="quentinz/bge-large-zh-v1.5:latest"
base_url='http://localhost:11434'

Settings.llm = Ollama(
    model=llm_name, 
    request_timeout=360.0,
    base_url=base_url)

# 设置embedding model 
Settings.embed_model = OllamaEmbedding(
    model_name=embedding_name,
    base_url=base_url)

## nodes->indexs

In [6]:
# 查看所有的Index
from llama_index.core import indices

indexs=list(filter(lambda att:att.find('Index')>0,dir(indices)))
print(indexs)

['DocumentSummaryIndex', 'EmptyIndex', 'GPTDocumentSummaryIndex', 'GPTEmptyIndex', 'GPTKeywordTableIndex', 'GPTListIndex', 'GPTPandasIndex', 'GPTRAKEKeywordTableIndex', 'GPTSQLStructStoreIndex', 'GPTSimpleKeywordTableIndex', 'GPTTreeIndex', 'GPTVectorStoreIndex', 'KeywordTableIndex', 'KnowledgeGraphIndex', 'ListIndex', 'MultiModalVectorStoreIndex', 'PandasIndex', 'PropertyGraphIndex', 'RAKEKeywordTableIndex', 'SQLStructStoreIndex', 'SimpleKeywordTableIndex', 'SummaryIndex', 'TreeIndex', 'VectorStoreIndex']


In [7]:
def build_index(IndexType,nodes,persist_dir):
    if os.path.exists(persist_dir):
        storage_context=StorageContext.from_defaults(persist_dir=persist_dir)
        index=load_index_from_storage(storage_context=storage_context)
    else:
        index=IndexType(nodes=nodes,show_progress=True)
        index.storage_context.persist(persist_dir=persist_dir)
    
    return index


## indexs-> retrievers

In [8]:
import logging
import os
from llama_index.core.storage.docstore.types import BaseDocumentStore
from typing import Any, Callable, Dict, List, Optional, cast
from llama_index.core.callbacks.base import CallbackManager
from llama_index.core.constants import DEFAULT_SIMILARITY_TOP_K
from llama_index.core.schema import BaseNode, IndexNode, NodeWithScore, QueryBundle
from llama_index.core.vector_stores.utils import (
    node_to_metadata_dict,
    metadata_dict_to_node,
)

import itertools
import jieba
from llama_index.retrievers.bm25 import BM25Retriever


class ChineseBM25Retriever(BM25Retriever):
    """A BM25 retriever that uses the BM25 algorithm to retrieve nodes.

    Args:
        nodes (List[BaseNode], optional):
            The nodes to index. If not provided, an existing BM25 object must be passed.
        similarity_top_k (int, optional):
            The number of results to return. Defaults to DEFAULT_SIMILARITY_TOP_K.
        callback_manager (CallbackManager, optional):
            The callback manager to use. Defaults to None.
        objects (List[IndexNode], optional):
            The objects to retrieve. Defaults to None.
        object_map (dict, optional):
            A map of object IDs to nodes. Defaults to None.
        verbose (bool, optional):
            Whether to show progress. Defaults to False.
    """

    def _chinese_tokenizer(self, texts: List[str]) -> tuple[str]:
        # Use jieba to segment Chinese text
        rslts = tuple(itertools.chain.from_iterable(jieba.cut(text) for text in texts))
        return rslts

    def __init__(
            self,
            nodes: Optional[List[BaseNode]] = None,
            similarity_top_k: int = DEFAULT_SIMILARITY_TOP_K,
            callback_manager: Optional[CallbackManager] = None,
            objects: Optional[List[IndexNode]] = None,
            object_map: Optional[dict] = None,
            verbose: bool = False,
    ) -> None:

        super().__init__(
            nodes=nodes,
            similarity_top_k=similarity_top_k,
            callback_manager=callback_manager,
            objects=objects,
            object_map=object_map,
            verbose=verbose,
        )
        
        # change the stop words for Chinese
        with open(r'./stopwords-zh.txt', encoding='utf-8') as f: # here needs to add in the path of chinese stopwords
            con = f.readlines()
            stop_words = set()
            for i in con:
                i = i.rstrip('\n')
                stop_words.add(i)
        self.stop_words = stop_words

        corpus_tokens = [
            [word for word in jieba.cut_for_search(node.get_content()) if word not in stop_words and word.strip('\n')]
            for node in nodes
        ]
        corpus = [node_to_metadata_dict(node) for node in nodes]
        self.bm25.corpus = corpus
        self.bm25.index(corpus_tokens, show_progress=True)
    
    @classmethod
    def from_defaults(
        cls,
        index: Optional[VectorStoreIndex] = None,
        nodes: Optional[List[BaseNode]] = None,
        docstore: Optional[BaseDocumentStore] = None,
        similarity_top_k: int = DEFAULT_SIMILARITY_TOP_K,
        verbose: bool = False,
    ) -> "ChineseBM25Retriever":
          # ensure only one of index, nodes, or docstore is passed
        if sum(bool(val) for val in [index, nodes, docstore]) != 1:
            raise ValueError("Please pass exactly one of index, nodes, or docstore.")

        if index is not None:
            docstore = index.docstore

        if docstore is not None:
            nodes = cast(List[BaseNode], list(docstore.docs.values()))
        
        assert (
            nodes is not None
        ), "Please pass exactly one of index, nodes, or docstore."
      
        return cls(
            nodes=nodes,
            similarity_top_k=similarity_top_k,
            verbose=verbose,
        )
      
    def _retrieve(self, query_bundle: QueryBundle) -> List[NodeWithScore]:
        query = query_bundle.query_str

        tokenized_query = [[word for word in jieba.cut_for_search(query) if word not in self.stop_words]]

        indexes, scores = self.bm25.retrieve(
            tokenized_query, k=self.similarity_top_k, show_progress=self._verbose
        )

        # batched, but only one query
        indexes = indexes[0]
        scores = scores[0]

        nodes: List[NodeWithScore] = []
        for idx, score in zip(indexes, scores):
            # idx can be an int or a dict of the node
            if isinstance(idx, dict):
                node = metadata_dict_to_node(idx)
            else:
                node_dict = self.corpus[int(idx)]
                node = metadata_dict_to_node(node_dict)
            nodes.append(NodeWithScore(node=node, score=float(score)))

        return nodes

resource module not available on Windows


  from .autonotebook import tqdm as notebook_tqdm


In [9]:
from llama_index.core.retrievers import QueryFusionRetriever
from llama_index.core.retrievers.fusion_retriever import FUSION_MODES


index_type=VectorStoreIndex
index_name=index_type.__name__
index=build_index(index_type,nodes,os.path.join('./Storage',index_name))

retriever = QueryFusionRetriever(
    [
        index.as_retriever(similarity_top_k=5),
        ChineseBM25Retriever.from_defaults(
            index=index,
            similarity_top_k=3
),
    ],
    num_queries=1,
    use_async=True,
    similarity_top_k=3,
    mode=FUSION_MODES.RECIPROCAL_RANK
)

Building prefix dict from the default dictionary ...
Loading model from cache C:\Users\WUSHAO~1\AppData\Local\Temp\jieba.cache
Loading model cost 0.449 seconds.
Prefix dict has been built successfully.
                                                                        

In [10]:
from llama_index.core import get_response_synthesizer
from llama_index.core.query_engine import RetrieverQueryEngine

response_synthesizer=get_response_synthesizer()

query_engine=RetrieverQueryEngine(
    retriever=retriever,
    response_synthesizer=response_synthesizer
)

## eval

### 生成测试数据

In [11]:
def displayify_df(df):
    """For pretty displaying DataFrame in a notebook."""
    display_df = df.style.set_properties(
        **{
            "inline-size": "500px",
            "overflow-wrap": "break-word",
        }
    )
    display(display_df)

In [12]:
from llama_index.core.llama_dataset.generator import RagDatasetGenerator
from llama_index.core.prompts.base import PromptTemplate
from llama_index.core.prompts.prompt_type import PromptType
from llama_index.core.llama_dataset import LabeledRagDataset
from llama_index.core.llama_dataset import RagPredictionDataset

async def Build_test_dataset(test_size=10,data_dir='./TestData/'):
    ragdataset_path=os.path.join(data_dir,'ragdataset.json')
    ragdataset_predictions_path=os.path.join(data_dir,'ragdataset_predictions.jaon')

    if os.path.exists(ragdataset_path) and os.path.exists(ragdataset_predictions_path):
        rag_dataset=LabeledRagDataset.from_json(ragdataset_path)
        rag_predictions_dataset=RagPredictionDataset.from_json(ragdataset_predictions_path)
    else:
        DEFAULT_QUESTION_GENERATION_PROMPT = """\
        Context information is below.
        ---------------------
        {context_str}
        ---------------------
        Given the context information and not prior knowledge.
        generate only questions based on the below query.
        使用中文生成答案
        {query_str}
        """

        DEFAULT_TEXT_QA_PROMPT_TMPL=(
            "Context information is below.\n"
            "---------------------\n"
            "{context_str}\n"
            "---------------------\n"
            "Given the context information and not prior knowledge,answer the query.\n"
            "使用中文生成答案\n"
            "Query: {query_str}\n"
            "Answer: "
        )

        text_qa_template = PromptTemplate(
            DEFAULT_TEXT_QA_PROMPT_TMPL, prompt_type=PromptType.QUESTION_ANSWER
        )

        text_question_template=PromptTemplate(DEFAULT_QUESTION_GENERATION_PROMPT)

        num_questions_per_chunk=1
        # role="Teacher/Professor"
        role="资深汽车售后工程师"
        question_gen_query=f"""
            You are a {role}. 
            Your task is to setup {num_questions_per_chunk} questions for an upcoming quiz/examination. 
            The questions should be diverse in nature across the document. 
            Restrict the questions to the context information provided. 
        """

        import random
        random.seed(0)
        test_size=min(len(nodes),test_size)
        sample_nodes=random.sample(nodes,test_size)

        # step1:初始化数据生成器
        print('step1:初始化数据生成器')
        rag_dataset_generator=RagDatasetGenerator(nodes=sample_nodes,
                                                text_question_template=text_question_template,
                                                text_qa_template=text_qa_template,
                                                question_gen_query=question_gen_query,
                                                num_questions_per_chunk=num_questions_per_chunk,
                                                show_progress=True)

        # step2:为每个node生成问题（包含标准答案）
        print('step2:为每个node生成问题（包含标准答案）')
        rag_dataset = rag_dataset_generator.generate_dataset_from_nodes()
        rag_dataset.save_json(ragdataset_path)

        # step3:使用query_engine回答问题
        print('step3:使用query_engine回答问题')
        rag_predictions_dataset=await rag_dataset.amake_predictions_with(
            predictor=query_engine,
            batch_size=2,
            show_progress=True,
            sleep_time_in_seconds=2)
        rag_predictions_dataset.save_json(ragdataset_predictions_path)
    
    return rag_dataset,rag_predictions_dataset

In [13]:
rag_dataset,rag_predictions_dataset=await Build_test_dataset(test_size=10)

In [14]:
# 查看测试数据
show_num=2
contexts_query_answer={
    "上下文":[example.reference_contexts for example in rag_dataset.examples[:show_num]],
    "生成的提问":[example.query for example in rag_dataset.examples[:show_num]],
    "回答":[example.response for example in rag_predictions_dataset.predictions[:show_num]],
}

df=pd.DataFrame(contexts_query_answer)
displayify_df(df)    

Unnamed: 0,上下文,生成的提问,回答
0,['# 检查制动液\n\n1 检查制动液液位。\n\n\n请不定期地检查制动液液位，确保液位在MIN（最低）和MAX（最\n\n高）标记之间。\n\n请参见电子版用户手册保养项目规定的间隔时间定期更换制动液。\n\nLynk & Co领克建议您请前往Lynk & Co领克中心更换制动液。\n\n警告！\n\n■如果制动液液位低于储液罐的最低液位，请勿驾驶车辆。如果制\n\n动液液位下降明显，制动系统可能出现故障，联系Lynk & Co领克\n\n中心进行检查。\n\n■制动液有毒。请保持制动液容器密封，避免儿童接触。如误服制\n\n动液，请立即就医。\n\n■如制动液与皮肤接触或进入眼睛，请立即用大量清水冲洗。\n\n注意！\n\n■建议使用Lynk & Co领克原厂纯正“DOT4”制动液。\n\n■请务必使用处于保质期内且未使用过的制动液。\n\n'],根据文档内容，如果制动液液位低于储液罐的最低液位，应该如何处理？请描述具体的应对措施。,如果制动液液位低于储液罐的最低液位，应立即停止驾驶，并联系Lynk & Co领克中心进行检查。不要继续使用车辆，因为制动液液位下降明显可能意味着制动系统存在故障。
1,['\n## 手动SOS呼叫\n\n按住\n\n按键约2 秒，手动激活SOS功能。\n\nSOS功能激活后，乘员可直接向Lynk & Co领克客户联络中心寻求紧急\n\n援助。如果Lynk & Co领克客户联络中心未得到任何回应，车辆位置将\n\n发送到Lynk & Co领克客户联络中心。Lynk & Co领克客户联络中心将\n\n提供适当的援助（呼叫救护车、警察等）。\n\n'],在激活SOS功能时，需要按住哪个按键多长时间才能手动激活该功能？当Lynk & Co领克客户联络中心未得到任何回应时，车辆会发送什么信息？（2分）,在激活SOS功能时，需要按住特定按键约2秒才能手动激活该功能。当Lynk & Co领克客户联络中心未得到任何回应时，车辆位置信息将被发送到Lynk & Co领克客户联络中心。


### 评估

In [30]:
from llama_index.core import evaluation

evaluations=list(filter(lambda att:att.find('Evaluator')>0,dir(evaluation)))
print(evaluations)

['AnswerRelevancyEvaluator', 'BaseEvaluator', 'BaseRetrievalEvaluator', 'ContextRelevancyEvaluator', 'CorrectnessEvaluator', 'FaithfulnessEvaluator', 'GuidelineEvaluator', 'MultiModalRetrieverEvaluator', 'PairwiseComparisonEvaluator', 'QueryResponseEvaluator', 'RelevancyEvaluator', 'ResponseEvaluator', 'RetrieverEvaluator', 'SemanticSimilarityEvaluator']


In [20]:
# instantiate the gpt-4 judges
from llama_index.core.evaluation import (
    AnswerRelevancyEvaluator,
    ContextRelevancyEvaluator,
    CorrectnessEvaluator,
    FaithfulnessEvaluator,
)

judges = {}
judges["answer_relevancy"] = AnswerRelevancyEvaluator()
judges["context_relevancy"] = ContextRelevancyEvaluator()
judges["correctness"] = CorrectnessEvaluator()
judges["faithfulness"] = FaithfulnessEvaluator()

In [33]:

# 测试集
examples_with_prediction=list(zip(rag_dataset.examples, rag_predictions_dataset.predictions))

eval_tasks = []
for example, prediction in examples_with_prediction:
    eval_tasks.append(
        judges["answer_relevancy"].aevaluate(
            query=example.query,
            response=prediction.response,
            sleep_time_in_seconds=1.0,
        )
    )
    eval_tasks.append(
        judges["context_relevancy"].aevaluate(
            query=example.query,
            contexts=prediction.contexts,
            sleep_time_in_seconds=1.0,
        )
    )

    eval_tasks.append(
        judges["correctness"].aevaluate(
            query=example.query,
            contexts=prediction.contexts,
            response=prediction.response,
            sleep_time_in_seconds=1.0,
        )
    )

    eval_tasks.append(
        judges["faithfulness"].aevaluate(
            query=example.query,
            contexts=prediction.contexts,
            response=prediction.response,
            sleep_time_in_seconds=1.0,
        )
    )


In [34]:
from tqdm.asyncio import tqdm_asyncio
eval_results = await tqdm_asyncio.gather(*eval_tasks)

100%|██████████| 40/40 [01:07<00:00,  1.68s/it]


In [35]:
# 拆分指标
evals = {
    "answer_relevancy": eval_results[::4],
    "context_relevancy": eval_results[1::4],
    "correctness": eval_results[2::4],
    "faithfulness": eval_results[3::4],
}

In [37]:
from llama_index.core.evaluation.notebook_utils import get_eval_results_df
import pandas as pd

deep_dfs = {}
mean_dfs = {}
for metric in evals.keys():
    deep_df, mean_df = get_eval_results_df(
        names=["baseline"] * len(evals[metric]),
        results_arr=evals[metric],
        metric=metric,
    )
    deep_dfs[metric] = deep_df
    mean_dfs[metric] = mean_df

In [38]:
mean_scores_df = pd.concat(
    [mdf.reset_index() for _, mdf in mean_dfs.items()],
    axis=0,
    ignore_index=True,
)
mean_scores_df = mean_scores_df.set_index("index")
mean_scores_df.index = mean_scores_df.index.set_names(["metrics"])
mean_scores_df

rag,baseline
metrics,Unnamed: 1_level_1
mean_answer_relevancy_score,0.95
mean_context_relevancy_score,0.859375
mean_correctness_score,3.6
mean_faithfulness_score,0.7


In [48]:
deep_dfs["answer_relevancy"][:3]

Unnamed: 0,rag,query,answer,contexts,scores,feedbacks
0,baseline,根据文档内容，如果制动液液位低于储液罐的最低液位，应该如何处理？请描述具体的应对措施。,如果制动液液位低于储液罐的最低液位，应立即停止驾驶，并联系Lynk & Co领克中心进行检查...,,1.0,1. **Does the provided response match the subj...
1,baseline,在激活SOS功能时，需要按住哪个按键多长时间才能手动激活该功能？当Lynk & Co领克客户...,在激活SOS功能时，需要按住特定按键约2秒才能手动激活该功能。当Lynk & Co领克客户联...,,1.0,1. **Does the provided response match the subj...
2,baseline,如何通过Lynk & Co App操作前排座椅加热功能？请描述具体步骤。,要通过Lynk & Co App操作前排座椅加热功能，可以按照以下步骤进行：\n\n1. 登...,,1.0,1. 该响应匹配用户查询的主题。用户询问如何通过Lynk & Co App操作前排座椅加热功...


In [49]:
deep_dfs["context_relevancy"][:3]

Unnamed: 0,rag,query,answer,contexts,scores,feedbacks
0,baseline,根据文档内容，如果制动液液位低于储液罐的最低液位，应该如何处理？请描述具体的应对措施。,,[# 检查制动液\n\n1 检查制动液液位。\n\n\n请不定期地检查制动液液位，确保液位在...,,1. **Does the retrieved context match the subj...
1,baseline,在激活SOS功能时，需要按住哪个按键多长时间才能手动激活该功能？当Lynk & Co领克客户...,,[\n## 手动SOS呼叫\n\n按住\n\n按键约2 秒，手动激活SOS功能。\n\nSO...,1.0,### 评价步骤：\n\n#### 第一步：主题匹配度评估\n- **查询内容**：涉及两个...
2,baseline,如何通过Lynk & Co App操作前排座椅加热功能？请描述具体步骤。,,[# 前排座椅加热\n\n## 使用Lynk & Co App打开/关闭前排座椅加热\n\n...,,1. **Does the retrieved context match the subj...


In [50]:
deep_dfs["correctness"][:3]

Unnamed: 0,rag,query,answer,contexts,scores,feedbacks
0,baseline,根据文档内容，如果制动液液位低于储液罐的最低液位，应该如何处理？请描述具体的应对措施。,如果制动液液位低于储液罐的最低液位，应立即停止驾驶，并联系Lynk & Co领克中心进行检查...,,4.0,该回答直接且明确地指出了当制动液液位低于储液罐最低液位时应采取的措施，并强调了立即停车和联系...
1,baseline,在激活SOS功能时，需要按住哪个按键多长时间才能手动激活该功能？当Lynk & Co领克客户...,在激活SOS功能时，需要按住特定按键约2秒才能手动激活该功能。当Lynk & Co领克客户联...,,4.0,生成的答案针对了两个问题中的一个，并且提供了准确的信息。对于激活SOS功能所需按键的按压时间...
2,baseline,如何通过Lynk & Co App操作前排座椅加热功能？请描述具体步骤。,要通过Lynk & Co App操作前排座椅加热功能，可以按照以下步骤进行：\n\n1. 登...,,3.0,该回答基本符合问题的需求，并且提供了一般性的操作步骤。但是，步骤过于简略，缺乏具体细节，可能...


In [51]:
deep_dfs["faithfulness"][:3]

Unnamed: 0,rag,query,answer,contexts,scores,feedbacks
0,baseline,根据文档内容，如果制动液液位低于储液罐的最低液位，应该如何处理？请描述具体的应对措施。,如果制动液液位低于储液罐的最低液位，应立即停止驾驶，并联系Lynk & Co领克中心进行检查...,[# 检查制动液\n\n1 检查制动液液位。\n\n\n请不定期地检查制动液液位，确保液位在...,0.0,NO
1,baseline,在激活SOS功能时，需要按住哪个按键多长时间才能手动激活该功能？当Lynk & Co领克客户...,在激活SOS功能时，需要按住特定按键约2秒才能手动激活该功能。当Lynk & Co领克客户联...,[\n## 手动SOS呼叫\n\n按住\n\n按键约2 秒，手动激活SOS功能。\n\nSO...,1.0,YES
2,baseline,如何通过Lynk & Co App操作前排座椅加热功能？请描述具体步骤。,要通过Lynk & Co App操作前排座椅加热功能，可以按照以下步骤进行：\n\n1. 登...,[# 前排座椅加热\n\n## 使用Lynk & Co App打开/关闭前排座椅加热\n\n...,1.0,YES


### 使用BatchEvalRunner进行批量评估

In [22]:
from llama_index.core.evaluation import BatchEvalRunner

runner=BatchEvalRunner(
    evaluators={
        "answer_relevancy":AnswerRelevancyEvaluator(),
        "context_relevancy":ContextRelevancyEvaluator(),
        "correctness":CorrectnessEvaluator(),
        "faithfulness":FaithfulnessEvaluator()        
    },
    workers=8,
    show_progress=True
)

# 没有回答时，传入query_engine回答后评估
# eval_results=await runner.aevaluate_queries(
#     query_engine=query_engine,
#     queries=[example.query for example in rag_dataset.examples],
# )

# 已有回答，直接传入回答评估
eval_results=await runner.aevaluate_response_strs(
    queries=[example.query for example in rag_dataset.examples],
    contexts_list=[example.reference_contexts for example in rag_dataset.examples],
    response_strs=[example.response for example in rag_predictions_dataset.predictions]
)

100%|██████████| 40/40 [01:53<00:00,  2.83s/it]


In [41]:
eval_results['answer_relevancy'][0].dict()

C:\Users\wushaogui\AppData\Local\Temp\ipykernel_59152\2287145998.py:1: PydanticDeprecatedSince20: The `dict` method is deprecated; use `model_dump` instead. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.10/migration/
  eval_results['answer_relevancy'][0].dict()


{'query': '根据文档内容，如果制动液液位低于储液罐的最低液位，应该如何处理？请描述具体的应对措施。',
 'contexts': None,
 'response': '如果制动液液位低于储液罐的最低液位，应立即停止驾驶，并联系Lynk & Co领克中心进行检查。不要继续使用车辆，因为制动液液位下降明显可能意味着制动系统存在故障。',
 'passing': None,
 'feedback': "1. **Does the provided response match the subject matter of the user's query?**\n   - The response addresses the issue related to brake fluid level being below the minimum mark in a reservoir, which is directly relevant to the user's query.\n   \n2. **Does the provided response attempt to address the focus or perspective on the subject matter taken on by the user's query?**\n   - The response focuses on the appropriate immediate action (stopping driving and contacting professional service) and explains the potential consequences of ignoring this issue, which aligns with addressing the concern about brake fluid levels.\n\nFeedback: The response is relevant to the query as it directly addresses the specific issue of low brake fluid level. It provides clear guidance on what actions shou

In [39]:
def get_eval_results(key, eval_results):
    results = eval_results[key]
    scores = 0
    for result in results:
        score = getattr(result,'score',0)
        if score:
            scores += score
    score = scores / len(results)
    print(f"{key} Score: {score}")
    return score

In [40]:
get_eval_results('answer_relevancy',eval_results),
get_eval_results('context_relevancy',eval_results),
get_eval_results('correctness',eval_results),
get_eval_results('faithfulness',eval_results),

answer_relevancy Score: 0.95
context_relevancy Score: 0.7525000000000001
correctness Score: 3.5
faithfulness Score: 0.7


(0.7,)