在使用llamaindex构建rag引擎时，有一下方式：

- 标准的RAG过程，所有文档构建1个增强查询引擎
- 单独为每个文档构建1个查询引擎，然后让llm选择查询引擎使用

本脚本比较他们之间的差异，评估不同方法的效果

|方法|answer_relevancy|context_relevancy|correctness|faithfulness|
|---|---|---|---|---|
|所有文档1个引擎|0.85|0.84375|2.95|0.25|
|每个文档1个引擎|0.7|0.70625|3.025|0.25|

指标只是有相对参考性，原因如下：1）没有使用私域数据，文档内的知识可能llm本身就具备；2) 没有定制prompt，不同方式的倾向不同，有的方法擅长给出步骤，有的方法擅长总结


In [1]:
from llama_index.core import Settings
from llama_index.llms.ollama import Ollama
from llama_index.embeddings.ollama import OllamaEmbedding

base_url='http://localhost:11434'
llm = Ollama(model="qwen2.5:latest", request_timeout=360.0,base_url=base_url)
Settings.llm = llm
Settings.embed_model = OllamaEmbedding(model_name="quentinz/bge-large-zh-v1.5:latest",base_url=base_url)

## 生成测试数据

In [2]:
# Enable async for the notebook
import nest_asyncio
nest_asyncio.apply()

In [3]:
def displayify_df(df):
    """For pretty displaying DataFrame in a notebook."""
    display_df = df.style.set_properties(
        **{
            "inline-size": "500px",
            "overflow-wrap": "break-word",
        }
    )
    display(display_df)

In [34]:
import os
from llama_index.core.llama_dataset.generator import RagDatasetGenerator
from llama_index.core.prompts.base import PromptTemplate
from llama_index.core.prompts.prompt_type import PromptType
from llama_index.core.llama_dataset import LabeledRagDataset
from llama_index.core.llama_dataset import RagPredictionDataset

async def Build_test_dataset(nodes,query_engine,test_size=10,data_dir='./data',prefix=''):
    ragdataset_path=os.path.join(data_dir,f'ragdataset.json')
    ragdataset_predictions_path=os.path.join(data_dir,f'{prefix}-ragdataset_predictions.json')

    if os.path.exists(ragdataset_path):
        rag_dataset=LabeledRagDataset.from_json(ragdataset_path)
    else:
        DEFAULT_QUESTION_GENERATION_PROMPT = """\
        Context information is below.
        ---------------------
        {context_str}
        ---------------------
        Given the context information and not prior knowledge.
        generate only questions based on the below query.
        使用中文生成答案
        {query_str}
        """

        DEFAULT_TEXT_QA_PROMPT_TMPL=(
            "Context information is below.\n"
            "---------------------\n"
            "{context_str}\n"
            "---------------------\n"
            "Given the context information and not prior knowledge,answer the query.\n"
            "使用中文生成答案\n"
            "Query: {query_str}\n"
            "Answer: "
        )

        text_qa_template = PromptTemplate(
            DEFAULT_TEXT_QA_PROMPT_TMPL, prompt_type=PromptType.QUESTION_ANSWER
        )

        text_question_template=PromptTemplate(DEFAULT_QUESTION_GENERATION_PROMPT)

        num_questions_per_chunk=1
        # role="Teacher/Professor"
        role="小说作家"
        question_gen_query=f"""
            You are a {role}. 
            Your task is to setup {num_questions_per_chunk} questions for an upcoming quiz/examination. 
            The questions should be diverse in nature across the document. 
            Restrict the questions to the context information provided. 
        """

        import random
        random.seed(0)
        test_size=min(len(nodes),test_size)
        sample_nodes=random.sample(nodes,test_size)

        # step1:初始化数据生成器
        print('step1:初始化数据生成器')
        rag_dataset_generator=RagDatasetGenerator(nodes=sample_nodes,
                                                text_question_template=text_question_template,
                                                text_qa_template=text_qa_template,
                                                question_gen_query=question_gen_query,
                                                num_questions_per_chunk=num_questions_per_chunk)

        # step2:为每个node生成问题（包含标准答案）
        print('step2:为每个node生成问题（包含标准答案）')
        rag_dataset = rag_dataset_generator.generate_dataset_from_nodes()
        rag_dataset.save_json(ragdataset_path)

    if os.path.exists(ragdataset_predictions_path):
        rag_predictions_dataset=RagPredictionDataset.from_json(ragdataset_predictions_path)
    else:
        # step3:使用query_engine回答问题
        print('step3:使用query_engine回答问题')
        rag_predictions_dataset=await rag_dataset.amake_predictions_with(
            predictor=query_engine,
            batch_size=10,
            sleep_time_in_seconds=2)
        rag_predictions_dataset.save_json(ragdataset_predictions_path)
    
    return rag_dataset,rag_predictions_dataset

## 定义评估函数

In [5]:
from typing import List

from llama_index.core.evaluation import BatchEvalRunner
from llama_index.core.evaluation import (
    AnswerRelevancyEvaluator,
    ContextRelevancyEvaluator,
    CorrectnessEvaluator,
    FaithfulnessEvaluator,
)

runner=BatchEvalRunner(
    evaluators={
        "answer_relevancy":AnswerRelevancyEvaluator(),
        "context_relevancy":ContextRelevancyEvaluator(),
        "correctness":CorrectnessEvaluator(),
        "faithfulness":FaithfulnessEvaluator()        
    },
    workers=12,
    show_progress=True
)

async def eval_query_engine(queries:List[str],contexts_list:List[List[str]],response_strs:List[str]):
    eval_results=await runner.aevaluate_response_strs(
        queries=queries,
        contexts_list=contexts_list,
        response_strs=response_strs
    )

    for key in eval_results.keys():
        results = eval_results[key]
        scores = 0
        for result in results:
            score = getattr(result,'score',0)
            if score:
                scores += score
        score = scores / len(results)
        print(f"{key} Score: {score}")        

## 所有文档构建1个查询引擎

In [6]:
from llama_index.core import SimpleDirectoryReader
from llama_index.core.node_parser import SentenceSplitter

documents=SimpleDirectoryReader(input_dir='../../data/sidaminzhu',recursive=True).load_data(show_progress=True)
splitter = SentenceSplitter(chunk_size=1024)
nodes = splitter.get_nodes_from_documents(documents,show_progress=True)

Loading files:   0%|          | 0/140 [00:00<?, ?file/s]

Loading files: 100%|██████████| 140/140 [00:00<00:00, 521.89file/s]
  from .autonotebook import tqdm as notebook_tqdm
Parsing nodes: 100%|██████████| 140/140 [00:01<00:00, 124.27it/s]


In [7]:
# 生成索引及查询引擎
from llama_index.core import VectorStoreIndex
index=VectorStoreIndex(nodes=nodes,show_progress=True)
query_engine=index.as_query_engine()

Generating embeddings: 100%|██████████| 962/962 [01:08<00:00, 14.14it/s]


In [10]:
rag_dataset,rag_predictions_dataset=await Build_test_dataset(
    nodes,query_engine,test_size=20,prefix='OneEngine')

# 查看测试数据
# import pandas as pd
# show_num=1
# contexts_query_answer={
#     "上下文":[example.reference_contexts for example in rag_dataset.examples[:show_num]],
#     "生成的提问":[example.query for example in rag_dataset.examples[:show_num]],
#     "回答":[example.response for example in rag_predictions_dataset.predictions[:show_num]],
# }
# df=pd.DataFrame(contexts_query_answer)
# displayify_df(df)  

step1:初始化数据生成器
step2:为每个node生成问题（包含标准答案）
step3:使用query_engine回答问题


In [11]:
queries=[example.query for example in rag_dataset.examples]
contexts_list=[example.reference_contexts for example in rag_dataset.examples]
response_strs=[example.response for example in rag_predictions_dataset.predictions]

# 已有回答，直接传入回答评估
await eval_query_engine(queries,contexts_list,response_strs)

100%|██████████| 80/80 [05:51<00:00,  4.39s/it]

answer_relevancy Score: 0.85
context_relevancy Score: 0.84375
correctness Score: 2.95
faithfulness Score: 0.25





## 每个文档构建1个查询引擎

In [24]:
import glob 
from llama_index.core.tools import FunctionTool

dir_list=glob.glob('../../data/sidaminzhu/*')
documents=[
    SimpleDirectoryReader(input_dir).load_data(show_progress=True)
    for input_dir in dir_list
 ]

splitter = SentenceSplitter(chunk_size=1024)
documents_nodes=[
    splitter.get_nodes_from_documents(document)
    for document in documents
]

documents_indexs=[
    VectorStoreIndex(nodes=nodes,show_progress=True)
    for nodes in documents_nodes
]

Loading files: 100%|██████████| 41/41 [00:00<00:00, 2022.33file/s]
Loading files: 100%|██████████| 34/34 [00:00<00:00, 2748.14file/s]
Loading files: 100%|██████████| 26/26 [00:00<00:00, 905.32file/s]
Loading files: 100%|██████████| 39/39 [00:00<00:00, 2479.24file/s]
Generating embeddings: 100%|██████████| 127/127 [00:09<00:00, 13.40it/s]
Generating embeddings: 100%|██████████| 311/311 [00:21<00:00, 14.63it/s]
Generating embeddings: 100%|██████████| 340/340 [00:23<00:00, 14.74it/s]
Generating embeddings: 100%|██████████| 184/184 [00:12<00:00, 15.11it/s]


In [23]:
def get_doc_tools(
    vector_index,name: str,
) -> str:
    
    def vector_query(query: str) -> str:
        f'''设计用于回答关于{name}的问题
        query : 输入内容
        '''

        query_engine = vector_index.as_query_engine(similarity_top_k=2)
        response = query_engine.query(query)
        return response
    
    vector_query_tool = FunctionTool.from_defaults(
        name=f"vector_tool_{name}", fn=vector_query,description=f"关于{name}问题的回答助手"
    )

    return vector_query_tool

dir_info=[os.path.split(dir)[1].replace('白话文','') for dir in dir_list]
documents_tools=[
    get_doc_tools(vector_index,dir_info[i])
    for i,vector_index in enumerate(documents_indexs)
]

In [25]:
from llama_index.core.objects import ObjectIndex

tool_index=ObjectIndex.from_objects(
    documents_tools,
    index_cls=VectorStoreIndex
)

tool_retriever=tool_index.as_retriever(similarity_top_k=1)

In [30]:
from llama_index.core.agent import FunctionCallingAgent

agent = FunctionCallingAgent.from_tools(
    tool_retriever=tool_retriever,
    system_prompt="""You are an agent designed to answer queries over a set of given documents.
    Please use the tools provided to answer a question as possible. Do not rely on prior knowledge\
    """,
    verbose=False,
)

In [37]:
rag_dataset,rag_predictions_dataset=await Build_test_dataset(
    nodes,agent,test_size=20,prefix='MulEngine')

step3:使用query_engine回答问题


In [38]:
queries=[example.query for example in rag_dataset.examples]
contexts_list=[example.reference_contexts for example in rag_dataset.examples]
response_strs=[example.response for example in rag_predictions_dataset.predictions]

# 已有回答，直接传入回答评估
await eval_query_engine(queries,contexts_list,response_strs)

100%|██████████| 80/80 [05:23<00:00,  4.05s/it]

answer_relevancy Score: 0.7
context_relevancy Score: 0.70625
correctness Score: 3.025
faithfulness Score: 0.25



