## Setting

In [1]:
import re

import nest_asyncio
nest_asyncio.apply()

from llama_index.core import Settings
from llama_index.llms.ollama import Ollama
from llama_index.embeddings.ollama import OllamaEmbedding
from llama_index.core import VectorStoreIndex
from llama_index.core import StorageContext,load_index_from_storage

In [2]:
# 设置llm模型
llm_name="qwen2.5:latest"
embedding_name="quentinz/bge-large-zh-v1.5:latest"
base_url='http://localhost:11434'

Settings.llm = Ollama(
    model=llm_name, 
    request_timeout=360.0,
    base_url=base_url,
    num_return_sequences=3)

# 设置embedding model 
Settings.embed_model = OllamaEmbedding(
    model_name=embedding_name,
    base_url=base_url)

## Documents->nodes

In [3]:
import os
import glob
from copy import deepcopy
from llama_index.core.schema import TextNode

def split_markdown_by_headers(markdown_text):
    # 正则表达式匹配Markdown的一级和二级标题
    headers = re.compile(r'^(#+) (.*)$', re.MULTILINE)
    
    # 用于存储结果的列表
    headers_content = {}
    current_block = []
    current_title=''
    no_content_h1=''
    
    # 按行分割文档
    lines = markdown_text.split('\n')
    
    for line in lines:
        # 检查当前行是否是标题
        match = headers.match(line)
        if match:
            # 如果是标题，保存当前块（如果有的话）
            if current_block:
                if len(current_block)<=2:
                    no_content_h1=current_block[0]
                else:
                    headers_content[current_title]=no_content_h1+'\n'+''.join(current_block)
                    current_block=[]
            current_title=match.string
            
        # 如果不是标题，添加到当前块
        current_block.append(line + '\n')
    
    # 添加最后一个块
    headers_content[current_title]=''.join(current_block)
    
    return headers_content

def get_block_images(block):
    images_path = re.findall(r'!\[.*?\]\((.*?)\)', block)
    return images_path

def get_page_nodes(md_files,headers_content):
    nodes=[]
    
    for header in headers_content:
        # 获取block的图片
        block=headers_content[header]
        images_path=get_block_images(block)

        # 去掉block的图片文本
        pattern = r"!\[.*\)"
        block= re.sub(pattern, "\n", block)

        # 添加metadata：标题、内容等级、图片路径
        metadata={
            'title': block.split('\n')[0].replace('#',''),
            'content_level': header.count('#') ,
            'images_path': images_path,
            'file_path': md_files
        }
        
 
        node=TextNode(
            text=block,
            metadata=deepcopy(metadata),
        )
        nodes.append(node)
    return nodes

def get_nodes_by_documents():
    documents=[]
    md_files=glob.glob('./preprocess/*.md')
    for md_file in md_files:
        with open(md_file,encoding='utf-8') as fr:
            md_content='\n'.join(fr.readlines())
        documents.append(md_content)
    
    nodes=[]
    for md_file,document in zip(md_files,documents):
        headers_content=split_markdown_by_headers(document)
        document_nodes=get_page_nodes(md_file,headers_content)
        nodes.extend(document_nodes)
    
    return nodes


nodes=get_nodes_by_documents()
print(len(nodes))
nodes[30]

563


TextNode(id_='69d51383-7c92-4c5c-8506-9fddde60b1a3', embedding=None, metadata={'title': '', 'content_level': 2, 'images_path': ['images\\img110-110.png'], 'file_path': './preprocess\\doc112-安全出行-79.md'}, excluded_embed_metadata_keys=[], excluded_llm_metadata_keys=[], relationships={}, metadata_template='{key}: {value}', metadata_separator='\n', text='\n## 打开/关闭全景天窗\n\n\n手动滑动打开（轻按按钮至第1个停止位置）。\n\n自动滑动打开（按到底）。\n\n手动滑动关闭（轻按按钮至第1个停止位置）。\n\n自动滑动关闭（按到底）。\n\n如果全景天窗和遮阳帘处于完全关闭状态，轻按控制按钮，先打开遮\n\n阳帘，只有再次按下控制按钮后，才能打开全景天窗。\n\n如果全景天窗和遮阳帘处于完全关闭状态，短时间内将控制按钮按到\n\n底两次，遮阳帘和全景天窗同时打开。\n\n', mimetype='text/plain', start_char_idx=None, end_char_idx=None, metadata_seperator='\n', text_template='{metadata_str}\n\n{content}')

## nodes->indexs

In [4]:
def build_index(IndexType,nodes,persist_dir):
    if os.path.exists(persist_dir):
        storage_context=StorageContext.from_defaults(persist_dir=persist_dir)
        index=load_index_from_storage(storage_context=storage_context)
    else:
        index=IndexType(nodes=nodes,show_progress=True)
        index.storage_context.persist(persist_dir=persist_dir)
    
    return index


## indexs-> retrievers

In [5]:
import logging
import os
from llama_index.core.storage.docstore.types import BaseDocumentStore
from typing import Any, Callable, Dict, List, Optional, cast
from llama_index.core.callbacks.base import CallbackManager
from llama_index.core.constants import DEFAULT_SIMILARITY_TOP_K
from llama_index.core.schema import BaseNode, IndexNode, NodeWithScore, QueryBundle
from llama_index.core.vector_stores.utils import (
    node_to_metadata_dict,
    metadata_dict_to_node,
)

import itertools
import jieba
from llama_index.retrievers.bm25 import BM25Retriever


class ChineseBM25Retriever(BM25Retriever):
    """A BM25 retriever that uses the BM25 algorithm to retrieve nodes.

    Args:
        nodes (List[BaseNode], optional):
            The nodes to index. If not provided, an existing BM25 object must be passed.
        similarity_top_k (int, optional):
            The number of results to return. Defaults to DEFAULT_SIMILARITY_TOP_K.
        callback_manager (CallbackManager, optional):
            The callback manager to use. Defaults to None.
        objects (List[IndexNode], optional):
            The objects to retrieve. Defaults to None.
        object_map (dict, optional):
            A map of object IDs to nodes. Defaults to None.
        verbose (bool, optional):
            Whether to show progress. Defaults to False.
    """

    def _chinese_tokenizer(self, texts: List[str]) -> tuple[str]:
        # Use jieba to segment Chinese text
        rslts = tuple(itertools.chain.from_iterable(jieba.cut(text) for text in texts))
        return rslts

    def __init__(
            self,
            nodes: Optional[List[BaseNode]] = None,
            similarity_top_k: int = DEFAULT_SIMILARITY_TOP_K,
            callback_manager: Optional[CallbackManager] = None,
            objects: Optional[List[IndexNode]] = None,
            object_map: Optional[dict] = None,
            verbose: bool = False,
    ) -> None:

        super().__init__(
            nodes=nodes,
            similarity_top_k=similarity_top_k,
            callback_manager=callback_manager,
            objects=objects,
            object_map=object_map,
            verbose=verbose,
        )
        
        # change the stop words for Chinese
        with open(r'./stopwords-zh.txt', encoding='utf-8') as f: # here needs to add in the path of chinese stopwords
            con = f.readlines()
            stop_words = set()
            for i in con:
                i = i.rstrip('\n')
                stop_words.add(i)
        self.stop_words = stop_words

        corpus_tokens = [
            [word for word in jieba.cut_for_search(node.get_content()) if word not in stop_words and word.strip('\n')]
            for node in nodes
        ]
        corpus = [node_to_metadata_dict(node) for node in nodes]
        self.bm25.corpus = corpus
        self.bm25.index(corpus_tokens, show_progress=True)
    
    @classmethod
    def from_defaults(
        cls,
        index: Optional[VectorStoreIndex] = None,
        nodes: Optional[List[BaseNode]] = None,
        docstore: Optional[BaseDocumentStore] = None,
        similarity_top_k: int = DEFAULT_SIMILARITY_TOP_K,
        verbose: bool = False,
    ) -> "ChineseBM25Retriever":
          # ensure only one of index, nodes, or docstore is passed
        if sum(bool(val) for val in [index, nodes, docstore]) != 1:
            raise ValueError("Please pass exactly one of index, nodes, or docstore.")

        if index is not None:
            docstore = index.docstore

        if docstore is not None:
            nodes = cast(List[BaseNode], list(docstore.docs.values()))
        
        assert (
            nodes is not None
        ), "Please pass exactly one of index, nodes, or docstore."
      
        return cls(
            nodes=nodes,
            similarity_top_k=similarity_top_k,
            verbose=verbose,
        )
      
    def _retrieve(self, query_bundle: QueryBundle) -> List[NodeWithScore]:
        query = query_bundle.query_str

        tokenized_query = [[word for word in jieba.cut_for_search(query) if word not in self.stop_words]]

        indexes, scores = self.bm25.retrieve(
            tokenized_query, k=self.similarity_top_k, show_progress=self._verbose
        )

        # batched, but only one query
        indexes = indexes[0]
        scores = scores[0]

        nodes: List[NodeWithScore] = []
        for idx, score in zip(indexes, scores):
            # idx can be an int or a dict of the node
            if isinstance(idx, dict):
                node = metadata_dict_to_node(idx)
            else:
                node_dict = self.corpus[int(idx)]
                node = metadata_dict_to_node(node_dict)
            nodes.append(NodeWithScore(node=node, score=float(score)))

        return nodes

resource module not available on Windows


  from .autonotebook import tqdm as notebook_tqdm


In [6]:
from llama_index.core.retrievers import QueryFusionRetriever
from llama_index.core.retrievers.fusion_retriever import FUSION_MODES


index_type=VectorStoreIndex
index_name=index_type.__name__
index=build_index(index_type,nodes,os.path.join('./Storage',index_name))

retriever = QueryFusionRetriever(
    [
        index.as_retriever(similarity_top_k=5),
        ChineseBM25Retriever.from_defaults(
            index=index,
            similarity_top_k=3
),
    ],
    num_queries=1,
    use_async=True,
    similarity_top_k=3,
    mode=FUSION_MODES.RECIPROCAL_RANK
)

Generating embeddings: 100%|██████████| 563/563 [00:18<00:00, 30.07it/s]
Building prefix dict from the default dictionary ...
Loading model from cache C:\Users\WUSHAO~1\AppData\Local\Temp\jieba.cache
Loading model cost 0.414 seconds.
Prefix dict has been built successfully.
                                                                        

In [8]:
from llama_index.core import get_response_synthesizer
from llama_index.core.query_engine import RetrieverQueryEngine

response_synthesizer=get_response_synthesizer()

query_engine=RetrieverQueryEngine(
    retriever=retriever,
    response_synthesizer=response_synthesizer
)

## 查询生成

In [8]:
import tqdm
import json

with open('./data/test_question.json',encoding='utf-8') as fp:
    test_questions=json.load(fp)

answer_num=3
for i,question_answers in enumerate(tqdm.tqdm(test_questions[:15])):
    for j in range(answer_num):
        question=question_answers['question']
        response=query_engine.query(question)
        question_answers[f'answer_{str(j+1)}']=response.response
    test_questions[i]=question_answers

100%|██████████| 15/15 [02:00<00:00,  8.05s/it]


In [10]:
with open('./data/test_question_answers.json',encoding='utf-8',mode='w') as fp:
    json_str =json.dumps(test_questions,ensure_ascii=False,indent=4)
    fp.write(json_str)

## 使用CitationQueryEngine

In [52]:
import math
from IPython.display import Markdown

def generate_markdown_table(image_paths):
    num_images = len(image_paths)
    if num_images==0:
        return ''
    
    num_cols = max(1,math.ceil(math.sqrt(num_images)))
    num_rows = math.ceil(num_images / num_cols)

    table = "| " + " | ".join(["-"] * num_cols) + " |\n"
    table += "| " + " | ".join(["---"] * num_cols) + " |\n"

    for row in range(num_rows):
        row_images = image_paths[row * num_cols:(row + 1) * num_cols]
        table += "| " + " | ".join([f"![Image](./preprocess/{img})" for img in row_images])
        table += " |" + " |" * (num_cols - len(row_images)) + "\n"

    return table

def show_response(response):
    md_content=f'问题：\n{question}\n'
    md_content+='\n---\n'
    md_content+='回答：\n'

    ref_string=''
    ref_files=[]
    imgs=[]
    for ref in response.response.references:
        try:
            ref=int(ref)  # 或者 float(s)
        except ValueError:
            print("不是数字")
            continue

        # 图片内容
        imgs.extend(response.source_nodes[ref-1].metadata['images_path'])

        # 引用内容
        file_path=response.source_nodes[ref-1].metadata['file_path']
        if file_path not in ref_files:
            ref_files.append(file_path)
            ref_string+=f'{ref}. [{os.path.splitext(os.path.basename(file_path))[0]}]({file_path})\n'
        
    
    # 图片
    md_content+='\n'
    md_content+=generate_markdown_table(imgs)

    # 回答
    md_content+='\n\n'
    md_content+=response.response.response
    
    # 引用
    md_content+='\n\n参考：\n\n'
    md_content+=ref_string
    
    # print(md_content)
    display(Markdown(md_content))


In [9]:
from pydantic import BaseModel, Field
from typing import List


class Output(BaseModel):
    """Output containing the response, references."""

    response: str = Field(..., description="The answer to the question.")
    references: List[int] = Field(
        ...,
        description="Which context is referenced to generate the content of the answer",
    )
    
llm = Ollama(
    model=llm_name, 
    request_timeout=360.0,
    base_url=base_url)

sllm = llm.as_structured_llm(output_cls=Output)

In [10]:
# query_engine= index.as_query_engine(similarity_top_k=3)
from llama_index.core.query_engine import CitationQueryEngine
citation_query_engine = CitationQueryEngine.from_args(
    index,
    llm=sllm,
    retriever=retriever,
    similarity_top_k=5,
    citation_chunk_size=512)

In [49]:
question='如何通过空调系统面板调节空调风量？'
response=citation_query_engine.query(question)
show_response(response)

问题：
如何通过空调系统面板调节空调风量？

---
回答：

| - |
| --- |
| ![Image](./preprocess/images\img244-273.png) |


您可以通过按压空调系统面板上的‘01 风速/副驾温度调节切换按钮’并旋转对应的‘02 风速/副驾温度调节旋钮’来调节空调的风量。当图标亮起时，可以开启/关闭鼓风机以及调整鼓风机转速，共有9个速度等级可设置。

参考：
1. [doc244-空调-157](./preprocess\doc244-空调-157.md)


In [50]:
question='车辆尾门的防夹保护功能是如何工作的？'
response=citation_query_engine.query(question)
show_response(response)

问题：
车辆尾门的防夹保护功能是如何工作的？

---
回答：



车辆尾门的防夹保护功能在尾门关闭过程中遇到障碍物时会停止动作并保持当前位置，同时伴有长鸣音。具体来说，在车辆运动状态下如果尾门触碰障碍物，则仅会停止关闭；而在车辆静止状态下则会被迫打开至设定的开启位置 [2]。此外，当尾门在打开过程中碰到障碍物时，它也会停止并在当前位置保持不动，并伴随有长鸣音警告[1]。

参考：
2. [doc34-装载货物-25](./preprocess\doc34-装载货物-25.md)


In [53]:
question='如何创建人脸识别？'
response=citation_query_engine.query(question)
show_response(response)

问题：
如何创建人脸识别？

---
回答：

| - | - |
| --- | --- |
| ![Image](./preprocess/images\img270-316.png) | ![Image](./preprocess/images\img270-317.png) |


要创建人脸识别，请按照以下步骤操作：

1. 点击开启/关闭人脸识别功能。
2. 点击图标，添加人脸信息。

注意，在创建人脸信息时，请将车辆挡位切换至驻车挡（P），并确保面部五官清晰可见，避免佩戴帽子、墨镜或口罩等物品。此外，如果创建成功或失败，中央显示屏会提示相关信息。

参考：

2. [doc271-中央显示屏-172](./preprocess\doc271-中央显示屏-172.md)
3. [doc273-中央显示屏-173](./preprocess\doc273-中央显示屏-173.md)


In [54]:
question='如何调节外后视镜？'
response=citation_query_engine.query(question)
show_response(response)

问题：
如何调节外后视镜？

---
回答：

| - | - |
| --- | --- |
| ![Image](./preprocess/images\img60-55.png) | ![Image](./preprocess/images\img60-56.png) |


要调节外后视镜，您可以按照以下步骤操作：1. 按下L按钮来调整左侧外后视镜；按下R按钮来调整右侧外后视镜。在相应的按钮上会点亮指示灯。2. 使用中间的控制杆来进一步调整外后视镜的角度。3. 再次按下L或R按钮，此时指示灯将熄灭以完成调节。注意，在驾驶过程中请勿调节外后视镜[1]。

参考：

1. [doc60-驾驶前的准备-45](./preprocess\doc60-驾驶前的准备-45.md)
