# 预训练与微调模块

需要安装的python库
```
llama_index
pypdf
jupyternotebook
sentence_transformers
```

## 制作微调数据

In [1]:
import json

from llama_index import SimpleDirectoryReader
from llama_index.node_parser import SimpleNodeParser
from llama_index.schema import MetadataMode
import openai
openai.api_key='sk-aJzbu0F3j7bstWlR3e4cA9Db59Ac4f669a9f471aFa66C458'
openai.api_base='https://gf.nekoapi.com/v1'

In [3]:
!mkdir -p 'data/10k/'
!wget 'https://raw.githubusercontent.com/run-llama/llama_index/main/docs/examples/data/10k/uber_2021.pdf' -O 'data/10k/uber_2021.pdf'
!wget 'https://raw.githubusercontent.com/run-llama/llama_index/main/docs/examples/data/10k/lyft_2021.pdf' -O 'data/10k/lyft_2021.pdf'
TRAIN_FILES = ["./data/10k/lyft_2021.pdf"]
VAL_FILES = ["./data/10k/uber_2021.pdf"]

TRAIN_CORPUS_FPATH = "./data/train_corpus.json"
VAL_CORPUS_FPATH = "./data/val_corpus.json"

def load_corpus(files, verbose=False):
    if verbose:
        print(f"Loading files {files}")

    reader = SimpleDirectoryReader(input_files=files)
    docs = reader.load_data()
    if verbose:
        print(f"Loaded {len(docs)} docs")

    parser = SimpleNodeParser.from_defaults()
    nodes = parser.get_nodes_from_documents(docs, show_progress=verbose)

    if verbose:
        print(f"Parsed {len(nodes)} nodes")

    return nodes

In [None]:
from llama_index.finetuning import (
    generate_qa_embedding_pairs,
    EmbeddingQAFinetuneDataset,
)
train_nodes = load_corpus(TRAIN_FILES, verbose=True)
val_nodes = load_corpus(VAL_FILES, verbose=True)
train_dataset = generate_qa_embedding_pairs(train_nodes,
                                            #llm=llm, num_questions_per_chunk=2
                                           )
val_dataset = generate_qa_embedding_pairs(val_nodes,  
                                          #nodes, llm=llm, num_questions_per_chunk=2
                                         )
train_dataset.save_json("train_dataset.json")
val_dataset.save_json("val_dataset.json")

## 对模型进行微调

In [4]:
from llama_index.finetuning import SentenceTransformersFinetuneEngine
from llama_index.finetuning import (
    EmbeddingQAFinetuneDataset,
)

train_dataset = EmbeddingQAFinetuneDataset.from_json("train_dataset.json")
val_dataset = EmbeddingQAFinetuneDataset.from_json("val_dataset.json")

finetune_engine = SentenceTransformersFinetuneEngine(
    train_dataset,
    model_id="BAAI/bge-large-en",
    model_output_path="model_flag",
    val_dataset=val_dataset,
)
finetune_engine.finetune()
# finetune_engine.get_finetuned_model()

# 评估模块

## 加载通用的评估函数 与评估数据集

In [1]:
%autoawait on
import nest_asyncio

nest_asyncio.apply()
import os

# 设定环境变量 ，一定要设定，因为集群默认地址不太对，我没权限
os.environ["LLAMA_INDEX_CACHE_DIR"] = "/home/zhuwenhui.p/project/llamaindex"

from llama_index.schema import TextNode
from llama_index import (
    ServiceContext, # 定义了管道式使用的一组服务和配置
VectorStoreIndex
)
from tqdm import tqdm  # 注意如果写成import tqdm会报错！
import pandas as pd
from llama_index.schema import TextNode
import json
from llama_index.llms import OpenAI
from llama_index.embeddings import OpenAIEmbedding
from llama_index.finetuning import (
    generate_qa_embedding_pairs,
    EmbeddingQAFinetuneDataset,
)
from llama_index import SimpleDirectoryReader
from llama_index.node_parser import SimpleNodeParser
from llama_index.schema import MetadataMode
from concurrent.futures import ThreadPoolExecutor
#------------------------------------------------------------ 加载环境

train_dataset = EmbeddingQAFinetuneDataset.from_json("train_dataset.json")
val_dataset = EmbeddingQAFinetuneDataset.from_json("val_dataset.json")
#------------------------------------------------------------ 加载模型


async def evaluate(
    dataset,
    embed_model,
    top_k=2,
    verbose=False,
    workers=10
):
    
    corpus = dataset.corpus
    queries = dataset.queries
    relevant_docs = dataset.relevant_docs

    service_context = ServiceContext.from_defaults(llm        = OpenAI(api_key='sk-aJzbu0F3j7bstWlR3e4cA9Db59Ac4f669a9f471aFa66C458',
                                                                       api_base='https://gf.nekoapi.com/v1'),
                                                   embed_model= embed_model
                                                  )
    
    nodes = [TextNode(id_=id_, text=text) for id_, text in corpus.items()]
    
    index = VectorStoreIndex(
        nodes, service_context=service_context, show_progress=True
    )
    
    retriever = index.as_retriever(similarity_top_k=top_k)
    # 初始化检索器
    from llama_index.evaluation import RetrieverEvaluator
    retriever_evaluator = RetrieverEvaluator.from_metric_names(["mrr", "hit_rate"], retriever=retriever)
    
    eval_results = await retriever_evaluator.aevaluate_dataset(dataset,workers=workers)
    
    return eval_results



def display_results(name, eval_results):
    """Display results from evaluate."""

    metric_dicts = []
    for eval_result in eval_results:
        metric_dict = eval_result.metric_vals_dict
        metric_dicts.append(metric_dict)

    full_df = pd.DataFrame(metric_dicts)

    hit_rate = full_df["hit_rate"].mean()
    mrr = full_df["mrr"].mean()

    metric_df = pd.DataFrame(
        {"retrievers": [name], "hit_rate": [hit_rate], "mrr": [mrr]}
    )

    return metric_df

def evaluate_openai(
    dataset,
    embed_model,
    top_k=2,
    verbose=False,
):
    corpus = dataset.corpus
    queries = dataset.queries
    relevant_docs = dataset.relevant_docs

    service_context = ServiceContext.from_defaults(llm        = OpenAI(api_key='sk-aJzbu0F3j7bstWlR3e4cA9Db59Ac4f669a9f471aFa66C458',
                                                                       api_base='https://gf.nekoapi.com/v1'),
                                                   embed_model=embed_model)
    nodes = [TextNode(id_=id_, text=text) for id_, text in corpus.items()]
    index = VectorStoreIndex(
        nodes, service_context=service_context, show_progress=True
    )
    retriever = index.as_retriever(similarity_top_k=top_k)
    
    eval_results = []
    
    for query_id, query in tqdm(queries.items()):
        retrieved_nodes = retriever.retrieve(query)
        retrieved_ids = [node.node.node_id for node in retrieved_nodes]
        expected_id = relevant_docs[query_id][0]
        is_hit = expected_id in retrieved_ids  # assume 1 relevant doc

        eval_result = {
            "is_hit": is_hit,
            "retrieved": retrieved_ids,
            "expected": expected_id,
            "query": query_id,
        }
        eval_results.append(eval_result)
    return eval_results
def display_openai_results(name, eval_results):
    """Display results from evaluate."""

    

    full_df = pd.DataFrame(results)

    hit_rate = full_df["is_hit"].mean()
    def compute_mrr(df):
        ranks = []
    
        for index, row in df.iterrows():
            try:
                rank = row['retrieved'].index(row['expected']) + 1
                ranks.append(1.0 / rank)
            except ValueError:
                ranks.append(0.0)
    
        mrr = sum(ranks) / len(ranks) if len(ranks) > 0 else 0.0
    
        return mrr
    mrr=compute_mrr(full_df)

    metric_df = pd.DataFrame(
        {"retrievers": [name], "hit_rate": [hit_rate], "mrr": [mrr]}
    )

    return metric_df


async def cevaluate_openai(
    dataset,
    embed_model,
    top_k=2,
    verbose=False,
):
    
    corpus = dataset.corpus
    queries = dataset.queries
    relevant_docs = dataset.relevant_docs

    service_context = ServiceContext.from_defaults(llm        = OpenAI(api_key='sk-aJzbu0F3j7bstWlR3e4cA9Db59Ac4f669a9f471aFa66C458',
                                                                       api_base='https://gf.nekoapi.com/v1'),
                                                   embed_model=embed_model)
    nodes = [TextNode(id_=id_, text=text) for id_, text in corpus.items()]
    index = VectorStoreIndex(
        nodes, service_context=service_context, show_progress=True
    )
    retriever = index.as_retriever(similarity_top_k=top_k)

    eval_results = []
    import asyncio
    semaphore = asyncio.Semaphore(5)
    
    for query_id, query in tqdm(queries.items()):
        async with semaphore:
            retrieved_nodes =await retriever.retrieve(query)
        retrieved_ids = [node.node.node_id for node in retrieved_nodes]
        expected_id = relevant_docs[query_id][0]
        is_hit = expected_id in retrieved_ids  # assume 1 relevant doc

        eval_result = {
            "is_hit": is_hit,
            "retrieved": retrieved_ids,
            "expected": expected_id,
            "query": query_id,
        }
        eval_results.append(eval_result)
    return eval_results

## 加载微调模型，用微调模型进行测试
平均倒数排名（Mean Reciprocal Rank, MRR）是一个国际上通用的对搜索算法进行评价的机制。

：  述

结果列表中，第一个结果匹配，分数为1，第二个匹配分数为0.5，第n个匹配分数为1/n，如果没有匹配的句子分数为0。最终的分数为所有得分之和，再求平均

In [10]:
from llama_index.finetuning import SentenceTransformersFinetuneEngine
finetune_engine = SentenceTransformersFinetuneEngine(
    train_dataset,
    model_id="BAAI/bge-large-en",
    model_output_path="test_model",
    val_dataset=val_dataset,
    epochs=3,
)
finetune_engine.finetune()
embed_model = finetune_engine.get_finetuned_model()
results = await evaluate(val_dataset, embed_model)
display_results("embedding-tune:", results)

Epoch:   0%|          | 0/3 [00:00<?, ?it/s]

Iteration:   0%|          | 0/71 [00:00<?, ?it/s]

Iteration:   0%|          | 0/71 [00:00<?, ?it/s]

Iteration:   0%|          | 0/71 [00:00<?, ?it/s]

Generating embeddings:   0%|          | 0/427 [00:00<?, ?it/s]

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 860/860 [00:29<00:00, 28.74it/s]


Unnamed: 0,retrievers,hit_rate,mrr
0,embedding-tune:,0.760465,0.706395


## 加载原始模型，用原始模型进行测试

In [2]:
bge = "local:BAAI/bge-large-en"
bge_results =await evaluate(val_dataset, bge)
display_results("embedding-tune:", bge_results)

Generating embeddings:   0%|          | 0/427 [00:00<?, ?it/s]

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 860/860 [00:27<00:00, 31.38it/s]


Unnamed: 0,retrievers,hit_rate,mrr
0,embedding-tune:,0.705814,0.644767


## 加载openai模型，用openai模型进行对比

In [None]:
embed_model = OpenAIEmbedding(api_key='sk-aJzbu0F3j7bstWlR3e4cA9Db59Ac4f669a9f471aFa66C458',api_base='https://gf.nekoapi.com/v1')
results =await evaluate(val_dataset, embed_model)
display_results("embedding-tune:", results)

Generating embeddings:   0%|          | 0/427 [00:00<?, ?it/s]

## 低信息损失的数据提取

In [None]:
### 测试一个新数据的产生

# 针对实验室的实验测试进行的实验测试

## 构建评估框架

### 构建测试集

为提供的测试用例生成测试集

In [1]:
%autoawait on
import nest_asyncio
nest_asyncio.apply()
import os

# 设定环境变量 ，一定要设定，因为集群默认地址不太对，我没权限
os.environ["LLAMA_INDEX_CACHE_DIR"] = "/home/zhuwenhui.p/project/llamaindex"

from llama_index.schema import TextNode
from llama_index import (
    ServiceContext, # 定义了管道式使用的一组服务和配置
VectorStoreIndex
)
from tqdm import tqdm  # 注意如果写成import tqdm会报错！
import pandas as pd
from llama_index.schema import TextNode
import json
from llama_index.llms import OpenAI
from llama_index.embeddings import OpenAIEmbedding
from llama_index.finetuning import (
    generate_qa_embedding_pairs,
    EmbeddingQAFinetuneDataset,
)
from llama_index import SimpleDirectoryReader
from llama_index.node_parser import SimpleNodeParser
from llama_index.schema import MetadataMode
from concurrent.futures import ThreadPoolExecutor
#------------------------------------------------------------ 加载环境

# train_dataset = EmbeddingQAFinetuneDataset.from_json("train_dataset.json")
# val_dataset = EmbeddingQAFinetuneDataset.from_json("val_dataset.json")
#------------------------------------------------------------ 加载模型


async def evaluate(
    dataset,
    retriever,
    top_k=2,
    verbose=False,
    workers=10
):
    # 初始化检索器
    from llama_index.evaluation import RetrieverEvaluator
    retriever_evaluator = RetrieverEvaluator.from_metric_names(["mrr", "hit_rate"], retriever=retriever)
    
    eval_results = await retriever_evaluator.aevaluate_dataset(dataset,workers=workers)
    
    return eval_results



def display_results(name, eval_results):
    """Display results from evaluate."""

    metric_dicts = []
    for eval_result in eval_results:
        metric_dict = eval_result.metric_vals_dict
        metric_dicts.append(metric_dict)

    full_df = pd.DataFrame(metric_dicts)

    hit_rate = full_df["hit_rate"].mean()
    mrr = full_df["mrr"].mean()

    metric_df = pd.DataFrame(
        {"retrievers": [name], "hit_rate": [hit_rate], "mrr": [mrr]}
    )

    return metric_df

In [2]:
from typing import List, Optional
import uuid

def generate_qa_embedding_pairs_v2(
    questions: List[str],
    list_nodes=None
) -> EmbeddingQAFinetuneDataset:
    """Generate examples given a set of nodes and questions."""
    node_dict = {
        node.node_id: node.text
      for nodes in list_nodes for node in nodes  
    }
    queries = {}
    for question in questions:
        question_id = str(uuid.uuid4())
        queries[question_id] = question
    # construct dataset
    relevant_docs={question:corpus for question,corpus in zip(list(queries.keys()),[[node.node_id for node in nodes] for nodes in list_nodes])}
    return EmbeddingQAFinetuneDataset(
        queries=queries, corpus=node_dict, relevant_docs=relevant_docs
    )

from tqdm import tqdm
import json
import re
from llama_index.bridge.pydantic import BaseModel
from llama_index.llms.base import LLM
from llama_index.llms.openai import OpenAI
from llama_index.schema import MetadataMode, TextNode


In [3]:
%load_ext autoreload
%autoreload 2
from llama_index import ServiceContext, set_global_service_context
from llama_index.llms import OpenAI
from llama_index.embeddings import OpenAIEmbedding, HuggingFaceEmbedding
from llama_index.node_parser import SentenceWindowNodeParser, SimpleNodeParser

from llama_index.node_parser.extractors import (
    MetadataExtractor,
    SummaryExtractor,
    QuestionsAnsweredExtractor,
    TitleExtractor,
    KeywordExtractor,
    EntityExtractor,
    MetadataFeatureExtractor,
)
from llama_index.text_splitter import SentenceSplitter
from llama_index.text_splitter import TokenTextSplitter
## 模型配置
llm = OpenAI(model="gpt-3.5-turbo", temperature=0.1,api_key='sk-aJzbu0F3j7bstWlR3e4cA9Db59Ac4f669a9f471aFa66C458',api_base='https://gf.nekoapi.com/v1')
embed_model="local:BAAI/bge-large-zh"
## 文本分割器
text_splitter = SentenceSplitter(
  separator=" ",
  chunk_size=1024,
  chunk_overlap=20,
  # paragraph_separator="\n\n\n",
  secondary_chunking_regex='[^。！\!\.？\?]+[。！\!\.？\?]',#"[^,.;。]+[,.;。]?"
)


## 原始信息读取器 
metadata_extractor = MetadataExtractor(
    extractors=[
        TitleExtractor(nodes=5, llm=llm),
        QuestionsAnsweredExtractor(questions=3, llm=llm),
        # EntityExtractor(prediction_threshold=0.5),
        # SummaryExtractor(summaries=["prev", "self"], llm=llm),
        # KeywordExtractor(keywords=10, llm=llm),
        # CustomExtractor()
    ],
)

# 节点解析器

node_parser = SimpleNodeParser.from_defaults(
    # text_splitter=text_splitter,
    # metadata_extractor=metadata_extractor
)

ctx = ServiceContext.from_defaults(
    llm=llm,
    embed_model=embed_model,
    node_parser=node_parser,
)

'(MaxRetryError("HTTPSConnectionPool(host='huggingface.co', port=443): Max retries exceeded with url: /BAAI/bge-large-zh/resolve/main/config.json (Caused by ConnectTimeoutError(<urllib3.connection.HTTPSConnection object at 0x7f3fb5150d30>, 'Connection to huggingface.co timed out. (connect timeout=10)'))"), '(Request ID: a79979e1-f018-4cf3-b5ff-275dfe2e5f44)')' thrown while requesting HEAD https://huggingface.co/BAAI/bge-large-zh/resolve/main/config.json
'(MaxRetryError("HTTPSConnectionPool(host='huggingface.co', port=443): Max retries exceeded with url: /BAAI/bge-large-zh/resolve/main/tokenizer_config.json (Caused by ConnectTimeoutError(<urllib3.connection.HTTPSConnection object at 0x7f3fa80ceec0>, 'Connection to huggingface.co timed out. (connect timeout=10)'))"), '(Request ID: 69d91d45-627e-4288-bfe6-78419a83ed9b)')' thrown while requesting HEAD https://huggingface.co/BAAI/bge-large-zh/resolve/main/tokenizer_config.json


In [4]:
from llama_index import QueryBundle

# import NodeWithScore
from llama_index.schema import NodeWithScore

# Retrievers
from llama_index.retrievers import (
    BaseRetriever,
    VectorIndexRetriever,
    KGTableRetriever,
)

from typing import List

class YyhRetriever(BaseRetriever):
    def __init__(self, index_retriever, base_nodes):#bm25_retriever,
        self.index_retriever = index_retriever
        # self.bm25_retriever = None#bm25_retriever
        self.base_nodes=base_nodes

    def _retrieve(self, query, **kwargs):
        index_nodes = self.index_retriever.retrieve(query, **kwargs)
        # bm25_nodes = self.bm25_retriever.retrieve(query, **kwargs)
        
        selected_ids=[index_node.node.index_id  for index_node in index_nodes]
        
        seen_ids = set()
        vector_nodes = []
        for id in selected_ids:
            if id not in seen_ids:
                seen_ids.add(id)
                vector_nodes.extend(node for node in self.base_nodes if node.node_id == id)
        
        result = vector_nodes
        
        
        return [NodeWithScore(node=node,score=1)for node in result]
        # combine the two lists of nodes
        # all_nodes = []
        # node_ids = set()
        # for n in bm25_nodes + vector_nodes:
        #     if n.node.node_id not in node_ids:
        #         all_nodes.append(n)
        #         node_ids.add(n.node.node_id)
        # return all_nodes


class YyhBm25Retriever(BaseRetriever):
    def __init__(self, index_retriever,base_nodes,bm25_retriever):#,
        self.index_retriever = index_retriever
        self.bm25_retriever = bm25_retriever
        self.base_nodes=base_nodes

    def _retrieve(self, query, **kwargs):
        index_nodes = self.index_retriever.retrieve(query, **kwargs)
        bm25_nodes = self.bm25_retriever.retrieve(query, **kwargs)
        
        selected_ids=[index_node.node.index_id  for index_node in index_nodes]
        
        seen_ids = set()
        vector_nodes = []
        for id in selected_ids:
            if id not in seen_ids:
                seen_ids.add(id)
                vector_nodes.extend(node for node in self.base_nodes if node.node_id == id)
        

class yyhbm25Retriever(BaseRetriever):
    def __init__(self, vector_retriever, bm25_retriever,reranker):
        self.vector_retriever = vector_retriever
        self.reranker=reranker
        self.bm25_retriever = bm25_retriever

    def _retrieve(self, query, **kwargs):
        bm25_nodes = self.bm25_retriever.retrieve(query, **kwargs)
        vector_nodes = self.vector_retriever.retrieve(query, **kwargs)
        
        
        all_nodes = []
        node_ids = set()
        for n in bm25_nodes + vector_nodes:
            if n.node.node_id not in node_ids:
                all_nodes.append(n)
                node_ids.add(n.node.node_id)
        all_nodes=self.reranker.postprocess_nodes(all_nodes,query_bundle=query)
        return all_nodes

### 构建验证集

In [5]:
from llama_index import SimpleDirectoryReader
documents= SimpleDirectoryReader(
    input_dir="dataset/right"
)
docs = documents.load_data()

In [6]:
# 构造两种node
base_nodes = node_parser.get_nodes_from_documents(docs)
base_index = VectorStoreIndex(base_nodes, service_context=ctx,show_progess=True)

import re
from llama_index.schema import IndexNode
IndexNodes=[]
for node in base_nodes:
    a=re.findall('.*?[。；]', node.text)
    for i in a:
        IndexNodes.append(IndexNode(text=i,index_id=node.node_id))
        
child_index = VectorStoreIndex(IndexNodes, service_context=ctx,show_progess=True)

In [7]:
base_retriever = base_index.as_retriever(similarity_top_k=3)

In [25]:
child_retriever= child_index.as_retriever(similarity_top_k=10)

In [27]:
yyh_retriever=YyhRetriever(child_retriever,base_nodes)

In [8]:
# 判断是否需要meta加强
for node in base_nodes:
    node.text=node.get_content(metadata_mode=0)

### 加载数据集、生成测试集、生成评估集

基于私域数据生成验证集，基于GPT生成评估集（经过数据增强看看能否被使用到）

In [8]:
import pandas as pd
questions=list(pd.read_excel('test_large.xlsx')['question'])
refers=list(pd.read_excel('test_large.xlsx')['response'])
list_refers = []
for refer in refers:
    nodes =base_retriever.retrieve(refer)
    list_refers.append(nodes)
test_dataset=generate_qa_embedding_pairs_v2(questions,list_refers)
test_dataset.save_json("dataset/json/test_dataset_default_spliter.json")

#### 基准效果

In [9]:
import nest_asyncio
nest_asyncio.apply()
results =await evaluate(test_dataset,base_retriever)
display_results("base-result:",results)

retrieved_ids: ['4ccdc8d5-6c06-4b7f-91bf-b798249fa3d3', 'b7db86d2-ef76-49a0-a0a9-d5a01a5ed6e8', 'a1abc8b4-9b25-4081-8ecf-5c111beaf063']
retrieved_ids: ['b7db86d2-ef76-49a0-a0a9-d5a01a5ed6e8', '9d137253-7019-4b36-9ae2-174301da4d9e', '8b15946d-c860-487c-bf62-de73d576da23']
retrieved_ids: ['13283afd-a4dd-4c3b-8e0b-7b80a9f5711a', '9d137253-7019-4b36-9ae2-174301da4d9e', '8b15946d-c860-487c-bf62-de73d576da23']
retrieved_ids: ['4ccdc8d5-6c06-4b7f-91bf-b798249fa3d3', '8b15946d-c860-487c-bf62-de73d576da23', 'b7db86d2-ef76-49a0-a0a9-d5a01a5ed6e8']
retrieved_ids: ['8b15946d-c860-487c-bf62-de73d576da23', '9d137253-7019-4b36-9ae2-174301da4d9e', 'e5c86b60-8637-4659-a4cf-d799fdc26d04']
retrieved_ids: ['b2a0bc4f-fd4a-4ec6-8d59-721475a2476a', '044e6688-5be9-44ba-afc7-6fbcec4402e1', 'be4dd3e9-0ced-40f3-80d9-fb3c6bf69a79']
retrieved_ids: ['13283afd-a4dd-4c3b-8e0b-7b80a9f5711a', '8b15946d-c860-487c-bf62-de73d576da23', '0c5f399a-0118-499e-8344-69ed19908d19']
retrieved_ids: ['e5c86b60-8637-4659-a4cf-d799fdc

Unnamed: 0,retrievers,hit_rate,mrr
0,base-result:,0.853333,0.75


: 

#### parent chunk效果

In [29]:
import nest_asyncio
nest_asyncio.apply()
results =await evaluate(test_dataset,yyh_retrieaver)
display_results("yyh_retriever-result:",results)

retrieved_ids: ['66175927-f6ed-4112-a3f1-51e82f8dc389', 'fa36105a-78e7-4b41-bf9c-a2c08671c031', '2b10af05-54d0-4bc2-a26f-c49677a75414', '58e4e2c1-5e62-41f4-b4fc-c74ac07bdb18', 'd4543c8a-980d-4618-9705-62f1f210998c', 'acb21a6d-f353-4b27-af50-925bf4b5c06d']
retrieved_ids: ['67c14236-3401-4cdc-90a7-cb8fa1207a28', '91f4766a-e9bb-43a8-b834-4417ef831af5', '66175927-f6ed-4112-a3f1-51e82f8dc389', '456bab96-b3e5-4934-9fb3-c26c5543f1d1', '58e4e2c1-5e62-41f4-b4fc-c74ac07bdb18']
retrieved_ids: ['67c14236-3401-4cdc-90a7-cb8fa1207a28', 'b9a2cd29-748a-42db-b0b5-4441ab8cfa8c', '37b1ebcc-c372-4a28-a3dc-0ba880343432', '7448d18e-996f-4023-820b-0a4980783049']
retrieved_ids: ['2b10af05-54d0-4bc2-a26f-c49677a75414', 'a9569a21-2be1-4e8c-8974-eb431fd192e7', '0561436f-a3fb-451d-a620-b993bcc355f4', '4d9ce9f8-eb20-49a6-ae6b-1c64f11a5e6c']
retrieved_ids: ['66175927-f6ed-4112-a3f1-51e82f8dc389', '7448d18e-996f-4023-820b-0a4980783049', 'ed3216e6-90c5-46c3-ae3e-fb0d7615b7b9', '58e4e2c1-5e62-41f4-b4fc-c74ac07bdb18', 

Unnamed: 0,retrievers,hit_rate,mrr
0,yyh_retriever-result:,0.76,0.595444


#### parent chunk+Bm25效果+reranker

In [23]:
from llama_index import QueryBundle
from llama_index.indices.postprocessor import SentenceTransformerRerank
from llama_index.retrievers import BM25Retriever
child_retriever = child_index.as_retriever(similarity_top_k=6)
reranker = SentenceTransformerRerank(top_n=4, model="BAAI/bge-reranker-large")
# retireve the top 10 most similar nodes using bm25
bm25_retriever = BM25Retriever.from_defaults(nodes=base_nodes, similarity_top_k=6)
hybrid_retriever = yyhbm25Retriever(child_retriever, bm25_retriever,reranker)

In [24]:
import nest_asyncio
nest_asyncio.apply()
results =await evaluate(test_dataset,hybrid_retriever)
display_results("yyh_retriever-result:",results)

retrieved_ids: ['ee3357bb-543e-481a-9507-664fa7c9d09e', '36b104cb-d374-4864-aa48-dd9a5523f42b', 'b034c0f1-ecc5-44cf-8a8f-f5ad15c3336f', '6f3d8838-4698-44ca-8e8f-cebd93d11f90']
retrieved_ids: ['bb5f429f-6715-4468-ae5a-7e362d4c6e93', '7c8c9953-a3ac-4f90-8d5a-ae6c8275d437', '2f09ca47-dbc8-449c-8298-0cd248d2798a', 'b034c0f1-ecc5-44cf-8a8f-f5ad15c3336f']
retrieved_ids: ['465b231b-7067-4df1-b8ba-1dec90b497c6', '7c8c9953-a3ac-4f90-8d5a-ae6c8275d437', 'e4d779d5-84ef-42f0-ab2b-719876e519e9', 'c6b66140-32cb-46cb-9f89-32cabee466ed']
retrieved_ids: ['ae0bad3a-90fd-4497-a8d2-61dc2e65cf83', 'b034c0f1-ecc5-44cf-8a8f-f5ad15c3336f', '5d9ef5ec-cdb5-4c3a-a526-c82d3ab54bb4', '39d17ee9-c340-4f97-8e7d-7dc2023571a3']
retrieved_ids: ['2f09ca47-dbc8-449c-8298-0cd248d2798a', '7c8c9953-a3ac-4f90-8d5a-ae6c8275d437', 'ae0bad3a-90fd-4497-a8d2-61dc2e65cf83', 'add3046f-3403-457c-b586-1e130fdeaab5']
retrieved_ids: ['ae0bad3a-90fd-4497-a8d2-61dc2e65cf83', 'c5d4e1bb-f3d5-4b87-8c70-0e1b91539892', '16822c6d-e988-4cec-95e1

Unnamed: 0,retrievers,hit_rate,mrr
0,yyh_retriever-result:,0.406667,0.286667


#### 基于basenodes生成测试集

#暂时还不能用

In [18]:
import llama_index
llama_index.set_global_handler("simple")
import nest_asyncio
nest_asyncio.apply()

from llama_index.evaluation import RelevancyEvaluator

In [19]:
service_context=ServiceContext.from_defaults(llm=OpenAI(model='gpt-3.5-turbo',api_key='sk-aJzbu0F3j7bstWlR3e4cA9Db59Ac4f669a9f471aFa66C458',api_base='https://gf.nekoapi.com/v1'),
                                                 embed_model="local:BAAI/bge-large-zh"
                                                 )

In [20]:
relevance_evaluator = RelevancyEvaluator(service_context=service_context)

In [21]:
relevance_evaluator.evaluate_response(question, response)

Unclosed client session
client_session: <aiohttp.client.ClientSession object at 0x7f0d99c3b8b0>


KeyboardInterrupt: 

### child chunk索引构建

<llama_index.indices.vector_store.base.VectorStoreIndex at 0x7fe5627e6890>

In [99]:
child_retriever = child_index.as_retriever(similarity_top_k=3)

In [105]:
child_retriever.retrieve('niha')[0].node.index_id

'f3bd2d3e-260c-4e50-a6f9-694fe8371eb2'

In [107]:
from llama_index import QueryBundle

# import NodeWithScore
from llama_index.schema import NodeWithScore

# Retrievers
from llama_index.retrievers import (
    BaseRetriever,
    VectorIndexRetriever,
    KGTableRetriever,
)

from typing import List

class YyhRetriever(BaseRetriever):
    def __init__(self, index_retriever, base_nodes):#bm25_retriever,
        self.index_retriever = index_retriever
        # self.bm25_retriever = None#bm25_retriever
        self.base_nodes=base_nodes

    def _retrieve(self, query, **kwargs):
        index_nodes = self.index_retriever.retrieve(query, **kwargs)
        # bm25_nodes = self.bm25_retriever.retrieve(query, **kwargs)
        
        selected_ids=[index_node.node.index_id  for index_node in index_nodes]
        vector_nodes = [node for node in self.base_nodes if node.node_id in selected_ids]
        vector_nodes = self.remove_duplicates_and_limit(vector_nodes, 3)
        return vector_nodes
        # combine the two lists of nodes
        # all_nodes = []
        # node_ids = set()
        # for n in bm25_nodes + vector_nodes:
        #     if n.node.node_id not in node_ids:
        #         all_nodes.append(n)
        #         node_ids.add(n.node.node_id)
        # return all_nodes
    def remove_duplicates_and_limit(self,nodes, limit=3):
        from collections import defaultdict
        count = defaultdict(int)
        result = []
        for node in nodes:
            if count[node.node_id] < limit:
                result.append(node)
                count[node.node_id] += 1
        return [NodeWithScore(node=node,score=1)for node in result]

In [108]:
import nest_asyncio
nest_asyncio.apply()child_retriever = child_index.as_retriever(similarity_top_k=10)
results =await evaluate(test_dataset,yyhretriever)
display_results("base-result:",results)

retrieved_ids: ['cd530ff8-b767-4f8a-9458-9a7a3dc08aa2', '26c23dc7-da06-4268-8531-4148caa28ebe', 'a732b9dc-cc54-487d-a730-42da4d87226d', '6fc26a29-519f-4d0a-a4a8-ceb44cacd3ea', '1f221a6c-352b-450a-9171-43f23134216e', 'd914e84b-49a7-4fb8-9b6d-74440031842c', '332aca86-9c9b-4046-9030-f68380f90933', 'a0e33da7-1c99-4876-99fc-d4b918242923', 'a24ca730-635c-4678-9c76-2cd6698a740a', '4f731d13-38c6-413e-8349-be9760272de2']
retrieved_ids: ['cd530ff8-b767-4f8a-9458-9a7a3dc08aa2', '26c23dc7-da06-4268-8531-4148caa28ebe', 'a732b9dc-cc54-487d-a730-42da4d87226d', '6fc26a29-519f-4d0a-a4a8-ceb44cacd3ea', '1f221a6c-352b-450a-9171-43f23134216e', 'd914e84b-49a7-4fb8-9b6d-74440031842c', '332aca86-9c9b-4046-9030-f68380f90933', 'a0e33da7-1c99-4876-99fc-d4b918242923', 'a24ca730-635c-4678-9c76-2cd6698a740a', '4f731d13-38c6-413e-8349-be9760272de2']
retrieved_ids: ['cd530ff8-b767-4f8a-9458-9a7a3dc08aa2', '26c23dc7-da06-4268-8531-4148caa28ebe', 'a732b9dc-cc54-487d-a730-42da4d87226d', '6fc26a29-519f-4d0a-a4a8-ceb44ca

Unnamed: 0,retrievers,hit_rate,mrr
0,base-result:,0.0,0.0


### 检索相同的规则

In [5]:
base_nodes = simple_node_parser.get_nodes_from_documents(docs)

In [6]:
from llama_index import VectorStoreIndex
sentence_index = VectorStoreIndex(nodes, service_context=ctx)

In [7]:
from llama_index import VectorStoreIndex
base_index = VectorStoreIndex(base_nodes, service_context=ctx)

In [9]:
from llama_index.indices.postprocessor import MetadataReplacementPostProcessor

query_engine = sentence_index.as_query_engine(
    similarity_top_k=2,
    # the target key defaults to `window` to match the node_parser's default
    node_postprocessors=[
        MetadataReplacementPostProcessor(target_metadata_key="window")
    ],
)
window_response = query_engine.query(
    "证券收盘价格与开盘价格?"
)
print(window_response)

The context information does not provide any information about the closing and opening prices of securities.


In [10]:
window = window_response.source_nodes[0].node.metadata["window"]
sentence = window_response.source_nodes[0].node.metadata["original_text"]

print(f"Window: {window}")
print("------------------")
print(f"Original Sentence: {sentence}")


Window: 第3页 题应为黑体四号，三级标题应为黑体小四号，且各级标题应分别采用一致的段落间距。  申请文件章与章之间、节与节之间应有明显的分隔标识。为便于阅读，“.doc”“.docx”文档应根据各级标题建立文档结构图，“.pdf”文档应建立书签。  申请文件中的页码应与目录中的页码相符。例如，第四部分4-1的页码标注为：4-1-1，4-1-2，4-1-3，……4-1-n。  第十一条    未按本准则的要求制作和报送发行申请文件的，交易所可按有关规定不予受理。  第十二条    红筹企业发行证券的，应按本准则和相关规定制作和报送申请文件。  第十三条    本准则由中国证监会负责解释。  第十四条    本准则自公布之日起施行。 《公开发行证券的公司信息披露内容与格式准则第10号——上市公司公开发行证券申请文件》 （证监发行字〔2006〕1号） 、 《公开发行证券的公司信息披露内容与格式准则第37号——创业板上市公司发行证券申请文件（2020年修订） 》 （证监会公告〔2020〕35号） 、 《公开发行证券的公司信息披露内容与格式准则第45号——科创板上市公司发行证券申请文件》 （证监会公告〔2020〕39号）同时废止。  
------------------
Original Sentence: 第3页 题应为黑体四号，三级标题应为黑体小四号，且各级标题应分别采用一致的段落间距。  申请文件章与章之间、节与节之间应有明显的分隔标识。为便于阅读，“.doc”“.docx”文档应根据各级标题建立文档结构图，“.pdf”文档应建立书签。  申请文件中的页码应与目录中的页码相符。例如，第四部分4-1的页码标注为：4-1-1，4-1-2，4-1-3，……4-1-n。  第十一条    未按本准则的要求制作和报送发行申请文件的，交易所可按有关规定不予受理。  第十二条    红筹企业发行证券的，应按本准则和相关规定制作和报送申请文件。  第十三条    本准则由中国证监会负责解释。  第十四条    本准则自公布之日起施行。 《公开发行证券的公司信息披露内容与格式准则第10号——上市公司公开发行证券申请文件》 （证监发行字〔2006〕1号） 、 《公开发行证券的公司信息披露内容与格式准则第37号——创业板上市公司发行证券申请文件（2020年修订） 》 （证监会公告〔

## 开始通测试