# Part 12: 多重表征索引
主要的思路：对文档进行摘要，通过摘要进行索引。可以通过相似的逻辑，扩展对原文档的多种索引方式。

In [1]:
# 加载网页数据
from langchain_community.document_loaders import WebBaseLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter

loader = WebBaseLoader("https://lilianweng.github.io/posts/2023-06-23-agent/")
docs = loader.load()

loader = WebBaseLoader("https://lilianweng.github.io/posts/2024-02-05-human-data-quality/")
docs.extend(loader.load())

USER_AGENT environment variable not set, consider setting it to identify your requests.


In [6]:
# 通过llm对文档进行摘要
import uuid
import os

from langchain_core.documents import Document
from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import ChatPromptTemplate
from langchain.chat_models import init_chat_model, ChatOpenAI
from dotenv import load_dotenv

load_dotenv()


api_url = os.getenv('API_URL')
api_key = os.getenv('API_KEY')
model_name = os.getenv('MODEL')
llm = init_chat_model(
    model_provider="openai",  # 避免langchain根据模型名自动选择供应商
    model=model_name,
    # temperature=0.0,
    api_key=api_key,
    base_url=api_url,
)

In [None]:
# 给文档生成摘要
chain = (
    {"doc": lambda x: x.page_content}
    | ChatPromptTemplate.from_template("Summarize the following document:\n\n{doc}")
    | llm
    | StrOutputParser()
)
summaries = chain.batch(docs, {"max_concurrency": 5})

In [None]:
# 使用摘要进行索引
from langchain.storage import InMemoryByteStore
from langchain_community.vectorstores import Chroma
from langchain.retrievers.multi_vector import MultiVectorRetriever
from ark_embedding import ArkEmbeddings


embd = ArkEmbeddings(
    model=os.getenv("ALIYUN_EMBEDDING_MODEL"),
    api_key=os.getenv("ALIYUN_API_KEY"),
    api_url=os.getenv("ALIYUN_API_URL"),
    batch_size=10
)
# 向量化并存储
vectorstore = Chroma(collection_name="summaries",
                     embedding_function=embd)
store = InMemoryByteStore()
id_key = "doc_id"

# 构建retriever, 通过id_key关联向量和doc
retriever = MultiVectorRetriever(
    vectorstore=vectorstore,
    byte_store=store,
    id_key=id_key,
)
doc_ids = [str(uuid.uuid4()) for _ in docs]

# 与doc关联的摘要
summary_docs = [
    Document(page_content=s, metadata={id_key: doc_ids[i]})
    for i, s in enumerate(summaries)
]

# 分别添加摘要(并向量化)和文档
retriever.vectorstore.add_documents(summary_docs)
retriever.docstore.mset(list(zip(doc_ids, docs)))

  vectorstore = Chroma(collection_name="summaries",
  """Field proxy for building Where conditions with operator overloading.


In [10]:
# 匹配相似摘要
query = "Memory in agents"
sub_docs = vectorstore.similarity_search(query, k=1)
sub_docs[0]

Document(metadata={'doc_id': '625d221f-ab8e-48eb-866f-7dd58760a6d0'}, page_content='Of course. Here is a summary of the document "LLM Powered Autonomous Agents" by Lilian Weng.\n\n### Document Summary\n\nThis comprehensive blog post explores the architecture, components, and real-world applications of autonomous agents powered by Large Language Models (LLMs). It frames the LLM as the core "brain" of an agent system, which is augmented by three key components to overcome its inherent limitations.\n\n#### Core Components of an LLM Agent:\n\n1.  **Planning:** The agent breaks down complex tasks into smaller, manageable subgoals and can self-reflect to learn from mistakes.\n    *   **Task Decomposition:** Techniques like Chain-of-Thought (CoT) and Tree of Thoughts are used to break problems into steps.\n    *   **Self-Reflection:** Frameworks like **ReAct** (Reason + Act) and **Reflexion** allow the agent to critique its past actions, learn from failures, and refine its future strategy.\n\

In [11]:
# 通过匹配摘要，检索相似文档
retrieved_docs = retriever.get_relevant_documents(query, n_results=1)
retrieved_docs[0].page_content[0:500]

  retrieved_docs = retriever.get_relevant_documents(query, n_results=1)


"\n\n\n\n\n\nLLM Powered Autonomous Agents | Lil'Log\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nLil'Log\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n|\n\n\n\n\n\n\nPosts\n\n\n\n\nArchive\n\n\n\n\nSearch\n\n\n\n\nTags\n\n\n\n\nFAQ\n\n\n\n\n\n\n\n\n\n      LLM Powered Autonomous Agents\n    \nDate: June 23, 2023  |  Estimated Reading Time: 31 min  |  Author: Lilian Weng\n\n\n \n\n\nTable of Contents\n\n\n\nAgent System Overview\n\nComponent One: Planning\n\nTask Decomposition\n\nSelf-Reflection\n\n\nComponent Two: Memory\n\nTypes of Memory\n\nMaximum Inner Product Search (MIPS)\n\n\nComponent Three:"

# Part 13: RAPTOR
Recursive Abstractive Processing for Tree-Organized Retrieval  
参考代码：https://github.com/parthsarthi03/raptor#  
主要的思路：对聚类后的文本块进行摘要并嵌入，递归这个过程，自底向上构建具有树状结构的不同层级摘要和嵌入。在推理时，从该树中进行检索，整合长篇文档中不同抽象层级的信息。 
整体的思想，有点类似GraphRAG的分层聚类，获取不同层级的信息，只是GraphRAG是对知识图谱进行操作，而RAPTOR是直接对分块chunk或文档进行操作。

In [1]:
from pathlib import Path
import sys
import os

project_root = (Path(os.getcwd()).parent / "raptor").resolve().as_posix()
sys.path.append(project_root)
print(project_root)

/Users/young/project/llmProject/rag-from-scratch/raptor


In [2]:
from raptor import (
    BaseSummarizationModel, 
    BaseQAModel, 
    BaseEmbeddingModel, 
    RetrievalAugmentation,
    RetrievalAugmentationConfig
)

  from .autonotebook import tqdm as notebook_tqdm
2025-11-22 16:01:52,889 - Loading faiss.
2025-11-22 16:01:54,452 - Successfully loaded faiss.


In [3]:
from tenacity import retry, stop_after_attempt, wait_random_exponential
from openai import OpenAI
from dotenv import load_dotenv

load_dotenv()

True

In [15]:
class MySummarizationModel(BaseSummarizationModel):
    def __init__(self, model=os.environ["MODEL"]):
        self.model = os.environ["MODEL"]

    # @retry(wait=wait_random_exponential(min=1, max=20), stop=stop_after_attempt(6))
    def summarize(self, context, max_tokens=500, stop_sequence=None):

        try:
            client = OpenAI(
                base_url=os.environ["API_URL"],
                api_key=os.environ["API_KEY"],
            )

            response = client.chat.completions.create(
                model=self.model,
                messages=[
                    {"role": "system", "content": "You are a helpful assistant."},
                    {
                        "role": "user",
                        "content": f"Write a summary of the following, including as many key details as possible: {context}:",
                    },
                ],
                max_tokens=max_tokens,
            )

            return response.choices[0].message.content

        except Exception as e:
            print("error:", e)
            return e

In [26]:
class MyQAModel(BaseQAModel):
    def __init__(self, model=os.environ["MODEL"]):
        """
        Initializes the GPT-3 model with the specified model version.

        Args:
            model (str, optional): The GPT-3 model version to use for generating summaries. Defaults to "text-davinci-003".
        """
        self.model = os.environ["MODEL"]
        self.client = OpenAI(
            api_key=os.environ["API_KEY"],
            base_url=os.environ["API_URL"],
        )

    # @retry(wait=wait_random_exponential(min=1, max=20), stop=stop_after_attempt(6))
    def answer_question(self, context, question, max_tokens=150, stop_sequence=None):
        """
        Generates a summary of the given context using the GPT-3 model.

        Args:
            context (str): The text to summarize.
            max_tokens (int, optional): The maximum number of tokens in the generated summary. Defaults to 150.
            stop_sequence (str, optional): The sequence at which to stop summarization. Defaults to None.

        Returns:
            str: The generated summary.
        """
        try:
            # response = self.client.completions.create(
            #     prompt=f"using the folloing information {context}. Answer the following question in less than 5-7 words, if possible: {question}",
            #     temperature=0,
            #     max_tokens=max_tokens,
            #     top_p=1,
            #     frequency_penalty=0,
            #     presence_penalty=0,
            #     stop=stop_sequence,
            #     model=self.model,
            # )
            # return response.choices[0].text.strip()

            response = self.client.chat.completions.create(
                model=self.model,
                messages=[
                    {"role": "system", "content": "You are Question Answering Portal."},
                    {
                        "role": "user",
                        "content": f"using the folloing information {context}. Answer the following question in less than 5-7 words, if possible: {question}",
                    },
                ],
                max_tokens=max_tokens,
            )

            return response.choices[0].message.content

        except Exception as e:
            print("error:", e)
            return ""

In [17]:
from ark_embedding import ArkEmbeddings


class MyEmbeddingModel(BaseEmbeddingModel):
    def __init__(self, model="text-embedding-ada-002"):
        self.client = ArkEmbeddings(
            model=os.getenv("ALIYUN_EMBEDDING_MODEL"),
            api_key=os.getenv("ALIYUN_API_KEY"),
            api_url=os.getenv("ALIYUN_API_URL"),
            batch_size=10
        )

    # @retry(wait=wait_random_exponential(min=1, max=20), stop=stop_after_attempt(6))
    def create_embedding(self, text):
        try:
            text = text.replace("\n", " ")
            embd = self.client.embed_documents([text])
            return embd[0]
        except Exception as e:
            print("error:", e)
            return []

In [27]:
RAC = RetrievalAugmentationConfig(
    summarization_model=MySummarizationModel(), 
    qa_model=MyQAModel(), 
    embedding_model=MyEmbeddingModel()
)

In [19]:
# 加载测试用文本
with open('data/sample.txt', 'r') as file:
    text = file.read()

print(text[:100])

The wife of a rich man fell sick, and as she felt that her end
was drawing near, she called her only


In [28]:
# 构建树状检索
RA = RetrievalAugmentation(config=RAC)

RA.add_documents(text)

2025-11-22 16:49:47,763 - Successfully initialized TreeBuilder with Config 
        TreeBuilderConfig:
            Tokenizer: <Encoding 'cl100k_base'>
            Max Tokens: 100
            Num Layers: 5
            Threshold: 0.5
            Top K: 5
            Selection Mode: top_k
            Summarization Length: 100
            Summarization Model: <__main__.MySummarizationModel object at 0x30e9f5880>
            Embedding Models: {'EMB': <__main__.MyEmbeddingModel object at 0x10ba10da0>}
            Cluster Embedding Model: EMB
        
        Reduction Dimension: 10
        Clustering Algorithm: RAPTOR_Clustering
        Clustering Parameters: {}
        
2025-11-22 16:49:47,764 - Successfully initialized ClusterTreeBuilder with Config 
        TreeBuilderConfig:
            Tokenizer: <Encoding 'cl100k_base'>
            Max Tokens: 100
            Num Layers: 5
            Threshold: 0.5
            Top K: 5
            Selection Mode: top_k
            Summarization Length

In [29]:
# 从树状检索中查询
question = "How did Cinderella reach her happy ending?"

answer = RA.answer_question(question=question)

print("Answer: ", answer)

2025-11-22 16:50:10,506 - Using collapsed_tree
2025-11-22 16:50:10,640 - HTTP Request: POST https://dashscope.aliyuncs.com/compatible-mode/v1/embeddings "HTTP/1.1 200 OK"
2025-11-22 16:50:11,908 - HTTP Request: POST https://ark.cn-beijing.volces.com/api/v3/chat/completions "HTTP/1.1 200 OK"


Answer:  By marrying the prince.


In [30]:
# 保存结果
SAVE_PATH = "data/cinderella"
RA.save(SAVE_PATH)

2025-11-22 16:50:27,262 - Tree successfully saved to data/cinderella


In [31]:
# 从保存的结果中恢复检索结果
RA = RetrievalAugmentation(config=RAC, tree=SAVE_PATH)

answer = RA.answer_question(question=question)
print("Answer: ", answer)

2025-11-22 16:50:27,564 - Successfully initialized TreeBuilder with Config 
        TreeBuilderConfig:
            Tokenizer: <Encoding 'cl100k_base'>
            Max Tokens: 100
            Num Layers: 5
            Threshold: 0.5
            Top K: 5
            Selection Mode: top_k
            Summarization Length: 100
            Summarization Model: <__main__.MySummarizationModel object at 0x30e9f5880>
            Embedding Models: {'EMB': <__main__.MyEmbeddingModel object at 0x10ba10da0>}
            Cluster Embedding Model: EMB
        
        Reduction Dimension: 10
        Clustering Algorithm: RAPTOR_Clustering
        Clustering Parameters: {}
        
2025-11-22 16:50:27,564 - Successfully initialized ClusterTreeBuilder with Config 
        TreeBuilderConfig:
            Tokenizer: <Encoding 'cl100k_base'>
            Max Tokens: 100
            Num Layers: 5
            Threshold: 0.5
            Top K: 5
            Selection Mode: top_k
            Summarization Length

Answer:  With the prince's help.


# Part 14: ColBERT
介绍资料：https://hackernoon.com/how-colbert-helps-developers-overcome-the-limits-of-rag  
论文：https://arxiv.org/abs/2004.12832?ref=hackernoon.com  
核心原理：
- 通过分词+bert进行向量化（通过双向的transformer编码，得到考虑了上下文的向量）。
- 文档和查询都会做相同的处理。
- 每个文档的总得分 = 逐个“分词后的查询向量”分别计算与“分词后的文档向量”的最大相似度，并在“分词后的查询向量”粒度求和。

In [24]:
# 使用专用于colbert的模型
from ragatouille import RAGPretrainedModel

RAG = RAGPretrainedModel.from_pretrained("colbert-ir/colbertv2.0")

********************************************************************************
--------------------------------------------
RAGatouille version 0.0.10 will be migrating to a PyLate backend 
instead of the current Stanford ColBERT backend.
PyLate is a fully mature, feature-equivalent backend, that greatly facilitates compatibility.
However, please pin version <0.0.10 if you require the Stanford ColBERT backend.
********************************************************************************
  from ragatouille import RAGPretrainedModel


[Nov 22, 16:32:53] Loading segmented_maxsim_cpp extension (set COLBERT_LOAD_TORCH_EXTENSION_VERBOSE=True for more info)...


  self.scaler = torch.cuda.amp.GradScaler()
  super().__init__(


In [32]:
# 下载wiki的数据
import requests

def get_wikipedia_page(title: str):
    """
    Retrieve the full text content of a Wikipedia page.

    :param title: str - Title of the Wikipedia page.
    :return: str - Full text content of the page as raw string.
    """
    # Wikipedia API endpoint
    URL = "https://en.wikipedia.org/w/api.php"

    # Parameters for the API request
    params = {
        "action": "query",
        "format": "json",
        "titles": title,
        "prop": "extracts",
        "explaintext": True,
    }

    # Custom User-Agent header to comply with Wikipedia's best practices
    headers = {"User-Agent": "RAGatouille_tutorial/0.0.1 (ben@clavie.eu)"}

    response = requests.get(URL, params=params, headers=headers)
    data = response.json()

    # Extracting page content
    page = next(iter(data["query"]["pages"].values()))
    return page["extract"] if "extract" in page else None

full_document = get_wikipedia_page("Hayao_Miyazaki")

In [33]:
# 建立索引
RAG.index(
    collection=[full_document],
    index_name="Miyazaki-123",
    max_document_length=180,
    split_documents=True,
)

This is a behaviour change from RAGatouille 0.8.0 onwards.
This works fine for most users and smallish datasets, but can be considerably slower than FAISS and could cause worse results in some situations.
If you're confident with FAISS working on your machine, pass use_faiss=True to revert to the FAISS-using behaviour.
--------------------


[Nov 22, 16:51:05] #> Creating directory .ragatouille/colbert/indexes/Miyazaki-123 


[Nov 22, 16:51:09] [0] 		 #> Encoding 124 passages..


  self.scaler = torch.cuda.amp.GradScaler()
  super().__init__(
  return torch.cuda.amp.autocast() if self.activated else NullContextManager()
100%|██████████| 4/4 [00:03<00:00,  1.27it/s]

[Nov 22, 16:51:12] [0] 		 avg_doclen_est = 132.18548583984375 	 len(local_sample) = 124
[Nov 22, 16:51:12] [0] 		 Creating 2,048 partitions.
[Nov 22, 16:51:12] [0] 		 *Estimated* 16,391 embeddings.
[Nov 22, 16:51:12] [0] 		 #> Saving the indexing plan to .ragatouille/colbert/indexes/Miyazaki-123/plan.json ..





used 13 iterations (0.5391s) to cluster 15572 items into 2048 clusters
[0.036, 0.038, 0.039, 0.033, 0.031, 0.035, 0.034, 0.034, 0.033, 0.033, 0.034, 0.035, 0.031, 0.035, 0.033, 0.035, 0.03, 0.032, 0.033, 0.036, 0.035, 0.034, 0.033, 0.035, 0.034, 0.031, 0.037, 0.032, 0.033, 0.035, 0.034, 0.036, 0.036, 0.033, 0.034, 0.03, 0.033, 0.032, 0.034, 0.039, 0.035, 0.037, 0.034, 0.032, 0.033, 0.033, 0.035, 0.036, 0.036, 0.031, 0.033, 0.033, 0.032, 0.033, 0.034, 0.033, 0.036, 0.035, 0.04, 0.031, 0.032, 0.032, 0.035, 0.032, 0.037, 0.034, 0.033, 0.036, 0.033, 0.031, 0.035, 0.033, 0.033, 0.035, 0.034, 0.032, 0.033, 0.037, 0.032, 0.033, 0.035, 0.036, 0.031, 0.036, 0.031, 0.034, 0.035, 0.035, 0.032, 0.037, 0.034, 0.036, 0.033, 0.035, 0.035, 0.035, 0.036, 0.033, 0.036, 0.034, 0.037, 0.039, 0.036, 0.035, 0.037, 0.033, 0.034, 0.032, 0.035, 0.031, 0.035, 0.035, 0.034, 0.031, 0.035, 0.034, 0.034, 0.034, 0.036, 0.035, 0.03, 0.032, 0.033, 0.035, 0.032, 0.033, 0.034, 0.035]


0it [00:00, ?it/s]

[Nov 22, 16:51:13] [0] 		 #> Encoding 124 passages..


100%|██████████| 4/4 [00:02<00:00,  1.53it/s]
1it [00:02,  2.66s/it]
100%|██████████| 1/1 [00:00<00:00, 1651.95it/s]

[Nov 22, 16:51:16] #> Optimizing IVF to store map from centroids to list of pids..
[Nov 22, 16:51:16] #> Building the emb2pid mapping..
[Nov 22, 16:51:16] len(emb2pid) = 16391



100%|██████████| 2048/2048 [00:00<00:00, 196148.58it/s]

[Nov 22, 16:51:16] #> Saved optimized IVF to .ragatouille/colbert/indexes/Miyazaki-123/ivf.pid.pt
Done indexing!





'.ragatouille/colbert/indexes/Miyazaki-123'

In [36]:
# 检索
results = RAG.search(query="What animation studio did Miyazaki found?", k=3)
results

[{'content': '=== Studio Ghibli ===\n\n\n==== Foundation and Laputa (1985–1987) ====\n\nFollowing the success of Nausicaä of the Valley of the Wind, Miyazaki and Takahata founded the animation production company Studio Ghibli on June 15, 1985, as a subsidiary of Tokuma Shoten, with offices in Kichijōji designed by Miyazaki. The studio\'s name had been registered a year earlier; Miyazaki named it after the nickname of the Caproni Ca.309 aircraft, meaning "a hot wind that blows in the desert" in Italian.',
  'score': 25.73267364501953,
  'rank': 1,
  'document_id': 'c39b400c-b67d-4fa7-b303-b9a7ea91925b',
  'passage_id': 42},
 {'content': 'Hayao Miyazaki (宮崎 駿 or 宮﨑 駿, Miyazaki Hayao; [mijaꜜzaki hajao]; born January 5, 1941) is a Japanese animator, filmmaker, and manga artist. He co-founded Studio Ghibli and serves as its honorary chairman. Throughout his career, Miyazaki has attained international acclaim as a masterful storyteller and creator of Japanese animated feature films, and is w

In [35]:
# 转换为langchain retriever
retriever = RAG.as_langchain_retriever(k=3)
retriever.invoke("What animation studio did Miyazaki found?")

  return torch.cuda.amp.autocast() if self.activated else NullContextManager()


[Document(metadata={}, page_content='=== Studio Ghibli ===\n\n\n==== Foundation and Laputa (1985–1987) ====\n\nFollowing the success of Nausicaä of the Valley of the Wind, Miyazaki and Takahata founded the animation production company Studio Ghibli on June 15, 1985, as a subsidiary of Tokuma Shoten, with offices in Kichijōji designed by Miyazaki. The studio\'s name had been registered a year earlier; Miyazaki named it after the nickname of the Caproni Ca.309 aircraft, meaning "a hot wind that blows in the desert" in Italian.'),
 Document(metadata={}, page_content='Hayao Miyazaki (宮崎 駿 or 宮﨑 駿, Miyazaki Hayao; [mijaꜜzaki hajao]; born January 5, 1941) is a Japanese animator, filmmaker, and manga artist. He co-founded Studio Ghibli and serves as its honorary chairman. Throughout his career, Miyazaki has attained international acclaim as a masterful storyteller and creator of Japanese animated feature films, and is widely regarded as one of the most accomplished filmmakers in the history o