# 4.4.2 召回内容上下文扩充

In [None]:
from langchain.document_loaders import TextLoader
from langchain.embeddings import OpenAIEmbeddings
from langchain.storage import InMemoryStore
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import Chroma
from langchain.retrievers.parent_document_retriever import ParentDocumentRetriever
loaders = [
    TextLoader("./file1.txt"),
    TextLoader("./file2.txt"),
]
docs = []
for l in loaders:
    docs.extend(l.load())

parent_splitter = RecursiveCharacterTextSplitter(chunk_size=1000)
child_splitter = RecursiveCharacterTextSplitter(chunk_size=100)
vectorstore = Chroma(
    collection_name="split_parents", embedding_function=OpenAIEmbeddings()
)
store = InMemoryStore()
retriever = ParentDocumentRetriever(
    child_splitter=child_splitter,
    parent_splitter=parent_splitter,
    vectorstore=vectorstore,
    docstore=store,
)
retriever.add_documents(docs)
retrieved_docs = retriever.get_relevant_documents("西瓜的品种")

# 4.4.3 文本多向量表示

In [None]:
from langchain.retrievers.multi_vector import MultiVectorRetriever
from langchain.document_loaders import TextLoader
from langchain.embeddings import OpenAIEmbeddings
from langchain.storage import InMemoryStore
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import Chroma
import uuid
import os
os.environ["OPENAI_API_BASE"] = "xxx"
os.environ["OPENAI_API_KEY"] = "xxx"

loader = TextLoader("./file.txt")
docs = []
docs.extend(loader.load())

text_splitter = RecursiveCharacterTextSplitter(chunk_size=10000,chunk_overlap=0)
docs = text_splitter.split_documents(docs)
vectorstore = Chroma(
    collection_name="full_documents", embedding_function=OpenAIEmbeddings()
)

store = InMemoryStore()
# metadata中，储存大块文本id的键
id_key = "doc_id"
retriever = MultiVectorRetriever(
    vectorstore=vectorstore,
    docstore=store,
    id_key=id_key,
)
# 生成大块文本对应的id值
doc_ids = [str(uuid.uuid4()) for _ in docs]
child_text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000,chunk_overlap=0)
sub_docs = []
for i, doc in enumerate(docs):
    _id = doc_ids[i]
    # 大块文本进一步切分成小块文本
    _sub_docs = child_text_splitter.split_documents([doc])
    for _doc in _sub_docs:
        # 小块文本的metadata中保存其对应的大块文本的id
        _doc.metadata[id_key] = _id
    sub_docs.extend(_sub_docs)
# 向量数据库中存储的是小块文本
retriever.vectorstore.add_documents(sub_docs)
# 存储大块文本id和内容的对应关系
retriever.docstore.mset(list(zip(doc_ids, docs)))
# 召回小块文本
print(len(retriever.vectorstore.similarity_search("user query")[0].page_content))
# 召回小块文本对应的大块文本
print(len(retriever.get_relevant_documents("user query")[0].page_content))

# 4.4.4 查询内容优化

In [None]:
# Build a sample vectorDB
from langchain.document_loaders import WebBaseLoader
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import Chroma
from langchain.chat_models import ChatOpenAI
from langchain.retrievers.multi_query import MultiQueryRetriever
import logging

# Load blog post
loader = WebBaseLoader("https://lilianweng.github.io/posts/2023-06-23-agent/")
data = loader.load()

# Split
text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=0)
splits = text_splitter.split_documents(data)

# VectorDB
embedding = OpenAIEmbeddings()
vectordb = Chroma.from_documents(documents=splits, embedding=embedding)

question = "What are the approaches to Task Decomposition?"
llm = ChatOpenAI(temperature=0)
retriever_from_llm = MultiQueryRetriever.from_llm(
    retriever=vectordb.as_retriever(), llm=llm
)
# Set logging for the queries
logging.basicConfig()
logging.getLogger("langchain.retrievers.multi_query").setLevel(logging.INFO)

unique_docs = retriever_from_llm.get_relevant_documents(query=question)
len(unique_docs)

In [None]:
from langchain.chains import HypotheticalDocumentEmbedder 
from langchain.embeddings import OpenAIEmbeddings 
from langchain.llms import OpenAI 
import os 
base_embeddings = OpenAIEmbeddings() 
llm = OpenAI() 
embeddings = HypotheticalDocumentEmbedder.from_llm(llm, base_embeddings, "web_search") 
result = embeddings.embed_query("北京在哪儿")

# 4.4.5 召回文本重排序

In [None]:
from langchain.text_splitter import CharacterTextSplitter
from langchain.embeddings import OpenAIEmbeddings
from langchain.document_loaders import TextLoader
from langchain.vectorstores import FAISS
from langchain.llms import OpenAI
from langchain.retrievers import ContextualCompressionRetriever
from langchain.retrievers.document_compressors import LLMChainExtractor,LLMChainFilter
import os

os.environ["OPENAI_API_BASE"]  = "xxx"
os.environ["OPENAI_API_KEY"] = "xxx"
documents = TextLoader('./fruit_information.txt').load()
text_splitter=CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
texts = text_splitter.split_documents(documents)
retriever=FAISS.from_documents(texts, OpenAIEmbeddings()).as_retriever()
llm = OpenAI(temperature=0)
# 提取关键句的压缩器
# compressor = LLMChainExtractor.from_llm(llm)
# 选择召回文本的压缩器
compressor = LLMChainFilter.from_llm(llm)
compression_retriever=ContextualCompressionRetriever(
base_compressor=compressor, base_retriever=retriever)
compressed_docs = compression_retriever.get_relevant_documents(
"介绍一下葡萄")
for d in compressed_docs:
    print(d.page_content)

# 4.4.6 多检索器融合

In [None]:
from langchain.retrievers import BM25Retriever, EnsembleRetriever
from langchain.vectorstores import FAISS
from langchain.embeddings import OpenAIEmbeddings
import os

doc_list = [
    "我喜欢吃西瓜",
    "我喜欢吃葡萄",
    "葡萄和西瓜都是我喜欢的水果",
]

# 定义BM25检索器
bm25 = BM25Retriever.from_texts(doc_list)
bm25.k = 2

#定义基于OpenAI向量化模型的检索器
embedding = OpenAIEmbeddings(
    openai_api_base="xxx",
    openai_api_key="xxx"
)
faiss_vectorstore = FAISS.from_texts(doc_list, embedding)
embedding_retriever = faiss_vectorstore.as_retriever(search_kwargs={"k": 2})

# 检索器融合
ensemble_retriever = EnsembleRetriever(
    retrievers=[bm25, embedding_retriever], weights=[0.2, 0.8]
)
docs = ensemble_retriever.get_relevant_documents("葡萄")
print(docs)

# 4.4.7 结合元数据召回

In [None]:
from langchain.vectorstores import FAISS
from langchain.embeddings import OpenAIEmbeddings
import os
from langchain.docstore.document import Document
doc_list = [
    Document(page_content="葡萄价格：10元/kg",metadata={"shop_name":"水果店1"}),
    Document(page_content="西瓜价格：5元/kg",metadata={"shop_name":"水果店1"}),
    Document(page_content="葡萄价格：15元/kg",metadata={"shop_name":"水果店2"}),  
]
#定义基于OpenAI向量化模型的检索器
embedding = OpenAIEmbeddings(
    openai_api_base="xxx",
    openai_api_key="xxx"
)
# 带元数据过滤的检索
faiss_vectorstore_filter = FAISS.from_documents(doc_list, embedding)
faiss_retriever_filter = faiss_vectorstore_filter.as_retriever(
    search_kwargs={"k": 2, "filter":{"shop_name":"水果店1"}})
print(faiss_retriever_filter.get_relevant_documents("葡萄价格"))
# 输出===
#[Document(page_content='葡萄价格：10元/kg', metadata={'shop_name': '水果店1'}), 
#Document(page_content='西瓜价格：5元/kg', metadata={'shop_name': '水果店1'})]
#===

# 不带元数据过滤的检索
faiss_vectorstore = FAISS.from_documents(doc_list, embedding)
faiss_retriever = faiss_vectorstore.as_retriever(search_kwargs={"k": 2})
print(faiss_retriever.get_relevant_documents("葡萄价格"))
# 输出===
#[Document(page_content='葡萄价格：15元/kg', metadata={'shop_name': '水果店2'}), 
#Document(page_content='葡萄价格：10元/kg', metadata={'shop_name': '水果店1'})]
#===

In [None]:
from langchain.chat_models import ChatOpenAI
from langchain.document_transformers.openai_functions import create_metadata_tagger
from langchain.schema import Document
import json
schema = {
    "properties": {
        "fruit": {"type": "string"},
        "shop_name": {
            "type": "string",
            "description": "the name of fruit shop",
        },
    },
    "required": ["fruit", "shop_name"],
}


llm = ChatOpenAI(openai_api_base="xxx",
    openai_api_key="xxx",temperature=0)

gen_tagger = create_metadata_tagger(metadata_schema=schema, llm=llm)
documents = [
    Document(
        page_content="味道美水果店卖葡萄，3块钱一斤"
    ),
]

enhanced_documents = gen_tagger.transform_documents(documents)

for d in enhanced_documents:
    print(d.metadata)
#输出===
#{'fruit': '葡萄', 'shop_name': '味道美水果店'}
#===

In [None]:
import faiss
from langchain.embeddings import OpenAIEmbeddings
from datetime import datetime, timedelta
from langchain.docstore import InMemoryDocstorefrom 
from langchain.vectorstores import FAISS
from langchain.retrievers import TimeWeightedVectorStoreRetriever
from langchain.schema import Document
import os
os.environ["OPENAI_API_BASE"]  = "xxx"
os.environ["OPENAI_API_KEY"] = "xxx"
# 定义向量化模型
embeddings_model = OpenAIEmbeddings()
# 构建faiss向量数据库
embedding_size = 1536
index = faiss.IndexFlatL2(embedding_size)
vectorstore = FAISS(embeddings_model.embed_query, index, InMemoryDocstore({}), {})
# 定义时间衰减检索器
retriever = TimeWeightedVectorStoreRetriever(vectorstore=vectorstore, decay_rate=0.999, k=1)
yesterday = datetime.now() - timedelta(days=1)
retriever.add_documents([Document(page_content="葡萄价格3元/斤", metadata={"last_accessed_at": yesterday})])
retriever.add_documents([Document(page_content="西瓜价格2元/斤")])
# "Hello Foo" is returned first because "hello world" is mostly forgotten
docs = retriever.get_relevant_documents("查询葡萄价格")
print(docs)
#输出===
#[Document(page_content='西瓜价格2元/斤', metadata={'last_accessed_at': datetime.datetime(2023, 6, 18, 15, 19, 54, 840124), 'created_at': datetime.datetime(2023, 6, 18, 815, 19, 53, 640318), 'buffer_idx': 1})]
#===