# 加载向量数据库

In [1]:
from langchain_community.document_loaders import TextLoader
from langchain_text_splitters import CharacterTextSplitter

from tongyi.embeddings import TongyiEmbeddings
from langchain_community.vectorstores import FAISS

# Load the document, split it into chunks, embed each chunk and load it into the vector store.
raw_documents = TextLoader('./examples/rag.txt').load()
text_splitter = CharacterTextSplitter(separator='\n\n\n', chunk_size=50, chunk_overlap=4)
documents = text_splitter.split_documents(raw_documents)
db = FAISS.from_documents(documents, TongyiEmbeddings())

Created a chunk of size 109, which is longer than the specified 50
Created a chunk of size 65, which is longer than the specified 50
Created a chunk of size 143, which is longer than the specified 50
Created a chunk of size 833, which is longer than the specified 50
Created a chunk of size 263, which is longer than the specified 50
Created a chunk of size 304, which is longer than the specified 50
Created a chunk of size 609, which is longer than the specified 50
Created a chunk of size 174, which is longer than the specified 50
Created a chunk of size 449, which is longer than the specified 50
Created a chunk of size 280, which is longer than the specified 50
Created a chunk of size 325, which is longer than the specified 50
Created a chunk of size 560, which is longer than the specified 50
Created a chunk of size 318, which is longer than the specified 50
Created a chunk of size 68, which is longer than the specified 50


# LCEL

In [7]:
retriever = db.as_retriever(search_type="similarity", search_kwargs={"k": 2})

retriever.invoke("哪里可以了解高考成绩")

[Document(page_content='一、在哪里可以了解高考成绩、志愿填报时间和方式、各高校招生计划、往年录取参考等志愿填报权威信息？\n各省级教育行政部门或招生考试机构官方网站、微信公众号等权威渠道都会公布今年高考各阶段工作时间安排，包括高考成绩公布时间和查询方式、志愿填报时间，以及今年各高校招生计划、往年录取情况参考等权威信息。考生和家长要及时关注本地官方权威渠道发布的消息内容。\n考生高考志愿是高校录取的重要依据，请广大考生务必按照省级招生考试机构相关要求按时完成志愿填报。前期，教育部已会同有关部门协调互联网平台对省级招生考试机构和高校的官方网站、微信公众号等进行了权威标识，请广大考生在信息查询时认准官方权威渠道，切勿轻信网络不实信息。', metadata={'source': './examples/rag.txt'}),
 Document(page_content='2024年高考是黑龙江、甘肃、吉林、安徽、江西、贵州、广西7个省份（中国第四批高考综合改革省份）的第一届落地实施的新高考。 [3]', metadata={'source': './examples/rag.txt'})]

# MultiQueryRetriever

In [13]:
from langchain.retrievers.multi_query import MultiQueryRetriever
from langchain_community.chat_models import ChatTongyi

llm = ChatTongyi()

retriever_from_llm = MultiQueryRetriever.from_llm(
    retriever=retriever, llm=llm, include_original=True
)

unique_docs = retriever_from_llm.invoke("哪里可以了解高考成绩")
unique_docs

[Document(page_content='一、在哪里可以了解高考成绩、志愿填报时间和方式、各高校招生计划、往年录取参考等志愿填报权威信息？\n各省级教育行政部门或招生考试机构官方网站、微信公众号等权威渠道都会公布今年高考各阶段工作时间安排，包括高考成绩公布时间和查询方式、志愿填报时间，以及今年各高校招生计划、往年录取情况参考等权威信息。考生和家长要及时关注本地官方权威渠道发布的消息内容。\n考生高考志愿是高校录取的重要依据，请广大考生务必按照省级招生考试机构相关要求按时完成志愿填报。前期，教育部已会同有关部门协调互联网平台对省级招生考试机构和高校的官方网站、微信公众号等进行了权威标识，请广大考生在信息查询时认准官方权威渠道，切勿轻信网络不实信息。', metadata={'source': './examples/rag.txt'}),
 Document(page_content='三、高校招生章程有什么作用，如何查询？\n高校招生章程由学校依据相关法律规定和国家招生政策制定，是学校开展招生工作的依据。考生在填报志愿前，应仔细查阅拟报考高校的招生章程，全面了解高校招生办法和相关招生要求。\n主要查询途径有：中国高等教育学生信息网的“阳光高考”信息平台（https://gaokao.chsi.com.cn）；各高校官方招生网站等。', metadata={'source': './examples/rag.txt'}),
 Document(page_content='2024年高考是黑龙江、甘肃、吉林、安徽、江西、贵州、广西7个省份（中国第四批高考综合改革省份）的第一届落地实施的新高考。 [3]', metadata={'source': './examples/rag.txt'})]

In [9]:
len(unique_docs)

3

# Contextual compression
## LLMChainExtractor

In [14]:
from langchain.retrievers import ContextualCompressionRetriever
from langchain.retrievers.document_compressors import LLMChainExtractor

compressor = LLMChainExtractor.from_llm(llm)
compression_retriever = ContextualCompressionRetriever(
    base_compressor=compressor, base_retriever=retriever
)

compressed_docs = compression_retriever.invoke(
    "哪里可以了解高考成绩"
)
compressed_docs

[Document(page_content='各省级教育行政部门或招生考试机构官方网站、微信公众号等权威渠道都会公布今年高考各阶段工作时间安排，包括高考成绩公布时间和查询方式、志愿填报时间，以及今年各高校招生计划、往年录取情况参考等权威信息。', metadata={'source': './examples/rag.txt'})]

## LLMChainFilter

In [15]:
from langchain.retrievers.document_compressors import LLMChainFilter

_filter = LLMChainFilter.from_llm(llm)
compression_retriever = ContextualCompressionRetriever(
    base_compressor=_filter, base_retriever=retriever
)

compressed_docs = compression_retriever.invoke(
    "哪里可以了解高考成绩"
)
compressed_docs

[Document(page_content='一、在哪里可以了解高考成绩、志愿填报时间和方式、各高校招生计划、往年录取参考等志愿填报权威信息？\n各省级教育行政部门或招生考试机构官方网站、微信公众号等权威渠道都会公布今年高考各阶段工作时间安排，包括高考成绩公布时间和查询方式、志愿填报时间，以及今年各高校招生计划、往年录取情况参考等权威信息。考生和家长要及时关注本地官方权威渠道发布的消息内容。\n考生高考志愿是高校录取的重要依据，请广大考生务必按照省级招生考试机构相关要求按时完成志愿填报。前期，教育部已会同有关部门协调互联网平台对省级招生考试机构和高校的官方网站、微信公众号等进行了权威标识，请广大考生在信息查询时认准官方权威渠道，切勿轻信网络不实信息。', metadata={'source': './examples/rag.txt'})]

## EmbeddingsFilter

In [None]:
from langchain.retrievers.document_compressors import EmbeddingsFilter

embeddings = TongyiEmbeddings()
embeddings_filter = EmbeddingsFilter(embeddings=embeddings, similarity_threshold=0.76)
compression_retriever = ContextualCompressionRetriever(
    base_compressor=embeddings_filter, base_retriever=retriever
)

compressed_docs = compression_retriever.invoke(
    "What did the president say about Ketanji Jackson Brown"
)
compressed_docs

## DocumentCompressorPipeline

In [None]:
from langchain.retrievers.document_compressors import DocumentCompressorPipeline
from langchain_community.document_transformers import EmbeddingsRedundantFilter

redundant_filter = EmbeddingsRedundantFilter(embeddings=embeddings)
relevant_filter = EmbeddingsFilter(embeddings=embeddings, similarity_threshold=0.76)
pipeline_compressor = DocumentCompressorPipeline(
    transformers=[redundant_filter, relevant_filter]
)

compression_retriever = ContextualCompressionRetriever(
    base_compressor=pipeline_compressor, base_retriever=retriever
)

compressed_docs = compression_retriever.invoke(
    "What did the president say about Ketanji Jackson Brown"
)

# Ensemble Retriever

In [17]:
from langchain_community.retrievers import BM25Retriever

doc_list = [doc.page_content for doc in documents]
bm25_retriever = BM25Retriever.from_texts(
    doc_list, metadatas=[{"source": f"BM25"}] * len(doc_list)
)
bm25_retriever.k = 2

In [19]:
from langchain.retrievers import EnsembleRetriever

# initialize the ensemble retriever
ensemble_retriever = EnsembleRetriever(
    retrievers=[bm25_retriever, retriever], weights=[0.5, 0.5]
)

docs = ensemble_retriever.invoke("哪里可以了解高考成绩")
docs

[Document(page_content='十、录取通知书何时能收到？\n高校一般会在录取结束后一周左右向录取新生寄发录取通知书。若考生在省级招生考试机构或高校官方网站上查询到了录取结果，一直没有收到录取通知书，可及时联系录取高校公布的招生咨询电话查询本人录取通知书邮寄情况。', metadata={'source': 'BM25'}),
 Document(page_content='一、在哪里可以了解高考成绩、志愿填报时间和方式、各高校招生计划、往年录取参考等志愿填报权威信息？\n各省级教育行政部门或招生考试机构官方网站、微信公众号等权威渠道都会公布今年高考各阶段工作时间安排，包括高考成绩公布时间和查询方式、志愿填报时间，以及今年各高校招生计划、往年录取情况参考等权威信息。考生和家长要及时关注本地官方权威渠道发布的消息内容。\n考生高考志愿是高校录取的重要依据，请广大考生务必按照省级招生考试机构相关要求按时完成志愿填报。前期，教育部已会同有关部门协调互联网平台对省级招生考试机构和高校的官方网站、微信公众号等进行了权威标识，请广大考生在信息查询时认准官方权威渠道，切勿轻信网络不实信息。', metadata={'source': './examples/rag.txt'}),
 Document(page_content='九、录取工作采用什么方式，一般什么时间开始？\n高校招生实行计算机远程网上录取，各省（区、市）录取工作一般于7月上旬开始，8月底之前结束。', metadata={'source': 'BM25'}),
 Document(page_content='2024年高考是黑龙江、甘肃、吉林、安徽、江西、贵州、广西7个省份（中国第四批高考综合改革省份）的第一届落地实施的新高考。 [3]', metadata={'source': './examples/rag.txt'})]

# MultiVector Retriever
## Smaller chunks

In [2]:
from langchain.retrievers.multi_vector import MultiVectorRetriever
from langchain.storage import InMemoryByteStore

import uuid
from langchain_text_splitters import RecursiveCharacterTextSplitter

doc_ids = [str(uuid.uuid4()) for _ in documents]
id_key = "doc_id"

# The splitter to use to create smaller chunks
child_text_splitter = RecursiveCharacterTextSplitter(chunk_size=100, chunk_overlap=20)
sub_docs = []
for i, doc in enumerate(documents):
    _id = doc_ids[i]
    _sub_docs = child_text_splitter.split_documents([doc])
    for _doc in _sub_docs:
        _doc.metadata[id_key] = _id
    sub_docs.extend(_sub_docs)

len(documents), len(sub_docs)

(15, 70)

In [3]:
# The vectorstore to use to index the child chunks
vectorstore = FAISS.from_documents(sub_docs, TongyiEmbeddings())
# The storage layer for the parent documents
store = InMemoryByteStore()
retriever = MultiVectorRetriever(
    vectorstore=vectorstore,
    byte_store=store,
    id_key=id_key,
)
retriever.docstore.mset(list(zip(doc_ids, documents)))

In [4]:
# Vectorstore alone retrieves the small chunks
retriever.vectorstore.similarity_search("哪里可以了解高考成绩")[0]

Document(page_content='一、在哪里可以了解高考成绩、志愿填报时间和方式、各高校招生计划、往年录取参考等志愿填报权威信息？', metadata={'source': './examples/rag.txt', 'doc_id': 'd855063c-ad52-4a09-a304-aa9d2b2ebd17'})

In [6]:
# Retriever returns larger chunks
retriever.invoke("哪里可以了解高考成绩")[0]

Document(page_content='一、在哪里可以了解高考成绩、志愿填报时间和方式、各高校招生计划、往年录取参考等志愿填报权威信息？\n各省级教育行政部门或招生考试机构官方网站、微信公众号等权威渠道都会公布今年高考各阶段工作时间安排，包括高考成绩公布时间和查询方式、志愿填报时间，以及今年各高校招生计划、往年录取情况参考等权威信息。考生和家长要及时关注本地官方权威渠道发布的消息内容。\n考生高考志愿是高校录取的重要依据，请广大考生务必按照省级招生考试机构相关要求按时完成志愿填报。前期，教育部已会同有关部门协调互联网平台对省级招生考试机构和高校的官方网站、微信公众号等进行了权威标识，请广大考生在信息查询时认准官方权威渠道，切勿轻信网络不实信息。', metadata={'source': './examples/rag.txt'})

In [27]:
sub_docs[:5]

[Document(page_content='2024年普通高等学校招生全国统一考试（简称：2024年全国高考），是中华人民共和国合格的高中毕业生或具有同等学力的考生参加的选拔性考试', metadata={'source': './examples/rag.txt', 'doc_id': 'ffccfda6-67bb-4454-b4a1-c81c011f1512'}),
 Document(page_content='[1-2]。2024年报名人数1342万人，比2023年增加51万人 [21]。', metadata={'source': './examples/rag.txt', 'doc_id': 'ffccfda6-67bb-4454-b4a1-c81c011f1512'}),
 Document(page_content='2024年高考是黑龙江、甘肃、吉林、安徽、江西、贵州、广西7个省份（中国第四批高考综合改革省份）的第一届落地实施的新高考。 [3]', metadata={'source': './examples/rag.txt', 'doc_id': 'f64eaa30-9981-4433-96ed-2c454a9249ce'}),
 Document(page_content='2024年高考全国统考于2024年6月7日开始举行，部分省份考试时间为2天，实行新高考的省份为3-4天 [29]。5月31日，2024年高考试卷从北京发往全国', metadata={'source': './examples/rag.txt', 'doc_id': '43267b94-dff0-4524-a210-ad70dcee0ed8'}),
 Document(page_content='[22]。6月5日，2024年高考举报电话已开通，教育部教育考试院的举报电话为：010-62790357 [53] [77]。', metadata={'source': './examples/rag.txt', 'doc_id': '43267b94-dff0-4524-a210-ad70dcee0ed8'})]

## Parent Document Retriever

In [29]:
from langchain.retrievers import ParentDocumentRetriever
from langchain_chroma import Chroma
from langchain.storage import InMemoryStore

# This text splitter is used to create the child documents
child_splitter = RecursiveCharacterTextSplitter(chunk_size=100, chunk_overlap=20)
# The vectorstore to use to index the child chunks (empty to start)
# FAISS not supports empty initialzation
vectorstore = Chroma(
    collection_name="full_documents", embedding_function=TongyiEmbeddings()
)
# The storage layer for the parent documents
store = InMemoryStore()
retriever = ParentDocumentRetriever(
    vectorstore=vectorstore,
    docstore=store,
    child_splitter=child_splitter,
)

retriever.add_documents(documents, ids=None)

In [None]:
sub_docs = vectorstore.similarity_search("哪里可以了解高考成绩")
sub_docs[0]

In [None]:
retrieved_docs = retriever.invoke("哪里可以了解高考成绩")
retrieved_docs[0]

## Summary

In [10]:
import uuid

from langchain_core.documents import Document
from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import ChatPromptTemplate
from langchain_community.chat_models import ChatTongyi

chain = (
    {"doc": lambda x: x.page_content}
    | ChatPromptTemplate.from_template("概括以下内容:\n\n{doc}")
    | ChatTongyi(max_retries=0)
    | StrOutputParser()
)

summaries = chain.batch(documents, {"max_concurrency": 5})
summaries[:2]

['2024年全国高考是中国的一项重要考试，用于选拔高中毕业生和具备同等学历的考生，2024年的报名人数达到了1342万人，相比上一年增长了51万人。',
 '2024年，中国有7个省份（黑龙江、甘肃、吉林、安徽、江西、贵州和广西）将首次实施新的高考制度，作为第四批改革省份。']

In [17]:
doc_ids = [str(uuid.uuid4()) for _ in documents]
id_key = "doc_id"

summary_docs = [
    Document(page_content=s, metadata={id_key: doc_ids[i]})
    for i, s in enumerate(summaries)
]

# The vectorstore to use to index the child chunks
vectorstore = FAISS.from_documents(summary_docs, TongyiEmbeddings())
# The storage layer for the parent documents
store = InMemoryByteStore()
retriever = MultiVectorRetriever(
    vectorstore=vectorstore,
    byte_store=store,
    id_key=id_key,
)
retriever.docstore.mset(list(zip(doc_ids, documents)))

In [18]:
sub_docs = vectorstore.similarity_search("哪里可以了解高考成绩")
sub_docs[0]

Document(page_content='高考相关信息可以在各省级教育行政部门或招生考试机构的官方网站、微信公众号等权威渠道获取，包括成绩公布时间、查询方式、志愿填报时间、高校招生计划和历年录取参考等。考生和家长需密切关注官方发布的信息。志愿填报至关重要，考生必须遵循省级招生考试机构的规定。教育部已与相关部门合作确保官方渠道的权威性，提醒大家在查询时要识别官方标识，避免相信非官方的不实信息。', metadata={'doc_id': 'd5f6fbc3-3425-4c3e-914a-f10669c9ae53'})

In [19]:
retrieved_docs = retriever.invoke("哪里可以了解高考成绩")
retrieved_docs[0]

Document(page_content='一、在哪里可以了解高考成绩、志愿填报时间和方式、各高校招生计划、往年录取参考等志愿填报权威信息？\n各省级教育行政部门或招生考试机构官方网站、微信公众号等权威渠道都会公布今年高考各阶段工作时间安排，包括高考成绩公布时间和查询方式、志愿填报时间，以及今年各高校招生计划、往年录取情况参考等权威信息。考生和家长要及时关注本地官方权威渠道发布的消息内容。\n考生高考志愿是高校录取的重要依据，请广大考生务必按照省级招生考试机构相关要求按时完成志愿填报。前期，教育部已会同有关部门协调互联网平台对省级招生考试机构和高校的官方网站、微信公众号等进行了权威标识，请广大考生在信息查询时认准官方权威渠道，切勿轻信网络不实信息。', metadata={'source': './examples/rag.txt'})

## Hypothetical Queries

In [31]:
from langchain_core.messages import AIMessage
from langchain_core.exceptions import OutputParserException

def custom_parse(ai_message: AIMessage) -> str:
    """Parse the AI message."""
    if '\n\n' in ai_message.content:
        return ai_message.content.split('\n\n')
    elif '\n' in ai_message.content:
        return ai_message.content.split('\n')
    else:
        raise OutputParserException("Badly formed question!")

chain = (
    {"doc": lambda x: x.page_content}
    | ChatPromptTemplate.from_template("为下面内容生成3个合适的提问问题：\n\n{doc}\n\n#限制\n生成的3个问题使用两个换行符，即```\n\n```符号进行隔开")
    | ChatTongyi(max_retries=0)
    | custom_parse
)

hypothetical_questions = chain.batch(documents, {"max_concurrency": 5})
hypothetical_questions[0]

['1. 2024年全国高考的全称是什么？',
 '2. 与2023年相比，2024年全国高考的报名人数有何变化？',
 '3. 能否提供2023年全国高考的报名人数数据作为对比？']

In [33]:
doc_ids = [str(uuid.uuid4()) for _ in documents]
id_key = "doc_id"

question_docs = []
for i, question_list in enumerate(hypothetical_questions):
    question_docs.extend(
        [Document(page_content=s, metadata={id_key: doc_ids[i]}) for s in question_list]
    )

# The vectorstore to use to index the child chunks
vectorstore = FAISS.from_documents(question_docs, TongyiEmbeddings())
# The storage layer for the parent documents
store = InMemoryByteStore()
retriever = MultiVectorRetriever(
    vectorstore=vectorstore,
    byte_store=store,
    id_key=id_key,
)
retriever.docstore.mset(list(zip(doc_ids, documents)))

In [34]:
sub_docs = vectorstore.similarity_search("哪里可以了解高考成绩")
sub_docs[0]

Document(page_content='1. 从哪里可以获得高考成绩查询的具体时间和方式，以及志愿填报的详细指导？', metadata={'doc_id': '120780d1-f9f2-4ee4-ac03-d806343878b8'})

In [35]:
retrieved_docs = retriever.invoke("哪里可以了解高考成绩")
retrieved_docs[0]

Document(page_content='一、在哪里可以了解高考成绩、志愿填报时间和方式、各高校招生计划、往年录取参考等志愿填报权威信息？\n各省级教育行政部门或招生考试机构官方网站、微信公众号等权威渠道都会公布今年高考各阶段工作时间安排，包括高考成绩公布时间和查询方式、志愿填报时间，以及今年各高校招生计划、往年录取情况参考等权威信息。考生和家长要及时关注本地官方权威渠道发布的消息内容。\n考生高考志愿是高校录取的重要依据，请广大考生务必按照省级招生考试机构相关要求按时完成志愿填报。前期，教育部已会同有关部门协调互联网平台对省级招生考试机构和高校的官方网站、微信公众号等进行了权威标识，请广大考生在信息查询时认准官方权威渠道，切勿轻信网络不实信息。', metadata={'source': './examples/rag.txt'})

## Self-querying

In [None]:
from langchain_chroma import Chroma
from langchain_core.documents import Document

docs = [
    Document(
        page_content="A bunch of scientists bring back dinosaurs and mayhem breaks loose",
        metadata={"year": 1993, "rating": 7.7, "genre": "science fiction"},
    ),
    Document(
        page_content="Leo DiCaprio gets lost in a dream within a dream within a dream within a ...",
        metadata={"year": 2010, "director": "Christopher Nolan", "rating": 8.2},
    ),
    Document(
        page_content="A psychologist / detective gets lost in a series of dreams within dreams within dreams and Inception reused the idea",
        metadata={"year": 2006, "director": "Satoshi Kon", "rating": 8.6},
    ),
    Document(
        page_content="A bunch of normal-sized women are supremely wholesome and some men pine after them",
        metadata={"year": 2019, "director": "Greta Gerwig", "rating": 8.3},
    ),
    Document(
        page_content="Toys come alive and have a blast doing so",
        metadata={"year": 1995, "genre": "animated"},
    ),
    Document(
        page_content="Three men walk into the Zone, three men walk out of the Zone",
        metadata={
            "year": 1979,
            "director": "Andrei Tarkovsky",
            "genre": "thriller",
            "rating": 9.9,
        },
    ),
]
vectorstore = Chroma.from_documents(docs, TongyiEmbeddings())

In [2]:
from langchain.chains.query_constructor.base import AttributeInfo
from langchain.retrievers.self_query.base import SelfQueryRetriever

metadata_field_info = [
    AttributeInfo(
        name="genre",
        description="The genre of the movie. One of ['science fiction', 'comedy', 'drama', 'thriller', 'romance', 'action', 'animated']",
        type="string",
    ),
    AttributeInfo(
        name="year",
        description="The year the movie was released",
        type="integer",
    ),
    AttributeInfo(
        name="director",
        description="The name of the movie director",
        type="string",
    ),
    AttributeInfo(
        name="rating", description="A 1-10 rating for the movie", type="float"
    ),
]
document_content_description = "Brief summary of a movie"
llm = ChatTongyi()
retriever = SelfQueryRetriever.from_llm(
    llm,
    vectorstore,
    document_content_description,
    metadata_field_info,
)

In [None]:
# This example only specifies a filter
retriever.invoke("I want to watch a movie rated higher than 8.5")
"""
[Document(page_content='Three men walk into the Zone, three men walk out of the Zone', metadata={'director': 'Andrei Tarkovsky', 'genre': 'thriller', 'rating': 9.9, 'year': 1979}),
 Document(page_content='A psychologist / detective gets lost in a series of dreams within dreams within dreams and Inception reused the idea', metadata={'director': 'Satoshi Kon', 'rating': 8.6, 'year': 2006})]
"""

### 原理

In [3]:
from langchain.chains.query_constructor.base import (
    StructuredQueryOutputParser,
    get_query_constructor_prompt,
)
from langchain.retrievers.self_query.chroma import ChromaTranslator
from langchain_community.chat_models import ChatTongyi

llm = ChatTongyi()

prompt = get_query_constructor_prompt(
    document_content_description,
    metadata_field_info,
    allowed_comparators=ChromaTranslator.allowed_comparators,
    allowed_operators=ChromaTranslator.allowed_operators
)
output_parser = StructuredQueryOutputParser.from_components()
query_constructor = prompt | llm | output_parser

In [7]:
ChromaTranslator.allowed_comparators

[<Comparator.EQ: 'eq'>,
 <Comparator.NE: 'ne'>,
 <Comparator.GT: 'gt'>,
 <Comparator.GTE: 'gte'>,
 <Comparator.LT: 'lt'>,
 <Comparator.LTE: 'lte'>]

In [8]:
ChromaTranslator.allowed_operators

[<Operator.AND: 'and'>, <Operator.OR: 'or'>]

In [4]:
prompt.format(query="dummy question")

'Your goal is to structure the user\'s query to match the request schema provided below.\n\n<< Structured Request Schema >>\nWhen responding use a markdown code snippet with a JSON object formatted in the following schema:\n\n```json\n{\n    "query": string \\ text string to compare to document contents\n    "filter": string \\ logical condition statement for filtering documents\n}\n```\n\nThe query string should contain only text that is expected to match the contents of documents. Any conditions in the filter should not be mentioned in the query as well.\n\nA logical condition statement is composed of one or more comparison and logical operation statements.\n\nA comparison statement takes the form: `comp(attr, val)`:\n- `comp` (eq | ne | gt | gte | lt | lte): comparator\n- `attr` (string):  name of attribute to apply the comparison to\n- `val` (string): is the comparison value\n\nA logical operation statement takes the form `op(statement1, statement2, ...)`:\n- `op` (and | or): logic

In [6]:
query_constructor.invoke("I want to watch a movie about dinosaurs rated higher than 8.5")

StructuredQuery(query='dinosaurs', filter=Comparison(comparator=<Comparator.GT: 'gt'>, attribute='rating', value=8.5), limit=None)

# 2024高考RAG应用
## without rag

In [9]:
from langchain_community.chat_models import ChatTongyi

chat = ChatTongyi()

chat.invoke('2024年高考报名人数是多少')

AIMessage(content='对不起，我无法提供具体的2024年高考报名人数信息，因为这些数据通常由各省份的教育考试机构或政府部门发布，而且会在高考报名开始前公布。对于这类实时数据，建议你关注当地教育部门或考试院的官方通知，或者在高考报名开始时查询相关公告。', response_metadata={'model_name': 'qwen-turbo', 'finish_reason': 'stop', 'request_id': '2cffd503-04a3-96e6-bed6-efd912311105', 'token_usage': {'input_tokens': 16, 'output_tokens': 67, 'total_tokens': 83}}, id='run-293addeb-4aa2-4ea9-b53a-861483c0114c-0')

In [10]:
chat.invoke('2024年高考，广东的报名时间是什么时候')

AIMessage(content='高考报名时间每年可能会有所变动，具体以官方发布的通知为准。一般来说，广东省的高考报名时间通常在每年的11月份进行，持续一周左右。建议你关注广东省教育考试院或当地教育局的官方网站，他们会发布最准确的高考报名通知和时间安排。同时，也要注意报名截止日期，不要错过报名时间。', response_metadata={'model_name': 'qwen-turbo', 'finish_reason': 'stop', 'request_id': 'a2e34ac6-4188-9626-a93d-cfea2347401d', 'token_usage': {'input_tokens': 20, 'output_tokens': 74, 'total_tokens': 94}}, id='run-5a200266-3e07-4271-9f7e-a74f4b7156d9-0')

## with rag

In [11]:
from langchain_community.document_loaders import TextLoader
from langchain_text_splitters import CharacterTextSplitter

from tongyi.embeddings import TongyiEmbeddings
from langchain_community.vectorstores import FAISS

# Load the document, split it into chunks, embed each chunk and load it into the vector store.
raw_documents = TextLoader('./examples/rag.txt').load()
text_splitter = CharacterTextSplitter(separator='\n\n\n', chunk_size=50, chunk_overlap=4)
documents = text_splitter.split_documents(raw_documents)
db = FAISS.from_documents(documents, TongyiEmbeddings())

Created a chunk of size 109, which is longer than the specified 50
Created a chunk of size 65, which is longer than the specified 50
Created a chunk of size 143, which is longer than the specified 50
Created a chunk of size 833, which is longer than the specified 50
Created a chunk of size 263, which is longer than the specified 50
Created a chunk of size 304, which is longer than the specified 50
Created a chunk of size 609, which is longer than the specified 50
Created a chunk of size 174, which is longer than the specified 50
Created a chunk of size 449, which is longer than the specified 50
Created a chunk of size 280, which is longer than the specified 50
Created a chunk of size 325, which is longer than the specified 50
Created a chunk of size 560, which is longer than the specified 50
Created a chunk of size 318, which is longer than the specified 50
Created a chunk of size 68, which is longer than the specified 50


In [13]:
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.messages import AIMessage
from langchain_core.exceptions import OutputParserException

def custom_parse(ai_message: AIMessage) -> str:
    """Parse the AI message."""
    if '\n\n' in ai_message.content:
        return ai_message.content.split('\n\n')
    elif '\n' in ai_message.content:
        return ai_message.content.split('\n')
    else:
        raise OutputParserException("Badly formed question!")

chain = (
    {"doc": lambda x: x.page_content}
    | ChatPromptTemplate.from_template("为下面内容生成3个合适的提问问题：\n\n{doc}\n\n#限制\n生成的3个问题使用两个换行符，即```\n\n```符号进行隔开")
    | ChatTongyi(max_retries=0)
    | custom_parse
)

hypothetical_questions = chain.batch(documents, {"max_concurrency": 5})
hypothetical_questions[0]

['1. 2024年全国高考的全称是什么？',
 '2. 与2023年相比，2024年全国高考的报名人数有何变化？',
 '3. 能否提供2023年全国高考的报名人数数据作为对比？']

In [17]:
import uuid
from langchain_core.documents import Document
from langchain.storage import InMemoryByteStore
from langchain.retrievers.multi_vector import MultiVectorRetriever

doc_ids = [str(uuid.uuid4()) for _ in documents]
id_key = "doc_id"

question_docs = []
for i, question_list in enumerate(hypothetical_questions):
    question_docs.extend(
        [Document(page_content=s, metadata={id_key: doc_ids[i]}) for s in question_list]
    )

# The vectorstore to use to index the child chunks
vectorstore = FAISS.from_documents(question_docs, TongyiEmbeddings())
# The storage layer for the parent documents
store = InMemoryByteStore()
retriever = MultiVectorRetriever(
    vectorstore=vectorstore,
    byte_store=store,
    id_key=id_key,
)
retriever.docstore.mset(list(zip(doc_ids, documents)))

In [18]:
from langchain_core.runnables import RunnablePassthrough
from langchain_core.output_parsers import StrOutputParser

template = """回答用户的问题，下面的内容可以作为你的知识依据：
```
{context}
```

用户的问题：{query}
"""
prompt = ChatPromptTemplate.from_template(template)


def format_docs(docs):
    return "\n\n".join([d.page_content for d in docs])


rag_chain = (
    {"context": retriever | format_docs, "query": RunnablePassthrough()}
    | prompt
    | chat
    | StrOutputParser()
)

rag_chain.invoke("2024年高考报名人数是多少")

'2024年全国高考的报名人数达到1342万人。'

In [19]:
rag_chain.invoke("2024年高考，广东的报名时间是什么时候")

'2024年广东的高考报名时间是2023年11月1日至10日。'