In [1]:
from langchain.retrievers.multi_vector import MultiVectorRetriever

In [7]:
import os
os.environ["OPENAI_API_KEY"]=""

In [2]:
from langchain.storage import InMemoryByteStore
from langchain_chroma import Chroma
from langchain_community.document_loaders import TextLoader
from langchain_openai import OpenAIEmbeddings
from langchain_text_splitters import RecursiveCharacterTextSplitter

In [5]:
loaders = [
    TextLoader("./data/paul_graham_essay.txt"),
    TextLoader("./data/state_of_the_union.txt"),
]
docs = []
for loader in loaders:
    docs.extend(loader.load())
text_splitter = RecursiveCharacterTextSplitter(chunk_size=10000)
docs = text_splitter.split_documents(docs)

### Smaller chunks

In [8]:
# The vectorstore to use to index the child chunks
vectorstore = Chroma(
    collection_name="full_documents", embedding_function=OpenAIEmbeddings()
)
# The storage layer for the parent documents
store = InMemoryByteStore()
id_key = "doc_id"
# The retriever (empty to start)
retriever = MultiVectorRetriever(
    vectorstore=vectorstore,
    byte_store=store,
    id_key=id_key,
)
import uuid

doc_ids = [str(uuid.uuid4()) for _ in docs]

In [9]:
# The splitter to use to create smaller chunks
child_text_splitter = RecursiveCharacterTextSplitter(chunk_size=400)

In [10]:
sub_docs = []
for i, doc in enumerate(docs):
    _id = doc_ids[i]
    _sub_docs = child_text_splitter.split_documents([doc])
    for _doc in _sub_docs:
        _doc.metadata[id_key] = _id
    sub_docs.extend(_sub_docs)

In [15]:
print(sub_docs[1])

page_content='The first programs I tried writing were on the IBM 1401 that our school district used for what was then called "data processing." This was in 9th grade, so I was 13 or 14. The school district's 1401 happened to be in the basement of our junior high school, and my friend Rich Draves and I got permission to use it. It was like a mini Bond villain's lair down there, with all these alien-looking' metadata={'source': './data/paul_graham_essay.txt', 'doc_id': '3f6f82f5-4b66-439f-900e-787548cb42af'}


In [11]:
retriever.vectorstore.add_documents(sub_docs)
retriever.docstore.mset(list(zip(doc_ids, docs)))

In [12]:
# Vectorstore alone retrieves the small chunks
retriever.vectorstore.similarity_search("justice breyer")[0]

Document(metadata={'doc_id': '28446e8c-961e-4927-9558-0d85ee02422c', 'source': './data/state_of_the_union.txt'}, page_content='Tonight, I’d like to honor someone who has dedicated his life to serve this country: Justice Stephen Breyer—an Army veteran, Constitutional scholar, and retiring Justice of the United States Supreme Court. Justice Breyer, thank you for your service. \n\nOne of the most serious constitutional responsibilities a President has is nominating someone to serve on the United States Supreme Court.')

In [16]:
retriever.get_relevant_documents("justice breyer")[0]

  warn_deprecated(


Document(metadata={'source': './data/state_of_the_union.txt'}, page_content='But in my administration, the watchdogs have been welcomed back. \n\nWe’re going after the criminals who stole billions in relief money meant for small businesses and millions of Americans.  \n\nAnd tonight, I’m announcing that the Justice Department will name a chief prosecutor for pandemic fraud. \n\nBy the end of this year, the deficit will be down to less than half what it was before I took office.  \n\nThe only president ever to cut the deficit by more than one trillion dollars in a single year. \n\nLowering your costs also means demanding more competition. \n\nI’m a capitalist, but capitalism without competition isn’t capitalism. \n\nIt’s exploitation—and it drives up prices. \n\nWhen corporations don’t have to compete, their profits go up, your prices go up, and small businesses and family farmers and ranchers go under. \n\nWe see it happening with ocean carriers moving goods in and out of America. \n\n

In [30]:
retriever.vectorstore.similarity_search("justice breyer")#小文档

[Document(metadata={'doc_id': '87fd15c2-5eda-4663-a6e3-038dde012838'}, page_content="President Biden has nominated Judge Ketanji Brown Jackson to serve on the United States Supreme Court, emphasizing her qualifications and broad support. He also emphasizes the need to secure the border and fix the immigration system, providing a pathway to citizenship for various groups. The President calls for advancing liberty and justice by protecting women's rights, including access to healthcare and the right to choose, as well as supporting LGBTQ+ Americans. He also outlines a Unity Agenda for the Nation, focusing on addressing the opioid epidemic, mental health, supporting veterans, and ending cancer. President Biden expresses optimism about America's future and concludes by affirming the strength of the nation and its people."),
 Document(metadata={'doc_id': '9d33281e-122b-48bc-95cf-a36b537ddc97'}, page_content="The document outlines the actions taken by the current administration to combat pan

In [31]:
retriever.get_relevant_documents("justice breyer")#大文档

[Document(metadata={'source': './data/state_of_the_union.txt'}, page_content='One of the most serious constitutional responsibilities a President has is nominating someone to serve on the United States Supreme Court. \n\nAnd I did that 4 days ago, when I nominated Circuit Court of Appeals Judge Ketanji Brown Jackson. One of our nation’s top legal minds, who will continue Justice Breyer’s legacy of excellence. \n\nA former top litigator in private practice. A former federal public defender. And from a family of public school educators and police officers. A consensus builder. Since she’s been nominated, she’s received a broad range of support—from the Fraternal Order of Police to former judges appointed by Democrats and Republicans. \n\nAnd if we are to advance liberty and justice, we need to secure the Border and fix the immigration system. \n\nWe can do both. At our border, we’ve installed new technology like cutting-edge scanners to better detect drug smuggling.  \n\nWe’ve set up joi

In [13]:
# Retriever returns larger chunks
len(retriever.invoke("justice breyer")[0].page_content)

9874

In [14]:
from langchain.retrievers.multi_vector import SearchType

retriever.search_type = SearchType.mmr

len(retriever.invoke("justice breyer")[0].page_content)

9874

In [19]:
from langchain.memory import ConversationBufferMemory
memory = ConversationBufferMemory(memory_key="chat_history", return_messages=True)

In [20]:
from langchain.llms import OpenAI
from langchain.chains import ConversationalRetrievalChain
qa = ConversationalRetrievalChain.from_llm(OpenAI(temperature=0), retriever=retriever,memory=memory)

  warn_deprecated(


In [None]:
result=qa({"question": "Why the NATO Alliance was created?"})
result

### Summary

In [22]:
import uuid

from langchain_core.documents import Document
from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import ChatPromptTemplate
from langchain_openai import ChatOpenAI

In [23]:
chain = (
    {"doc": lambda x: x.page_content}
    | ChatPromptTemplate.from_template("Summarize the following document:\n\n{doc}")
    | ChatOpenAI(max_retries=0)
    | StrOutputParser()
)

In [24]:
summaries = chain.batch(docs, {"max_concurrency": 5})

In [25]:
# The vectorstore to use to index the child chunks
vectorstore = Chroma(collection_name="summaries", embedding_function=OpenAIEmbeddings())
# The storage layer for the parent documents
store = InMemoryByteStore()
id_key = "doc_id"
# The retriever (empty to start)
retriever = MultiVectorRetriever(
    vectorstore=vectorstore,
    byte_store=store,
    id_key=id_key,
)
doc_ids = [str(uuid.uuid4()) for _ in docs]

In [26]:
summary_docs = [
    Document(page_content=s, metadata={id_key: doc_ids[i]})
    for i, s in enumerate(summaries)
]

In [27]:
retriever.vectorstore.add_documents(summary_docs)
retriever.docstore.mset(list(zip(doc_ids, docs)))

In [28]:
# # We can also add the original chunks to the vectorstore if we so want
# for i, doc in enumerate(docs):
#     doc.metadata[id_key] = doc_ids[i]
# retriever.vectorstore.add_documents(docs)
sub_docs = vectorstore.similarity_search("justice breyer")
sub_docs[0]

Document(metadata={'doc_id': '87fd15c2-5eda-4663-a6e3-038dde012838'}, page_content="President Biden has nominated Judge Ketanji Brown Jackson to serve on the United States Supreme Court, emphasizing her qualifications and broad support. He also emphasizes the need to secure the border and fix the immigration system, providing a pathway to citizenship for various groups. The President calls for advancing liberty and justice by protecting women's rights, including access to healthcare and the right to choose, as well as supporting LGBTQ+ Americans. He also outlines a Unity Agenda for the Nation, focusing on addressing the opioid epidemic, mental health, supporting veterans, and ending cancer. President Biden expresses optimism about America's future and concludes by affirming the strength of the nation and its people.")

In [29]:
retrieved_docs = retriever.invoke("justice breyer")
len(retrieved_docs[0].page_content)

9194

### Hypothetical Queries

In [32]:
functions = [
    {
        "name": "hypothetical_questions",
        "description": "Generate hypothetical questions",
        "parameters": {
            "type": "object",
            "properties": {
                "questions": {
                    "type": "array",
                    "items": {"type": "string"},
                },
            },
            "required": ["questions"],
        },
    }
]

In [36]:
from langchain.output_parsers.openai_functions import JsonKeyOutputFunctionsParser

chain = (
    {"doc": lambda x: x.page_content}
    # Only asking for 3 hypothetical questions, but this could be adjusted
    | ChatPromptTemplate.from_template(
        "Generate a list of exactly 3 hypothetical questions that the below document could be used to answer:\n\n{doc}"
    )
    | ChatOpenAI(max_retries=0, model="gpt-4o-mini").bind(
        functions=functions, function_call={"name": "hypothetical_questions"}
    )
    | JsonKeyOutputFunctionsParser(key_name="questions")
)

In [37]:
chain.invoke(docs[0])

['What if the author had continued to pursue philosophy instead of switching to AI?',
 'What if the author had access to modern programming tools during their early programming experiences?',
 'What if the author had written a book on programming in a different language instead of Lisp?']

In [38]:
hypothetical_questions = chain.batch(docs, {"max_concurrency": 5})

In [39]:
# The vectorstore to use to index the child chunks
vectorstore = Chroma(
    collection_name="hypo-questions", embedding_function=OpenAIEmbeddings()
)
# The storage layer for the parent documents
store = InMemoryByteStore()
id_key = "doc_id"
# The retriever (empty to start)
retriever = MultiVectorRetriever(
    vectorstore=vectorstore,
    byte_store=store,
    id_key=id_key,
)
doc_ids = [str(uuid.uuid4()) for _ in docs]

In [40]:
question_docs = []
for i, question_list in enumerate(hypothetical_questions):
    question_docs.extend(
        [Document(page_content=s, metadata={id_key: doc_ids[i]}) for s in question_list]
    )

In [44]:
question_docs

[Document(metadata={'doc_id': 'dc34e4cf-4d69-49fb-aaee-241d87d7a425'}, page_content='What if the author had decided to pursue a career in programming instead of switching to artificial intelligence?'),
 Document(metadata={'doc_id': 'dc34e4cf-4d69-49fb-aaee-241d87d7a425'}, page_content="How might the author's perspective on AI have changed if he had access to modern programming languages and tools during his studies?"),
 Document(metadata={'doc_id': 'dc34e4cf-4d69-49fb-aaee-241d87d7a425'}, page_content="What if the author had never been exposed to influential works like 'The Moon is a Harsh Mistress' or Terry Winograd's SHRDLU?"),
 Document(metadata={'doc_id': '29fa0e67-edec-49fe-a070-6a5af0d6884f'}, page_content='What if the author had chosen to pursue a full-time career in computer science instead of art?'),
 Document(metadata={'doc_id': '29fa0e67-edec-49fe-a070-6a5af0d6884f'}, page_content='What if the author had received an acceptance letter from the Accademia di Belli Arti before d

In [41]:
retriever.vectorstore.add_documents(question_docs)
retriever.docstore.mset(list(zip(doc_ids, docs)))

In [42]:
sub_docs = vectorstore.similarity_search("justice breyer")
sub_docs

[Document(metadata={'doc_id': 'f98dcae3-fe0f-4291-9995-a91f3340a4d9'}, page_content="What impact might Judge Ketanji Brown Jackson's nomination have on the future direction of the U.S. Supreme Court?"),
 Document(metadata={'doc_id': 'f98dcae3-fe0f-4291-9995-a91f3340a4d9'}, page_content='How could proposed immigration reforms affect the economy and labor market in the United States?'),
 Document(metadata={'doc_id': '2b6c8e16-c547-42d7-942d-340c67cf52ff'}, page_content="How might the trajectory of the author's career have changed if Robert had agreed to work with him on the new web app project?"),
 Document(metadata={'doc_id': '42a1e762-7157-4012-a701-a69e8204c63f'}, page_content='What might happen to the job market in America if the Bipartisan Infrastructure Law is not implemented?')]

In [43]:
retrieved_docs = retriever.invoke("justice breyer")
len(retrieved_docs[0].page_content)

9194