# Step-by-step implementation
The following are the steps to implement the Multi-Representation Indexing:
  1. Import necessary modules
  2. Set up the OpenAI API key
  3. Load documents and split text
  4. Generate document summaries with LLM
  5. Index with multi-representations
  6. Retrieve documents based on query


## 1. Import necessary modules


In [0]:
import os
from langchain_community.document_loaders import TextLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_core.prompts import ChatPromptTemplate
from langchain_openai import ChatOpenAI
from langchain_core.output_parsers import StrOutputParser
from langchain_chroma import Chroma
from langchain_openai import OpenAIEmbeddings
from langchain.storage import InMemoryByteStore
from langchain.retrievers.multi_vector import MultiVectorRetriever
import uuid
from langchain_core.documents import Document

## 2. Set up the OpenAI API key

In [0]:
OPENAI_API_KEY = os.environ["OPENAI_API_KEY"] = ""  # Add your OpenAI API key
if OPENAI_API_KEY == "":
    raise ValueError("Please set the OPENAI_API_KEY environment variable")

## 3. Load documents and split text


In [0]:
loaders = [
    TextLoader("blog.langchain.dev_announcing-langsmith_.txt"),
    TextLoader("blog.langchain.dev_automating-web-research_.txt"),
]

docs = []
for loader in loaders:
    docs.extend(loader.load())
text_splitter = RecursiveCharacterTextSplitter(chunk_size=10000)
docs = text_splitter.split_documents(docs)

## 4. Generate document summaries with LLM


In [0]:
chain = (
    {"doc": lambda x: x.page_content}
    | ChatPromptTemplate.from_template("Summarize the following document:\n\n{doc}")
    | ChatOpenAI(model="gpt-3.5-turbo", max_retries=0)
    | StrOutputParser()
)

In [0]:
summaries = chain.batch(docs, {"max_concurrency": 3})

## 5. Index with multi-representations


In [0]:
vectorstore = Chroma(collection_name="summaries", embedding_function=OpenAIEmbeddings())

store = InMemoryByteStore()
id_key = "doc_id"

retriever = MultiVectorRetriever(
    vectorstore=vectorstore,
    byte_store=store,
    id_key=id_key,
)
doc_ids = [str(uuid.uuid4()) for _ in docs]

In [0]:
summary_docs = [
    Document(page_content=s, metadata={id_key: doc_ids[i]})
    for i, s in enumerate(summaries)
]

In [0]:
retriever.vectorstore.add_documents(summary_docs)
retriever.docstore.mset(list(zip(doc_ids, docs)))

## 6. Retrieve documents based on query

In [0]:
query = "What is LangSmith?"
sub_docs = vectorstore.similarity_search(query)
sub_docs[0]

In [0]:
retrieved_docs = retriever.invoke(query)

In [0]:
retrieved_docs[0].page_content[0:500]

In [0]:
len(retrieved_docs[0].page_content)