In [127]:
from langchain.llms import OpenAI
from langchain.embeddings import OpenAIEmbeddings
from langchain.document_loaders import TextLoader
from langchain.text_splitter import CharacterTextSplitter
from langchain.vectorstores import Chroma
from langchain.chains import RetrievalQA
from langchain.docstore.document import Document
from langchain.schema import Document

from abc import ABC, abstractmethod
from typing import List

## What are Indexes

Indexes refer to ways to structure `documents` so that LLMs can best interact with them. The most common way that indexes are used in chains is in a `retrieval` step. This step refers to taking a user's query and returning the most relevant documents.

Most of the time when we talk about indexes and retrieval we are talking about indexing and retrieving `unstructured data`. 

In [3]:
## The `BaseRetriever` interface is as simple as the following class

class BaseRetriever(ABC):
    @abstractmethod
    def get_relevant_documents(self, query: str) -> List[Document]:
        """Get texts relevant for a query.

        Args:
            query: string to find relevant texts for

        Returns:
            List of relevant documents
        """

where the `get_relevant_documents` method can be implemented as we like.

The main type of Retriever is a `Vectorstore` retriever. In order to understand what this type of retriever is, we need to discuss more about Vectorstores. To showcase Vectorstores we are going to create a simple `question answering` system in a document.

QA over a document consist of those steps:
1. Create an `Index`.
2. Create a `Retriever` from that Index.
3. Create a QA `Chain`.
4. `Ask` Questions.

In [34]:
## Loading the document into LangChain

loader = TextLoader("state_of_the_union.txt", encoding="utf-8")
loader

<langchain.document_loaders.text.TextLoader at 0x7f06703e7f40>

In [39]:
## Splitting Text into Chunks

text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
texts = text_splitter.split_documents(loader.load())

type(texts), type(texts[0])

(list, langchain.schema.Document)

In [36]:
## Setting an Embedding Model

embeddings = OpenAIEmbeddings(openai_api_key=open("openai_api.txt").read())

In [42]:
## Creating a Vectorestore using an Index

db = Chroma.from_documents(texts, embeddings)
db

<langchain.vectorstores.chroma.Chroma at 0x7f06767dbaf0>

In [None]:
## Perfroming Queries

query = "What is Advengers"
docs = db.similarity_search(query, k=2)
docs[0]

In [None]:
## Adding text to the Vectorstore

db.add_texts(["Ankush went to Princeton College"])

In [None]:
query = "Where did Ankush go to college?"
docs = db.similarity_search(query)

docs

You can see more about `Vectorestores` here: https://python.langchain.com/docs/modules/data_connection/vectorstores/

In [43]:
## Creating a Retriever from that Index

retriever = db.as_retriever()
retriever

VectorStoreRetriever(vectorstore=<langchain.vectorstores.chroma.Chroma object at 0x7f06767dbaf0>, search_type='similarity', search_kwargs={})

In [46]:
## Creating the QA Chain

qa = RetrievalQA.from_chain_type(llm=OpenAI(openai_api_key=open("openai_api.txt").read()), chain_type="stuff", retriever=retriever)

In [47]:
## Asking Questions

query = "What did the president say about Ketanji Brown Jackson"
qa.run(query)

" The president said that Ketanji Brown Jackson is one of the nation's top legal minds and will continue Justice Breyer's legacy of excellence."