# LangChain QA

All code comes from [LangChain docs](langchain.readthedocs.io).

In [None]:
!pip install langchain langchain_community openai chromadb tiktoken pypdf

In [None]:
import os 
os.environ["OPENAI_API_KEY"] = ""

In [None]:
from langchain.chains import RetrievalQA
from langchain.llms import OpenAI
from langchain.document_loaders import TextLoader
from langchain.document_loaders import PyPDFLoader
from langchain.indexes import VectorstoreIndexCreator
from langchain.text_splitter import CharacterTextSplitter
from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores import Chroma

In [None]:
llm = OpenAI()
print(llm("tell me a joke"))

# load_qa_chain

Loads a chain that you can use to do QA over a set of documents, but it uses ALL of those documents. 

chain_type="stuff" will not work because the number of tokens exceeds the limit. We can try other chain types like "map_reduce". 

In [None]:
from langchain.chains.question_answering import load_qa_chain
from langchain_community.document_loaders import PyPDFDirectoryLoader


### Load one document at a time
# loader = PyPDFLoader("PDF/ai1.pdf")
# documents = loader.load()

### Load multiple documents at a time
# loaders = ["PDF/ai1.pdf", "PDF/ai2.pdf"]
# documents = []
# for loader in loaders:
#     documents.extend(loader.load())

### Load multiple documents from a directory
directory_path = "PDF/"
loader = PyPDFDirectoryLoader("PDF/")
documents = loader.load()

chain = load_qa_chain(llm=OpenAI(), chain_type="map_reduce")
query = "what is Artificial intelligence?"
chain.run(input_documents=documents, question=query)

# RetrievalQA 

RetrievalQA chain uses load_qa_chain under the hood. We retrieve the most relevant chunck of text and feed those to the language model. 


#### Options: 
- [embeddings](https://python.langchain.com/v0.2/docs/how_to/#embedding-models)
- [TextSplitter](https://python.langchain.com/v0.2/docs/how_to/#text-splitters)
- [VectorStore](https://python.langchain.com/v0.2/docs/how_to/#vector-stores)
- [Retrievers](https://python.langchain.com/v0.2/docs/how_to/#retrievers)
  - [search_type](https://python.langchain.com/v0.1/docs/modules/data_connection/vectorstores/#similarity-search): "similarity" or "mmr"
- [Chain Type](https://api.python.langchain.com/en/latest/chains/langchain.chains.qa_with_sources.loading.load_qa_with_sources_chain.html#langchain.chains.qa_with_sources.loading.load_qa_with_sources_chain): "stuff", "map reduce", "refine", "map_rerank"


In [None]:
# load document
loader = PyPDFLoader("PDF/ai1.pdf")
documents = loader.load()

# split the documents into chunks
text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
texts = text_splitter.split_documents(documents)

# select which embeddings we want to use
embeddings = OpenAIEmbeddings()

# create the vectorestore to use as the index
db = Chroma.from_documents(texts, embeddings)

# expose this index in a retriever interface
retriever = db.as_retriever(search_type="similarity", search_kwargs={"k":2})

# create a chain to answer questions 
qa = RetrievalQA.from_chain_type(llm=OpenAI(), chain_type="stuff", retriever=retriever, return_source_documents=True)
query = "what is Artificial intelligence?"
result = qa({"query": query})

In [None]:
retriever.get_relevant_documents(query)

In [None]:
result

# VectorstoreIndexCreator

VectorstoreIndexCreator is a wrapper for the above logic. 

Source: 
- https://python.langchain.com/v0.1/docs/modules/chains/
- https://python.langchain.com/v0.2/docs/how_to/#vector-stores

In [None]:
index = VectorstoreIndexCreator(
    # split the documents into chunks
    text_splitter=CharacterTextSplitter(chunk_size=1000, chunk_overlap=0),

    # select which embeddings we want to use
    embedding=OpenAIEmbeddings(),
    
    # use Chroma as the vectorestore to index and search embeddings
    vectorstore_cls=Chroma
).from_loaders([loader])
query = "what is Artificial intelligence?"
index.query(llm=OpenAI(), question=query, chain_type="stuff")

# ConversationalRetrievalChain

conversation memory + RetrievalQAChain

Allow for passing in chat history which can be used for follow up questions.

Source: https://python.langchain.com/en/latest/modules/chains/index_examples/chat_vector_db.html


In [None]:
from langchain.chains import ConversationalRetrievalChain

In [None]:
# load document
loader = PyPDFLoader("PDF/ai1.pdf")
documents = loader.load()

# split the documents into chunks
text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
texts = text_splitter.split_documents(documents)

# select which embeddings we want to use
embeddings = OpenAIEmbeddings()

# create the vectorestore to use as the index
db = Chroma.from_documents(texts, embeddings)

# expose this index in a retriever interface
retriever = db.as_retriever(search_type="similarity", search_kwargs={"k":2})

# create a chain to answer questions 
qa = ConversationalRetrievalChain.from_llm(OpenAI(), retriever)

In [None]:
chat_history = []
query = "what is Artificial intelligence?"
result = qa({"question": query, "chat_history": chat_history})

In [None]:
result["answer"]

In [None]:
chat_history = [(query, result["answer"])]
query = "What is machine learning?"
result = qa({"question": query, "chat_history": chat_history})

In [None]:
chat_history

In [None]:
result['answer']