# Document Question Answering

In [38]:
from langchain.vectorstores import Chroma, DocArrayInMemorySearch
from langchain.embeddings import OpenAIEmbeddings
from langchain.text_splitter import CharacterTextSplitter, RecursiveCharacterTextSplitter
from langchain.chat_models import ChatOpenAI
from langchain.chains import RetrievalQA, ConversationalRetrievalChain
from langchain.document_loaders import TextLoader, PyPDFLoader
from langchain.prompts import PromptTemplate
from langchain.memory import ConversationBufferMemory

In [39]:
import glob
import os
!pip install panel
import panel as pn
from dotenv import load_dotenv, find_dotenv
load_dotenv(find_dotenv())

open.api_key = os.environ['OPENAI_API_KEY']

[0m

## Load documents

In [40]:
glob.glob("data/*")

['data/silverman-openai-complaint.pdf',
 'data/doc-5.txt',
 'data/The_Effect_of_Student_Teacher_Ratio_on_Truancy.pdf',
 'data/doc-3.txt',
 'data/Question_Generation.pdf',
 'data/doc-2.txt',
 'data/state_of_the_union.txt',
 'data/fec_2016_EDA.v2.pdf',
 'data/doc-4.txt',
 'data/2023-08-01_Trump_Indictment.pdf',
 'data/exploring-ggplot.pdf',
 'data/doc-6.txt',
 'data/doc-1.txt']

In [41]:
loaders = [PyPDFLoader(pdf) for pdf in glob.glob("data/*.pdf")] + \
          [TextLoader (txt) for txt in glob.glob("data/*.txt")]

In [42]:
docs = []
for loader in loaders:
    docs.extend(loader.load())
len(docs)

123

## Split documents

In [43]:
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size = 1500,
    chunk_overlap = 150
)
splits = text_splitter.split_documents(docs)
len(splits)

257

## Initialize ChromaDB

Create embeddings for each chunk and insert into the Chroma vector database.

In [44]:
embeddings = OpenAIEmbeddings()
vectordb = Chroma.from_documents(splits, embeddings)
persist_directory = 'chroma/'

!rm -rf chroma/
vectordb = Chroma.from_documents(
    documents=splits,
    embedding=embeddings,
    persist_directory=persist_directory
)
print(vectordb._collection.count())
vectordb.persist()

257


## Create the chain

Initialize the chain and prompt we will use for question answering.

In [45]:
llm = ChatOpenAI(model_name="gpt-3.5-turbo", temperature=0)

qa = RetrievalQA.from_chain_type(
    llm,
    retriever=vectordb.as_retriever(),
)

In [37]:
template = """Use the following pieces of context to answer the question at the end. If you don't know the answer, just say that you don't know, don't try to make up an answer. Keep the answer as concise as possible. 
{context}
Question: {question}
Helpful Answer:"""
QA_CHAIN_PROMPT = PromptTemplate(input_variables=["context", "question"],template=template,)

qa_chain = RetrievalQA.from_chain_type(llm,
                                       retriever=vectordb.as_retriever(),
                                       return_source_documents=True,
                                       chain_type_kwargs={"prompt": QA_CHAIN_PROMPT})

query = "What did the president say about large corporations and the wealthy?"
result = qa_chain({"query": query})
result["result"]

"The president said that when corporations don't have to compete, their profits go up, prices go up, and small businesses and family farmers and ranchers go under. He also mentioned closing loopholes so that the very wealthy don't pay a lower tax rate than a teacher or a firefighter."

## Memory

In [29]:
memory = ConversationBufferMemory(
    memory_key="chat_history",
    return_messages=True
)

## Conversational Retrieval Chain

In [30]:
from langchain.chains import ConversationalRetrievalChain
retriever=vectordb.as_retriever()
qa = ConversationalRetrievalChain.from_llm(
    llm,
    retriever=retriever,
    memory=memory
)

In [31]:
query = "Who is Sarah Silverman"
result = qa_chain({"query": query})
result["result"]

'Sarah Silverman is a writer and performer who lives in California.'

In [32]:
query = "Why is she suing OpenAI?"
result = qa_chain({"query": query})
result["result"]

'She is suing OpenAI for direct copyright infringement, vicarious copyright infringement, violations of section 1202(b) of the Digital Millennium Copyright Act, unjust enrichment, violations of the California and common law unfair competition laws, and negligence.'

In [33]:
query = "What is the name of her book?"
result = qa_chain({"query": query})
result["result"]

'The name of her book is "The Bedwetter."'

In [34]:
query = "Who are the plaintiffs in the lawsuit against OpenAI?"
result = qa_chain({"query": query})
result["result"]

'The plaintiffs in the lawsuit against OpenAI are persons or entities domiciled in the United States that own a United States copyright in any work that was used as training data for the OpenAI Language Models during the Class Period.'

In [35]:
query = "What are their names?"
result = qa_chain({"query": query})
result["result"]

"We don't know their names."

In [36]:
query = "Why didn't you mention Richard Kadrey?"
result = qa_chain({"query": query})
result["result"]

'Richard Kadrey is not mentioned in the provided context.'

## Ask questions!

Now we can use the chain to ask questions!

In [10]:
query = "What did the president say about large corporations and the wealthy?"
qa.run(query)

"The president mentioned that when corporations don't have to compete, their profits go up, which in turn drives up prices for consumers. He also stated that the previous administration's tax cuts for the wealthy and corporations led to weaker economic growth, lower wages, and a wider wealth gap. The president proposed closing loopholes to ensure that the very wealthy don't pay a lower tax rate than teachers or firefighters. Additionally, he emphasized the importance of competition in capitalism and highlighted the need to lower costs for families."

In [11]:
query = "What does Sarah Silverman allege against OpenAI?"
qa.run(query)

'Sarah Silverman alleges that OpenAI, without her permission, made copies of her book "The Bedwetter" during the training process of their language models. She claims that the language models are infringing derivative works of her book and that OpenAI\'s actions constitute direct copyright infringement. She seeks damages and other remedies for the unauthorized use of her work.'

In [None]:
query = "What is the essence of Uncle Tom's Cabin?"
qa.run(query)

In [None]:
query = "Did Richard Robbins and his colleagues reach a conclusion about whether semantic metrics are better than lexical metrics for text generation tasks?"
qa.run(query)