# Document Question Answering

In [1]:
from langchain.vectorstores import Chroma
from langchain.embeddings import OpenAIEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.chat_models import ChatOpenAI
from langchain.chains import RetrievalQA
from langchain.document_loaders import TextLoader, PyPDFLoader

In [2]:
import glob
import os
from dotenv import load_dotenv, find_dotenv
load_dotenv(find_dotenv())

open.api_key = os.environ['OPENAI_API_KEY']

## Load documents

In [3]:
glob.glob("data/*")

['data/silverman-openai-complaint.pdf',
 'data/doc-5.txt',
 'data/The_Effect_of_Student_Teacher_Ratio_on_Truancy.pdf',
 'data/doc-3.txt',
 'data/Question_Generation.pdf',
 'data/doc-2.txt',
 'data/state_of_the_union.txt',
 'data/fec_2016_EDA.v2.pdf',
 'data/doc-4.txt',
 'data/2023-08-01_Trump_Indictment.pdf',
 'data/exploring-ggplot.pdf',
 'data/doc-6.txt',
 'data/doc-1.txt']

In [4]:
loaders = [PyPDFLoader(pdf) for pdf in glob.glob("data/*.pdf")] + \
          [TextLoader (txt) for txt in glob.glob("data/*.txt")]

In [5]:
docs = []
for loader in loaders:
    docs.extend(loader.load())
len(docs)

123

## Split documents

In [6]:
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size = 1500,
    chunk_overlap = 150
)
splits = text_splitter.split_documents(docs)
len(splits)

257

## Initialize ChromaDB

Create embeddings for each chunk and insert into the Chroma vector database.

In [7]:
embeddings = OpenAIEmbeddings()
vectordb = Chroma.from_documents(splits, embeddings)
persist_directory = 'chroma/'

!rm -rf chroma/
vectordb = Chroma.from_documents(
    documents=splits,
    embedding=embeddings,
    persist_directory=persist_directory
)
print(vectordb._collection.count())
vectordb.persist()

257


## Create the chain

Initialize the chain we will use for question answering.

In [8]:
llm = ChatOpenAI(model_name="gpt-3.5-turbo", temperature=0)

qa = RetrievalQA.from_chain_type(
    llm,
    retriever=vectordb.as_retriever()
)

## Ask questions!

Now we can use the chain to ask questions!

In [9]:
query = "What did the president say about large corporations and the wealthy?"
qa.run(query)

"The president mentioned that when corporations don't have to compete, their profits go up, which in turn drives up prices for consumers. He also stated that the previous administration gave tax cuts to the very wealthy and corporations, leading to a wider gap between the top earners and everyone else. The president proposed closing loopholes to ensure that the wealthy don't pay a lower tax rate than teachers or firefighters."

In [10]:
query = "What does Sarah Silverman allege against OpenAI?"
qa.run(query)

'Sarah Silverman alleges that OpenAI, without her permission, made copies of her book "The Bedwetter" during the training process of their language models. She claims that the language models are infringing derivative works of her book and that OpenAI\'s actions constitute direct copyright infringement. She seeks damages and other remedies for the unauthorized use of her work.'

In [11]:
query = "What is the essence of Uncle Tom's Cabin?"
qa.run(query)

"The essence of Uncle Tom's Cabin is a story that portrays the harsh realities of slavery and the moral dilemmas faced by enslaved individuals. It highlights the noble and steadfast character of Uncle Tom, who remains faithful to his beliefs and exhibits Christian virtues even in the face of extreme suffering. The novel also explores themes of friendship, compassion, and the destructive nature of greed and brutality represented by characters like Simon Legree. Overall, Uncle Tom's Cabin aims to expose the injustices of slavery and advocate for its abolition."

In [12]:
query = "Did Richard Robbins and his colleagues reach a conclusion about whether semantic metrics are better than lexical metrics for text generation tasks?"
qa.run(query)

'Yes, Richard Robbins and his colleagues concluded that semantic metrics are more robust indicators of success at question generation than lexical metrics. They found that metrics like METEOR, BERTScore, and USE, which measure semantic similarity, produced scores that were more reflective of successful question generation compared to metrics based on lexical similarity like BLEU and ROUGE-L. They also found that high scores in one metric but not the other often indicated incomplete semantic alignment. Therefore, they recommend using both semantic and lexical metrics together for a more comprehensive evaluation of text generation tasks.'