In [77]:
from langchain.chat_models import ChatOpenAI
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter, CharacterTextSplitter 
from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores import Chroma
from langchain.chains import ConversationalRetrievalChain


In [78]:
loader = PyPDFLoader("/Users/rouzbeh/Downloads/intel_10k_2022.pdf")
documents = loader.load()
# split the documents into chunks
text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=10)
texts = text_splitter.split_documents(documents)
# select which embeddings we want to use
embeddings = OpenAIEmbeddings()
# create the vectorestore to use as the index
db = Chroma.from_documents(texts, embeddings)
# expose this index in a retriever interface
retriever = db.as_retriever(search_type="similarity", search_kwargs={"k":4})
# create a chain to answer questions 
qa = ConversationalRetrievalChain.from_llm(
    llm=ChatOpenAI(model_name="gpt-4", temperature=0),
    chain_type="stuff",
    retriever=retriever,
    return_source_documents=True,
    )
chat_history = []
query = "intel revenue in 2022?"
result = qa({"question": query, "chat_history": chat_history})
result["answer"]


' Yes, the word Gore appears in the document.\nSOURCES: /Users/rouzbeh/Downloads/apple_10k_2022.pdf'

In [82]:
chat_history = [(query, result["answer"])]
query = "Can you say apple's revenue in 2022 again with the unit?"
result = qa({"question": query, "chat_history": chat_history})
result['answer']

" Apple's revenue in 2022 expressed in units is $394,328.\nSOURCES: /Users/rouzbeh/Downloads/apple_10k_2022.pdf"