In [27]:
from langchain_community.document_loaders import PyMuPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import Qdrant
from langchain_openai.embeddings import OpenAIEmbeddings
from langchain.retrievers import SelfQueryRetriever
from langchain_openai import OpenAI, ChatOpenAI
from langchain_core.prompts import PromptTemplate, ChatPromptTemplate
import os
from getpass import getpass
import openai
from operator import itemgetter
from langchain_core.runnables import RunnablePassthrough

In [8]:
openai.api_key = getpass("Please provide your OpenAI Key: ")
os.environ["OPENAI_API_KEY"] = openai.api_key

In [10]:
loader = PyMuPDFLoader("meta_10k.pdf")
documents = loader.load()

In [11]:
len(documents)

147

In [12]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size = 4250, chunk_overlap = 100)
texts = text_splitter.split_documents(documents)

In [13]:
len(texts)

208

In [14]:
embeddings_model = OpenAIEmbeddings(model = "text-embedding-3-small")

In [15]:
qdrant = Qdrant.from_documents(texts, embeddings_model, location=":memory:", collection_name = '10K_RAG', force_recreate=True)
qdrant

<langchain_community.vectorstores.qdrant.Qdrant at 0x284036510>

In [16]:
query1 = "What was the total value of 'Cash and cash equivalents' as of December 31, 2023?"
query2 = "Who are Meta's 'Directors' (i.e., members of the Board of Directors)?"

In [18]:
qdrant_retriever = qdrant.as_retriever()
qdrant_retriever

VectorStoreRetriever(tags=['Qdrant', 'OpenAIEmbeddings'], vectorstore=<langchain_community.vectorstores.qdrant.Qdrant object at 0x284036510>)

In [19]:
llm = ChatOpenAI(temperature=0)
llm.model_name

'gpt-3.5-turbo'

In [22]:
metadata_field_info = [
]

document_content_desc = "Form 10-K annual report required by the U.S. Securities and Exchange Commission (SEC), that gives a comprehensive summary of a company's financial performance for company Meta for year 2023"

self_query_retriever = SelfQueryRetriever.from_llm(llm,
                                                   qdrant, 
                                                   document_content_desc, 
                                                   metadata_field_info)

In [26]:
template = """You are an helpful assistant for question-answering tasks, specifically you are an expert in answering SEC 10-K report questions.
If you  don't know the answer, just say that you don't know.
Answer based on the context given to you, for a given question.

Context:
{context}

Question:
{question}

Answer:"""

rag_chat_prompt = ChatPromptTemplate.from_template(template)
print(rag_chat_prompt.messages[0].prompt.template)

You are an helpful assistant for question-answering tasks, specifically you are an expert in answering SEC 10-K report questions.
If you  don't know the answer, just say that you don't know.
Answer based on the context given to you, for a given question.

Context:
{context}

Question:
{question}

Answer:


In [37]:
rag_qa_chain_sqr =  (
    {"context": itemgetter("question") | self_query_retriever, "question": itemgetter("question")}
    | RunnablePassthrough.assign(context=itemgetter("context"))
    | {"response": rag_chat_prompt | llm, "context": itemgetter("context")}
)

In [39]:
query1_result = rag_qa_chain_sqr.invoke({"question": query1})
query2_result = rag_qa_chain_sqr.invoke({"question": query2})

In [40]:
print(query1_result['response'].content)
print("*******")
print(query2_result['response'].content)

The total value of 'Cash and cash equivalents' as of December 31, 2023, was $41.862 billion.
*******
The directors of Meta, as listed in the SEC 10-K report, are:
1. Peggy Alford
2. Marc L. Andreessen
3. Andrew W. Houston
4. Nancy Killefer
5. Robert M. Kimmitt
6. Sheryl K. Sandberg
7. Tracey T. Travis
8. Tony Xu


In [61]:
query2_result['context']

[Document(page_content='Table of Contents\nPOWER OF ATTORNEY\nKNOW ALL PERSONS BY THESE PRESENTS, that each person whose signature appears below constitutes and appoints Susan Li and Katherine R.\nKelly, and each of them, as his or her true and lawful attorneys-in-fact and agents, with full power of substitution and resubstitution, for him or her and in his or\nher name, place and stead, in any and all capacities, to sign any and all amendments to this Annual Report on Form 10-K, and to file the same, with all exhibits\nthereto, and other documents in connection therewith, with the Securities and Exchange Commission, granting unto said attorneys-in-fact and agents, and each\nof them, full power and authority to do and perform each and every act and thing requisite and necessary to be done in connection therewith, as fully to all\nintents and purposes as he or she might or could do in person, hereby ratifying and confirming that all said attorneys-in-fact and agents, or any of them or t

In [62]:
query1_result

{'response': AIMessage(content="The total value of 'Cash and cash equivalents' as of December 31, 2023, was $41.862 billion.", response_metadata={'token_usage': {'completion_tokens': 27, 'prompt_tokens': 4496, 'total_tokens': 4523}, 'model_name': 'gpt-3.5-turbo', 'system_fingerprint': 'fp_3b956da36b', 'finish_reason': 'stop', 'logprobs': None}, id='run-7cde556e-69b7-4c7c-be16-1df5d6d35111-0'),
 'context': [Document(page_content='Table of Contents\nNote\xa05. Financial Instruments\nInstruments Measured at Fair Value\nWe classify our cash equivalents and marketable debt securities within Level\xa01 or Level\xa02 because we use quoted market prices or alternative pricing\nsources and models utilizing market observable inputs to determine their fair value. Certain other assets are classified within Level\xa03 because factors used to\ndevelop the estimated fair value are unobservable inputs that are not supported by market activity.\nThe following tables summarize our assets measured at fai