In [1]:
from unstructured.partition.pdf import partition_pdf
from pydantic import BaseModel
from typing import Any

from langchain_openai import ChatOpenAI, OpenAIEmbeddings
from langchain.retrievers import MultiVectorRetriever
from langchain.storage import InMemoryStore
from langchain_community.vectorstores.elasticsearch import ElasticsearchStore
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import (
  RunnableLambda,
  RunnablePassthrough
)
from langchain_core.documents import Document
from langchain.output_parsers import JsonOutputToolsParser

import uuid
from typing import Union
from operator import itemgetter
import pickle
from itertools import chain

In [2]:
import os
from dotenv import load_dotenv

load_dotenv()
OPENAI_API_KEY = os.getenv('OPENAI_API_KEY')

In [3]:
pdf_paths = ["./AMD.10K.2023.pdf", "./IBM.10K.2023.pdf", "./AAPL.10K.2023.pdf"]
pdfs = ["AMD.10K.2023.pdf", "IBM.10K.2023.pdf", "AAPL.10K.2023.pdf"]

In [4]:
# !! Think twice before running this cell, because you have pickled the required elements + running out of api calls !!
# raw_pdfs_elements = []
# for i,pdf_path in enumerate(pdf_paths):
#   raw_pdfs_elements.append(
#     partition_pdf(
#       filename=pdf_path,
#       extract_images_in_pdf=False,
#       infer_table_structure=True,
#       chunking_strategy="by_title",
#       max_characters=1800,
#       new_after_n_chars=1500,
#       combine_text_under_n_chars=1000,
#       image_output_dir_path="./",
#       url=<your_api_base_url>,
#       token=<your_api_key>,
#     )
#   )
  
#   with open(f'{pdf_path}-{i}.pkl', 'wb') as f:
#     pickle.dump(raw_pdfs_elements[i], f)


In [5]:
raw_pdf_elements = []
pickle_paths = ["./AMD.10K.2023.pdf-0.pkl", "./IBM.10K.2023.pdf-2.pkl", "./AAPL.10K.2023.pdf-4.pkl"]
for pdf in pickle_paths:
    with open(f"{pdf}", 'rb') as f:
        raw_pdf_elements.append(pickle.load(f))

In [6]:
class Element(BaseModel):
    type: str
    text: Any


# Categorize by type
categorized_elements = [
    [
        Element(type="table", text=str(element.metadata.text_as_html))
        if "unstructured.documents.elements.Table" in str(type(element))
        else Element(type="text", text=str(element))
        for element in raw_pdf_element
    ]
    for raw_pdf_element in raw_pdf_elements
]

In [7]:
table_elements = [ [e for e in categorized_element if e.type == "table"] for categorized_element in categorized_elements ]
text_elements = [ [e for e in categorized_element if e.type == "text"] for categorized_element in categorized_elements ]

In [8]:
def get_docs(text_ele):
    pdf_docs = []
    pdf_docs.extend(
        [Document(page_content=ele.text, metadata={"pdf_title":t[1]}) for ele in t[0]] for i,t in enumerate(zip(text_ele,pdfs))
    )
    pdf_docs = list(chain(*pdf_docs))
    return pdf_docs

In [9]:
table_docs = get_docs(table_elements)
text_docs = get_docs(text_elements)

In [10]:
len(text_docs)

626

In [11]:
# tables = [table.page_content for table in table_docs]
# table_summaries = table_summarize_chain.batch(texts, {"max_concurrency": 5})
# with open("table_summaries.pkl", 'wb') as f:
#     pickle.dump(table_summaries, f)

In [12]:
# texts = [text.page_content for text in text_docs]
# text_summaries = text_summarize_chain.batch(texts, {"max_concurrency": 5})
# with open("text_summaries-3.pkl", 'wb') as f:
#   pickle.dump(text_summaries, f))

In [13]:
with open("./table_summaries-3.pkl", 'rb') as f:
    table_summaries = pickle.load(f)

with open("./text_summaries.pkl", 'rb') as f:
    text_summaries = pickle.load(f)

In [14]:
text_ids = [str(uuid.uuid4()) for _ in text_docs]
table_ids = [str(uuid.uuid4()) for _ in table_docs]

id_key = "doc_id"

text_summaries_docs = [
  Document(page_content=text_summaries[i], metadata={id_key:text_ids[i], "pdf_title":text_doc.metadata['pdf_title']})
  for i,text_doc in enumerate(text_docs)
]
table_summaries_docs = [
  Document(page_content=table_summaries[i], metadata={id_key:table_ids[i], "pdf_title":table_doc.metadata['pdf_title']})
  for i,table_doc in enumerate(table_docs)
]

In [15]:
vectorstore = ElasticsearchStore(
    embedding=OpenAIEmbeddings(model="text-embedding-3-small"),
    es_url="http://localhost:9200",
    index_name="summaries_index",
    strategy=ElasticsearchStore.ApproxRetrievalStrategy()
)

vectorstore.add_documents(text_summaries_docs);
vectorstore.add_documents(table_summaries_docs);



In [16]:
docs_w_ids = list(zip(text_ids+table_ids,text_docs+table_docs))

In [17]:
vectorstore.similarity_search("How much is Apple investing in R&D?", k=2, filter=[{"term": {"metadata.pdf_title.keyword": "AAPL.10K.2023.pdf"}}])



[Document(page_content='In 2023, Apple Inc. saw a significant increase in R&D expenses due to higher headcount-related costs, while selling, general, and administrative expenses remained relatively unchanged. The provision for income taxes and effective tax rates for 2023, 2022, and 2021 are also detailed in the report.', metadata={'doc_id': 'e920174a-bea0-4485-bffb-05858c3d8e0d', 'pdf_title': 'AAPL.10K.2023.pdf'}),
 Document(page_content='In 2023, Apple Inc. saw a significant increase in R&D expenses due to higher headcount-related costs, while selling, general, and administrative expenses remained relatively unchanged. The provision for income taxes and effective tax rates for 2023, 2022, and 2021 are also detailed in the report.', metadata={'doc_id': 'a2ce90ec-225f-4081-927e-fdf5e2955096', 'pdf_title': 'AAPL.10K.2023.pdf'})]

In [18]:
def get_orig(summary_docs):
    out_docs = [docs[1] for summary_doc in summary_docs for docs in docs_w_ids if docs[0]==summary_doc.metadata[id_key]]
    return out_docs

In [19]:
get_orig(vectorstore.similarity_search("How much is Apple investing in R&D?", k=2, filter=[{"term": {"metadata.pdf_title.keyword": "AAPL.10K.2023.pdf"}}]))



[Document(page_content='Research and Development\n\nThe year-over-year growth in R&D expense in 2023 was driven primarily by increases in headcount-related expenses.\n\nSelling, General and Administrative\n\nSelling, general and administrative expense was relatively flat in 2023 compared to 2022.\n\nApple Inc. | 2023 Form 10-K | 23\n\nProvision for Income Taxes\n\nProvision for income taxes, effective tax rate and statutory federal income tax rate for 2023, 2022 and 2021 were as follows (dollars in millions):', metadata={'pdf_title': 'AAPL.10K.2023.pdf'})]

In [20]:
model = ChatOpenAI(temperature=0, model="gpt-3.5-turbo-1106")

get_pdf_query = """You are an assistant tasked with generating additional questions from the given query. \
Given a set of questions, give the relevant questions (in the format as shown) pertaining to each individual company \
in the query IF there are more than one. Also give the report name it corresponds to.
Report names:
AMD.10K.2023.pdf
AAPL.10K.2023.pdf
IBM.10K.2023.pdf
CSCO.10K.2023.pdf
UBER.10K.2023.pdf

<--example start-->
Query: What are the equity compensation plans of AMD and Cisco?
Answer:
What are the equity compensation plans of AMD?, AMD.10K.2023.pdf
What are the equity compensation plans of Cisco?, CSCO.10K.2023.pdf
<--example end-->

<--example start-->
Are there any ongoing legal disputes with Uber?
Answer:
Are there any ongoing legal disputes with Uber?, UBER.10K.2023.pdf
<--example end-->

Query: {user_query}
Answer:
"""
get_pdf_query_prompt = ChatPromptTemplate.from_template(get_pdf_query)
get_pdf_query_chain = {"user_query": RunnablePassthrough()} | get_pdf_query_prompt | model | StrOutputParser()

In [21]:
pdf_resp = get_pdf_query_chain.invoke("How much are apple and AMD investing in R&D?")

In [22]:
pdf_resp

'How much is Apple investing in R&D?, AAPL.10K.2023.pdf\nHow much is AMD investing in R&D?, AMD.10K.2023.pdf'

In [23]:
pdf_resp2 = get_pdf_query_chain.invoke("How much is AMD investing in R&D?")

In [24]:
for p in pdf_resp2.split('\n'):
    print(p.split(','))

['How much is AMD investing in R&D?', ' AMD.10K.2023.pdf']


In [25]:
pdf_resp.split('\n')[0].split(',')[1].strip()

'AAPL.10K.2023.pdf'

In [26]:
def get_context(pdf_response):
    context_out = []
    for resp in pdf_response.split('\n'):
        context_out.append(
            get_orig(
                vectorstore.similarity_search(resp.split(',')[0], k=3, filter=[{"term": {"metadata.pdf_title.keyword": resp.split(',')[1].strip()}}])
            )
        )

    return context_out

In [27]:
context_resp = get_context(get_pdf_query_chain.invoke("How much is apple and AMD investing in R&D?"))



In [28]:
context_resp

[[Document(page_content='Research and Development\n\nThe year-over-year growth in R&D expense in 2023 was driven primarily by increases in headcount-related expenses.\n\nSelling, General and Administrative\n\nSelling, general and administrative expense was relatively flat in 2023 compared to 2022.\n\nApple Inc. | 2023 Form 10-K | 23\n\nProvision for Income Taxes\n\nProvision for income taxes, effective tax rate and statutory federal income tax rate for 2023, 2022 and 2021 were as follows (dollars in millions):', metadata={'pdf_title': 'AAPL.10K.2023.pdf'})],
 [Document(page_content='During the twelve months ended December 31, 2022, we returned a total of $3.7 billion to shareholders through the repurchase of 36.3 million shares of common stock under our stock repurchase program. As of December 31, 2022, $6.5 billion remained available for future stock repurchases under this program. The repurchase program does not obligate us to acquire any common stock, has no termination date and may

In [29]:
def parse_context(contexts):
    contexts = list(chain(*contexts))
    str_out = ""
    for context in contexts:
        str_out += "CONTEXT FROM " + context.metadata['pdf_title'] + "\n"
        str_out += context.page_content + "\n\n"

    return str_out

In [30]:
context_chain = get_pdf_query_chain | get_context | parse_context

In [31]:
ctx_chain_resp = context_chain.invoke("How much is apple and AMD investing in R&D?")



In [32]:
print(ctx_chain_resp)

CONTEXT FROM AAPL.10K.2023.pdf
Research and Development

The year-over-year growth in R&D expense in 2023 was driven primarily by increases in headcount-related expenses.

Selling, General and Administrative

Selling, general and administrative expense was relatively flat in 2023 compared to 2022.

Apple Inc. | 2023 Form 10-K | 23

Provision for Income Taxes

Provision for income taxes, effective tax rate and statutory federal income tax rate for 2023, 2022 and 2021 were as follows (dollars in millions):

CONTEXT FROM AMD.10K.2023.pdf
During the twelve months ended December 31, 2022, we returned a total of $3.7 billion to shareholders through the repurchase of 36.3 million shares of common stock under our stock repurchase program. As of December 31, 2022, $6.5 billion remained available for future stock repurchases under this program. The repurchase program does not obligate us to acquire any common stock, has no termination date and may be suspended or discontinued at any time.

We co

In [33]:
rag_prompt_text = """You are an assistant for question-answering tasks. Use the following pieces of retrieved context to answer the question \
in as many words as required.
Feel free to go into the details of what's presented in the context down below.
If you don't know the answer, just say "I don't know."
Question: {question}
Context: {context}
Answer: 
"""

rag_prompt = ChatPromptTemplate.from_template(rag_prompt_text)

rag_chain = (
  {"question": RunnablePassthrough(), "context": context_chain}
  | rag_prompt
  | model
  | StrOutputParser()
)

In [34]:
print(rag_prompt_text)

You are an assistant for question-answering tasks. Use the following pieces of retrieved context to answer the question in as many words as required.
Feel free to go into the details of what's presented in the context down below.
If you don't know the answer, just say "I don't know."
Question: {question}
Context: {context}
Answer: 



In [35]:
rag_resp = rag_chain.invoke({"question":"What is apple's approach to sustainability and environmental impact?"})



In [36]:
rag_resp

"Apple's approach to sustainability and environmental impact is addressed through compliance with complex and changing laws and regulations worldwide. This includes adherence to environmental, health, and safety regulations, as well as addressing electronic waste, recycling, product design, and climate change. The company's global operations are subject to a wide range of laws and regulations related to environmental impact, and it is committed to meeting these requirements to minimize its environmental footprint."

In [37]:
rag_resp = rag_chain.invoke({"question":"What is IBM's approach to sustainability and environmental impact?"})



In [38]:
rag_resp

"IBM's approach to sustainability and environmental impact is not explicitly mentioned in the provided context. The document primarily focuses on the company's brand reputation, revenue growth, market share gains, and potential factors that could impact its business operations. Therefore, the specific details of IBM's approach to sustainability and environmental impact are not addressed in the given context."

In [44]:
rag_resp = rag_chain.invoke({"question":"Who are IBM’s main competitors?"})



In [45]:
print(rag_resp)

IBM's main competitors vary by industry segment and range from large multinational enterprises to smaller, more narrowly focused entities. In the software segment, IBM's principal competitors include Alphabet (Google), Amazon, BMC, Broadcom, Cisco Systems, Informatica, Microsoft, Oracle, Palo Alto Networks, Salesforce, SAP, Splunk, and VMware. Additionally, IBM competes with smaller, niche competitors in specific geographic regions or product segments.


In [49]:
rag_resp = rag_chain.invoke({"question":"What is AMD’s guidance or outlook for future performance?"})
print(rag_resp)



AMD’s guidance or outlook for future performance is based on current expectations and beliefs, and involves numerous risks and uncertainties that could cause actual results to differ materially from expectations. The forward-looking statements relate to factors such as demand for AMD’s products, the growth and competitive landscape of the markets, international sales, and the sufficiency of AMD’s cash, cash equivalents, and short-term investment balances to fund operations over the next 12 months and beyond. However, it is important to note that these forward-looking statements should not be relied upon as predictions of future events, as there are no assurances that the events or circumstances reflected in these statements will be achieved or will occur.
