In [13]:
from unstructured.partition.pdf import partition_pdf
from pydantic import BaseModel
from typing import Any

from langchain_openai import ChatOpenAI, OpenAIEmbeddings
from langchain.retrievers import MultiVectorRetriever
from langchain.storage import InMemoryStore
from langchain_community.vectorstores.elasticsearch import ElasticsearchStore
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import (
  RunnableLambda,
  RunnablePassthrough
)
from langchain_core.documents import Document
from langchain.output_parsers import JsonOutputToolsParser

import uuid
from typing import Union
from operator import itemgetter
import pickle
from itertools import chain

In [2]:
import os
from dotenv import load_dotenv

load_dotenv()
OPENAI_API_KEY = os.getenv('OPENAI_API_KEY')

In [3]:
pdf_paths = ["./AMD.10K.2023.pdf", "./IBM.10K.2023.pdf", "./AAPL.10K.2023.pdf"]
pdfs = ["AMD.10K.2023.pdf", "IBM.10K.2023.pdf", "AAPL.10K.2023.pdf"]

In [4]:
# !! Think twice before running this cell, because you have pickled the required elements + running out of api calls !!
# raw_pdfs_elements = []
# for i,pdf_path in enumerate(pdf_paths):
#   raw_pdfs_elements.append(
#     partition_pdf(
#       filename=pdf_path,
#       extract_images_in_pdf=False,
#       infer_table_structure=True,
#       chunking_strategy="by_title",
#       max_characters=1800,
#       new_after_n_chars=1500,
#       combine_text_under_n_chars=1000,
#       image_output_dir_path="./",
#       url=<your_api_base_url>,
#       token=<your_api_key>,
#     )
#   )
  
#   with open(f'{pdf_path}-{i}.pkl', 'wb') as f:
#     pickle.dump(raw_pdfs_elements[i], f)


In [5]:
raw_pdf_elements = []
pickle_paths = ["./AMD.10K.2023.pdf-0.pkl", "./IBM.10K.2023.pdf-2.pkl", "./AAPL.10K.2023.pdf-4.pkl"]
for pdf in pickle_paths:
    with open(f"{pdf}", 'rb') as f:
        raw_pdf_elements.append(pickle.load(f))

In [6]:
class Element(BaseModel):
    type: str
    text: Any


# Categorize by type
categorized_elements = [
    [
        Element(type="table", text=str(element.metadata.text_as_html))
        if "unstructured.documents.elements.Table" in str(type(element))
        else Element(type="text", text=str(element))
        for element in raw_pdf_element
    ]
    for raw_pdf_element in raw_pdf_elements
]

In [7]:
table_elements = [ [e for e in categorized_element if e.type == "table"] for categorized_element in categorized_elements ]
text_elements = [ [e for e in categorized_element if e.type == "text"] for categorized_element in categorized_elements ]

In [8]:
def get_docs(text_ele):
    pdf_docs = []
    pdf_docs.extend(
        [Document(page_content=ele.text, metadata={"pdf_title":t[1]}) for ele in t[0]] for i,t in enumerate(zip(text_ele,pdfs))
    )
    pdf_docs = list(chain(*pdf_docs))
    return pdf_docs

In [26]:
table_docs = get_docs(table_elements)
text_docs = get_docs(text_elements)

In [27]:
len(text_docs)

626

In [11]:
# tables = [table.page_content for table in table_docs]
# table_summaries = table_summarize_chain.batch(texts, {"max_concurrency": 5})
# with open("table_summaries.pkl", 'wb') as f:
#     pickle.dump(table_summaries, f)

In [12]:
# texts = [text.page_content for text in text_docs]
# text_summaries = text_summarize_chain.batch(texts, {"max_concurrency": 5})
# with open("text_summaries-3.pkl", 'wb') as f:
#   pickle.dump(text_summaries, f))

In [11]:
with open("./table_summaries-3.pkl", 'rb') as f:
    table_summaries = pickle.load(f)

with open("./text_summaries.pkl", 'rb') as f:
    text_summaries = pickle.load(f)

In [30]:
text_ids = [str(uuid.uuid4()) for _ in text_docs]
table_ids = [str(uuid.uuid4()) for _ in table_docs]

id_key = "doc_id"

text_summaries_docs = [
  Document(page_content=text_summaries[i], metadata={id_key:text_ids[i], "pdf_title":text_doc.metadata['pdf_title']})
  for i,text_doc in enumerate(text_docs)
]
table_summaries_docs = [
  Document(page_content=table_summaries[i], metadata={id_key:table_ids[i], "pdf_title":table_doc.metadata['pdf_title']})
  for i,table_doc in enumerate(table_docs)
]

In [36]:
vectorstore = ElasticsearchStore(
    embedding=OpenAIEmbeddings(model="text-embedding-3-small"),
    es_url="http://localhost:9200",
    index_name="summaries_index",
    strategy=ElasticsearchStore.ApproxRetrievalStrategy()
)

vectorstore.add_documents(text_summaries_docs);
vectorstore.add_documents(table_summaries_docs);

In [85]:
docs_w_ids = list(zip(text_ids+table_ids,text_docs+table_docs))

In [89]:
vectorstore.similarity_search("How much is Apple investing in R&D?", k=2, filter=[{"term": {"metadata.pdf_title.keyword": "AAPL.10K.2023.pdf"}}])



[Document(page_content='In 2023, Apple Inc. saw a significant increase in R&D expenses due to higher headcount-related costs, while selling, general, and administrative expenses remained relatively unchanged. The provision for income taxes and effective tax rates for 2023, 2022, and 2021 are also detailed in the report.', metadata={'doc_id': 'a2ce90ec-225f-4081-927e-fdf5e2955096', 'pdf_title': 'AAPL.10K.2023.pdf'}),
 Document(page_content="Apple Inc. has invested in new business strategies and acquisitions, which come with significant risks and uncertainties. These include distraction of management, unexpected liabilities and expenses, economic and regulatory challenges, inadequate return on capital, potential impairment of assets, and significant write-offs. There is also the risk of failing to obtain required regulatory approvals or facing onerous conditions that could delay or prevent a transaction. These new ventures are inherently risky and may not be successful, potentially impac

In [91]:
def get_orig(summary_docs):
    out_docs = [docs[1] for summary_doc in summary_docs for docs in docs_w_ids if docs[0]==summary_doc.metadata[id_key]]
    return out_docs

In [90]:
get_orig(vectorstore.similarity_search("How much is Apple investing in R&D?", k=2, filter=[{"term": {"metadata.pdf_title.keyword": "AAPL.10K.2023.pdf"}}]))



[Document(page_content='Research and Development\n\nThe year-over-year growth in R&D expense in 2023 was driven primarily by increases in headcount-related expenses.\n\nSelling, General and Administrative\n\nSelling, general and administrative expense was relatively flat in 2023 compared to 2022.\n\nApple Inc. | 2023 Form 10-K | 23\n\nProvision for Income Taxes\n\nProvision for income taxes, effective tax rate and statutory federal income tax rate for 2023, 2022 and 2021 were as follows (dollars in millions):', metadata={'pdf_title': 'AAPL.10K.2023.pdf'}),
 Document(page_content='Apple Inc. | 2023 Form 10-K | 11\n\nInvestment in new business strategies and acquisitions could disrupt the Company’s ongoing business, present risks not originally contemplated and materially adversely affect the Company’s business, reputation, results of operations and financial condition.\n\nThe Company has invested, and in the future may invest, in new business strategies or acquisitions. Such endeavors m

In [93]:
model = ChatOpenAI(temperature=0, model="gpt-3.5-turbo-1106")

get_pdf_query = """You are an assistant tasked with generating additional questions from the given query. \
Given a set of questions, give the relevant questions (in the format as shown) pertaining to each individual company \
in the query IF there are more than one. Also give the report name it corresponds to.
Report names:
AMD.10K.2023.pdf
AAPL.10K.2023.pdf
IBM.10K.2023.pdf
CSCO.10K.2023.pdf
UBER.10K.2023.pdf

<--example start-->
Query: What are the equity compensation plans of AMD and Cisco?
Answer:
What are the equity compensation plans of AMD?, AMD.10K.2023.pdf
What are the equity compensation plans of Cisco?, CSCO.10K.2023.pdf
<--example end-->

<--example start-->
Are there any ongoing legal disputes with Uber?
Answer:
Are there any ongoing legal disputes with Uber?, UBER.10K.2023.pdf
<--example end-->

Query: {user_query}
Answer:
"""
get_pdf_query_prompt = ChatPromptTemplate.from_template(get_pdf_query)
get_pdf_query_chain = {"user_query": RunnablePassthrough()} | get_pdf_query_prompt | model | StrOutputParser()

In [96]:
pdf_resp = get_pdf_query_chain.invoke("How much is apple and AMD investing in R&D?")

In [105]:
pdf_resp2 = get_pdf_query_chain.invoke("How much is AMD investing in R&D?")

In [112]:
for p in pdf_resp2.split('\n'):
    print(p.split(','))

['How much is AMD investing in R&D?', ' AMD.10K.2023.pdf']


In [117]:
pdf_resp.split('\n')[0].split(',')[1].strip()

'AAPL.10K.2023.pdf'

In [121]:
def get_context(pdf_response):
    context_out = []
    for resp in pdf_response.split('\n'):
        context_out.append(
            get_orig(
                vectorstore.similarity_search(resp.split(',')[0], k=2, filter=[{"term": {"metadata.pdf_title.keyword": resp.split(',')[1].strip()}}])
            )
        )

    return context_out

In [122]:
context_resp = get_context(get_pdf_query_chain.invoke("How much is apple and AMD investing in R&D?"))



In [123]:
context_resp

[[Document(page_content='Research and Development\n\nThe year-over-year growth in R&D expense in 2023 was driven primarily by increases in headcount-related expenses.\n\nSelling, General and Administrative\n\nSelling, general and administrative expense was relatively flat in 2023 compared to 2022.\n\nApple Inc. | 2023 Form 10-K | 23\n\nProvision for Income Taxes\n\nProvision for income taxes, effective tax rate and statutory federal income tax rate for 2023, 2022 and 2021 were as follows (dollars in millions):', metadata={'pdf_title': 'AAPL.10K.2023.pdf'}),
  Document(page_content='Apple Inc. | 2023 Form 10-K | 11\n\nInvestment in new business strategies and acquisitions could disrupt the Company’s ongoing business, present risks not originally contemplated and materially adversely affect the Company’s business, reputation, results of operations and financial condition.\n\nThe Company has invested, and in the future may invest, in new business strategies or acquisitions. Such endeavors