In [2]:
import unstructured
from pydantic import BaseModel
from typing import Any
import pickle

import uuid
from langchain.retrievers import BM25Retriever, EnsembleRetriever
from langchain_community.vectorstores import Chroma
from langchain_core.documents import Document
from langchain_core.prompts import ChatPromptTemplate
from langchain_openai import OpenAIEmbeddings, ChatOpenAI
from langchain_core.runnables import RunnablePassthrough
from langchain_core.output_parsers import StrOutputParser

In [4]:
model = ChatOpenAI(temperature=0.2, model="gpt-3.5-turbo-1106")

In [3]:
pdfs = ["AMD.10K.2023.pdf", "BABA.10K.2023.pdf", "IBM.10K.2023.pdf", "UBER.10K.2023.pdf", "AAPL.10K.2023.pdf"]

In [4]:
pkls = ["./AMD.10K.2023.pdf-0.pkl", "./BABA.10K.2023.pdf-1.pkl", "./IBM.10K.2023.pdf-2.pkl", "./UBER.10K.2023.pdf-3.pkl", "AAPL.10K.2023.pdf-4.pkl"]
raw_pdf_elements = []
for pkl in pkls:
  with open(f"{pkl}", 'rb') as f:
    raw_pdf_elements.append(pickle.load(f))

In [5]:
class Element(BaseModel):
    type: str
    text: Any


# Categorize by type
# categorized_elements = [[], [], [], [], []]
# for raw_pdf_element,categorized_element in zip(raw_pdf_elements,categorized_elements):
#     for element in raw_pdf_element:
#         if "unstructured.documents.elements.Table" in str(type(element)):
#             categorized_element.append(Element(type="table", text=str(element.metadata.text_as_html)))
#         elif "unstructured.documents.elements.CompositeElement" in str(type(element)):
#             categorized_element.append(Element(type="text", text=str(element)))

categorized_elements = [
    [Element(type="table", text=str(element.metadata.text_as_html))
     if isinstance(element, unstructured.documents.elements.Table)
     else Element(type="text", text=str(element))
     for element in raw_pdf_element]
    for raw_pdf_element in raw_pdf_elements
]


In [6]:
categorized_elements[3][-4]

Element(type='table', text='<table><tr><td>Wan Ling Martello</td><td></td><td></td></tr><tr><td>/s/ H.E. Yasir Al-Rumayyan</td><td>Director</td><td>Fe bruary 2 , 20:</td></tr><tr><td colspan="3">H.E. Yasir Al-Rumayyan</td></tr><tr><td>/s/ John Thain</td><td>Director</td><td>Fe bruary 2 , 20:</td></tr><tr><td colspan="3">John Thain</td></tr><tr><td>/s/ David Trujillo</td><td>Director</td><td>Fe bruary 2 , 20:</td></tr><tr><td colspan="3">David Trujillo</td></tr><tr><td>/s/ Alexander Wynaendts</td><td>Director</td><td>Fe bruary 2 , 20:</td></tr></table>')

In [7]:
# Tables
# table_elements = [[], [], [], [], []]
# for table_element,categorized_element in zip(table_elements,categorized_elements):
#   for e in categorized_element:
#     if e.type=="table":
#       table_element.append(e)
  # print(len(table_element))
table_elements = [ [e for e in categorized_element if e.type == "table"] for categorized_element in categorized_elements ]

# Text
# text_elements = [[], [], [], [], []]
# for text_element,categorized_element in zip(text_elements,categorized_elements):
#   for e in categorized_element:
#     if e.type=="text":
#       text_element.append(e)
#   print(len(text_element))
text_elements = [ [e for e in categorized_element if e.type == "text"] for categorized_element in categorized_elements ]


In [10]:
count = 0
for ele in text_elements[0]:
  if 'None' in ele.text:
    count += 1

count

3

In [11]:
summarize_prompt_text = """You are an assistant tasked with summarizing tables.
Give a concise summary of the table. Table chunk: {table_element}"""
summarize_prompt = ChatPromptTemplate.from_template(summarize_prompt_text)

summarize_chain = {"table_element": RunnablePassthrough()} | summarize_prompt | model | StrOutputParser()

In [50]:
test_table = table_elements[0][1].text
test_resp = summarize_chain.invoke(test_table)

In [51]:
test_resp

'The table shows the repurchases of shares during each fiscal quarter of 2022, along with the total number of shares repurchased and the average price paid per share. It also includes the total number of shares repurchased as part of the publicly announced program and the maximum dollar value of shares that may yet be purchased under the program. The last fiscal quarter of 2022 is also included in the table.'

In [8]:
tables_html = [ [t.text for t in table_element] for table_element in table_elements ]

In [13]:
# DO NOT RUN THIS CELL. YOU HAVE PICKLED HERE
# table_summaries = [ [summarize_chain.invoke(table_html) for table_html in table] for table in tables ]
# with open("table_summaries.pkl", 'wb') as f:
#   pickle.dump(table_summaries, f)

In [9]:
with open("./table_summaries.pkl", 'rb') as f:
  table_summaries = pickle.load(f)

In [11]:
len(table_summaries)

5

In [10]:
print('\nSummary:\n'.join([tables_html[0][0],table_summaries[0][0]]))

<table><tr><td>Business</td></tr><tr><td>Risk Factors</td></tr><tr><td>Unresolved Staff Comments</td></tr><tr><td>Properties</td></tr><tr><td>Legal Proceedings</td></tr><tr><td>Mine Safety Disclosures</td></tr><tr><td>Market for Registrant's Common Equity, Related Stockholder Matters and Issuer Purchases of Equity. Securities</td></tr><tr><td>Reserved]</td></tr><tr><td>Management's Discussion and Analysis of Financial Condition and Results of Operations</td></tr><tr><td>Quantitative and Qualitative Disclosure About Market Risk</td></tr><tr><td>Financial Statements and Supplementary Data</td></tr><tr><td>Changes in and Disagreements with Accountants on Accounting_and Financial Disclosure</td></tr><tr><td>Controls and Procedures</td></tr><tr><td>Other Information</td></tr><tr><td>Disclosures Regarding Foreign Jurisdictions that Prevent Inspections</td></tr><tr><td>Directors, Executive Officers and Governance</td></tr><tr><td>Corporate Executive Compensation</td></tr><tr><td>Security Owne

In [16]:
table_summary_p_html = [ ['\nSummary:\n'.join([table,table_text]) for table,table_text in zip(table_html,table_summary)] for table_html,table_summary in zip(tables_html,table_summaries) ] 

In [14]:
text_elements = [ [ text.text for text in text_element ] for text_element in text_elements ]

In [17]:
pdf_wise_text = [text+table for text,table in zip(text_elements,table_summary_p_html)]

In [19]:
for p in pdf_wise_text:
  print(len(p))

412
1187
90
692
279


In [20]:
from itertools import chain
def get_docs(raw_pdfs):
  pdf_docs = []
  pdf_docs.extend(
    [Document(page_content=text, metadata={"pdf_title":raw_pdf[1]}) for text in raw_pdf[0]] for i,raw_pdf in enumerate(zip(raw_pdfs,pdfs))
  )
  pdf_docs = list(chain(*pdf_docs))
  return pdf_docs

In [22]:
len(get_docs(pdf_wise_text))

2660

In [31]:
print(text_elements[0][5])

The statements in this report include forward-looking statements within the meaning of the Private Securities Litigation Reform Act of 1995. These forward- looking statements are based on current expectations and beliefs and involve numerous risks and uncertainties that could cause actual results to differ materially from expectations. These forward-looking statements speak only as of the date hereof or as of the dates indicated in the statements and should not be relied upon as predictions of future events, as we cannot assure you that the events or circumstances reflected in these statements will be achieved or will occur. You can identify forward-looking statements by the use of forward-looking terminology including “believes,” “expects,” “may,” “will,” “should,” “seeks,” “intends,” “plans,” “pro forma,” “estimates,” “anticipates,” or the negative of these words and phrases, other variations of these words and phrases or comparable terminology. The forward-looking statements relate 

In [37]:
all_pdf_docs = get_docs(text_elements)
all_pdf_tables = get_docs(tables_html)

In [43]:
all_pdf_docs[-1].metadata, all_pdf_tables[-1].metadata

({'pdf_title': 'AAPL.10K.2023.pdf'}, {'pdf_title': 'AAPL.10K.2023.pdf'})

In [23]:
test_docs = []
for apc in all_pdf_docs:
  if apc.metadata['pdf_title']=='AAPL.10K.2023.pdf':
    test_docs.append(apc)
len(test_docs)
bm = BM25Retriever.from_documents(all_pdf_docs, k=2)

In [44]:
type([all_pdf_doc for all_pdf_doc in all_pdf_docs if "AMD" in all_pdf_doc.metadata['pdf_title']])

list

In [24]:
bm.invoke("financial holdings")

[Document(page_content='domiciled and operate in countries with particular economic, tax, political, legal, safety, regulatory and public health risks, including the extent of the impact of the COVID-19 pandemic on their business; are domiciled or operate in countries that may become subject to economic sanctions or foreign investment restrictions; depend on the management talents and efforts of a small group of individuals, and, as a result, the death, disability, resignation, or termination of one or more of these individuals could have an adverse effect on the relevant company’s operations; and will likely require substantial additional capital to support their operations and expansion and to maintain their competitive positions. For example, in light of the conflict between Russia and Ukraine, members of our management team resigned from the board of our Yandex.Taxi joint venture, and we announced that we are actively looking for opportunities to accelerate the sale of our remainin

In [25]:
test_vec = Chroma.from_documents(documents=all_pdf_docs, embedding=OpenAIEmbeddings(model="text-embedding-3-small", show_progress_bar=True))



  0%|          | 0/3 [00:00<?, ?it/s]

In [26]:
test_vec_retr = test_vec.as_retriever(search_kwargs={"k":3})

In [38]:
test_vec.as_retriever(search_type="mmr", search_kwargs={"filter":{"pdf_title":"AAPL.10K.2023.pdf"}}).invoke("factors affecting brand and reputation")



  0%|          | 0/1 [00:00<?, ?it/s]

[Document(page_content='The Company’s products and services may be affected from time to time by design and manufacturing defects that could materially adversely affect the Company’s business and result in harm to the Company’s reputation.', metadata={'doc_id': '6d7636eb-ab12-4578-9eca-ad202f9b9f50', 'pdf_title': 'AAPL.10K.2023.pdf'}),
 Document(page_content='rsely affect the Company’s business. Political uncertainty surrounding trade and other international disputes could also have a negative effect on consumer confidence and spending, which could adversely affect the Company’s business.', metadata={'doc_id': '6d7636eb-ab12-4578-9eca-ad202f9b9f50', 'pdf_title': 'AAPL.10K.2023.pdf'}),
 Document(page_content='The Company’s retail stores are subject to numerous risks and uncertainties.\n\nThe Company’s retail operations are subject to many factors that pose risks and uncertainties and could adversely impact the Company’s business, results of operations and financial condition, including 

In [30]:
ens = EnsembleRetriever(
  retrievers=[bm, test_vec_retr], weights=[0.5, 0.5]
)

In [31]:
ens.invoke("test")



  0%|          | 0/1 [00:00<?, ?it/s]

[Document(page_content='Note:\n\n16\n\n17', metadata={'doc_id': '1a3245eb-c8b7-462a-8075-e4d334959ce3', 'pdf_title': 'BABA.10K.2023.pdf'}),
 Document(page_content='Goodwill Impairment Assessment\n\nWe review goodwill for impairment annually (in the fourth quarter) and whenever events or changes in circumstances indicate that goodwill might be impaired. We make certain judgments and assumptions to determine our reporting units and in allocating shared assets and liabilities to determine the carrying values for each of our reporting units. Determination of reporting units is based on a judgmental evaluation of the level at which our segment managers review financial results, evaluate performance, and allocate resources.\n\nJudgment in the assessment of qualitative factors of impairment include, among other factors: financial performance; legal, regulatory, contractual, political, business, and other factors; entity specific factors; industry and market considerations, macroeconomic condi

In [44]:
# PICKLING
with open('all_pdf_docs.pkl', 'wb') as f:
  pickle.dump(all_pdf_docs, f)

with open('all_pdf_tables.pkl', 'wb') as f:
  pickle.dump(all_pdf_tables, f)

In [48]:
len(text_elements[0]) + len(text_elements[4]) + len(text_elements[2])

626