In [1]:
from langchain_core.tools import tool
from langchain_openai import ChatOpenAI, OpenAIEmbeddings

import uuid
from langchain.retrievers import BM25Retriever, EnsembleRetriever, MultiVectorRetriever
from langchain.storage import InMemoryStore
from langchain_community.vectorstores import Chroma
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser, JsonOutputParser
from langchain_core.runnables import (
  Runnable,
  RunnableLambda,
  RunnablePassthrough
)
from langchain_core.documents import Document
from langchain.output_parsers import JsonOutputToolsParser

from typing import Union
from operator import itemgetter
import pickle
from itertools import chain

In [2]:
model = ChatOpenAI(temperature=0, model="gpt-3.5-turbo-1106")

In [3]:
pdfs = ['AAPL.10K.2023.pdf', 'AMD.10K.2023.pdf', 'IBM.10K.2023.pdf']

In [4]:
with open('all_pdf_docs.pkl', 'rb') as f:
  all_pdf_docs = pickle.load(f)

with open('all_pdf_tables.pkl', 'rb') as f:
  all_pdf_tables = pickle.load(f)

with open('table_summaries.pkl', 'rb') as f:
  table_summaries = pickle.load(f)

In [5]:
# vectorstore = Chroma.from_documents(documents=all_pdf_docs, embedding=OpenAIEmbeddings(model="text-embedding-3-small"))

In [6]:
table_summaries = list(chain(*table_summaries))

In [7]:
text_summarize_prompt_text = """You are an assistant tasked with summarizing text. \ 
Give a concise summary of the text. Text chunk: {element} """
text_summarize_prompt = ChatPromptTemplate.from_template(text_summarize_prompt_text)

text_summarize_chain = {"element": lambda x: x} | text_summarize_prompt | model | StrOutputParser()

In [8]:
# texts = [text.page_content for text in all_pdf_docs if text.metadata['pdf_title'] in ["AAPL.10K.2023.pdf", "AMD.10K.2023.pdf", "IBM.10K.2023.pdf"]]
# text_summaries = text_summarize_chain.batch(texts, {"max_concurrency": 5})

In [9]:
# with open("text_summaries.pkl", 'wb') as f:
#   pickle.dump(text_summaries, f)

In [10]:
with open("text_summaries.pkl", 'rb') as f:
  text_summaries = pickle.load(f)

In [11]:
text_summaries[57]

"Intel's dominant position in the market could negatively impact our business due to their business practices, including pricing actions, product bundling, and marketing strategies. They have greater financial resources and invest heavily in marketing and research and development, making us vulnerable to their aggressive marketing and pricing strategies for microprocessor products."

In [13]:
multi_vectorstore = Chroma(collection_name="summaries_docs", embedding_function=OpenAIEmbeddings(model="text-embedding-3-small"))
store = InMemoryStore()
id_key = "doc_id"

retriever = MultiVectorRetriever(
  vectorstore=multi_vectorstore,
  docstore=store,
  id_key=id_key,
  search_type="mmr",
  search_kwargs={"k":2},
)


In [14]:
doc_ids = [str(uuid.uuid4()) for _ in all_pdf_docs if _.metadata['pdf_title'] in pdfs]
temp_docs = [doc for doc in all_pdf_docs if doc.metadata['pdf_title'] in pdfs]
text_summaries_docs = [
  Document(page_content=text_summaries[i], metadata={id_key:doc_ids[i], "pdf_title":doc.metadata['pdf_title']})
  for i,doc in enumerate(temp_docs)
]
retriever.vectorstore.add_documents(text_summaries_docs)
retriever.docstore.mset(list(zip(doc_ids,temp_docs)))



In [15]:
table_ids = [str(uuid.uuid4()) for _ in all_pdf_tables if _.metadata['pdf_title'] in pdfs]
temp_tables = [table for table in all_pdf_tables if table.metadata['pdf_title'] in pdfs]
table_summaries_docs = [
  Document(page_content=table_summaries[i], metadata={id_key:table_ids[i], "pdf_title":table.metadata['pdf_title']})
  for i,table in enumerate(temp_tables)
]
retriever.vectorstore.add_documents(table_summaries_docs)
retriever.docstore.mset(list(zip(table_ids, temp_tables)))



In [16]:
retriever.invoke("How does the AMD\'s debt and equity position look?")



[Document(page_content='ternal financing on favorable terms, or at all; AMD’s expectation that based on management’s current knowledge, the potential liability related to AMD’s current litigation will not have a material adverse effect on its financial position, results of operation or cash flows; anticipated ongoing and increased costs related to enhancing and implementing information security controls; all unbilled accounts receivables are expected to be billed and collected within 12 months; revenue allocated to remaining performance obligations that are unsatisfied which will be recognized in the next 12 months; and a small number of customers will continue to account for a substantial part of AMD’s revenue in the future. For a discussion of the factors that could cause actual results to differ materially from the forward-looking statements, see “Part I, Item 1A-Risk Factors” and the “Financial Condition” section set forth in “Part II, Item 7-Management’s Discussion and Analysis of

In [17]:
# @tool
# def context_retriever(pdf_name: str, query: str) -> str:
#   """Function to get information about the exact user query"""
#   vec_retriever = vectorstore.as_retriever(search_type="mmr", search_kwargs={"filter":{"pdf_title": pdf_name}, "k":2})
#   bm25_retriever = BM25Retriever.from_documents([all_pdf_doc for all_pdf_doc in all_pdf_docs if pdf_name in all_pdf_doc.metadata['pdf_title']], k=2)
#   ens_retriever = EnsembleRetriever(
#     retrievers=[vec_retriever, bm25_retriever]
#   )
#   _context = ens_retriever.invoke(query)
#   context = '\n'.join(c.page_content for c in _context)
#   return f"Context from {pdf_name} report:\n {context}"


In [18]:
# tools = [context_retriever]
# model_with_tools = model.bind_tools(tools)
# tool_map = {tool.name: tool for tool in tools}

# def call_tool(tool_invocation: dict) -> Union[str, Runnable]:
#   """Function for dynamically constructing the end of the chain based on the model-selected tool."""
#   tool = tool_map[tool_invocation["type"]]
#   return RunnablePassthrough.assign(output=itemgetter("args") | tool)

# call_tool_list = RunnableLambda(call_tool).map()
# tool_chain = model_with_tools | JsonOutputToolsParser() | call_tool_list

In [19]:
# tool_input = """What is the revenue of amd for the year 2022?, AMD.10K.2023.pdf
# What is the revenue of alibaba for the year 2022?, BABA.10K.2023.pdf"""
# args_resp = tool_chain.invoke(tool_input)

In [20]:
get_pdf_query = """You are an assistant tasked with generating additional questions from the given query. \
Given a set of questions, give the relevant questions (in the format as shown) pertaining to each individual company \
in the query IF there are more than one.
<--example start-->
Query: What are the equity compensation plans of AMD and Cisco?
Answer:
What are the equity compensation plans of AMD?
What are the equity compensation plans of Cisco?
<--example end-->

<--example start-->
Are there any ongoing legal disputes with Intel?
Answer:
Are there any ongoing legal disputes with Intel?
<--example end-->

Query: {user_query}
Answer:
"""
get_pdf_query_prompt = ChatPromptTemplate.from_template(get_pdf_query)
get_pdf_query_chain = {"user_query": RunnablePassthrough()} | get_pdf_query_prompt | model | StrOutputParser()

In [21]:
# test_dict = {"user_query":"How is alibaba's revenue for the year of 2023"}
# test_resp = get_pdf_query_chain.invoke(test_dict)
# print(test_resp)

In [22]:
def parse_context(contexts):
  str_out = ""
  for context in contexts:
    str_out += "CONTEXT FROM " + context[0].metadata['pdf_title'] + "\n"
    if len(context)==1:
      continue

    for c in context:
      str_out += c.page_content + "\n\n"

  return str_out

In [23]:
context_chain = get_pdf_query_chain | RunnableLambda(lambda x: x.split('\n')) | RunnableLambda(retriever.invoke).map() | parse_context

In [24]:
context_resp = context_chain.invoke({"user_query":"How much is Apple investing in R&D"})



In [25]:
print(context_resp)

CONTEXT FROM AAPL.10K.2023.pdf
Research and Development

The year-over-year growth in R&D expense in 2023 was driven primarily by increases in headcount-related expenses.

Selling, General and Administrative

Selling, general and administrative expense was relatively flat in 2023 compared to 2022.

Apple Inc. | 2023 Form 10-K | 23

Provision for Income Taxes

Provision for income taxes, effective tax rate and statutory federal income tax rate for 2023, 2022 and 2021 were as follows (dollars in millions):

Apple Inc. | 2023 Form 10-K | 11

Investment in new business strategies and acquisitions could disrupt the Company’s ongoing business, present risks not originally contemplated and materially adversely affect the Company’s business, reputation, results of operations and financial condition.

The Company has invested, and in the future may invest, in new business strategies or acquisitions. Such endeavors may involve significant risks and uncertainties, including distraction of managem

In [26]:
rag_prompt_text = """You are an assistant for question-answering tasks. Use the following pieces of retrieved context to answer the question \
in as many words as required.
If you don't know the answer, just say \"I don't know.\"
Question: {question}
Context: {context}
Answer: 
"""

rag_prompt = ChatPromptTemplate.from_template(rag_prompt_text)

In [27]:
rag_chain = (
  {"question": RunnablePassthrough(), "context": context_chain}
  | rag_prompt
  | ChatOpenAI(temperature=0.4, model="gpt-3.5-turbo-1106")
  | StrOutputParser()
)

In [28]:
rag_resp = rag_chain.invoke({"question":"What is apple's approach to sustainability and environmental impact?"})



In [29]:
print(rag_resp)

Apple's approach to sustainability and environmental impact involves complying with complex and changing laws and regulations worldwide, including those related to environmental, health, and safety, electronic waste, recycling, product design, and climate change. They are committed to minimizing their environmental impact and ensuring compliance with environmental regulations.


In [30]:
rag_resp = rag_chain.invoke({"question":"How does IBM's debt and equity position look?"})



In [31]:
print(context_chain.invoke("How does IBM's debt and equity position look?"))



CONTEXT FROM IBM.10K.2023.pdf
and by Customer Credit Risk on Receivables: The company’s financial performance is exposed to a wide variety of industry sector dynamics worldwide, including sudden shifts in regional or global economic activity. The company’s earnings and cash flows, as well as its access to funding, could be negatively impacted by changes in market liquidity conditions. IBM’s 2022 Annual Report to Stockholders includes information about the company’s liquidity position. The company’s client base includes many enterprises worldwide, from small and medium businesses to the world’s largest organizations and governments, with a significant portion of the company’s revenue coming from global clients across many sectors. Most of the company’s sales are on an open credit basis, and the company performs ongoing credit evaluations of its clients’ financial conditions. If the company becomes aware of information related to the creditworthiness of a major customer, or if future act

In [32]:
print(rag_resp)

Based on the information provided in the context, IBM's debt position includes long-term taxes payable of $15,457 in 2022 and $16,657 in 2023, as well as other non-current liabilities of $34,391 in 2022 and $32,485 in 2023. The company's equity position is not explicitly mentioned in the provided context. Therefore, based on the given information, it is not possible to provide a comprehensive assessment of IBM's debt and equity position.


In [33]:
rag_resp = rag_chain.invoke({"question":"Are there new market or regulatory challenges Apple faces?"})



In [35]:
print(context_chain.invoke("Are there new market or regulatory challenges Apple faces?"))



CONTEXT FROM AAPL.10K.2023.pdf
Apple Inc. | 2023 Form 10-K | 12

The Company is subject to complex and changing laws and regulations worldwide, which exposes the Company to potential liabilities, increased costs and other adverse effects on the Company’s business.

The Company’s global operations are subject to complex and changing laws and regulations on subjects, including antitrust; privacy, data security and data localization; consumer protection; advertising, sales, billing and e-commerce; financial services and technology; product liability; intellectual property ownership and infringement; digital platforms; machine learning and artificial intelligence; internet, telecommunications and mobile communications; media, television, film and digital content; availability of third-party software applications and services; labor and employment; anticorruption; import, export and trade; foreign exchange controls and cash repatriation restrictions; anti–money laundering; foreign ownership

In [36]:
print(rag_resp)

Apple faces new market and regulatory challenges due to the complex and changing laws and regulations worldwide. These challenges include antitrust, privacy, data security, consumer protection, intellectual property ownership and infringement, and environmental, health, and safety regulations. Additionally, Apple expects intense competition from rapid technological changes, frequent product introductions by competitors, aggressive pricing, and competitors with significant marketing and sales resources. The company also faces the potential impact of competitors introducing new products into the market before Apple, which could adversely affect demand for Apple's products. Furthermore, increased adoption of ARM-based semiconductor designs presents a potential challenge for Apple.


In [38]:
rag_resp = rag_chain.invoke({"question":"Are there any ongoing legal disputes or regulatory issues with IBM?"})



In [37]:
print(context_chain.invoke("Are there any ongoing legal disputes or regulatory issues with IBM?"))



CONTEXT FROM IBM.10K.2023.pdf
The Company Is Subject to Legal Proceedings and Investigatory Risks: As a company with a substantial employee population and with clients in more than 175 countries, IBM is or may become involved as a party and/or may be subject to a variety of claims, demands, suits, investigations, tax matters and other proceedings that arise from time to time in the ordinary course of its business. The risks associated with such legal proceedings are described in more detail in note R, “Commitments & Contingencies,” in IBM’s 2022 Annual Report to Stockholders. The company believes it has adopted appropriate risk management and compliance programs. Legal and compliance risks, however, will continue to exist and additional legal proceedings and other contingencies, the outcome of which cannot be predicted with certainty, may arise from time to time.

https://www.sec.gov/Archives/edgar/data/51143/000155837023002376/ibm-20221231x10k.htm

18/51

1/12/24, 6:41 AM

sec.gov/Arc

In [39]:
print(rag_resp)

Yes, IBM is subject to legal proceedings and investigatory risks, as it may become involved in claims, demands, suits, investigations, tax matters, and other proceedings that arise in the ordinary course of its business. The company believes it has adopted appropriate risk management and compliance programs, but legal and compliance risks will continue to exist, and additional legal proceedings and other contingencies may arise from time to time.
