In [13]:
from dotenv import load_dotenv
load_dotenv()

True

In [14]:
import nest_asyncio
nest_asyncio.apply()

In [15]:
from llama_parse import LlamaParse
parser = LlamaParse(
    result_type="markdown",
)

In [12]:
import os
data_path ='./data/sample_excel.xlsx'
documents = parser.load_data(data_path)

Started parsing the file under job_id cd0ab71d-6d8f-4d8f-9eca-f05b56edfa5f


In [18]:
print(documents[0].text)

# 478479 series COVER Packer wor

|serial number|Items to be improved                  |Improvement direction                                             |Improvement plan                                                                      |feasibility assessment|Evaluation indicators|       |       |Program evaluation|Options|
|-------------|--------------------------------------|------------------------------------------------------------------|--------------------------------------------------------------------------------------|----------------------|---------------------|-------|-------|------------------|-------|
|             |                                      |                                                                  |                                                                                      |                      |technology           |economy|society|                  |       |
|1            |Product master mold surface inspection|1:Reduce the number o

In [20]:
## Set OpenAI API Key

import os

from llama_index.llms.openai import OpenAI
from llama_index.core import Settings

llm = OpenAI()
Settings.llm = llm

In [21]:
## Build Index and QueryEngine
from llama_index.core import VectorStoreIndex
index = VectorStoreIndex.from_documents(documents)
query_engine = index.as_query_engine()

In [23]:
response = query_engine.query("What improvement direction was selected for 'Product Wear Rod'?")
print(str(response))

The improvement direction selected for 'Product Wear Rod' was to make changes in rod threading methods.


## Advanced RAG with LlamaParse

In [26]:
import os
data_path = './data/_10-K-2021-(As-Filed).pdf'
os.path.exists(data_path)

True

In [27]:
from llama_index.llms.openai import OpenAI
from llama_index.embeddings.openai import OpenAIEmbedding
from llama_index.core import VectorStoreIndex
from llama_index.core import Settings

embed_model = OpenAIEmbedding(model="text-embedding-3-small")
llm = OpenAI(model="gpt-3.5-turbo-0125")

Settings.llm = llm
Settings.embed_model = embed_model

In [28]:
from llama_parse import LlamaParse

documents = LlamaParse(result_type="markdown").load_data(data_path)

Started parsing the file under job_id befb89c3-c317-432d-a1d7-2402c408ca43


In [31]:
from copy import deepcopy
from llama_index.core.schema import TextNode
from llama_index.core import VectorStoreIndex


def get_page_nodes(docs, separator="\n---\n"):
    """Split each document into page node, by separator."""
    nodes = []
    for doc in docs:
        doc_chunks = doc.text.split(separator)
        for doc_chunk in doc_chunks:
            node = TextNode(
                text=doc_chunk,
                metadata=deepcopy(doc.metadata),
            )
            nodes.append(node)

    return nodes

In [32]:
page_nodes = get_page_nodes(documents)

In [34]:
print(len(page_nodes))

82


In [37]:
print(page_nodes[0].text)

# UNITED STATES SECURITIES AND EXCHANGE COMMISSION

# Washington, D.C. 20549

# FORM 10-K

(Mark One)

☒ ANNUAL REPORT PURSUANT TO SECTION 13 OR 15(d) OF THE SECURITIES EXCHANGE ACT OF 1934

For the fiscal year ended September 25, 2021

or

☐ TRANSITION REPORT PURSUANT TO SECTION 13 OR 15(d) OF THE SECURITIES EXCHANGE ACT OF 1934

For the transition period from               to          .

Commission File Number: 001-36743

# Apple Inc.

(Exact name of Registrant as specified in its charter)

|California|94-2404110|
|---|---|
|(State or other jurisdiction|(I.R.S. Employer Identification No.)|
|of incorporation or organization)| |

One Apple Park Way

Cupertino, California 95014

(Address of principal executive offices) (Zip Code)

(408) 996-1010

(Registrant’s telephone number, including area code)

# Securities registered pursuant to Section 12(b) of the Act:

|Title of each class|Trading symbol(s)|Name of each exchange on which registered|
|---|---|---|
|Common Stock, $0.00001 par va

In [38]:
from llama_index.core.node_parser import MarkdownElementNodeParser

node_parser = MarkdownElementNodeParser(
    llm=OpenAI(model="gpt-3.5-turbo-0125"), num_workers=8
)

In [39]:
nodes = node_parser.get_nodes_from_documents(documents)

2it [00:00, ?it/s]
1it [00:00, ?it/s]
1it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
1it [00:00, ?it/s]
1it [00:00, ?it/s]
0it [00:00, ?it/s]
1it [00:00, ?it/s]
1it [00:00, ?it/s]
3it [00:00, ?it/s]
2it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
1it [00:00, ?it/s]
1it [00:00, ?it/s]
1it [00:00, ?it/s]
1it [00:00, ?it/s]
1it [00:00, ?it/s]
5it [00:00, ?it/s]
1it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
1it [00:00, ?it/s]
2it [00:00, ?it/s]
2it [00:00, ?it/s]
3it [00:00, ?it/s]
3it [00:00, ?it/s]
2it [00:00, ?it/s]
1it [00:00, ?it/s]
2it [00:00, ?it/s]
2it [00:00, ?it/s]
1it [00:00, ?it/s]
2it [00:00, ?it/s]
2it [00:00, ?it/s]
0it [00:00, ?it/s]
2it [00:00, 

In [42]:
print(type(nodes))
print(len(nodes))

<class 'list'>
256


In [44]:
nodes[0].text

'UNITED STATES SECURITIES AND EXCHANGE COMMISSION\n\n Washington, D.C. 20549\n\n FORM 10-K\n\n(Mark One)\n\n☒ ANNUAL REPORT PURSUANT TO SECTION 13 OR 15(d) OF THE SECURITIES EXCHANGE ACT OF 1934\n\nFor the fiscal year ended September 25, 2021\n\nor\n\n☐ TRANSITION REPORT PURSUANT TO SECTION 13 OR 15(d) OF THE SECURITIES EXCHANGE ACT OF 1934\n\nFor the transition period from               to          .\n\nCommission File Number: 001-36743\n\n Apple Inc.\n\n(Exact name of Registrant as specified in its charter)'

In [45]:
base_nodes, objects = node_parser.get_nodes_and_objects(nodes)

In [48]:
objects[0].get_content()

"This table provides information about a company's state of incorporation or organization and its corresponding I.R.S. Employer Identification Number.,\nwith the following columns:\n- California: None\n- 94-2404110: None\n"

In [49]:
# dump both indexed tables and page text into the vector index
recursive_index = VectorStoreIndex(nodes=base_nodes + objects + page_nodes)

In [50]:
print(page_nodes[31].get_content())

# Apple Inc.

# CONSOLIDATED STATEMENTS OF OPERATIONS

(In millions, except number of shares which are reflected in thousands and per share amounts)

| |Years ended|September 25, 2021|September 26, 2020|September 28, 2019|
|---|---|---|---|---|
|Net sales:|Products|$ 297,392|$ 220,747|$ 213,883|
| |Services|$ 68,425|$ 53,768|$ 46,291|
| |Total net sales|$ 365,817|$ 274,515|$ 260,174|
|Cost of sales:|Products|$ 192,266|$ 151,286|$ 144,996|
| |Services|$ 20,715|$ 18,273|$ 16,786|
| |Total cost of sales|$ 212,981|$ 169,559|$ 161,782|
| |Gross margin|$ 152,836|$ 104,956|$ 98,392|
|Operating expenses:|Research and development|$ 21,914|$ 18,752|$ 16,217|
| |Selling, general and administrative|$ 21,973|$ 19,916|$ 18,245|
| |Total operating expenses|$ 43,887|$ 38,668|$ 34,462|
|Operating income| |$ 108,949|$ 66,288|$ 63,930|
|Other income/(expense), net| |$ 258|$ 803|$ 1,807|
|Income before provision for income taxes| |$ 109,207|$ 67,091|$ 65,737|
|Provision for income taxes| |$ 14,527|$ 9,680

In [53]:
from llama_index.postprocessor.flag_embedding_reranker import FlagEmbeddingReranker
reranker = FlagEmbeddingReranker(
    top_n=5,
    model="BAAI/bge-reranker-large",
)
recursive_query_engine = recursive_index.as_query_engine(
    similarity_top_k=5, node_postprocessors=[reranker], verbose=True
)

  for b in cls.__mro__[-1:0:-1]:


tokenizer_config.json:   0%|          | 0.00/443 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.1M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/279 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/801 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.24G [00:00<?, ?B/s]

In [54]:
## Setup Baseline
from llama_index.core import SimpleDirectoryReader

reader = SimpleDirectoryReader(input_files=[data_path])
base_docs = reader.load_data()
raw_index = VectorStoreIndex.from_documents(base_docs)
raw_query_engine = raw_index.as_query_engine(
    similarity_top_k=5, node_postprocessors=[reranker]
)

In [None]:
## Table Query Task: Queries for Table Question Answering
query = "Purchases of marketable securities in 2020"

response_1 = raw_query_engine.query(query)
print("\n***********Basic Query Engine***********")
print(response_1)

response_2 = recursive_query_engine.query(query)
print("\n***********New LlamaParse+ Recursive Retriever Query Engine***********")
print(response_2)

pre tokenize:   0%|                                                                                                          | 0/1 [00:00<?, ?it/s]You're using a XLMRobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
pre tokenize: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 124.59it/s]



***********Basic Query Engine***********
The purchases of marketable securities in 2020 amounted to $163.4 billion.
[1;3;38;2;11;159;203mRetrieval entering 7d2d89dc-4eb2-483d-8ccf-d970e9d80d57: TextNode
[0m[1;3;38;2;237;90;200mRetrieving from object TextNode with query Purchases of marketable securities in 2020
[0m

pre tokenize: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 247.20it/s]



***********New LlamaParse+ Recursive Retriever Query Engine***********
The purchases of marketable securities in 2020 amounted to $171.886 billion.


In [56]:
print(response_2.source_nodes[2].get_content())

This table provides information on hedged assets and liabilities for the years 2021 and 2020, including current and non-current marketable securities and term debt.,
with the following columns:
- Hedged assets/(liabilities):: None
- 2021: None
- 2020: None

|Hedged assets/(liabilities):|2021|2020|
|---|---|---|
|Current and non-current marketable securities|$ 15,954|$ 16,270|
|Current and non-current term debt|$ (17,857)|$ (21,033)|



In [57]:
query = "effective interest rates of all debt issuances in 2021"

response_1 = raw_query_engine.query(query)
print("\n***********Basic Query Engine***********")
print(response_1)

response_2 = recursive_query_engine.query(query)
print("\n***********New LlamaParse+ Recursive Retriever Query Engine***********")
print(response_2)

pre tokenize: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 198.62it/s]



***********Basic Query Engine***********
0.03% - 4.78%
[1;3;38;2;11;159;203mRetrieval entering 77263be9-18c7-4697-b733-b1fd6ae6afa5: TextNode
[0m[1;3;38;2;237;90;200mRetrieving from object TextNode with query effective interest rates of all debt issuances in 2021
[0m[1;3;38;2;11;159;203mRetrieval entering 0fa72da3-cf43-4c00-8e84-e48953ebaaf5: TextNode
[0m[1;3;38;2;237;90;200mRetrieving from object TextNode with query effective interest rates of all debt issuances in 2021
[0m[1;3;38;2;11;159;203mRetrieval entering 6389bddb-3700-427e-8424-dc1af9fd9c9f: TextNode
[0m[1;3;38;2;237;90;200mRetrieving from object TextNode with query effective interest rates of all debt issuances in 2021
[0m

pre tokenize: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 499.02it/s]



***********New LlamaParse+ Recursive Retriever Query Engine***********
0.48% – 0.63%, 0.03% – 4.78%, 0.75% – 2.81%, 1.43% – 2.86%


In [58]:
print(response_1.source_nodes[0].get_content())

Term Debt
As of September 25, 2021, the Company had outstanding floating- and fixed-rate notes with varying maturities for an aggregate 
principal amount of $118.1 billion (collectively the “Notes”). The Notes are senior unsecured obligations and interest is payable in 
arrears. The following table provides a summary of the Company’s term debt as of September  25, 2021 and September  26, 
2020:
Maturities
(calendar year)
2021 2020
Amount
(in millions)
Effective
Interest Rate
Amount
(in millions)
Effective
Interest Rate
2013 – 2020 debt issuances:
Floating-rate notes  2022 $ 1,750 0.48% – 0.63% $ 2,250 0.60% – 1.39%
Fixed-rate 0.000% – 4.650% notes 2022 – 2060  95,813 0.03% – 4.78%  103,828 0.03% – 4.78%
Second quarter 2021 debt issuance:
Fixed-rate 0.700% – 2.800% notes 2026 – 2061  14,000 0.75% – 2.81%  —  — %
Fourth quarter 2021 debt issuance:
Fixed-rate 1.400% – 2.850% notes 2028 – 2061  6,500 1.43% – 2.86%  —  — %
Total term debt  118,063  106,078 
Unamortized premium/(discount) an

In [59]:
query = "Impacts of the U.S. Tax Cuts and Jobs Act of 2017 on income taxes in 2020"

response_1 = raw_query_engine.query(query)
print("\n***********Basic Query Engine***********")
print(response_1)

response_2 = recursive_query_engine.query(query)
print("\n***********New LlamaParse+ Recursive Retriever Query Engine***********")
print(response_2)

pre tokenize: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 195.20it/s]



***********Basic Query Engine***********
The U.S. Tax Cuts and Jobs Act of 2017 had an impact on income taxes in 2020, as evidenced by a decrease in the provision for income taxes compared to the previous year.
[1;3;38;2;11;159;203mRetrieval entering 95ddcd5f-dda9-40bd-b44d-03845389270b: TextNode
[0m[1;3;38;2;237;90;200mRetrieving from object TextNode with query Impacts of the U.S. Tax Cuts and Jobs Act of 2017 on income taxes in 2020
[0m[1;3;38;2;11;159;203mRetrieval entering 34e7585a-dedf-4952-b59a-1c43f7e27bbc: TextNode
[0m[1;3;38;2;237;90;200mRetrieving from object TextNode with query Impacts of the U.S. Tax Cuts and Jobs Act of 2017 on income taxes in 2020
[0m[1;3;38;2;11;159;203mRetrieval entering 24ac6da9-f295-4931-8a4a-771e5320d91a: TextNode
[0m[1;3;38;2;237;90;200mRetrieving from object TextNode with query Impacts of the U.S. Tax Cuts and Jobs Act of 2017 on income taxes in 2020
[0m[1;3;38;2;11;159;203mRetrieval entering b12a4c4c-6443-48aa-9f25-a68f108e2db4: Text

pre tokenize: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 499.32it/s]



***********New LlamaParse+ Recursive Retriever Query Engine***********
The U.S. Tax Cuts and Jobs Act of 2017 had a negative impact on income taxes in 2020, as indicated by a figure of ($582) million in the table provided.


In [60]:
query = "current state taxes per year in 2019-2021 (include +/-)"

response_1 = raw_query_engine.query(query)
print("\n***********Basic Query Engine***********")
print(response_1)

response_2 = recursive_query_engine.query(query)
print("\n***********New LlamaParse+ Recursive Retriever Query Engine***********")
print(response_2)

pre tokenize: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 198.39it/s]



***********Basic Query Engine***********
$1,620 million in 2019, $455 million in 2020, $475 million in 2021.
[1;3;38;2;11;159;203mRetrieval entering 34e7585a-dedf-4952-b59a-1c43f7e27bbc: TextNode
[0m[1;3;38;2;237;90;200mRetrieving from object TextNode with query current state taxes per year in 2019-2021 (include +/-)
[0m[1;3;38;2;11;159;203mRetrieval entering 24ac6da9-f295-4931-8a4a-771e5320d91a: TextNode
[0m[1;3;38;2;237;90;200mRetrieving from object TextNode with query current state taxes per year in 2019-2021 (include +/-)
[0m[1;3;38;2;11;159;203mRetrieval entering 95ddcd5f-dda9-40bd-b44d-03845389270b: TextNode
[0m[1;3;38;2;237;90;200mRetrieving from object TextNode with query current state taxes per year in 2019-2021 (include +/-)
[0m[1;3;38;2;11;159;203mRetrieval entering 0596859c-09a1-4173-bae9-42984222abab: TextNode
[0m[1;3;38;2;237;90;200mRetrieving from object TextNode with query current state taxes per year in 2019-2021 (include +/-)
[0m[1;3;38;2;11;159;203m

pre tokenize: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 499.62it/s]



***********New LlamaParse+ Recursive Retriever Query Engine***********
State current taxes for the years 2019-2021 are as follows:
- 2019: $475
- 2020: $455
- 2021: $1,620


In [61]:
print(response_2.source_nodes[0].get_content())

Summary of Federal, State, and Foreign income taxes for the years 2019, 2020, and 2021,
with the following columns:
- 2021: None
- 2020: None
- 2019: None

| |2021|2020|2019|
|---|---|---|---|
|Federal:| | | |
|Current|$ 8,257|$ 6,306|$ 6,384|
|Deferred|(7,176)|(3,619)|(2,939)|
|Total|1,081|2,687|3,445|
|State:| | | |
|Current|1,620|455|475|
|Deferred|(338)|21|(67)|
|Total|1,282|476|408|
|Foreign:| | | |
|Current|9,424|3,134|3,962|
|Deferred|2,740|3,383|2,666|
|Total|12,164|6,517|6,628|
|Provision for income taxes|$ 14,527|$ 9,680|$ 10,481|



In [63]:
!wget "https://policyholder.gov.in/documents/37343/931203/NBHTGBP22011V012223.pdf/c392bcc1-f6a8-cadd-ab84-495b3273d2c3?version=1.0&t=1669350459879&download=true" -O "./policy.pdf"

--2024-11-14 18:15:55--  https://policyholder.gov.in/documents/37343/931203/NBHTGBP22011V012223.pdf/c392bcc1-f6a8-cadd-ab84-495b3273d2c3?version=1.0&t=1669350459879&download=true
Resolving policyholder.gov.in (policyholder.gov.in)... 13.107.253.51
Connecting to policyholder.gov.in (policyholder.gov.in)|13.107.253.51|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1341586 (1.3M) [application/pdf]
Saving to: './policy.pdf'

     0K .......... .......... .......... .......... ..........  3%  676K 2s
    50K .......... .......... .......... .......... ..........  7%  430K 2s
   100K .......... .......... .......... .......... .......... 11% 1.59M 2s
   150K .......... .......... .......... .......... .......... 15% 2.93M 1s
   200K .......... .......... .......... .......... .......... 19% 2.95M 1s
   250K .......... .......... .......... .......... .......... 22%  323K 1s
   300K .......... .......... .......... .......... .......... 26% 6.21M 1s
   350K ........

In [64]:
## Vanila approach

from llama_parse import LlamaParse
documents = LlamaParse(result_type="markdown").load_data("./data/policy.pdf")

Started parsing the file under job_id 0620f9e2-4478-4bfa-b924-1f4258fba9f9
....

In [65]:
documents[0].text[:1000]

'# Bupa nivaHealth Insurance\n\n# 1. Preamble\n\nThis ‘Travel Infinity’ Policy is a contract of insurance between You and Us which is subject to payment of full premium in advance and the terms, conditions and exclusions of this Policy. Expense incurred outside the policy period will NOT be covered. Unutilized Sum Insured will expire at the end of policy year. All applicable benefits, details and limits are mentioned in your Certificate of insurance. We will cover only allopathic treatments in this policy.\n\n# 2. Defined Terms\n\nThe terms listed below in this Section and used elsewhere in the Policy in Initial Capitals shall have the meaning set out against them in this Section.\n\n# Standard Definitions\n\n# 2.1\n\nAccident or Accidental means sudden, unforeseen and involuntary event caused by external, visible and violent means.\n\n# 2.2\n\nCo-payment means a cost sharing requirement under a health insurance policy that provides that the policyholder/insured will bear a specified p

In [66]:
## Markdown element node parser
from llama_index.core.node_parser import MarkdownElementNodeParser
node_parser = MarkdownElementNodeParser(
    llm=OpenAI(model="gpt-3.5-turbo-0125"), num_workers=8
)

In [67]:
nodes = node_parser.get_nodes_from_documents(documents)

0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
2it [00:00, ?it/s]
2it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
1it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
1it [00:00, 1015.82it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
1it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
2it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
2it [00:00, ?it/s]
2it [00:00, ?it/s]
1it [00:00, ?it/s]
0it [00:00, ?it/s]
1it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
1it [00:00, ?it/s]
6it [00:00, ?it/s]
0it [0

In [68]:
base_nodes, objects = node_parser.get_nodes_and_objects(nodes)

recursive_index = VectorStoreIndex(nodes=base_nodes + objects)

In [69]:
query_engine = recursive_index.as_query_engine(similarity_top_k=25)

In [70]:
query_1 = "My trip was delay and I paid 45, how much am I cover for?"

response_1 = query_engine.query(query_1)
print(str(response_1))

You are covered for the delay of your trip up to the amount specified in the Certificate of Insurance.


In [72]:
documents_with_instruction = LlamaParse(
    result_type="markdown",
    parsing_instruction="""
This document is an insurance policy.
When a benefits/coverage/exlusion is describe in the document ammend to it add a text in the follwing benefits string format (where coverage could be an exclusion).
For {nameofrisk} and in this condition {whenDoesThecoverageApply} the coverage is {coverageDescription}.                                  
If the document contain a benefits TABLE that describe coverage amounts, do not ouput it as a table, but instead as a list of benefits string.                                 
""",
).load_data("./data/policy.pdf")

Started parsing the file under job_id d527d9fd-22a8-40f0-a07f-172d72e5dcfc
.

In [74]:
target_page = 45
pages_vanilla = documents[0].text.split("\n---\n")
pages_with_instructions = documents_with_instruction[0].text.split("\n---\n")

print(pages_vanilla[target_page])
print("\n\n=========================================================\n\n")
print(pages_with_instructions[target_page])

IndexError: list index out of range

In [90]:
import os
from IPython.display import Markdown, display

In [None]:
## Simple example without parser
from llama_index.core import VectorStoreIndex, SimpleDirectoryReader

documents = SimpleDirectoryReader("data_2").load_data()

In [83]:
# create an index from the documents
index = VectorStoreIndex.from_documents(documents)

In [84]:
# create a query engine for the index
query_engine = index.as_query_engine()

In [92]:
# query the engine
query = "What is full form of SMIRL?"
response = query_engine.query(query)

In [93]:
display(Markdown(f"<b>{response}</b>"))

<b>State Machine Integrated Recognition and Localization</b>

In [94]:
## Example with parser
parser = LlamaParse(
    result_type='markdown'
)
# use SimpleDirectoryReader to parse our file
file_extractor = {".pdf": parser}
documents = SimpleDirectoryReader(input_files=['data_2/Paper-2.pdf'], file_extractor=file_extractor).load_data()

Started parsing the file under job_id 473d5694-4614-4a30-a5c5-cdcdd957491b
...........

In [95]:
# create an index from the parsed markdown
index = VectorStoreIndex.from_documents(documents)

In [96]:
# create a query engine for the index
query_engine = index.as_query_engine()

In [101]:
# query the engine
query = "What is the NMAE and IoU for 'Attach Bracket'?"
response = query_engine.query(query)
display(Markdown(f"<b>{response}</b>"))

<b>The NMAE for 'Attach Bracket' is 0.12. The context does not provide information about the IoU (Intersection over Union) for 'Attach Bracket'.</b>

In [102]:
## Advanced RAG with Excel File
parser = LlamaParse(
    result_type='markdown'
)
documents = parser.load_data('./data/BP_Excel.xlsx')


Started parsing the file under job_id cdfe9e0a-0217-488c-9d02-768a91bb3146


In [103]:
len(documents)

44

In [109]:
print(documents[3].get_content())

# Summary

|Financial and Operating Information 2020 - 2024                          |           |        |        |        |        |        |        |        |        |        |        |         |        |        |        |        |        |        |        |        |        |        |        |      |      |           |
|-------------------------------------------------------------------------|-----------|--------|--------|--------|--------|--------|--------|--------|--------|--------|--------|---------|--------|--------|--------|--------|--------|--------|--------|--------|--------|--------|--------|------|------|-----------|
|Group information                                                        |           |        |        |        |        |        |        |        |        |        |        |         |        |        |        |        |        |        |        |        |        |        |        |      |      |           |
|                                                 

In [132]:
llm_o1 = OpenAI(model="o1-mini")
llm_gpt4o_mini = OpenAI(model="gpt-4o-mini")
llm_o1_preview = OpenAI(model="o1-preview")

In [122]:
node_parser = MarkdownElementNodeParser(llm = llm_gpt4o_mini, num_workers=4)

In [123]:
nodes = node_parser.get_nodes_from_documents(documents[:10])

1it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
1it [00:00, ?it/s]
1it [00:00, 995.33it/s]
1it [00:00, ?it/s]
1it [00:00, ?it/s]
1it [00:00, ?it/s]
1it [00:00, ?it/s]
1it [00:00, ?it/s]


In [124]:
base_nodes, objects = node_parser.get_nodes_and_objects(nodes)

In [125]:
len(nodes), len(base_nodes), len(objects)

(27, 11, 8)

In [126]:
print(objects[3].get_content())

This table presents the financial and operating information of a group from 2020 to 2024, detailing the condensed group statement of comprehensive income, including profit or loss for the period, other comprehensive income items, and total comprehensive income attributable to shareholders and non-controlling interests.,
with the following table title:
Financial and Operating Information 2020 - 2024,
with the following columns:
- Group information: None
- Condensed group statement of comprehensive income: None
- Profit (loss) for the period: None
- Other comprehensive income: None
- Total comprehensive income: None
- Attributable to: None



In [133]:
# dump both indexed tables and page text into the vector index
recursive_index = VectorStoreIndex(nodes=base_nodes + objects, llm=llm_gpt4o_mini)

recursive_query_engine_o1 = recursive_index.as_query_engine(
    similarity_top_k=5, llm=llm_o1
)

recursive_query_engine_o1_preview = recursive_index.as_query_engine(
    similarity_top_k=5, llm=llm_o1_preview
)

recursive_query_engine_gpt4o_mini = recursive_index.as_query_engine(
    similarity_top_k=5, llm=llm_gpt4o_mini
)

In [135]:
query = "What is the Sales and other operating revenues in 2020?"

# response_recursive_o1 = recursive_query_engine_o1.query(query)
# response_recursive_o1_preview = recursive_query_engine_o1_preview.query(query)
response_recursive_gpt4o_mini = recursive_query_engine_gpt4o_mini.query(query)

In [None]:
response_recursive_gpt4o_mini = recursive_query_engine_gpt4o_mini.query(query)

In [136]:
print("----------------------RESPONSE WITH GPT4O-MINI----------------------")
display(Markdown(f"{response_recursive_gpt4o_mini}"))

----------------------RESPONSE WITH GPT4O-MINI----------------------


The Sales and other operating revenues in 2020 amount to 105,944 million.

In [138]:
print(response_recursive_o1.source_nodes[0].get_content())

NameError: name 'response_recursive_o1' is not defined

In [140]:
query = "In which years the Sales and other operating revenues is greater than $1,50,000 million?"
response_recursive_gpt4o_mini = recursive_query_engine_gpt4o_mini.query(query)

In [141]:
print("----------------------RESPONSE WITH GPT4O-MINI----------------------")
display(Markdown(f"{response_recursive_gpt4o_mini}"))

----------------------RESPONSE WITH GPT4O-MINI----------------------


The Sales and other operating revenues exceed $150,000 million in the year 2022.

In [145]:
from llama_index.core import Document, VectorStoreIndex

text_list = ['text1', 'text2']
documents = [Document(text=t) for t in text_list]

# build index
index = VectorStoreIndex.from_documents(documents)

In [149]:
print(documents[0].get_content())

text1


In [150]:
from llama_index.core.node_parser import SentenceSplitter

# parse nodes
parser = SentenceSplitter()
nodes = parser.get_nodes_from_documents(documents)

In [153]:
print(nodes[0].get_content())

text1


In [154]:
index = VectorStoreIndex(nodes)

## Advanced RAG with LLamaParse

In [None]:
# llama-parse is async-first, running the async code in a notebook requires the use of nest_asyncio
import nest_asyncio
nest_asyncio.apply()

In [156]:
from llama_index.llms.openai import OpenAI
from llama_index.embeddings.openai import OpenAIEmbedding
from llama_index.core import VectorStoreIndex
from llama_index.core import Settings

embed_model = OpenAIEmbedding(model="text-embedding-3-small")
llm = OpenAI(model="gpt-3.5-turbo-0125")

Settings.llm = llm
Settings.embed_model = embed_model

In [157]:
from llama_parse import LlamaParse

documents = LlamaParse(result_type="markdown").load_data("./data/template_2.xlsx")

Started parsing the file under job_id 9d6ef962-aff1-4579-8184-f68963146883


In [158]:
from copy import deepcopy
from llama_index.core.schema import TextNode
from llama_index.core import VectorStoreIndex


def get_page_nodes(docs, separator="\n---\n"):
    """Split each document into page node, by separator."""
    nodes = []
    for doc in docs:
        doc_chunks = doc.text.split(separator)
        for doc_chunk in doc_chunks:
            node = TextNode(
                text=doc_chunk,
                metadata=deepcopy(doc.metadata),
            )
            nodes.append(node)

    return nodes

In [159]:
page_nodes = get_page_nodes(documents)

In [160]:
from llama_index.core.node_parser import MarkdownElementNodeParser

node_parser = MarkdownElementNodeParser(
    llm=OpenAI(model="gpt-3.5-turbo-0125"), num_workers=8
)

In [161]:
nodes = node_parser.get_nodes_from_documents(documents)
base_nodes, objects = node_parser.get_nodes_and_objects(nodes)
objects[0].get_content()

1it [00:00, ?it/s]
1it [00:00, ?it/s]
1it [00:00, ?it/s]
1it [00:00, ?it/s]
1it [00:00, ?it/s]


'This table provides detailed information on the analysis and improvement projects related to standard operations in the 1366 CAP Packer. It includes project names, form types, serial numbers, items to be improved, analysis methods, responsible persons, start and completion dates, and status of completion for each project.,\nwith the following table title:\nAnalysis of Projects for Standard Operation Improvement in 1366 CAP Packer,\nwith the following columns:\n- Project: None\n- Project name: None\n- form type: None\n- serial number: None\n- Items to be improved (problems): None\n- Analytical methods(A. Process analysis B. Operation analysis C. Action economic principle D. Action analysis E. 5W1H F.5Why or Wn ): None\n- Analysis process (explained in the attachment): None\n- True Cause(KPIVs): None\n- Responsible person: None\n- start date: None\n- completion date: None\n- illustrate: None\n'

In [162]:
# dump both indexed tables and page text into the vector index
recursive_index = VectorStoreIndex(nodes=base_nodes + objects + page_nodes)

In [163]:
from llama_index.postprocessor.flag_embedding_reranker import FlagEmbeddingReranker

reranker = FlagEmbeddingReranker(
    top_n=5,
    model="BAAI/bge-reranker-large",
)

recursive_query_engine = recursive_index.as_query_engine(
    similarity_top_k=5, node_postprocessors=[reranker], verbose=True
)

In [164]:
print(len(nodes))

15


In [180]:
from IPython.display import Markdown, display
import warnings

# Suppress all warnings
warnings.filterwarnings("ignore")

In [165]:
# Setup Baseline

from llama_index.core import SimpleDirectoryReader
reader = SimpleDirectoryReader(input_files=['./data/template_2.xlsx'])
base_docs = reader.load_data()
raw_index = VectorStoreIndex.from_documents(base_docs)
raw_query_engine = raw_index.as_query_engine(
    similarity_top_k=5, node_postprocessors=[reranker]
)

In [182]:
## Worksheet '478479 series COVER Packer work'
query = "What is the true cause for 'Product Manual Threading'?"

response_2 = recursive_query_engine.query(query)
print("\n***********New LlamaParse + Recursive Retriever Query Engine***********")
display(Markdown(f"<b>{response_2}</b>"))

[1;3;38;2;11;159;203mRetrieval entering 0206302d-7f2a-409b-8300-0f0db79b8f8a: TextNode
[0m[1;3;38;2;237;90;200mRetrieving from object TextNode with query What is the true cause for 'Product Manual Threading'?
[0m

pre tokenize: 100%|███████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 125.48it/s]



***********New LlamaParse + Recursive Retriever Query Engine***********


<b>The true cause for 'Product Manual Threading' is that assembly requires neatly arranged products.</b>

In [None]:
## Worksheet '478479 series COVER Packer work'
query = "For Product wear rod, please mention the all feasible improvement solutions, \ the approved final solution,\
    and reason behind it in terms of evaluation criteria?"


response_2 = recursive_query_engine.query(query)
print("\n***********New LlamaParse+ Recursive Retriever Query Engine***********")
print(response_2)

[1;3;38;2;11;159;203mRetrieval entering 78ed5ec7-59c7-47e5-82e5-247fe25aeeb8: TextNode
[0m[1;3;38;2;237;90;200mRetrieving from object TextNode with query For Product wear rod, please mention the all feasible improvement solutions, the approved final solution,    and reason behind it in terms of evaluation criteria?
[0m

pre tokenize: 100%|███████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 124.93it/s]



***********New LlamaParse+ Recursive Retriever Query Engine***********


<b>The feasible improvement solutions for Product wear rod are as follows:
1. Changes in rod threading methods: Check all products before threading the rod.
2. Add a jig to replace manual alignment and threading of rods.

The approved final solution for Product wear rod is to add a jig to replace manual alignment and threading of rods. This solution was chosen based on the evaluation criteria of feasibility assessment, technology, economy, and society, where it was deemed to be OK in terms of feasibility, technology, and economy, and OK in terms of society impact.</b>

In [184]:
## Dome Machine
query = "What is true cause for 'Dome Appearance and functional inspection' and final solution for it?"

response_2 = recursive_query_engine.query(query)
print("\n***********New LlamaParse+ Recursive Retriever Query Engine***********")
display(Markdown(f"<b>{response_2}</b>"))

pre tokenize: 100%|███████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 166.43it/s]



***********New LlamaParse+ Recursive Retriever Query Engine***********


<b>The true cause for 'Dome Appearance and functional inspection' is identified as a high number of self-checks, causing duplication of time and waste. The final solution proposed to address this issue is to reduce the number of appearance and functional tests of self-inspection. The improvement plan suggests conducting 3 appearance and function inspections every 10K units, reduced from the previous 2 appearance and function inspections every 10K units.</b>

In [185]:
## Dome Machine
query = "What should I do when there is a gap between 'CT' and 'TT'?"

response_2 = recursive_query_engine.query(query)
print("\n***********New LlamaParse+ Recursive Retriever Query Engine***********")
display(Markdown(f"<b>{response_2}</b>"))

[1;3;38;2;11;159;203mRetrieval entering cd4a5bf2-a37d-4844-a8a5-8c9ba0bcdb98: TextNode
[0m[1;3;38;2;237;90;200mRetrieving from object TextNode with query What should I do when there is a gap between 'CT' and 'TT'?
[0m

pre tokenize: 100%|███████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 199.55it/s]



***********New LlamaParse+ Recursive Retriever Query Engine***********


<b>Increase operator workload and job saturation to address the gap between 'CT' and 'TT'.</b>

In [None]:
## Dome Machine
query = "Please output the implementation countermeasures for \ 
'dome Standardization of machine operation operations'?"

response_2 = recursive_query_engine.query(query)
print("\n***********New LlamaParse+ Recursive Retriever Query Engine***********")
display(Markdown(f"<b>{response_2}</b>"))

[1;3;38;2;11;159;203mRetrieval entering 0206302d-7f2a-409b-8300-0f0db79b8f8a: TextNode
[0m[1;3;38;2;237;90;200mRetrieving from object TextNode with query Please output the implementation countermeasures for 'dome Standardization of machine operation operations'?
[0m

pre tokenize: 100%|███████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 166.73it/s]



***********New LlamaParse+ Recursive Retriever Query Engine***********


<b>The implementation countermeasures for 'Dome Standardization of Machine Operation Operations' include the following:
1. For the Dome Appearance and Functional Inspection improvement project, the countermeasure implemented was to reduce the number of appearance and functional tests of self-inspection from 3 to 2 every 10K operations.
2. For the CT time and TT time difference improvement project, the countermeasure implemented was to increase operator workload and job saturation by rearranging operator work, resulting in a promotion of operator saturation from 69.4% to 90.5%.
3. For the Dome Functional Testing improvement project, the countermeasure implemented was to reduce the estimated functional measurement time from 4.9 minutes to 4.27 minutes after improvement.</b>

In [None]:
## Dome Machine
query = "Please output the implementation summary of 'dome Standardization' project?"

response_2 = recursive_query_engine.query(query)
print("\n***********New LlamaParse + Recursive Retriever Query Engine***********")
display(Markdown(f"<b>{response_2}</b>"))

[1;3;38;2;11;159;203mRetrieval entering 0206302d-7f2a-409b-8300-0f0db79b8f8a: TextNode
[0m[1;3;38;2;237;90;200mRetrieving from object TextNode with query Please output the implementation summary of 'dome Standardization' project?
[0m

pre tokenize: 100%|███████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 166.98it/s]



***********New LlamaParse+ Recursive Retriever Query Engine***********


<b>The implementation summary of the 'Dome Standardization' project includes the following details:
- For the Process Capability Table:
  - Dome Functional testing improvement reduced the estimated functional measurement time from 4.9 minutes to 4.27 minutes.
  - CT time and TT time difference was addressed to reduce waiting time for the machine.
  - Product function self-inspection process was improved to reduce personnel waiting time.
- For the Standard Work Combination Table:
  - Improvement was made to increase operator workload and job saturation by rearranging operator work.
- For the Standard Worksheet:
  - Improvement was made in the appearance and functional inspection process to reduce the number of self-inspections and improve efficiency.</b>