# Using LlamaParse to retrieve text from PDFs stored in either S3 bucket or local computer



In [None]:
# install modules
pip install llama-index s3fs boto3

In [None]:
# Uncomment if you are in a Jupyter Notebook
import nest_asyncio
nest_asyncio.apply()

## 1. Creating and uploading files in an S3 bucket in AWS

In [44]:
import boto3

In [None]:
endpoint_url = (
    "http://localhost:4566"  # use this line if you are using S3 via localstack
)
# endpoint_url = None  # use this line if you are using real AWS S3


bucket_name = "llama-index-test-bucket"
s3 = boto3.resource("s3", endpoint_url=endpoint_url) #creates a resource object for interacting with the S3 storage
s3.create_bucket(Bucket=bucket_name) #creates new s3 bucket
bucket = s3.Bucket(bucket_name) # selects a S3 bucket based on its name

#.upload_file() uploads the files into the selected bucket
# put the paul graham essays in the test-bucket in various subdirectories
bucket.upload_file(
    "data/paul_graham/paul_graham_essay1.txt", "essays/paul_graham_essay1.txt"
)
bucket.upload_file(
    "data/paul_graham/paul_graham_essay2.txt",
    "essays/more_essays/paul_graham_essay2.txt",
)
bucket.upload_file(
    "data/paul_graham/paul_graham_essay3.txt",
    "essays/even_more_essays/paul_graham_essay3.txt",
)

## 2. Creating an S3FileSystem Instance

In [None]:
from s3fs import S3FileSystem

In [None]:
# Use this code if you'll only use it for testing
s3_fs = S3FileSystem(key="...", secret="...")

In [None]:
# Use this code instead to reduce security risks
endpoint_url = (
    "http://localhost:4566"  # use this line if you are using S3 via localstack
)

s3_fs = S3FileSystem(anon=False, # disallows anonymous access without credentials
                     endpoint_url=endpoint_url) # should be defined elsewhere in the code

In [None]:
from llama_index.core import SimpleDirectoryReader  # pip install llama-index

## 3. Create reader object

In [None]:
# option A: create reader object that reads into S3 buckets

bucket_name = "my-document-bucket" #S3 bucket name

reader = SimpleDirectoryReader(
    input_dir=bucket_name,
    fs=s3_fs, # fs means file system. use the instance (file system object) created earlier in s3_fs
    recursive=True,  # recursively searches all subdirectories
)

In [97]:
# option B: create reader object that reads into local computer
from llama_parse import LlamaParse

parser = LlamaParse(
    api_key = "", # can also be set in your env as LLAMA_CLOUD_API_KEY
    result_type = "text" # "markdown" and "text" are available
)

file_extractor = {".pdf": parser}

reader = SimpleDirectoryReader(input_dir="./sample_data", 
                            #    required_exts=[".pdf", ".docx"],
                                file_extractor=file_extractor,
                                recursive=False, # doesn't get files from subdirectories also
                                num_files_limit= 3) #sets number of documents to read


## 4. Loading data

To extract the text data using the reader object. You either use *.iter_data()* or *.load_data()*

*.load_data()* - loads ALL of the documents first (one by one) and you can't do anything but wait til everything is loaded. Outputs a list

*.iter_data()* - You can do anything each time a single document has been loaded. Outputs an object

In [98]:
# Results of using load_data()
documents = reader.load_data()
documents

Started parsing the file under job_id fab74a44-17c5-4432-948f-ab40886cc6a1
Started parsing the file under job_id ba186038-c93c-4157-a659-7e1f91edf18d
Started parsing the file under job_id 770bae17-4f1c-49ae-8d32-1e7ed0df46c6


[Document(id_='11703026-bd5d-42f3-8c80-a76c7cd5f6cc', embedding=None, metadata={'file_path': 'c:\\Users\\Shiro\\Document & Files\\Educational Materials\\Programming\\Python Files\\sample_data\\03-2024-onduline-classic-brochure_compressed_2.pdf', 'file_name': '03-2024-onduline-classic-brochure_compressed_2.pdf', 'file_type': 'application/pdf', 'file_size': 1816568, 'creation_date': '2024-05-10', 'last_modified_date': '2024-05-10'}, excluded_embed_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], excluded_llm_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], relationships={}, text='GLOBAL LEADER IN\nLIGHTWEIGHT ROOFING                                                                        ONDULINE® Classic\nSOLUTIONS\n\n\n\nONDULINE is well recognized globally in\nthe construction industry with 80 years\nof experience offering guaranteed\nprotection for buil

In [65]:
# Results of using iter_data()
documents = reader.iter_data()
documents

<generator object SimpleDirectoryReader.iter_data at 0x0000021EE3A99900>

In [66]:
# .iter_data() method
documents = []
doc_num = 1
for docs in reader.iter_data():
    print(f"Loading document #{doc_num}...")

    print("unstructured look of the loaded document:")
    print(docs)
    print("|")
    for doc in docs:
        # before modification
        print("Id and text obtained from the loaded document:")
        print(doc)
        print("||")
        
        # do something with the doc
        doc.text = doc.text.upper()

        # after modification
        print("modified text of the loaded document:")
        print(doc)
        print("||||")

        #add doc to all_docs
        documents.append(doc)
        
    doc_num += 1
documents #weird because it's showing docs not doc

Started parsing the file under job_id 37520e1c-e92f-4903-86c8-e19e692a84d6
Loading document #1...
unstructured look of the loaded document:
[Document(id_='b2faefc2-bcb9-4c71-a42f-e79a2ab88f6f', embedding=None, metadata={'file_path': 'c:\\Users\\Shiro\\Document & Files\\Educational Materials\\Programming\\Python Files\\sample_data\\03-2024-onduline-classic-brochure_compressed_2.pdf', 'file_name': '03-2024-onduline-classic-brochure_compressed_2.pdf', 'file_type': 'application/pdf', 'file_size': 1816568, 'creation_date': '2024-05-10', 'last_modified_date': '2024-05-10'}, excluded_embed_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], excluded_llm_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], relationships={}, text='GLOBAL LEADER IN\nLIGHTWEIGHT ROOFING                                                                        ONDULINE® Classic\nSOLUTIONS\n\n

[Document(id_='b2faefc2-bcb9-4c71-a42f-e79a2ab88f6f', embedding=None, metadata={'file_path': 'c:\\Users\\Shiro\\Document & Files\\Educational Materials\\Programming\\Python Files\\sample_data\\03-2024-onduline-classic-brochure_compressed_2.pdf', 'file_name': '03-2024-onduline-classic-brochure_compressed_2.pdf', 'file_type': 'application/pdf', 'file_size': 1816568, 'creation_date': '2024-05-10', 'last_modified_date': '2024-05-10'}, excluded_embed_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], excluded_llm_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], relationships={}, text='GLOBAL LEADER IN\nLIGHTWEIGHT ROOFING                                                                        ONDULINE® CLASSIC\nSOLUTIONS\n\n\n\nONDULINE IS WELL RECOGNIZED GLOBALLY IN\nTHE CONSTRUCTION INDUSTRY WITH 80 YEARS\nOF EXPERIENCE OFFERING GUARANTEED\nPROTECTION FOR BUIL

In [74]:
# .iter_data() method
documents = []
doc_num = 1
for idx, docs in reader.iter_data():
    for doc in docs:
        print(f"{idx} - {doc.metadata['file_name']} - {doc.text}")
        print(f"-------------------------------------------------------------------------------------------")
        documents.append(doc)
        
    doc_num += 1
documents #weird because it's showing docs not doc

Started parsing the file under job_id 114e2952-1d3b-4d07-baba-a26d1cee0355
2 - 03-2024-onduline-classic-brochure_compressed_2.pdf - GLOBAL LEADER IN
LIGHTWEIGHT ROOFING                                                                        ONDULINE® Classic
SOLUTIONS



ONDULINE is well recognized globally in
the construction industry with 80 years
of experience offering guaranteed
protection for buildings
and people beneath.



ONDULINE is present in more than100
countries      providing       clients     with
ecologically responsible roofing and
waterproofing solutions.                                Scan this to know more



ONDULINE products are asbestos-free and
are fully committed to customers
satisfaction and planet environment
preservation.



                                                      /ondulinephilippines@onduline_ph
                                                      0917 713 5616  www.ph.onduline.com
In the Philippines, ONDULINE started its
operations in 2012 in

[Document(id_='5e4874c5-f17b-4e7a-bc79-b82da54aef53', embedding=None, metadata={'file_path': 'c:\\Users\\Shiro\\Document & Files\\Educational Materials\\Programming\\Python Files\\sample_data\\03-2024-onduline-classic-brochure_compressed_2.pdf', 'file_name': '03-2024-onduline-classic-brochure_compressed_2.pdf', 'file_type': 'application/pdf', 'file_size': 1816568, 'creation_date': '2024-05-10', 'last_modified_date': '2024-05-10'}, excluded_embed_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], excluded_llm_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], relationships={}, text='GLOBAL LEADER IN\nLIGHTWEIGHT ROOFING                                                                        ONDULINE® Classic\nSOLUTIONS\n\n\n\nONDULINE is well recognized globally in\nthe construction industry with 80 years\nof experience offering guaranteed\nprotection for buil

In [67]:
# show the metadata of each document
for idx, doc in enumerate(documents):
    print(f"{idx} - {doc.metadata}")

0 - {'file_path': 'c:\\Users\\Shiro\\Document & Files\\Educational Materials\\Programming\\Python Files\\sample_data\\03-2024-onduline-classic-brochure_compressed_2.pdf', 'file_name': '03-2024-onduline-classic-brochure_compressed_2.pdf', 'file_type': 'application/pdf', 'file_size': 1816568, 'creation_date': '2024-05-10', 'last_modified_date': '2024-05-10'}
1 - {'file_path': 'c:\\Users\\Shiro\\Document & Files\\Educational Materials\\Programming\\Python Files\\sample_data\\03-2024-onduvilla-tile_compressed_0.pdf', 'file_name': '03-2024-onduvilla-tile_compressed_0.pdf', 'file_type': 'application/pdf', 'file_size': 2387566, 'creation_date': '2024-05-10', 'last_modified_date': '2024-05-10'}
2 - {'file_path': 'c:\\Users\\Shiro\\Document & Files\\Educational Materials\\Programming\\Python Files\\sample_data\\092123_ModernBamboo_Catalogue.pdf', 'file_name': '092123_ModernBamboo_Catalogue.pdf', 'file_type': 'application/pdf', 'file_size': 9046826, 'creation_date': '2024-05-10', 'last_modified_

In [None]:
# show the file name of each document
for idx, doc in enumerate(documents):
    print(f"{idx} - {doc.metadata['file_name']}")

In [None]:
# show the file name and the text content of each document
for idx, doc in enumerate(documents):
    print(f"{idx} - {doc.metadata['file_name']} - {doc.text}")

In [68]:
# Asynchronous execution:
documents = await reader.aload_data()

print(len(documents))

Started parsing the file under job_id e883d3d8-5440-494c-85a4-a4bc6ea25e1c
Started parsing the file under job_id 30de12b2-a8fd-4af8-ab44-fbf3ec97632c
Started parsing the file under job_id 67644102-40ba-4b59-8c13-957491d7de76
3


## Others: Using prompts to extract information

You can also retrieve data from PDF using prompt instructions. Risks include returning information not explicitly stated, having misinterpretations, and incorrectly identifying characters.

In [91]:
# with parsing instruction
from llama_parse import LlamaParse

my_instructions = """
The provided document is a product brochure containing images and information about the product. it may or may not contain tables. Try to present these information in a clear and cohesive way, and in text format only.
"""

parser = LlamaParse(
    api_key = "", # can also be set in your env as LLAMA_CLOUD_API_KEY
    result_type = "markdown", # "markdown" and "text" are available
    parsing_instruction=my_instructions
)

print(parser.load_data("./sample_data/hd_simplified-installation_onduline_1.pdf")[0].text)


Started parsing the file under job_id 98e541bc-11e5-4b01-a1be-d44aa2f94a77
# Onduline Product Brochure

# Onduline Product Brochure

20 x =

1

21 3 5 9 10 2

9° - 15° (15% - 27%)

2

> 15° (> 27%)

3

12du

15 cm

min

4

Onduline®

35, rue Baudin, 92300 Levallois-Perret, FRANCE

www.onduline.com

#06_FicheInstallation_Onduline.indd 1 09/12/14 09:17


In [94]:
# with parsing instruction
from llama_parse import LlamaParse

my_instructions = """
The provided document is a product brochure containing images and information about the product. Present all information about the product while considering the context of the images and its connection to the text in the document. Don't describe the product unless otherwise stated in the document.
"""

parser = LlamaParse(
    api_key = "", # can also be set in your env as LLAMA_CLOUD_API_KEY
    result_type = "markdown", # "markdown" and "text" are available
    parsing_instruction=my_instructions
)

print(parser.load_data("./sample_data/hd_simplified-installation_onduline_1.pdf")[0].text)



Started parsing the file under job_id 1ec804ed-36aa-4a09-b588-f86c0513d09b
# Onduline Product Brochure

Product Information:

Dimensions: 15 cm minimum

Material: Onduline®

Address: 35, rue Baudin, 92300 Levallois-Perret, FRANCE

Website: www.onduline.com

Installation Guidelines:

Angle Range: 9° - 15° (15% - 27%)

For angles greater than 15° (> 27%), refer to section 5b

Document ID: #06_FicheInstallation_Onduline.indd 1

Date: 09/12/14 09:17


## 5. Loading Extracted Text to a Pinecone Server

In [2]:
import nest_asyncio
nest_asyncio.apply()

The extracted text must be in markdown format. Run any one of the three methods:

In [6]:
# Method 1: Loading a single doc without prompt
from llama_parse import LlamaParse

my_instructions = """
The provided document is a product brochure containing images and information about the product. Present all information about the product while considering the context of the images and its connection to the text in the document. Don't describe the product unless otherwise stated in the document.
"""

parser = LlamaParse(
    api_key = "", # can also be set in your env as LLAMA_CLOUD_API_KEY
    result_type = "markdown", # "markdown" and "text" are available
    parsing_instruction=None
)

extracted_text = parser.load_data("./sample_data/hd_simplified-installation_onduline_1.pdf")
extracted_text

Started parsing the file under job_id f6c4766c-0911-4fbb-bda3-54f6c72f5e06


[Document(id_='cd6cf9e2-d75e-4add-991a-4e341007fa46', embedding=None, metadata={}, excluded_embed_metadata_keys=[], excluded_llm_metadata_keys=[], relationships={}, text='|20 x =|\n|---|\n|1|2|3|\n|21|3|5 9 10 2|\n|9° - 15° (15% - 27%)|\n|&gt; 15° (&gt; 27%)|\n|4|5a|5b|\n| | |12du|15 cm min|\n|Onduline®|35, rue Baudin , 92300 Levallois-Perret, FRANCE|www.onduline.com|\n|#06_FicheInstallation_Onduline.indd 1| |09/12/14 09:17|', start_char_idx=None, end_char_idx=None, text_template='{metadata_str}\n\n{content}', metadata_template='{key}: {value}', metadata_seperator='\n')]

In [7]:
# Method 2: Loading multiple docs without prompt
from llama_index.core import SimpleDirectoryReader
from llama_parse import LlamaParse

parser = LlamaParse(
    api_key = "", # can also be set in your env as LLAMA_CLOUD_API_KEY
    result_type = "markdown" # "markdown" and "text" are available
)

file_extractor = {".pdf": parser}

reader = SimpleDirectoryReader(input_dir="./sample_data", 
                            #    required_exts=[".pdf", ".docx"],
                                file_extractor=file_extractor,
                                recursive=False, # doesn't get files from subdirectories also
                                num_files_limit= 3) #sets number of documents to read

extracted_text = reader.load_data()
extracted_text


Started parsing the file under job_id 30acbdf8-cbe1-405d-9270-d067023bcbfc
Started parsing the file under job_id 9cfbf61e-dee2-447b-820b-94323568e961
Started parsing the file under job_id 55065124-1ba5-47a6-b756-547b55ff47ad


[Document(id_='831a6990-eff1-4997-8285-febe9bbe6a04', embedding=None, metadata={'file_path': 'c:\\Users\\Shiro\\Document & Files\\Educational Materials\\Programming\\Python Files\\sample_data\\03-2024-onduline-classic-brochure_compressed_2.pdf', 'file_name': '03-2024-onduline-classic-brochure_compressed_2.pdf', 'file_type': 'application/pdf', 'file_size': 1816568, 'creation_date': '2024-05-10', 'last_modified_date': '2024-05-10'}, excluded_embed_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], excluded_llm_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], relationships={}, text='# GLOBAL LEADER IN LIGHTWEIGHT ROOFING SOLUTIONS\n\nONDULINE® Classic\n\nONDULINE is well recognized globally in the construction industry with 80 years of experience offering guaranteed protection for buildings and people beneath.\n\nONDULINE is present in more than 100 countries

In [3]:
# Method 3: Parsing with prompts
from llama_parse import LlamaParse

my_instructions = """
The provided document is a product brochure containing images and information about the product. Present all information about the product while considering the context of the images and its connection to the text in the document. Don't describe the product unless otherwise stated in the document.
"""

parser = LlamaParse(
    api_key = "", # can also be set in your env as LLAMA_CLOUD_API_KEY
    result_type = "markdown", # "markdown" and "text" are available
    parsing_instruction=my_instructions
)

extracted_text = parser.load_data("./sample_data/hd_simplified-installation_onduline_1.pdf")
extracted_text

Started parsing the file under job_id dbd2270b-38ff-4f18-b6e2-ab5ddba72c4d


[Document(id_='57161244-3ab9-450e-9b4c-5e7caa1a70f3', embedding=None, metadata={}, excluded_embed_metadata_keys=[], excluded_llm_metadata_keys=[], relationships={}, text='# Onduline Product Brochure\n\nProduct Information:\n\nDimensions: 15 cm minimum\n\nMaterial: Onduline®\n\nAddress: 35, rue Baudin, 92300 Levallois-Perret, FRANCE\n\nWebsite: www.onduline.com\n\nInstallation Guidelines:\n\nAngle Range: 9° - 15° (15% - 27%)\n\nFor angles greater than 15° (> 27%), refer to section 5b\n\nDocument ID: #06_FicheInstallation_Onduline.indd 1\n\nDate: 09/12/14 09:17', start_char_idx=None, end_char_idx=None, text_template='{metadata_str}\n\n{content}', metadata_template='{key}: {value}', metadata_seperator='\n')]

Then run these code below to obtain the embeddings and send it to the Pinecone server.

In [None]:
from langchain.vectorstores import Chroma, Pinecone
from langchain.embeddings.openai import OpenAIEmbeddings
import pinecone

In [None]:
embeddings = OpenAIEmbeddings(openai_api_key="")

In [None]:
#initialize pinecone
pinecone.Pinecone(
    api_key="",
    environment=""
)
index_name = "" # name of index in pinecone

In [None]:
#creates embeddings to the texts and passes them up to pinecone
docsearch = Pinecone.from_texts([t.text for t in extracted_text],
                                embeddings,
                                index_name=index_name)

The index in the pinecone should be filled with vector data after the code above.