# SetUp
Install the dependencies you need to run the notebook

In [9]:
# !brew install poppler tesseract libmagic

## Extract the data

Extract the elements of the PDF that we will be able to use in the retrieval process. These elements can be: Text, Images, Tables, etc.

### Partition PDF tables, text, and images

In [56]:
from unstructured.partition.auto import partition


filename = "inputs/invoices/2Modern.pdf"
elements = partition(filename=filename, content_type="application/pdf")
print("\n\n".join([str(el) for el in elements][:10]))

%6 2MODERN —_

Company Address 35 Miller Ave #192 Invoice 263048 Mill Valley, California 94941 Quote Name PO#24-DPA3-023 United States Created Date 3/25/2025 Submit Payment To: 2Modern PO Box 840603 Los Angeles, CA 90084-0603 Prepared By Pondi Sottile Phone Direct: 925.448.9105 Email pondi@2modern.com Customer PO # PO#24-DPA3-023 Ship To Name Sorensen c/o The Advance Group Bill To Name Hollywood FL Owner, LLC c/o Source Ship To 3715 SW 30th Ave Bill To 921 SW Washington Street, Suite 518 Fort Lauderdale, Florida 33312 Portland, Oregon 97205 United States United States sidemark SM: Source / Diplomat Phase 2B / 24-DPA3-023

/ FB-410 & 24-DPA3-023 / FB-441

Sales Discount Total (Percentage) Price

Product Option/Col tit roduct ption/Color Price Quantity

TAG: FB-410 / Small PI Console Table PI Console Table - Default Title (Small / $999.00 3.00 15.00% | $2,547.45 Mahogany Dark Brown

Castle LED Wall Sconce OTS TAG: FB-441 $2,580.00} 2.00 20.00% | $4,128.00 Smoke / Black / 2 Lights

Specia

## From the Multimodal RAG strategy
partition the pdf into chunks, and set a chunking strategy  
the character chunking strategy is helpful for long documents that may need to be split for RAG retrieval,  
or to target specific areas of the document for analysis  
studies suggest effective chunking at 512 tokens with 25% chunk overlap  
look into chunking strategy, and how to not split elements in the middle (tables, etc)  

In [59]:
from unstructured.partition.pdf import partition_pdf

input_path = "inputs/invoices/"
file_path = input_path + '2Modern.pdf'
image_output_dir_path = 'outputs/images/'

# Reference: https://docs.unstructured.io/open-source/core-functionality/chunking
chunks = partition_pdf(
    filename=file_path,
    skip_infer_table_types=False,
    infer_table_structure=True,            # extract tables
    strategy="hi_res",                     # mandatory to infer tables

    extract_image_block_types=["Image", "Table"],   # Add 'Table' to list to extract image of tables
    image_output_dir_path=image_output_dir_path,   # if None, images and tables will saved in base64

    extract_image_block_to_payload=True,   # if true, will extract base64 for API usage

    chunking_strategy="by_title",          # or 'basic'
    max_characters=10000,                  # defaults to 500
    combine_text_under_n_chars=2000,       # defaults to 0
    new_after_n_chars=6000,

    # extract_images_in_pdf=True,          # deprecated
)

### Display information about the elements within the first chunk

In [72]:
chunks[0].metadata.orig_elements

[<unstructured.documents.elements.Title at 0x30bfda580>,
 <unstructured.documents.elements.Header at 0x30bfd8670>,
 <unstructured.documents.elements.Title at 0x30bfd8a60>,
 <unstructured.documents.elements.Address at 0x17fc70550>,
 <unstructured.documents.elements.Table at 0x30bfda350>,
 <unstructured.documents.elements.Title at 0x30bfd8750>,
 <unstructured.documents.elements.NarrativeText at 0x30bfda200>,
 <unstructured.documents.elements.Title at 0x30bfd9a90>,
 <unstructured.documents.elements.Title at 0x30bfd82f0>,
 <unstructured.documents.elements.Title at 0x30bfd8980>,
 <unstructured.documents.elements.Text at 0x30bfda660>,
 <unstructured.documents.elements.Title at 0x30bfd8f30>,
 <unstructured.documents.elements.EmailAddress at 0x178ca70b0>,
 <unstructured.documents.elements.Title at 0x30bfda2e0>,
 <unstructured.documents.elements.Title at 0x30bfdac80>,
 <unstructured.documents.elements.Title at 0x30bfd8520>,
 <unstructured.documents.elements.Title at 0x30bfdbcb0>,
 <unstructured

### Deeper look at what makes up the chunk

In [87]:
chunks[0].to_dict()

{'type': 'CompositeElement',
 'element_id': '2e2c3cdefddf45858264b6e66163f251',
 'text': '%6 2MODERN\n\n—_\n\nCompany Address 35 Miller Ave #192\n\nMill Valley, California 94941 United States\n\nInvoice 263048 Quote Name PO#24-DPA3-023 Created Date 3/25/2025\n\nSubmit Payment\n\nTo: 2Modern PO Box 840603 Los Angeles, CA 90084-0603\n\nPrepared By\n\nPondi Sottile\n\nPhone\n\nDirect: 925.448.9105\n\nEmail\n\npondi@2modern.com\n\nCustomer PO # PO#24-DPA3-023\n\nShip To Name\n\nSorensen c/o The Advance Group\n\nBill To Name\n\nBill To\n\nHollywood FL Owner, LLC c/o Source\n\n921 SW Washington Street, Suite 518 Portland, Oregon 97205 United States\n\nShip To\n\n3715 SW 30th Ave Fort Lauderdale, Florida 33312 United States\n\nsidemark SM: Source / Diplomat Phase 2B / 24-DPA3-023 / FB-410 & 24-DPA3-023 / FB-441\n\nSales Discount Total (Percentage) Price Product Option/Col tit roduct ption/Color Price Quantity TAG: FB-410 / Small PI Console Table PI Console Table - Default Title (Small / $999.

In [88]:
set([str(type(el)) for el in chunks])

{"<class 'unstructured.documents.elements.CompositeElement'>"}

In [104]:
from unstructured.partition.auto import partition

fname = "inputs/invoices/2Modern.pdf"

elements = partition(filename=fname,
                        #  skip_infer_table_types=False,
                         strategy='hi_res',
                         extract_image_block_types=["Image", "Table"],
                         image_output_dir_path=image_output_dir_path,
                        #  extract_image_block_to_payload=True,
           )

tables = [el for el in elements if el.category == "Table"]
images = [el for el in elements if el.category == "Image"]


element_counts = {}

for el in elements:
    category = el.category
    if category in element_counts:
        element_counts[category] += 1
    else:
        element_counts[category] = 1

for el in element_counts:
    print(el, element_counts[el])
[print(el.to_dict()) for el in elements]

type_to_print = "Table"

for el in elements:
    if el.category == type_to_print:
        print(el.to_dict())


Title 23
Header 1
Address 1
Table 2
NarrativeText 2
UncategorizedText 8
EmailAddress 1
ListItem 6
{'type': 'Title', 'element_id': 'db9749130b3f9f82e74a0d472ae034b6', 'text': '%6 2MODERN', 'metadata': {'detection_class_prob': 0.715265691280365, 'coordinates': {'points': ((np.float64(99.83700561523438), np.float64(40.807518005371094)), (np.float64(99.83700561523438), np.float64(125.9177474975586)), (np.float64(603.5902709960938), np.float64(125.9177474975586)), (np.float64(603.5902709960938), np.float64(40.807518005371094))), 'system': 'PixelSpace', 'layout_width': 1700, 'layout_height': 2200}, 'last_modified': '2025-05-15T17:14:58', 'filetype': 'application/pdf', 'languages': ['eng'], 'page_number': 1, 'file_directory': 'inputs/invoices', 'filename': '2Modern.pdf'}}
{'type': 'Header', 'element_id': 'b90b94c2b6b05848e8120e16804a0cbd', 'text': '—_', 'metadata': {'detection_class_prob': 0.8126082420349121, 'coordinates': {'points': ((np.float64(1459.4019775390625), np.float64(42.6826705932