# unpdf

## Usage Examples

__This notebook is used to demonstrate how to use the package for extracting and chunking text from PDFs.__

### Libraries

In [1]:
# standard library
import os

# this package
import unpdf

print('Loaded!')

Loaded!


## I. Text Extraction from PDFs

In [2]:
file_path = os.path.join(os.pardir, 'tests', 'lorem.pdf')

In [3]:
doc = unpdf.extraction.extract_text(input_data=file_path, progress_bar=True)
print(f'Pages:', len(doc))

2it [00:00, 232.26it/s]

Pages: 2





In [4]:
# get the first page
doc.pages[0]

PageEntity(doc_id='lorem.pdf', page_id=0, text='Lorem Ipsum: 10 Paragraphs\r\nLorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore \r\net dolore magna aliqua. Viverra vitae congue eu consequat. Egestas erat imperdiet sed euismod nisi \r\nporta lorem mollis. Viverra justo nec ultrices dui sapien eget mi proin. Vulputate enim nulla aliquet \r\nporttitor lacus. Donec ac odio tempor orci dapibus. Quis hendrerit dolor magna eget est. Pharetra \r\nmassa massa ultricies mi quis hendrerit dolor magna. Mauris ultrices eros in cursus turpis massa \r\ntincidunt. Nibh sit amet commodo nulla facilisi nullam. Ullamcorper eget nulla facilisi etiam \r\ndignissim diam quis enim. Est ultricies integer quis auctor elit. Semper quis lectus nulla at volutpat \r\ndiam ut venenatis. Dapibus ultrices in iaculis nunc sed augue lacus.\r\nQuam id leo in vitae turpis massa sed. Tellus in metus vulputate eu. Tristique risus nec feugiat in \r\nfermentum. Neque laoreet 

In [5]:
# or equivalently, since DocumentEntity is subscriptable by default
doc[0] == doc.pages[0]

True

In [6]:
# you can iterate through a DocumentEntity to get pages
len([page for page in doc])

2

## II. Text Chunking (Segmentation)

In [7]:
# instantiate a Chunker
chunker = unpdf.chunking.Chunker()
chunker

Chunker(pipe_batch_size=16, add_sentence_metadata=False)

In [8]:
print(chunker)

Chunker(pipe_batch_size=16, add_sentence_metadata=False) powered by en core_web_sm model from spaCy v3.5.0.


In [9]:
# you can chunk a DocumentEntity into paragraphs
paragraphs = chunker.get_quasiparagraphs(doc)
print('Paragraphs:', len(paragraphs))
paragraphs[0]

Paragraphs: 19


QuasiParagraphEntity(doc_id='lorem.pdf', page_id=0, text='Lorem Ipsum: 10 Paragraphs Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Viverra vitae congue eu consequat. Egestas erat imperdiet sed euismod nisi porta lorem mollis. Viverra justo nec ultrices dui sapien eget mi proin. Vulputate enim nulla aliquet porttitor lacus. Donec ac odio tempor orci dapibus. Quis hendrerit dolor magna eget est. Pharetra massa massa ultricies mi quis hendrerit dolor magna. Mauris ultrices eros in cursus turpis massa tincidunt. Nibh sit amet commodo nulla facilisi nullam. Ullamcorper eget nulla facilisi etiam dignissim diam quis enim. Est ultricies integer quis auctor elit. Semper quis lectus nulla at volutpat diam ut venenatis. Dapibus ultrices in iaculis nunc sed augue lacus.', paragraph_id=0)

In [10]:
# you can also chunk a PageEntity into paragraphs
paragraphs = chunker.get_quasiparagraphs(doc[0])
print('Paragraphs:', len(paragraphs))
paragraphs[0]

Paragraphs: 8


QuasiParagraphEntity(doc_id='lorem.pdf', page_id=0, text='Lorem Ipsum: 10 Paragraphs Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Viverra vitae congue eu consequat. Egestas erat imperdiet sed euismod nisi porta lorem mollis. Viverra justo nec ultrices dui sapien eget mi proin. Vulputate enim nulla aliquet porttitor lacus. Donec ac odio tempor orci dapibus. Quis hendrerit dolor magna eget est. Pharetra massa massa ultricies mi quis hendrerit dolor magna. Mauris ultrices eros in cursus turpis massa tincidunt. Nibh sit amet commodo nulla facilisi nullam. Ullamcorper eget nulla facilisi etiam dignissim diam quis enim. Est ultricies integer quis auctor elit. Semper quis lectus nulla at volutpat diam ut venenatis. Dapibus ultrices in iaculis nunc sed augue lacus.', paragraph_id=0)

In [11]:
# DocumentEntity, PageEntity or ParagraphEntity can all be chunked into sentences
sentences = chunker.get_sentences(doc)
print('Sentences:', len(sentences))
sentences[0]

Sentences: 137


SentenceEntity(doc_id='lorem.pdf', page_id=0, text='Lorem Ipsum: 10 Paragraphs Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua.', sentence_id=0, metadata=None)

In [12]:
sentences = chunker.get_sentences(doc[0])
print('Sentences:', len(sentences))
sentences[0]

Sentences: 77


SentenceEntity(doc_id='lorem.pdf', page_id=0, text='Lorem Ipsum: 10 Paragraphs Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua.', sentence_id=0, metadata=None)

In [13]:
sentences = chunker.get_sentences(paragraphs[0])
print('Sentences:', len(sentences))
sentences[0]

Sentences: 14


SentenceEntity(doc_id='lorem.pdf', page_id=0, text='Lorem Ipsum: 10 Paragraphs Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua.', sentence_id=0, metadata=None)

In [14]:
# derived sentences are always identical if the entity is a document or page but may be different for paragraphs
chunker.get_sentences(doc)[0] == chunker.get_sentences(doc[0])[0] == chunker.get_sentences(paragraphs[0])[0]

True

In [15]:
# chunker can add sentence metadata like tokens and named enties
chunker.add_sentence_metadata = True

In [16]:
sentences = chunker.get_sentences(paragraphs[0])
print('Sentences:', len(sentences))
sentences[0]

Sentences: 14


SentenceEntity(doc_id='lorem.pdf', page_id=0, text='Lorem Ipsum: 10 Paragraphs Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua.', sentence_id=0, metadata=MetadataEntity(length=27, tokens=['Lorem', 'Ipsum', ':', '10', 'Paragraphs', 'Lorem', 'ipsum', 'dolor', 'sit', 'amet', ',', 'consectetur', 'adipiscing', 'elit', ',', 'sed', 'eiusmod', 'tempor', 'incididunt', 'ut', 'labore', 'et', 'dolore', 'magna', 'aliqua', '.'], noun_chunks=['Lorem Ipsum', '10 Paragraphs Lorem', 'dolor sit amet', 'do eiusmod tempor incididunt', 'labore', 'et dolore magna aliqua'], subject_phrases=['Lorem Ipsum: 10 Paragraphs Lorem ipsum dolor sit amet, consectetur adipiscing elit,', '10 Paragraphs Lorem'], dobject_phrases=['dolor sit amet', 'do eiusmod tempor incididunt ut labore et dolore magna aliqua']))

In [17]:
# the metadata is available as a SentenceEntity metadata attribute
sentences[0].metadata

MetadataEntity(length=27, tokens=['Lorem', 'Ipsum', ':', '10', 'Paragraphs', 'Lorem', 'ipsum', 'dolor', 'sit', 'amet', ',', 'consectetur', 'adipiscing', 'elit', ',', 'sed', 'eiusmod', 'tempor', 'incididunt', 'ut', 'labore', 'et', 'dolore', 'magna', 'aliqua', '.'], noun_chunks=['Lorem Ipsum', '10 Paragraphs Lorem', 'dolor sit amet', 'do eiusmod tempor incididunt', 'labore', 'et dolore magna aliqua'], subject_phrases=['Lorem Ipsum: 10 Paragraphs Lorem ipsum dolor sit amet, consectetur adipiscing elit,', '10 Paragraphs Lorem'], dobject_phrases=['dolor sit amet', 'do eiusmod tempor incididunt ut labore et dolore magna aliqua'])

In [18]:
chunker.get_slices_from_page(doc[0], window_size=10, step_size=10)

[QuasiParagraphEntity(doc_id='lorem.pdf', page_id=0, text='Lorem Ipsum: 10 Paragraphs Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Viverra vitae congue eu consequat. Egestas erat imperdiet sed euismod nisi porta lorem mollis. Viverra justo nec ultrices dui sapien eget mi proin. Vulputate enim nulla aliquet porttitor lacus. Donec ac odio tempor orci dapibus. Quis hendrerit dolor magna eget est. Pharetra massa massa ultricies mi quis hendrerit dolor magna. Mauris ultrices eros in cursus turpis massa tincidunt. Nibh sit amet commodo nulla facilisi nullam.', paragraph_id=0),
 QuasiParagraphEntity(doc_id='lorem.pdf', page_id=0, text='Ullamcorper eget nulla facilisi etiam dignissim diam quis enim. Est ultricies integer quis auctor elit. Semper quis lectus nulla at volutpat diam ut venenatis. Dapibus ultrices in iaculis nunc sed augue lacus. Quam id leo in vitae turpis massa sed. Tellus in metus vulputate eu. Trist

## III. Performance

TBC.

In [19]:
%%time

file_path = os.path.join(os.pardir, 'tests', '6130-pdf-compressed.pdf')
chunker = unpdf.chunking.Chunker()

doc = unpdf.extraction.extract_text(input_data=file_path)
print(f'Pages:', len(doc))

paragraphs = chunker.get_quasiparagraphs(doc)
print('Paragraphs:', len(paragraphs))

sentences = chunker.get_sentences(doc)
print('Sentences:', len(sentences))

Pages: 838
Paragraphs: 4182
Sentences: 9834
CPU times: user 23.9 s, sys: 455 ms, total: 24.3 s
Wall time: 24.4 s
