# unpdf

## Usage Examples

__This notebook is used to demonstrate how to use the package for extracting and chunking text from PDFs.__

### Libraries

In [1]:
# standard library
import os

# this package
import unpdf

print('Loaded!')

Loaded!


## I. Text Extraction from PDFs

In [2]:
file_path = os.path.join(os.pardir, 'tests', '6130-pdf-compressed-book-i.pdf')

In [3]:
doc = unpdf.extraction.extract_text(input_data=file_path, progress_bar=True)
print(f'Pages:', len(doc))

41it [00:00, 1241.68it/s]

Pages: 41





In [4]:
# get the first page
doc.pages[0]

Page(doc_id='6130-pdf-compressed-book-i.pdf', page_id=0, text='The Project Gutenberg EBook of The Iliad of Homer by Homer\r\nThis eBook is for the use of anyone anywhere at no cost\r\nand with almost no restrictions whatsoever. You may copy\r\nit, give it away or re-use it under the terms of the Project\r\nGutenberg License included with this eBook or online at\r\nhttp://www.gutenberg.org/license\r\nTitle: The Iliad of Homer\r\nAuthor: Homer\r\nRelease Date: September 2006 [Ebook 6130]\r\nLanguage: English\r\n***START OF THE PROJECT GUTENBERG EBOOK\r\nTHE ILIAD OF HOMER***', error=False)

In [5]:
# or equivalently, since Document is subscriptable by default
doc[0] == doc.pages[0]

True

In [6]:
# you can iterate through a Document to get pages
len([page for page in doc])

41

## II. Text Chunking (Segmentation)

In [7]:
# instantiate a Chunker
chunker = unpdf.chunking.Chunker()
chunker

Chunker()

In [8]:
print(chunker)

Chunker() powered by en core_web_sm model from spaCy v3.5.0.


In [9]:
# you can chunk a Document into paragraphs
paragraphs = chunker.get_quasiparagraphs(doc)
print('Paragraphs:', len(paragraphs))
paragraphs[0]

Paragraphs: 175


QuasiParagraph(doc_id='6130-pdf-compressed-book-i.pdf', page_id=0, text='The Project Gutenberg EBook of The Iliad of Homer by Homer This eBook is for the use of anyone anywhere at no cost and with almost no restrictions whatsoever. You may copy it, give it away or re-use it under the terms of the Project Gutenberg License included with this eBook or online at http://www.gutenberg.org/license Title: The Iliad of Homer Author: Homer Release Date: September 2006 [Ebook 6130] Language: English ***START OF THE PROJECT GUTENBERG EBOOK THE ILIAD OF HOMER***', paragraph_id=0)

In [10]:
# you can also chunk a Page into paragraphs
paragraphs = chunker.get_quasiparagraphs(doc[0])
print('Paragraphs:', len(paragraphs))
paragraphs[0]

Paragraphs: 1


QuasiParagraph(doc_id='6130-pdf-compressed-book-i.pdf', page_id=0, text='The Project Gutenberg EBook of The Iliad of Homer by Homer This eBook is for the use of anyone anywhere at no cost and with almost no restrictions whatsoever. You may copy it, give it away or re-use it under the terms of the Project Gutenberg License included with this eBook or online at http://www.gutenberg.org/license Title: The Iliad of Homer Author: Homer Release Date: September 2006 [Ebook 6130] Language: English ***START OF THE PROJECT GUTENBERG EBOOK THE ILIAD OF HOMER***', paragraph_id=0)

In [11]:
# Document, Page or Paragraph can all be chunked into sentences
sentences = chunker.get_sentences(doc)
print('Sentences:', len(sentences))
sentences[0]

Sentences: 433


Sentence(doc_id='6130-pdf-compressed-book-i.pdf', page_id=0, text='The Project Gutenberg EBook of The Iliad of Homer by Homer This eBook is for the use of anyone anywhere at no cost and with almost no restrictions whatsoever.', sentence_id=0)

In [12]:
sentences = chunker.get_sentences(doc[0])
print('Sentences:', len(sentences))
sentences[0]

Sentences: 4


Sentence(doc_id='6130-pdf-compressed-book-i.pdf', page_id=0, text='The Project Gutenberg EBook of The Iliad of Homer by Homer This eBook is for the use of anyone anywhere at no cost and with almost no restrictions whatsoever.', sentence_id=0)

In [13]:
sentences = chunker.get_sentences(paragraphs[0])
print('Sentences:', len(sentences))
sentences[0]

Sentences: 4


Sentence(doc_id='6130-pdf-compressed-book-i.pdf', page_id=0, text='The Project Gutenberg EBook of The Iliad of Homer by Homer This eBook is for the use of anyone anywhere at no cost and with almost no restrictions whatsoever.', sentence_id=0)

In [14]:
# derived sentences are always identical if the entity is a document or page but may be different for paragraphs
chunker.get_sentences(doc)[0] == chunker.get_sentences(doc[0])[0] == chunker.get_sentences(paragraphs[0])[0]

True

In [15]:
# chunker can apply an arbitrary preprocessing function that acts on spaCy docs
sentences = chunker.get_sentences(doc)
print('Sentences:', len(sentences))
sentences[10]

Sentences: 433


Sentence(doc_id='6130-pdf-compressed-book-i.pdf', page_id=4, text='Chryses, the father of Chryseis, and priest of Apollo, comes to the Grecian camp to ransom her; with which the action of the poem opens, in the tenth year of the siege.', sentence_id=3)

In [16]:
sentences = chunker.get_sentences(doc, unpdf.cleaning.simple_preprocess)
print('Sentences:', len(sentences))
sentences[10]

Sentences: 433


Sentence(doc_id='6130-pdf-compressed-book-i.pdf', page_id=4, text='chryse father priest come grecian camp ransom action poem open tenth year siege', sentence_id=3)

In [17]:
sentences = chunker.get_sentences(doc, lambda doc: ' '.join(token.text for token in doc if token.pos_ == 'PROPN'))
print('Sentences:', len(sentences))
sentences[10]

Sentences: 433


Sentence(doc_id='6130-pdf-compressed-book-i.pdf', page_id=4, text='Chryseis Apollo', sentence_id=3)

## III. Performance

TBC.

In [18]:
%%time

file_path = os.path.join(os.pardir, 'tests', '6130-pdf-compressed-book-i.pdf')
chunker = unpdf.chunking.Chunker()

doc = unpdf.extraction.extract_text(input_data=file_path)
print(f'Pages:', len(doc))

paragraphs = chunker.get_quasiparagraphs(doc)
print('Paragraphs:', len(paragraphs))

sentences = chunker.get_sentences(doc)
print('Sentences:', len(sentences))

Pages: 41
Paragraphs: 175
Sentences: 433
CPU times: user 1.37 s, sys: 39.7 ms, total: 1.41 s
Wall time: 1.41 s


## IV. Utilities

In [19]:
# download a PDF from a URL
url = 'https://github.com/UNDP-Data/unpdf/raw/main/tests/6130-pdf-compressed-book-i.pdf'
unpdf.utils.download_pdf(url=url)

True