# Try the PDFMiner.six lib

# Test simple text PDFs

In [1]:
from pdfminer.high_level import extract_text

In [2]:
_uri = "../data/__test_pdfs/simple-text.pdf"

text = extract_text(_uri)
print(text)

### What is Albert?

Albert is a groundbreaking venture that combines state-of-the-art electric vehicles with a robust
software-as-a-service (SaaS) subscription model. As a pioneer in both automotive
manufacturing and technology solutions, Albert aims to redefine the landscape of transportation
and renewable energy.

#### **Automotive Manufacturing**

Albert designs, engineers, and manufactures a diverse range of electric vehicles that set
industry standards for efficiency, performance, and innovation. From compact sedans to large
commercial trucks, Albert's EVs are built with cutting-edge technology, offering not just a mode
of transportation but a lifestyle choice.

#### **Software Features**

Albert offers a subscription-based service that enhances the capabilities of its electric vehicles,
elevating the user experience to unparalleled levels. Here's a glimpse of the software modules
available:

1. **Albert Drive**: This module provides Level 4 autonomous driving capabilities, allow

In [3]:
from io import StringIO

from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.pdfpage import PDFPage
from pdfminer.utils import open_filename


def iter_text_per_page(pdf_file, password='', page_numbers=None, maxpages=0,
                 caching=True, codec='utf-8', laparams=None):
    if laparams is None:
        laparams = LAParams()

    with open_filename(pdf_file, "rb") as fp:
        rsrcmgr = PDFResourceManager(caching=caching)

        idx = 1
        for page in PDFPage.get_pages(
                fp,
                page_numbers,
                maxpages=maxpages,
                password=password,
                caching=caching,
        ):
            with StringIO() as output_string:
                device = TextConverter(rsrcmgr, output_string, codec=codec,
                                       laparams=laparams)
                interpreter = PDFPageInterpreter(rsrcmgr, device)
                interpreter.process_page(page)
                yield idx, output_string.getvalue()
                idx += 1

In [4]:
for count, page_text in iter_text_per_page(_uri):
    print(f'page# {count}:\n{page_text}')
    print()

page# 1:
### What is Albert?

Albert is a groundbreaking venture that combines state-of-the-art electric vehicles with a robust
software-as-a-service (SaaS) subscription model. As a pioneer in both automotive
manufacturing and technology solutions, Albert aims to redefine the landscape of transportation
and renewable energy.

#### **Automotive Manufacturing**

Albert designs, engineers, and manufactures a diverse range of electric vehicles that set
industry standards for efficiency, performance, and innovation. From compact sedans to large
commercial trucks, Albert's EVs are built with cutting-edge technology, offering not just a mode
of transportation but a lifestyle choice.

#### **Software Features**

Albert offers a subscription-based service that enhances the capabilities of its electric vehicles,
elevating the user experience to unparalleled levels. Here's a glimpse of the software modules
available:

1. **Albert Drive**: This module provides Level 4 autonomous driving capabiliti

In [5]:
page_text

"#### **The Albert Ecosystem**\n\nWhat sets Albert apart is its unique ecosystem that allows both hardware and software to evolve\ntogether. The vehicles are designed to be software-compatible from the get-go, with regular\nover-the-air updates that add new features and improve existing ones.\n\nBy combining top-tier electric vehicle manufacturing with a dynamic software subscription\nmodel, Albert is not just reimagining the automotive industry; it's revolutionizing the way we\ninteract with technology and the world around us. Albert offers a cohesive, scalable, and\nfuture-proof solution for modern transportation and energy needs.\n\n\x0c"

# Test only one table PDFs

In [None]:
_uri = "../data/__test_pdfs/table.pdf"
with pdfplumber.open(_uri) as pdf:
    print("PAGE # 1")
    text = pdf.pages[0].extract_text()
    print(text)
    print("PAGE # 2")
    text = pdf.pages[1].extract_text()
    print(text)

In [None]:
# try to extract table
with pdfplumber.open(_uri) as pdf:
    print("PAGE # 1")
    table = pdf.pages[0].extract_table()
    print(table)
    print("PAGE # 2")
    table = pdf.pages[1].extract_table()
    print(table)

In [None]:
# try to extract tables
with pdfplumber.open(_uri) as pdf:
    print("PAGE # 1")
    tables = pdf.pages[0].extract_tables()
    print(tables)
    print("PAGE # 2")
    tables = pdf.pages[1].extract_tables()
    print(tables)

# Test table & text PDFs

In [None]:
_uri = "../data/__test_pdfs/table-text.pdf"
with pdfplumber.open(_uri) as pdf:
    print("PAGE # 1")
    text = pdf.pages[0].extract_text()
    print(text)

In [None]:
with pdfplumber.open(_uri) as pdf:
    print("PAGE # 1")
    table = pdf.pages[0].extract_table()
    print(table)

# Test complex text PDFs

In [None]:
_uri = "../data/__test_pdfs/complex_text.pdf"

In [None]:
page_no = 1
task = "EXTRACT TEXT"
with pdfplumber.open(_uri) as pdf:
    
    print(f"task: {task}\nPAGE # {page_no}\n***\n\n")
    text = pdf.pages[page_no - 1].extract_text()
    print(text)

In [None]:
page_no = 2
task = "EXTRACT TEXT"
with pdfplumber.open(_uri) as pdf:
    
    print(f"task: {task}\nPAGE # {page_no}\n***\n\n")
    text = pdf.pages[page_no - 1].extract_text()
    print(text)

In [None]:
page_no = 3
task = "EXTRACT TEXT"
with pdfplumber.open(_uri) as pdf:
    
    print(f"task: {task}\nPAGE # {page_no}\n***\n\n")
    text = pdf.pages[page_no - 1].extract_text()
    print(text)

In [None]:
page_no = 3
task = "EXTRACT TABLE"
with pdfplumber.open(_uri) as pdf:
    
    print(f"task: {task}\nPAGE # {page_no}\n***\n\n")
    text = pdf.pages[page_no - 1].extract_table()
    print(text)

In [None]:
page_no = 60
task = "EXTRACT TEXT"
with pdfplumber.open(_uri) as pdf:
    
    print(f"task: {task}\nPAGE # {page_no}\n***\n\n")
    text = pdf.pages[page_no - 1].extract_text()
    print(text)

In [None]:
page_no = 60
task = "EXTRACT TABLE"
with pdfplumber.open(_uri) as pdf:
    
    print(f"task: {task}\nPAGE # {page_no}\n***\n\n")
    text = pdf.pages[page_no - 1].extract_table()
    print(text)

# Test complex_text-tables-images PDFs

In [None]:
_uri = "../data/__test_pdfs/complex_text-tables-images.pdf"

In [None]:
page_no = 1
task = "EXTRACT TEXT"
with pdfplumber.open(_uri) as pdf:
    
    print(f"task: {task}\nPAGE # {page_no}\n***\n\n")
    text = pdf.pages[page_no - 1].extract_text()
    print(text)

In [None]:
page_no = 2
task = "EXTRACT TEXT"
with pdfplumber.open(_uri) as pdf:
    
    print(f"task: {task}\nPAGE # {page_no}\n***\n\n")
    text = pdf.pages[page_no - 1].extract_text()
    print(text)

In [None]:
page_no = 2
task = "EXTRACT TABLE"
with pdfplumber.open(_uri) as pdf:
    
    print(f"task: {task}\nPAGE # {page_no}\n***\n\n")
    text = pdf.pages[page_no - 1].extract_table()
    print(text)

In [None]:
page_no = 4
task = "EXTRACT TEXT"
with pdfplumber.open(_uri) as pdf:
    
    print(f"task: {task}\nPAGE # {page_no}\n***\n\n")
    text = pdf.pages[page_no - 1].extract_text()
    print(text)

In [None]:
page_no = 4
task = "EXTRACT TABLE"
with pdfplumber.open(_uri) as pdf:
    
    print(f"task: {task}\nPAGE # {page_no}\n***\n\n")
    text = pdf.pages[page_no - 1].extract_table()
    print(text)

In [None]:
page_no = 6
task = "EXTRACT TEXT"
with pdfplumber.open(_uri) as pdf:
    
    print(f"task: {task}\nPAGE # {page_no}\n***\n\n")
    text = pdf.pages[page_no - 1].extract_text()
    print(text)

In [None]:
page_no = 6
task = "EXTRACT TABLE"
with pdfplumber.open(_uri) as pdf:
    
    print(f"task: {task}\nPAGE # {page_no}\n***\n\n")
    text = pdf.pages[page_no - 1].extract_table()
    print(text)

In [None]:
page_no = 7
task = "EXTRACT TABLE"
with pdfplumber.open(_uri) as pdf:
    
    print(f"task: {task}\nPAGE # {page_no}\n***\n\n")
    table = pdf.pages[page_no - 1].extract_table()
    print(table)

In [None]:
page_no = 8
task = "EXTRACT TABLEs"
with pdfplumber.open(_uri) as pdf:
    
    print(f"task: {task}\nPAGE # {page_no}\n***\n\n")
    tables = pdf.pages[page_no - 1].extract_tables()
    for no, table in enumerate(tables):
        print(f"t_no: {no}")
        print(table)

In [None]:
page_no = 8
task = "EXTRACT TEXT"
with pdfplumber.open(_uri) as pdf:
    
    print(f"task: {task}\nPAGE # {page_no}\n***\n\n")
    text = pdf.pages[page_no - 1].extract_text()
    print(text)

In [None]:
page_no = 9
task = "EXTRACT TABLE"
with pdfplumber.open(_uri) as pdf:
    
    print(f"task: {task}\nPAGE # {page_no}\n***\n\n")
    table = pdf.pages[page_no - 1].extract_table()
    print(table)

In [None]:
page_no = 11
task = "EXTRACT TABLEs"
with pdfplumber.open(_uri) as pdf:
    
    print(f"task: {task}\nPAGE # {page_no}\n***\n\n")
    tables = pdf.pages[page_no - 1].extract_tables()
    for no, table in enumerate(tables):
        print(f"t_no: {no}")
        print(table)

In [None]:
_uri = "../data/__test_pdfs/complex_columns-text-tables-images.pdf"

In [None]:
page_no = 1
task = "EXTRACT TEXT"
with pdfplumber.open(_uri) as pdf:
    
    print(f"task: {task}\nPAGE # {page_no}\n***\n\n")
    text = pdf.pages[page_no - 1].extract_text()
    print(text)

In [None]:
page_no = 2
task = "EXTRACT TEXT"
with pdfplumber.open(_uri) as pdf:
    
    print(f"task: {task}\nPAGE # {page_no}\n***\n\n")
    text = pdf.pages[page_no - 1].extract_text()
    print(text)

In [None]:
page_no = 3
task = "EXTRACT TEXT"
with pdfplumber.open(_uri) as pdf:
    
    print(f"task: {task}\nPAGE # {page_no}\n***\n\n")
    text = pdf.pages[page_no - 1].extract_text()
    print(text)

In [None]:
page_no = 3
task = "EXTRACT TABLEs"
with pdfplumber.open(_uri) as pdf:
    
    print(f"task: {task}\nPAGE # {page_no}\n***\n\n")
    tables = pdf.pages[page_no - 1].extract_tables()
    for no, table in enumerate(tables):
        print(f"t_no: {no}")
        print(table)

In [None]:
page_no = 4
task = "EXTRACT TABLEs"
with pdfplumber.open(_uri) as pdf:
    
    print(f"task: {task}\nPAGE # {page_no}\n***\n\n")
    tables = pdf.pages[page_no - 1].extract_tables()
    for no, table in enumerate(tables):
        print(f"t_no: {no}")
        print(table)

In [None]:
page_no = 5
task = "EXTRACT TABLEs"
with pdfplumber.open(_uri) as pdf:
    
    print(f"task: {task}\nPAGE # {page_no}\n***\n\n")
    tables = pdf.pages[page_no - 1].extract_tables()
    for no, table in enumerate(tables):
        print(f"t_no: {no}")
        print(table)

# Example from DOC

In [None]:
_uri = "../data/__test_pdfs/GeoBase_NHNC1_Data_Model_UML_EN.pdf"


In [None]:
page_no = 1
task = "EXTRACT TEXT"
with pdfplumber.open(_uri) as pdf:
    
    print(f"task: {task}\nPAGE # {page_no}\n***\n\n")
    text = pdf.pages[page_no - 1].extract_text()
    print(text)

In [None]:
page_no = 2
task = "EXTRACT TEXT"
with pdfplumber.open(_uri) as pdf:
    
    print(f"task: {task}\nPAGE # {page_no}\n***\n\n")
    text = pdf.pages[page_no - 1].extract_text()
    print(text)

In [None]:
page_no = 2
task = "EXTRACT TABLEs"
with pdfplumber.open(_uri) as pdf:
    
    print(f"task: {task}\nPAGE # {page_no}\n***\n\n")
    tables = pdf.pages[page_no - 1].extract_tables()
    for no, table in enumerate(tables):
        print(f"t_no: {no}")
        print(table)
    
    if not tables:
        print(f"No tables found")

In [None]:
page_no = 3
task = "EXTRACT TEXT"
with pdfplumber.open(_uri) as pdf:
    
    print(f"task: {task}\nPAGE # {page_no}\n***\n\n")
    text = pdf.pages[page_no - 1].extract_text()
    print(text)

In [None]:
page_no = 3
task = "EXTRACT TABLEs"
with pdfplumber.open(_uri) as pdf:
    
    print(f"task: {task}\nPAGE # {page_no}\n***\n\n")
    tables = pdf.pages[page_no - 1].extract_tables()
    for no, table in enumerate(tables):
        print(f"t_no: {no}")
        print(table)

In [None]:
page_no = 5
task = "EXTRACT TABLEs"
with pdfplumber.open(_uri) as pdf:
    
    print(f"task: {task}\nPAGE # {page_no}\n***\n\n")
    tables = pdf.pages[page_no - 1].extract_tables()
    for no, table in enumerate(tables):
        print(f"t_no: {no}")
        print(table)
    
    if not tables:
        print(f"No tables found")

In [None]:
page_no = 7
task = "EXTRACT TABLEs"
with pdfplumber.open(_uri) as pdf:
    
    print(f"task: {task}\nPAGE # {page_no}\n***\n\n")
    tables = pdf.pages[page_no - 1].extract_tables()
    for no, table in enumerate(tables):
        print(f"t_no: {no}")
        print(table)
    
    if not tables:
        print(f"No tables found")

In [None]:
page_no = 12
task = "EXTRACT TABLEs"
with pdfplumber.open(_uri) as pdf:
    
    print(f"task: {task}\nPAGE # {page_no}\n***\n\n")
    tables = pdf.pages[page_no - 1].extract_tables()
    for no, table in enumerate(tables):
        print(f"t_no: {no}")
        print(table)
    
    if not tables:
        print(f"No tables found")