### Copyright (c) 2020  Ming Liu \<Mliu54@sheffield.ac.uk\>
<br><br>
Permission is hereby granted, free of charge, to any person<br>
obtaining a copy of this software and associated documentation<br>
files (the "Software"), to deal in the Software without<br>
restriction, including without limitation the rights to use,<br>
copy, modify, merge, publish, distribute, sublicense, and/or<br>
sell copies of the Software, and to permit persons to whom the<br>
Software is furnished to do so, subject to the following<br>
conditions:<br><br>

The above copyright notice and this permission notice shall be<br>
included in all copies or substantial portions of the Software.<br>
<br>
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY<br>
KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE<br>
WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR<br>
PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR<br>
COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER<br>
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR<br>
OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE<br>
SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.<br><br>

Project: Pdfminer.six<br>
Project URL: https://pdfminersix.readthedocs.io/en/latest/<br>
License: MIT License (MIT)  https://github.com/pdfminer/pdfminer.six/blob/develop/LICENSE<br><br>

Code: extract text and text coordinates from a PDF file<br>
Code URL: https://code-examples.net/en/q/15d65e1 <br>

In [1]:
from pdfminer.pdfparser import PDFParser
from pdfminer.pdfdocument import PDFDocument
from pdfminer.pdfpage import PDFPage
from pdfminer.pdfpage import PDFTextExtractionNotAllowed
from pdfminer.pdfinterp import PDFResourceManager
from pdfminer.pdfinterp import PDFPageInterpreter
from pdfminer.pdfdevice import PDFDevice
from pdfminer.layout import LAParams
from pdfminer.converter import PDFPageAggregator
import pdfminer

In [2]:
# Open a PDF file.
fp = open('34676-M57-0302_Iss7.pdf', 'rb')

# Create a PDF parser object associated with the file object.
parser = PDFParser(fp)

# Create a PDF document object that stores the document structure.
# Password for initialization as 2nd parameter
document = PDFDocument(parser)

# Check if the document allows text extraction. If not, abort.
if not document.is_extractable:
    raise PDFTextExtractionNotAllowed

# Create a PDF resource manager object that stores shared resources.
rsrcmgr = PDFResourceManager()

# Create a PDF device object.
device = PDFDevice(rsrcmgr)

# BEGIN LAYOUT ANALYSIS
# Set parameters for analysis.
laparams = LAParams()

# Create a PDF page aggregator object.
device = PDFPageAggregator(rsrcmgr, laparams=laparams)

# Create a PDF interpreter object.
interpreter = PDFPageInterpreter(rsrcmgr, device)

In [3]:
def parse_obj(lt_objs):
    
    lt_objs = sorted(lt_objs, key=lambda x:(x.bbox[0],x.bbox[1]))

    idd = 0
    # loop over the object list
    for obj in lt_objs:
        
        
        # if it's a textbox, print text and location
        if isinstance(obj, pdfminer.layout.LTTextBoxHorizontal):
            
            # increate the number of id
            idd += 1
            print("%6d, %6d, %6d, %s" % (idd, obj.bbox[0], obj.bbox[1], obj.get_text().replace('\n', "\n                        ")))

        # if it's a container, recurse
        elif isinstance(obj, pdfminer.layout.LTFigure):
            parse_obj(obj._objs)

In [4]:
print("%s, %s, %s, %s\n\n" % ('    ID', 'X-axis', 'Y-axis', 'Text'))
# loop over all pages in the document
for page in PDFPage.create_pages(document):

    # read the page into a layout object
    interpreter.process_page(page)
    layout = device.get_result()

    # extract text from this object
    parse_obj(layout._objs)

    ID, X-axis, Y-axis, Text


     1,   -164,    656, SILENTSTUDYSPACE1
                        
     2,   -152,   1386, BREAKOUTROOM3.5
                        
     3,    -95,    596, 30Workplaces
                        
     4,    -67,    627, (cid:20)(cid:20)(cid:22)(cid:17)(cid:24)(cid:19)(cid:3)(cid:80)(cid:240)
                        
     5,      0,      0, t
                        o
                        p
                        d
                        a
                        c
                        m
                        o
                        c
                        .
                        e
                        r
                        a
                        w
                        t
                        f
                        o
                        s
                        -
                        s
                        y
                        s
                        a
                        o
                        .
   