In [1]:
from pdfminer.pdfparser import PDFParser
from pdfminer.pdfdocument import PDFDocument
from pdfminer.pdfpage import PDFPage
from pdfminer.pdfpage import PDFTextExtractionNotAllowed
from pdfminer.pdfinterp import PDFResourceManager
from pdfminer.pdfinterp import PDFPageInterpreter
from pdfminer.layout import LAParams
from pdfminer.converter import PDFPageAggregator


def extract_layout_by_page(pdf_path):
    laparams = LAParams()

    fp = open(pdf_path, 'rb')
    parser = PDFParser(fp)
    document = PDFDocument(parser)

    if not document.is_extractable:
        raise PDFTextExtractionNotAllowed

    rsrcmgr = PDFResourceManager()
    device = PDFPageAggregator(rsrcmgr, laparams=laparams)
    interpreter = PDFPageInterpreter(rsrcmgr, device)

    layouts = []
    for page in PDFPage.create_pages(document):
        interpreter.process_page(page)
        layouts.append(device.get_result())

    return layouts

example_file = "StandardTest.pdf"
page_layouts = extract_layout_by_page(example_file)

In [2]:
len(page_layouts)

1

In [3]:
objects_on_page = set(type(o) for o in page_layouts[0])
objects_on_page

{pdfminer.layout.LTCurve,
 pdfminer.layout.LTLine,
 pdfminer.layout.LTRect,
 pdfminer.layout.LTTextBoxHorizontal}

In [4]:
import pdfminer

TEXT_ELEMENTS = [
    pdfminer.layout.LTTextBox,
    pdfminer.layout.LTTextBoxHorizontal,
    pdfminer.layout.LTTextLine,
    pdfminer.layout.LTTextLineHorizontal
]

def flatten(lst):
    """Flattens a list of lists"""
    return [subelem for elem in lst for subelem in elem]


def extract_characters(element):
    """
    Recursively extracts individual characters from 
    text elements. 
    """
    if isinstance(element, pdfminer.layout.LTChar):
        return [element]

    if any(isinstance(element, i) for i in TEXT_ELEMENTS):
        return flatten([extract_characters(e) for e in element])

    if isinstance(element, list):
        return flatten([extract_characters(l) for l in element])

    return []

current_page = page_layouts[0]

texts = []
rtLines = []
rects = []

# seperate text and rectangle elements
for e in current_page:
    if isinstance(e, pdfminer.layout.LTTextBoxHorizontal):
        texts.append(e)
    elif isinstance(e, pdfminer.layout.LTLine):
        rtLines.append(e)
    elif isinstance(e, pdfminer.layout.LTRect):
        rects.append(e)

# sort them into 
characters = extract_characters(texts)
characterlist=[]
for c in characters:
    a=[]
    a.extend([c.bbox[0],c.bbox[1],c.bbox[2],c.bbox[3],c._text])
    characterlist.append(a)
    
def sortCharacters(listOfChar):
    copyOfCharacterList = listOfChar
    copyOfCharacterList.sort(key = lambda x:x[1], reverse = True)
    for i in range(1,len(copyOfCharacterList)):
        value = copyOfCharacterList[i]
        hole = i-1
        while(hole>=0 and copyOfCharacterList[hole][1]>value[1] and copyOfCharacterList[hole][2] == value[2]):      
            copyOfCharacterList[hole+1] = copyOfCharacterList[hole]
            hole -= 1
        copyOfCharacterList[hole+1] = value
    return copyOfCharacterList

characterlist = sortCharacters(characterlist)
horizontalSeperators = []
verticalSeperators = []
for r in rtLines:
    x1, x2, x3, x4 = r.bbox[0] , r.bbox[1], r.bbox[2], r.bbox[3]
    if(x2==x4):
        horizontalSeperators.append([x1, x2, x3, x4])
    elif(x1==x3):
        verticalSeperators.append([x1, x2, x3, x4])

In [5]:
def extractTableData(listOfChar):
    rowData = []
    for h in range(len(horizontalSeperators)-1):
        cellData = []
        for v in range(len(verticalSeperators)-1):            
            words = []
            for char in listOfChar:
                if(verticalSeperators[v][0] <= char[0]+2 and verticalSeperators[v+1][0] >= char[2]-2 and horizontalSeperators[h][1] >= char[3]-2 and horizontalSeperators[h+1][1] <= char[1]+2):
                    words.append(char)
            cellData.append(words)
        rowData.append(cellData)
    return rowData

In [6]:
TableData = extractTableData(characterlist)

In [7]:
def returnStringfromCharacterlist(listOfChar):
    outputString = ""
    count = len(listOfChar)
    for k in range(count-1):
        if(listOfChar[k][1] > listOfChar[k+1][1]):
            outputString += listOfChar[k][4] + "\n"
        elif((round(listOfChar[k][2])-round(listOfChar[k+1][0])) >= -1):
            outputString += listOfChar[k][4]
        else:
            outputString += listOfChar[k][4] + " "
    outputString += listOfChar[count-1][4]
    return outputString

In [8]:
print(returnStringfromCharacterlist(TableData[1][1]))

It’s a good product that can
make the life of a person
great.


In [9]:
def returnRelativeCharactersWRTWords(listOfChar,words,operation):
    characterInWords = list(words.replace(" ",""))
    flag = False
    characterList = []
    position = 0
    for i in range(len(listOfChar)-1):
        if(listOfChar[i][4] == characterInWords[0]):
            for j in range(len(characterInWords)-1):
                if(listOfChar[i+j][4] == characterInWords[j]):
                    flag = True
                else:
                    flag = False
                    break
            if(flag == False):
                continue
            else:
                if(operation == "LeftOf"):
                    position = listOfChar[i][0]
                elif(operation == "RightOf"):
                    position = listOfChar[i][2]
                elif(operation == "Below"):
                    position = listOfChar[i][1]
                elif(operation == "Above"):
                    position = listOfChar[i][3]
                else:
                    raise Exception
                break
    if(flag == True):
        if(operation == "LeftOf"):
            for k in range(len(listOfChar)):
                if(listOfChar[k][0]< position):
                    characterList.append(listOfChar[k])
            return characterList  
        elif(operation == "RightOf"):
            for k in range(len(listOfChar)):
                if(listOfChar[k][2] > position):
                    characterList.append(listOfChar[k])
            return characterList 
        elif(operation == "Below"):
            for k in range(len(listOfChar)):
                if(listOfChar[k][1]< position):
                    characterList.append(listOfChar[k])
            return characterList 
        elif(operation == "Above"):
            for k in range(len(listOfChar)):
                if(listOfChar[k][3]> position):
                    characterList.append(listOfChar[k])
            return characterList 
    else:
        raise Exception

In [10]:
print(returnStringfromCharacterlist(returnRelativeCharactersWRTWords(characterlist,"Amrud Bagan","LeftOf")))

Name : Vivek Kumar
Phone No : +91 8105694842
Email: vivekkumarvik@
gmail.com
Name Description
Bag It’s a good product
make the life of a
great.
Shoes Wear it well
from Main Road withCar Wheeli Wali gaddi
Multiple wheel
Mobile Best Tool
Laptop For Working and
games


In [11]:
"""
def returnRelativeCharacters(listOfChar,offset,operation):
    if(operation == "LeftOf"):
        for k in range(len(listOfChar)):
            if(listOfChar[k][0]< position):
                characterList.append(listOfChar[k])
        return characterList  
    elif(operation == "RightOf"):
        for k in range(len(listOfChar)):
            if(listOfChar[k][2] > position):
                characterList.append(listOfChar[k])
        return characterList 
    elif(operation == "Below"):
        for k in range(len(listOfChar)):
            if(listOfChar[k][1]< position):
                characterList.append(listOfChar[k])
        return characterList 
    elif(operation == "Above"):
        for k in range(len(listOfChar)):
            if(listOfChar[k][3]> position):
                characterList.append(listOfChar[k])
        return characterList
"""

'\ndef returnRelativeCharacters(listOfChar,offset,operation):\n    if(operation == "LeftOf"):\n        for k in range(len(listOfChar)):\n            if(listOfChar[k][0]< position):\n                characterList.append(listOfChar[k])\n        return characterList  \n    elif(operation == "RightOf"):\n        for k in range(len(listOfChar)):\n            if(listOfChar[k][2] > position):\n                characterList.append(listOfChar[k])\n        return characterList \n    elif(operation == "Below"):\n        for k in range(len(listOfChar)):\n            if(listOfChar[k][1]< position):\n                characterList.append(listOfChar[k])\n        return characterList \n    elif(operation == "Above"):\n        for k in range(len(listOfChar)):\n            if(listOfChar[k][3]> position):\n                characterList.append(listOfChar[k])\n        return characterList\n'