Skip to content

Commit

Permalink
Delete - Deleting section extraction methods
Browse files Browse the repository at this point in the history
- Delete separate_section_blocks, clean_section_boxes and _fuse_blocks
  • Loading branch information
Lary15 committed Nov 1, 2021
1 parent 25f6e43 commit 1ac4262
Showing 1 changed file with 0 additions and 83 deletions.
83 changes: 0 additions & 83 deletions dodfminer/extract/pure/utils/box_extractor.py
Original file line number Diff line number Diff line change
Expand Up @@ -85,65 +85,6 @@ def compare_blocks(block1, block2):
else:
return b1_x0-b2_x0


def clean_section_boxes(page):
"""Makes sure that a section title text is always separated into
a single block, which does not contain additional tex.
Args:
page: fitz.fitz.Page object to have its bold content extracted.
Returns:
List[List[tuple(float, float, float, float, str, int, int, int)]]
"""
blocks = page.getTextBlocks()

section_blocks = list(filter(lambda x: re.match("SEÇÃO (I|II|III)", x[4]), blocks))

if section_blocks:
separated_section_blocks = map(
lambda x: separate_section_blocks(page, x), section_blocks)
joined_section_blocks = reduce(operator.add, separated_section_blocks)
section_blocks = list(filter(None, joined_section_blocks))

not_section_blocks = list(filter(lambda x: not re.match("SEÇÃO (I|II|III)", x[4]), blocks))

return not_section_blocks + section_blocks


def separate_section_blocks(page, box):
"""Separates, within a single box text, a section title text from any
other kind of text into different boxes.
Args:
page: fitz.fitz.Page object to have its bold content extracted.
box: Box tuple, representing an area of text.
Returns:
Separated boxes if necessary. If not, the same box received.
List[tuple(float, float, float, float, str, int, int, int)]
"""
rect = fitz.Rect(box[:4])

def extract_box(x): return x['bbox'] + (x['text'], box[5], box[6])

section_box = []
rest_box = []

box_dict = page.getText('dict', clip=rect)['blocks'][0]['lines']

for span_dict in box_dict:
span_list = span_dict['spans']
boxes = list(map(extract_box, span_list))

section_box = section_box + \
list(filter(lambda x: x[4] in SECTION_TITLES, boxes))
rest_box = rest_box + \
list(filter(lambda x: not x[4] in SECTION_TITLES, boxes))

return [_fuse_blocks(rest_box) if rest_box else (), section_box[0]]


def draw_doc_text_boxes(doc: fitz.Document, doc_boxes, save_path=None):
"""Draw extracted text blocks rectangles.
In result, a pdf file with rectangles shapes added, representing the extracted blocks,
Expand Down Expand Up @@ -175,30 +116,6 @@ def draw_doc_text_boxes(doc: fitz.Document, doc_boxes, save_path=None):
doc.save(f"{doc_path}{'/' if len(doc_path) else ''}BOXES_{doc_name}")


def _fuse_blocks(blocks):
"""Transform a list of block into one fused block.
The block coordinates and text are changed to represent the
multiple blocks as a single one.
Args:
blocks: a list of blocks to be fused.
Returns:
List[List[tuple(float, float, float, float, str, int, int, int)]]
"""
texts = list(map(lambda x: x[4], blocks))
fused_text = " ".join(texts)

x0_1, y0, x1_1, *_ = blocks[0]
x0_2, _, x1_2, y1, *_ = blocks[-1]

x0 = min([x0_1, x0_2])
x1 = max([x1_1, x1_2])

fused_block = (x0, y0, x1, y1, fused_text) + blocks[0][5:]
return fused_block


def get_doc_text_lines(doc: fitz.Document):
"""Returns list of list of extracted text lines.
Expand Down

0 comments on commit 1ac4262

Please sign in to comment.