Delete - Deleting section extraction methods

- Delete separate_section_blocks, clean_section_boxes and _fuse_blocks
UnB-KnEDLe · Nov 1, 2021 · 1ac4262 · 1ac4262
1 parent 25f6e43
commit 1ac4262
Showing 1 changed file with 0 additions and 83 deletions.
diff --git a/dodfminer/extract/pure/utils/box_extractor.py b/dodfminer/extract/pure/utils/box_extractor.py
@@ -85,65 +85,6 @@ def compare_blocks(block1, block2):
     else:
         return b1_x0-b2_x0
 
-
-def clean_section_boxes(page):
-    """Makes sure that a section title text is always separated into
-       a single block, which does not contain additional tex.
-
-    Args:
-        page: fitz.fitz.Page object to have its bold content extracted.
-
-    Returns:
-        List[List[tuple(float, float, float, float, str, int, int, int)]]
-    """
-    blocks = page.getTextBlocks()
-
-    section_blocks = list(filter(lambda x: re.match("SEÇÃO (I|II|III)", x[4]), blocks))
-
-    if section_blocks:
-        separated_section_blocks = map(
-            lambda x: separate_section_blocks(page, x), section_blocks)
-        joined_section_blocks = reduce(operator.add, separated_section_blocks)
-        section_blocks = list(filter(None, joined_section_blocks))
-
-    not_section_blocks = list(filter(lambda x: not re.match("SEÇÃO (I|II|III)", x[4]), blocks))
-
-    return not_section_blocks + section_blocks
-
-
-def separate_section_blocks(page, box):
-    """Separates, within a single box text, a section title text from any 
-       other kind of text into different boxes.
-
-    Args:
-        page: fitz.fitz.Page object to have its bold content extracted.
-        box: Box tuple, representing an area of text.
-
-    Returns:
-        Separated boxes if necessary. If not, the same box received.
-        List[tuple(float, float, float, float, str, int, int, int)]
-    """
-    rect = fitz.Rect(box[:4])
-
-    def extract_box(x): return x['bbox'] + (x['text'], box[5], box[6])
-
-    section_box = []
-    rest_box = []
-
-    box_dict = page.getText('dict', clip=rect)['blocks'][0]['lines']
-
-    for span_dict in box_dict:
-        span_list = span_dict['spans']
-        boxes = list(map(extract_box, span_list))
-
-        section_box = section_box + \
-            list(filter(lambda x: x[4] in SECTION_TITLES, boxes))
-        rest_box = rest_box + \
-            list(filter(lambda x: not x[4] in SECTION_TITLES, boxes))
-
-    return [_fuse_blocks(rest_box) if rest_box else (), section_box[0]]
-
-
 def draw_doc_text_boxes(doc: fitz.Document, doc_boxes, save_path=None):
     """Draw extracted text blocks rectangles.
        In result, a pdf file with rectangles shapes added, representing the extracted blocks,
@@ -175,30 +116,6 @@ def draw_doc_text_boxes(doc: fitz.Document, doc_boxes, save_path=None):
         doc.save(f"{doc_path}{'/' if len(doc_path) else ''}BOXES_{doc_name}")
 
 
-def _fuse_blocks(blocks):
-    """Transform a list of block into one fused block.
-       The block coordinates and text are changed to represent the
-       multiple blocks as a single one.
-
-    Args:
-        blocks: a list of blocks to be fused.
-
-    Returns:
-        List[List[tuple(float, float, float, float, str, int, int, int)]]
-    """
-    texts = list(map(lambda x: x[4], blocks))
-    fused_text = " ".join(texts)
-
-    x0_1, y0, x1_1, *_ = blocks[0]
-    x0_2, _, x1_2, y1, *_ = blocks[-1]
-
-    x0 = min([x0_1, x0_2])
-    x1 = max([x1_1, x1_2])
-
-    fused_block = (x0, y0, x1, y1, fused_text) + blocks[0][5:]
-    return fused_block
-
-
 def get_doc_text_lines(doc: fitz.Document):
     """Returns list of list of extracted text lines.