In [1]:
# %load_ext autoreload
# %autoreload 2

In [2]:
import re
from collections import namedtuple
from typing import List, Dict, Union, NamedTuple

import prextract.title_filter as title_filter
import fitz

In [95]:
PATH='2.pdf'
# PATH='25_paginas.pdf'
# PATH='2020.pdf'
PATH='novo-2020.pdf'
doc = fitz.open(PATH)
pnum = 1

p = doc[pnum]
p_width = p.MediaBox[2]


class TextBlockTrans(NamedTuple):
    x0: float
    y0: float
    x1: float
    y1: float
    text: str
    block_no: int
    page: int
    pwidth: float = None
    def __repr__(self):
        ret = []
        ret.append('TextBlockTrans')
        ret.append('\tbbox: ({}, {}, {}, {})'.format(*self[:4]))
        ret.append('\ttext: {}'.format(self.text))
        ret.append('\tblock_no: {}'.format(self[5]))
        ret.append('\tpage, pwidth: {}'.format(self[-2:]))
        return '\n'.join(ret)


In [96]:
text_blocks = p.getTextBlocks()
extract = p.getTextPage().extractDICT()['blocks']
# text_blocks[4:6]

In [97]:
_TRASH_EXPRESSIONS = [
    "SUMÁRIO",
    "DIÁRIO OFICIAL",
    "SEÇÃO (I|II|III)",
]

_TRASH_COMPILED = re.compile('|'.join(_TRASH_EXPRESSIONS))


def is_bold(flags):
    return flags & 2 ** 4


def textBlock_topage(lis, page_width, pnum):
    """Given a text_block list (of tuples), return one with 
    two more values at the end, indicating `page number` and
    `page width`.
    """
    return [TextBlockTrans(*i[:-1], pnum, page_width) for i in lis]

def text_blocks_transform(text_blocks: List,
                          keep_page_width=True):
    lis = []
    for idx, tb in enumerate(text_blocks):
        p_num = tb[-2]
        p_num = tb.page
        
        
        p_width = tb[-1]
        p_width = tb.pwidth
        
        
        p_num *= 2
        x0, y0, x1, y1 = tb[:4]
        x0 = tb.x0
        
        p_num += int(x0 >( p_width / 2))
        if keep_page_width:            
            lis.append( TextBlockTrans( *(tb[:-2]), p_num, p_width ) )
        else:
            lis.append( TextBlockTrans( *(tb[:-2]), p_num, ) )
    return lis

def page_transform(blocks, keep_page_width=True, inplace=False):
    """Increases page numbers of blocks.
        This function takes an list of dictionaries each of wich
        having at least 'page', 'page_width' and 'bbox' as keys,
        and modify 'page' entry if bbox[0] > page_width / 2.
        Basically, "stacks" text based first on page and then if it is
        located on left/right half-horizontal.


    Args:
        blocks: List[Dict]
        keep_page_width: whether to drop or not `page_width`
            dict entries
    Returns:
        the modified list.
    WARNING:
        blocks is modified.
    """
    if not inplace:
        blocks = [i.copy() for i in blocks]
    for d in blocks:
        p_num = d['page']
        p_width = d.pop('page_width') if \
                    not keep_page_width else d['page_width']
        p_num *= 2
        x0, y0, x1, y1 = d['bbox']
        # Is top-left corner on left [horizontal] half of the page?
        p_num += int(x0 >( p_width / 2))
        # p_num = p_num + int(((x0 > p_width/2) and (x1 > p_width/2) ))
        # p_num = p_num + int(x0 > (p_width * .4) and x1 > (p_width / 2))
        d['page'] = p_num
    return blocks


def is_title_subtitle(span):
    return ((title_filter.BoldUpperCase.dict_text(span))
            and is_bold(span['flags'])
            and not re.search(_TRASH_COMPILED, span['text'])
            and 'calibri' not in span['font'].lower()
        )

def are_title_subtitle(span_list):
    return [is_title_subtitle(span) for span in span_list]

In [98]:
tb_paged = textBlock_topage(text_blocks, p_width, pnum)
tb_trans = text_blocks_transform(tb_paged, keep_page_width=False)

## VERIFICAR SE UM BLOCO É CANDIDATO A TÍTULO

In [99]:
import re
def is_upper(text: str):
    return text.upper() == text

In [100]:
def get_block_spans(block):
    span_lis = [] 
    for line in block['lines']: 
        for span in line['spans']: 
            span_lis.append(span) 
    return span_lis

titles_idx = [idx for (idx, bl) in enumerate(tb_trans)
          if is_upper(bl.text)]


In [34]:
def reading_sort_tuple(lis):
    return sorted(lis, key=lambda x: (x.page, int(x.y0), x.x0))

def reading_sort_dict(lis):
    return sorted(lis, key=lambda x: (x['page'], int(x['bbox'][1]), x['bbox'][0]))


def drop_dup_tbt(lis: List[TextBlockTrans]):
    """. This fun
    
    Sometimes, a span text apears multiple times, as if there exists
    multiple spans starting at the same point. Tihs function drops
    duplicate which matches this case.
    """
    dic = {}
    for tup in lis:
        dic[tuple([int(i) for i in tup[:2]])] = tup
    return list(dic.values())
ret = reading_sort_tuple(drop_dup_tbt(tb_trans))
# print(ret[0])


def drop_header_footer(lis: List[tuple]):
    y0l = [ x.y0 for x in lis ]
    mi, ma = min(y0l), max(y0l)
    idx_mi, idx_ma = y0l.index(mi), y0l.index(ma)
    left, right = min(idx_mi, idx_ma), max(idx_mi, idx_ma)
    del lis[left]
    del lis[right-1]
    return lis


In [35]:
from prextract.dodf_hierarchy import get_spans_by_page

def get_first_title_cands(blocks, page_width):
    """Returns first_title candidates.
    
    """
    blocks = [{**b, 'page': 0, 'page_width': page_width} \
              for b in blocks]
    blocks = reading_sort_dict(page_transform(blocks))
    sps = []
    for block in blocks:
        for line in block['lines']:
            i = 0
            for sp in line['spans']:
                sps.append(sp)
#     sps = [i for i in sps if 
#            i['text'] == 'SEÇÃO I' and is_bold(i['flags'])]
    cands = [(idx, sp) for (idx, sp) in enumerate(sps) if (
        sp['text'].startswith('SEÇÃO I') and is_bold(sp['flags'])
    )]
#   OBS: sps é para DEBUG; idealmente, não deveria precisar.
#   CASO len(cands) > 1, temos um problema!
    return sps, cands
sps, cands = get_first_title_cands(
    doc[0].getTextPage().extractDICT()['blocks'],
    doc[0].MediaBox[2],
)

In [36]:
# sps[cands[0][0]-2:cands[0][0]+3]
print('SEÇÃO I --> ', sps[cands[0][0]])
print('TÍTULO I --> ', sps[cands[0][0]+1])

SEÇÃO I -->  {'size': 12.951430320739746, 'flags': 16, 'font': 'Arial-BoldMT', 'color': 2236191, 'text': 'SEÇÃO I', 'bbox': (202.99237060546875, 600.3304443359375, 257.645751953125, 614.7996215820312)}
TÍTULO I -->  {'size': 12.951430320739746, 'flags': 16, 'font': 'Arial-BoldMT', 'color': 2236191, 'text': 'PODER EXECUTIVO', 'bbox': (168.16795349121094, 638.8291015625, 293.4197082519531, 653.2982788085938)}


In [37]:
bls = doc[0].getTextPage().extractDICT()['blocks']
page_width = doc[0].MediaBox[2]
sps, cands = get_first_title_cands(bls, page_width)
del bls

In [38]:
cands, sps[cands[0][0]+1]

([(92,
   {'size': 12.951430320739746,
    'flags': 16,
    'font': 'Arial-BoldMT',
    'color': 2236191,
    'text': 'SEÇÃO I',
    'bbox': (202.99237060546875,
     600.3304443359375,
     257.645751953125,
     614.7996215820312)})],
 {'size': 12.951430320739746,
  'flags': 16,
  'font': 'Arial-BoldMT',
  'color': 2236191,
  'text': 'PODER EXECUTIVO',
  'bbox': (168.16795349121094,
   638.8291015625,
   293.4197082519531,
   653.2982788085938)})

In [90]:
def mount_doc_hierarchy(doc: fitz.Document):
    blocks_p0 = doc[0].getTextPage().extractDICT()['blocks']    
                
    spans, candidates = get_first_title_cands(
        blocks_p0,
        doc[0].MediaBox[2]
    )
    
    sp = spans[candidates[0][0] + 1]
    print('first title: ', sp['text'])
    input()
    
    TITLE_SIZE = sp['size']
        
    prev_font_size = 0
    last_title = 'preambulo'
    hier = [ ([last_title], []) ]
    all_tbt = []
    
    _dbg = []
    prev_spans = []
    for idx, page in enumerate(doc):
        _dbg.append([])
        p_width = page.MediaBox[2]

        text_blocks = page.getTextBlocks()
        extract = page.getTextPage().extractDICT()['blocks']

        if len(text_blocks) != len(extract):
            raise ValueError("different blocks len! {} vs {}".format(
                len(text_blocks), len(extract)))

        tb_paged = textBlock_topage(text_blocks, p_width, idx)    
        tb_trans = text_blocks_transform(tb_paged, keep_page_width=False)

        cleaned_and_sorted = drop_header_footer(
            reading_sort_tuple(
            drop_dup_tbt(
                tb_trans
            )))

        for text_block in cleaned_and_sorted:
            spans = get_block_spans(extract[text_block.block_no])

            for sp in spans:
                sp['page'] = idx
                sp['page_width'] = doc[idx].MediaBox[2]
            
            spans = reading_sort_dict(page_transform(spans))
            
            _dbg[-1].extend(spans)
            assert text_block[:4] == extract[text_block.block_no]['bbox']
            
            first = spans[0]
            
            # Falso positivo de título
            if first['text'].startswith('SEÇÃO I') and is_bold(first['flags']):
                continue
            
            first_size = spans[0]['size']
            # Temos um título?
            cond1 = bool(spans)
            cond3 = first_size == TITLE_SIZE
            not_fake = [ not re.match(_TRASH_COMPILED, sp['text']) for sp in spans]            
            if  cond1 and all(are_title_subtitle(spans)) and cond3 and all(not_fake):
                # verificar se não estende o anterior (múltiplas linhas)
#                 raise "asdfjkl"
                if first_size == prev_font_size and hier[-1][0][0] != 'preambulo':
                    print("EXTENDING {} BY {}".format(hier[-1][0], text_block.text))
                    print("PREV_SPANS: ", prev_spans)
                    print("CURR_SPANS: ", spans)
                    print("---------------")
                    hier[-1][0].extend([text_block.text])
                else:
                    last = hier[-1]
                    hier[-1] = ('\n'.join(last[0]), last[1])
                    hier.append( ([text_block.text], []) )                
            else:  # Não é título/subtítulo em hipótese alguma
                hier[-1][1].append(text_block.text)
            if spans: prev_font_size = spans[0]['size']
            prev_spans = spans.copy()
    
    hier[-1] = ('\n'.join(hier[-1][0]), hier[-1][1])
    return hier, _dbg

In [101]:
h, _dbg = mount_doc_hierarchy(doc)

first title:  PODER LEGISLATIVO

EXTENDING ['SECRETARIA DE ESTADO DE AGRICULTURA, '] BY ABASTECIMENTO E DESENVOLVIMENTO RURAL
PREV_SPANS:  [{'size': 10.5, 'flags': 20, 'font': 'Times-Bold', 'color': 16777215, 'text': 'SECRETARIA DE ESTADO DE AGRICULTURA, ', 'bbox': (61.051998138427734, 310.989990234375, 275.7125244140625, 325.5325012207031), 'page': 20, 'page_width': 605.0}]
CURR_SPANS:  [{'size': 10.5, 'flags': 20, 'font': 'Times-Bold', 'color': 16777215, 'text': 'ABASTECIMENTO E DESENVOLVIMENTO RURAL', 'bbox': (51.2599983215332, 322.12799072265625, 284.3059997558594, 336.6705017089844), 'page': 20, 'page_width': 605.0}]
---------------
EXTENDING ['SECRETARIA DE ESTADO DE '] BY CULTURA E ECONOMIA CRIATIVA
PREV_SPANS:  [{'size': 10.5, 'flags': 20, 'font': 'Times-Bold', 'color': 16777215, 'text': 'SECRETARIA DE ESTADO DE ', 'bbox': (357.57598876953125, 168.20602416992188, 509.259033203125, 182.74853515625), 'page': 25, 'page_width': 605.0}]
CURR_SPANS:  [{'size': 10.5, 'flags': 20, 'fon

In [92]:
_dbg[10][155:160];
_dbg[11][0]

{'size': 12.951430320739746,
 'flags': 16,
 'font': 'Arial-BoldMT',
 'color': 2236191,
 'text': 'SEÇÃO III',
 'bbox': (200.13706970214844,
  76.4350814819336,
  260.5020446777344,
  90.90425872802734),
 'page': 22,
 'page_width': 814.9600219726562}

In [102]:
print(doc.name)
for _ in h:
    print(_[0])
    input('------')

novo-2020.pdf
preambulo
------
PODER LEGISLATIVO
------
PODER EXECUTIVO
------
SECRETARIA DE ESTADO DE ECONOMIA
------
SECRETARIA DE ESTADO DE SAÚDE
------
SECRETARIA DE ESTADO 
DE SEGURANÇA PÚBLICA
------
SECRETARIA DE ESTADO DE 
TRANSPORTE E MOBILIDADE
------
SECRETARIA DE ESTADO 
DE JUSTIÇA E CIDADANIA
------
SECRETARIA DE ESTADO DE AGRICULTURA, 
ABASTECIMENTO E DESENVOLVIMENTO RURAL
------
SECRETARIA DE ESTADO DE 
CULTURA E ECONOMIA CRIATIVA
------
SECRETARIA DE ESTADO DE 
DESENVOLVIMENTO ECONÔMICO
------
SECRETARIA DE ESTADO DE 
DESENVOLVIMENTO SOCIAL
------
SECRETARIA DE ESTADO DE MEIO AMBIENTE
------
DEFENSORIA PÚBLICA
------
TRIBUNAL DE CONTAS
------
PODER EXECUTIVO
------
SECRETARIA DE ESTADO DE ECONOMIA
------
SECRETARIA DE ESTADO DE SAÚDE
------
SECRETARIA DE ESTADO DE EDUCAÇÃO
------
SECRETARIA DE ESTADO 
DE SEGURANÇA PÚBLICA
------
SECRETARIA DE ESTADO DE 
TRANSPORTE E MOBILIDADE
------
SECRETARIA DE ESTADO 
DE JUSTIÇA E CIDADANIA
------
SECRETARIA DE ESTADO DE AGRICULTURA

In [None]:



def get_page_blocks(page: fitz.Page):
    return page.getTextPage().extractDICT()['blocks']

p20_blocks = get_page_blocks(doc[20])
sps=[]
for lis in p20_blocks:
    sps.extend(get_block_spans(lis))

for sp in sps:
    sp['page'] = 20
    sp['page_width'] = doc[20].MediaBox[2]

sps = page_transform(sps)
ordered = reading_sort_dict(sps)