In [1]:
import pandas as pd
import re

In [2]:
def rem_texts_begin_end_page(text, p_begin_page1, p_begin_page2, p_end_page):
    bp1 = re.search(p_begin_page1, text)
    bp2 = re.search(p_begin_page2, text)
    
    # Se as regras não encontrarem correspondência no texto, retornar o texto até à última alteração
    if (bp1 == None) or (bp2 == None):
        return text
    
    if bp1.end() > bp2.end():
        bp = bp1
    else:
        bp = bp2

    text = text[bp.end():]

    ep = re.search(p_end_page, text)
    
    if ep == None:
        return text
    
    text = text[:ep.start()]

    return text

In [3]:
def extracted_texts_to_df(texts_indexes, base_df, column_names):
    # Gera dataframe com os textos extraídos, com informações do documento onde foram encontrados
    extracted_texts_df = []
    
    for extracted_text in texts_indexes:
        index_text = extracted_text[0]
        ext_text = extracted_text[1]
        
        list_text = list(base_df.loc[index_text, base_df.columns].values)
        list_text.append(ext_text)
        
        extracted_texts_df.append(list_text)
        
    return pd.DataFrame(extracted_texts_df, columns=column_names)

In [4]:
def row_list_regex(index, text, pattern_ext, pattern_blk):
    # Lista com as buscas por regex. O findall permite ver se no documento há pelo menos 1 extrato detectado
    row_list = [index, re.findall(pattern_ext, text), re.finditer(pattern_ext, text), re.finditer(pattern_blk, text)]
    
    # Se findall não achou nenhum, então o documento não tem nada que interessa
    if len(row_list[1]) > 0:
        return row_list

In [5]:
def mapped_positions_regex(matched_texts):
    # Mapeia as posições do que foi encontrado pelo regex
    # Lista de 2 dimensões: na primeira, de extrato; na segunda, de bloco
    mapped_positions_ext_blk = []

    for match in matched_texts:
        a_ext = match[2]
        a_blk = match[3]

        a_ext_list = []
        a_blk_list = []

        for i in a_ext:
            a_ext_list.append(i.start())

        for i in a_blk:
            a_blk_list.append(i.start())

        mapped_positions_ext_blk.append([match[0], a_ext_list, a_blk_list])
    
    return mapped_positions_ext_blk

In [6]:
def extract_texts_from_mapped_positions(mapped_positions, base_df, text_column):
    extracted_texts = []
    
    for mapped_position in mapped_positions:
        mapped_text = base_df.loc[mapped_position[0]][text_column]
        
        for ia in mapped_position[1]:
            # Um texto começa no bloco detectado pelo regex para extrato e termina no próximo bloco qualquer detectado
            index_ia = mapped_position[2].index(ia)
            
            if (index_ia + 1) <= (len(mapped_position[2])-1):
                ib = mapped_position[2][index_ia+1]
                extracted_text = mapped_text[ia:ib]
            else:
                extracted_text = mapped_text[ia:]

            extracted_texts.append([mapped_position[0], extracted_text])
    
    return extracted_texts

In [7]:
pattern_new_page1 = r"ÁGINA\s([0-9]{1,5})"
p_new_page1 = re.compile(pattern_new_page1)

pattern_new_page2 = r"N(.+?)20([0-9]{2})"
p_new_page2 = re.compile(pattern_new_page2)

pattern_end_page = r"\nEste documento pode ser verificado no"
p_end_page = re.compile(pattern_end_page)

In [8]:
pattern_extrato = "(\nxx([a-z]{0,10})\sEXTRAT([A-Z]{0,3})\sD([A-Z]{0,3})\sCONTRAT([A-Z]{0,3}))"
p_ext = re.compile(pattern_extrato)

pattern_bloco = "(\nxx([a-z]{0,10})\s([A-ZÀ-Ú0-9º\/x\-\.\,\(\)\|\*&\s\']{5,}) xx([a-z]{0,10}))"
pattern_bloco = pattern_bloco + "|" + pattern_extrato

p_blk = re.compile(pattern_bloco)

In [9]:
data = pd.read_parquet("df.parquet.gzip")

In [10]:
dodf_selected = data[data["year"] == 2021].copy()

In [11]:
del data

In [13]:
dodf_selected = dodf_selected.sort_values(["file_name", "page"])

In [12]:
dodf_selected.loc[:, "text_corr"] = dodf_selected.apply(lambda row: rem_texts_begin_end_page(row["text"], p_new_page1, p_new_page2, p_end_page), axis=1)

In [14]:
# Concatenar todas as páginas em uma única linha
dodf_selected = dodf_selected.groupby(["file_name", "number", "day", "month", "year"])["text_corr"].apply('\n'.join).reset_index()

In [15]:
matched_texts = dodf_selected.apply(lambda row: row_list_regex(row.name, row["text_corr"], p_ext, p_blk), axis=1)
matched_texts = [matched_text for matched_text in matched_texts if matched_text != None]

In [16]:
ext_blk_list = mapped_positions_regex(matched_texts)

In [17]:
textos_extraidos = extract_texts_from_mapped_positions(ext_blk_list, dodf_selected, "text_corr")

In [18]:
ext_df = extracted_texts_to_df(textos_extraidos, dodf_selected[["file_name", "number", "day", "month", "year"]], ["file_name", "number", "day", "month", "year", "text"])

In [19]:
ext_df.head(5)

Unnamed: 0,file_name,number,day,month,year,text
0,DODF 001 04-01-2021,1,4,1,2021,\nxxbcet EXTRATO DO CONTRATO DE EXECUÇÃO DE OB...
1,DODF 001 04-01-2021,1,4,1,2021,\nxxbcet EXTRATO DO CONTRATO DE EXECUÇÃO DE OB...
2,DODF 001 04-01-2021,1,4,1,2021,\nxxbcet EXTRATO DO CONTRATO DE EXECUÇÃO DE OB...
3,DODF 001 04-01-2021,1,4,1,2021,\nxxbcet EXTRATO DO CONTRATO Nº41470/2020 xxec...
4,DODF 001 04-01-2021,1,4,1,2021,\nxxbcet EXTRATO DO CONTRATO Nº42437/2020 xxec...
