In [1]:
import requests
import ipykernel
import re
from notebook.notebookapp import list_running_servers
from notebook import notebookapp

servers = list(notebookapp.list_running_servers())

TOKEN = servers[0]['token']

base_url = next(list_running_servers())['url']
r = requests.get(
    url=base_url + 'api/sessions',
    headers={'Authorization': 'token {}'.format(TOKEN),})

r.raise_for_status()
response = r.json()

kernel_id = re.search(
    'kernel-(.*).json',
    ipykernel.connect.get_connection_file()
).group(1)

NOTEBOOK_PATH = {
    r['kernel']['id']: r['notebook']['path']
    for r in response
}[kernel_id]
# print(NOTEBOOK_PATH)

In [2]:
%%time
import os
import re
import glob
from itertools import chain

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import fitz
from dodfminer.extract.polished.acts.aposentadoria import Retirements

import unidecode
import unicodedata    # normalização igual à do DODFMiner

from utils import get_dodf_key
from sklearn.model_selection import train_test_split
from paragraph_segmentation import FORBIDDEN
from fuzzywuzzy import fuzz

PATH_PREDICT = 'marked_pdf/predict/'
PATH_REGEX = 'marked_pdf/regex/'

CPU times: user 1.48 s, sys: 499 ms, total: 1.98 s
Wall time: 1.65 s


# Preprocessamento do DODF para segmentacao 

Separa sentencas e as rotula no padrao IOB (B=sentenca de inicio do ato, I=sentenca continuacao do ato, O=sentenca nao faz parte do ato)

In [3]:
%load_ext autoreload
%autoreload 2
import json
import pandas as pd
import glob
import re
import sys
sys.path.append('../')
import numpy as np
from itertools import chain

## Parte 1 - Identificando os DODFs com anotacao de aposentadoria

In [4]:
df = pd.read_csv("labeled.csv")
df.columns

Index(['DATA_DODF', 'NUM_DODF', 'ATO', 'EMPRESA_ATO', 'COD_MATRICULA_ATO',
       'NOME_ATO', 'CARGO', 'CLASSE', 'PADRAO', 'QUADRO', 'PROCESSO',
       'FUND_LEGAL', 'text', 'labels'],
      dtype='object')

In [5]:
from utils import get_dodf_key
from utils import get_dodf_num
from utils import get_dodf_reverse_date

def get_cands(num, date):
    date = date.replace('/', '-')
    snum = str(num)
    snum = (2 - int(np.floor(np.log10(num))))*'0' + snum 
    p = f'DODF {snum} {date}'
#     print("CAND-PATH-DOC:", p)
    return glob.glob(
        f'data/aposentadoria-ouro/pdfs/results/json/{p}*.json'
    )
    

In [6]:
%%time
dodfs = []
dic = {}
err = []
sups = []
for i in range(len(df)):
    num, data = df.loc[i, ['NUM_DODF', 'DATA_DODF']]
    if df.loc[i, 'text']:
        if (num, data) not in dic:
            dic[(num, data)] = -1
            dia, mes, ano = data.split('/')
            cands = get_cands(num, data)
            if not cands:
                err.append((num, data))
                continue
            for i in cands:
                if 'SUPLEMENTO' not in i:
                    dic[(num, data)] = i
                    dodfs.append(i)
                else:
                    print("SUP-cand:", i)
                    sups.append(i)

SUP-cand: data/aposentadoria-ouro/pdfs/results/json/DODF 030 12-02-2019 SUPLEMENTOpdf.json
SUP-cand: data/aposentadoria-ouro/pdfs/results/json/DODF 065 05-04-2019 SUPLEMENTOpdf.json
CPU times: user 4.77 s, sys: 41.2 ms, total: 4.81 s
Wall time: 4.81 s


In [7]:
len(dodfs), len(dic), len(err), len(sups)

(225, 227, 1, 2)

In [8]:
err, sups

([(76, '20/04/2018')],
 ['data/aposentadoria-ouro/pdfs/results/json/DODF 030 12-02-2019 SUPLEMENTOpdf.json',
  'data/aposentadoria-ouro/pdfs/results/json/DODF 065 05-04-2019 SUPLEMENTOpdf.json'])

In [9]:
# Caso problemático resolvido manualmente
dic[(76, '20/04/2018')] = \
    'data/aposentadoria-ouro/pdfs/DODF 076 20-04-2018 SECAO1.json'
# Os outros dois são complicados, portanto serão ignorados.

In [10]:
for i in dic:
    if dic[i] not in dodfs:
        print("Missing DODF:", i)

Missing DODF: (76, '20/04/2018')
Missing DODF: (30, '12/02/2019')


In [11]:
dodfs.append(
    'data/aposentadoria-ouro/pdfs/DODF 076 20-04-2018 SECAO1.json'
)

## Parte 2 - Anotando os documentos no padrao IOB

In [12]:
# path = "../dodfs/results/json/2018/04_Abril/DODF 019 04-04-2018 EDICAO EXTRA.json"
# f = open(path)
# data = json.load(f)

In [13]:
%%time
full_base = []
not_found = []
cnt = 0
temp = -1
for dodf in dodfs:
    f = open(dodf)
    data = json.load(f)
    aux = df.loc[(df['NUM_DODF'] == int(dodf.split('/')[-1].split()[1])) & (df['DATA_DODF'] == dodf.split('/')[-1].split()[2].replace('-', '/'))]
    for j in range(len(data)):
        data[j].append([])
    for i in aux.index:
        cnt += 1
        if not df.loc[i, 'text'] or pd.isna(df.loc[i, 'text']):
            continue
        find = False
        for j in range(len(data)):
            entity = re.search(df.loc[i, 'text'].replace('(', '\(').replace(')', '\)'), data[j][4].replace('\n', ' '))
            if entity:
                temp = entity
                find = True
                if entity.span() not in data[j][5]:
                    data[j][5].append(entity.span())
        if not find:
            not_found.append((dodf, df.loc[i, 'text'], i))
    full_base.append(data)


CPU times: user 1min 15s, sys: 213 ms, total: 1min 15s
Wall time: 1min 16s


In [14]:
flat_base =list(chain(*full_base))
com_entidade = [i for i in flat_base if len(i[-1]) > 0]
com_entidade

[[418.1085510253906,
  202.36849975585938,
  765.5305786132812,
  874.0826416015625,
  'PORTARIA No 266, DE 28 DE DEZEMBRO DE 2018\nO DIRETOR PRESIDENTE DO INSTITUTO DE PREVIDENCIA DOS SERVIDORES DO DISTRITO\nFEDERAL, no uso das atribuicoes conferidas pelos artigos 3o, 4o e 93 da Lei Complementar no 769 de\n30 de junho de 2008, c/c o inciso I, art. 5o do anexo ao Decreto no 37.166, de 08 de marco de 2016,\nbem como pelo art. 1o do Decreto no 38.649, de 27 de novembro de 2017, resolve:\nCONCEDER, aposentadoria voluntaria integral, a servidora LUZIA APARECIDA ALVES AZEVEDO,\nmatricula n.o 42.992-9, no cargo de Inspetor Fiscal, Classe Especial, Padrao III, do Quadro de Pessoal\nda Agencia de Fiscalizacao do Distrito Federal, nos termos do artigo 3o, incisos I, II e III, e Paragrafo\nunico da Emenda Constitucional n.o 47 de 05/07/2005, combinado com o artigo 44 da Lei Complementar\nn.o 769, de 30/06/2008. Lotacao: Agencia de Fiscalizacao do Distrito Federal. Processo SEI n.o 00361-\n000232

In [15]:
df = pd.DataFrame(
    flat_base, columns=['x0', 'x', 'y0', 'y1', 'text', 'ents']
)
df['qtd_ents'] = df.ents.map(len)
df.head()

Unnamed: 0,x0,x,y0,y1,text,ents,qtd_ents
0,144.846329,190.436508,659.465149,208.861298,"ANO XLVII EDICAO No- 248\nBRASILIA - DF, TERCA...",[],0
1,202.992294,594.359863,257.645691,608.829041,SECAO I,[],0
2,141.144211,225.327972,401.498444,245.675476,SUMARIO\nSECAO I\nPAG.\nSECAO II\nPAG.\nSECAO ...,[],0
3,56.702831,253.438995,344.70282,262.265259,Poder Executivo .................................,[],0
4,56.702831,270.028778,344.85025,278.855042,Governadoria ....................................,[],0


In [16]:
def is_forbidden(s):
    for forb in FORBIDDEN:
        if s.startswith(forb):
            return True
    return False

def cond(s):
    return (len(s) > 40 
            and s.upper() != s
            and '\n' in s
            and not is_forbidden(s)
           )

df_filter = df.copy()
print(df_filter.shape)
df_filter = df_filter[df_filter.text.map(cond)]
print(df_filter.shape)

(214621, 7)
(104023, 7)


In [17]:
df.qtd_ents[df.qtd_ents > 0].sum()

5049

In [18]:
df_filter.qtd_ents[df_filter.qtd_ents > 0].sum()

5049

Portanto esse filtro não perdeu nenhum caso de aposentadoria; porém, reduziu consideravelmente a base removendo MUITOS **casos inúteis (mais de 50%)**

In [19]:
df_filter.head(2)

Unnamed: 0,x0,x,y0,y1,text,ents,qtd_ents
0,144.846329,190.436508,659.465149,208.861298,"ANO XLVII EDICAO No- 248\nBRASILIA - DF, TERCA...",[],0
9,56.702831,353.227753,344.850311,370.024231,"Secretaria de Estado de Trabalho, Desenvolvime...",[],0


In [20]:
df_filter.to_csv('blocos_com_entides.csv', index=False)
df_filter.drop('ents', axis=1).to_csv('blocos_qtd_entidades.csv',
    index=False
)