In [1]:
import requests
import ipykernel
import re
from notebook.notebookapp import list_running_servers
from notebook import notebookapp

servers = list(notebookapp.list_running_servers())

TOKEN = servers[0]['token']

base_url = next(list_running_servers())['url']
r = requests.get(
    url=base_url + 'api/sessions',
    headers={'Authorization': 'token {}'.format(TOKEN),})

r.raise_for_status()
response = r.json()

kernel_id = re.search(
    'kernel-(.*).json',
    ipykernel.connect.get_connection_file()
).group(1)

NOTEBOOK_PATH = {
    r['kernel']['id']: r['notebook']['path']
    for r in response
}[kernel_id]
# print(NOTEBOOK_PATH)

In [3]:
%load_ext autoreload
%autoreload 2
import pandas as pd
import numpy as np
from itertools import chain
from collections import Counter
import unidecode

import glob
import re
import gc

from dodfminer.extract.polished.acts.aposentadoria import Retirements
from drawBoxes.drawBoxes import draw2

import fitz
from utils import get_dodf_num
from utils import get_dodf_tipo
from utils import get_dodf_reverse_date
from utils import get_dodf_key
import pickle

In [4]:
def get_doc_blocks(doc):
    lis = []
    for idx, page in enumerate(doc, start=1):
        bls = page.getTextBlocks()
        bls = [(*i, idx) for i in bls]
        lis.extend(bls)
    return lis

def get_dodfs_path():
    return glob.glob('data/aposentadoria-ouro/pdfs/*.pdf')

def wRetirements(s):
    return Retirements(None, 'regex',
                       txt= unidecode.unidecode_expect_ascii(s))

INDEX_COL = ['data', 'num', 'tipo']
NUM_FILES = 3
NUM_FILES = None
PATHS = get_dodfs_path()[:NUM_FILES]

### Construção do DataFrame ouro e DataFrame com nomes dos arquivos

In [5]:
def load_ouro(path, sep):
    """Retorna ordenado por (data, num, tipo), já os 
    usando como index."""
    douro = pd.read_csv(path, sep='®', engine='python',
                        parse_dates=['DATA_DODF']
    )    
    douro.rename(
    lambda x: x.lower().replace('_dodf', ''),
        axis=1, inplace=True
    )
    douro['tipo'] = douro.tipo.map(lambda x: x.split()[-1])
    columns = ['data', 'num', 'pagina', 'tipo', 'nome_ato']
    douro = douro[columns]
    douro['pagina'] = douro.pagina.map(
        lambda x: x.split()[-1]
    ).astype(np.int16)
    douro['num'] = douro.num.astype(np.int16)
    douro.sort_values(
        by=['data', 'num', 'pagina', 'tipo'], inplace=True
    )
    douro['data'] = douro.data.astype(str)
    douro['index'] = douro.index
#     douro.set_index(INDEX_COL, drop=False, inplace=True)
    douro.set_index(INDEX_COL, drop=True, inplace=True)    
    return douro


def build_file_df():
    def f(x):
        if x.name == 'data':
            return pd.to_datetime(x)
        elif x.name == 'num':
            return x.astype(np.int16)
        return x

    df = pd.DataFrame()
    file_paths = get_dodfs_path()
    df['files'] = [i.split('/')[-1] for i in file_paths]
    df['num'] = df.files.map(get_dodf_num)
    df['data'] = df.files.map(get_dodf_reverse_date)

    df = df[reversed(df.columns)]

    df['tipo'] = df.files.map(
        lambda s: re.sub(r'DODF \d{3} [^A-Z]+|[.]pdf', '', s)
    )
    df['tipo'] = df.tipo.map(lambda x: x if x != 'SECAO1' else 'INTEGRA')
    df['tipo'] = df.tipo.map(lambda x: x.replace('INTEGRA', 'NORMAL'))
    df['tipo'] = df.tipo.map(lambda x: x.split()[-1])
    df.sort_values(['data', 'num', 'tipo'], key=f, inplace=True)
    return df

In [6]:
path = 'data/aposentadoria-ouro/aposentaria-ouro-®.csv'
douro = load_ouro(path, sep='®')
dfiles = build_file_df()    # Apenas checando

In [7]:
douro.head(22)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,pagina,nome_ato,index
data,num,tipo,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2018-04-02,62,NORMAL,35,ANDREIA MARIA ALVES DE BARROS,964
2018-04-02,62,NORMAL,35,MARLENE DELFINO DA ROCHA,966
2018-04-02,62,NORMAL,35,EFREM MARQUES MOREIRA,967
2018-04-02,62,NORMAL,35,ADEMAR ALVES BEZERRA,968
2018-04-02,62,NORMAL,35,MARCIO ROBERTO FREITAS DA SILVA,969
2018-04-02,62,NORMAL,35,LUIZ CARLOS PEREIRA DA CUNHA,970
2018-04-02,62,NORMAL,35,DERCIVAL ANDRADE CARVALHO,3845
2018-04-02,62,NORMAL,35,ZELMA APARECIDA DOS REIS SOARES,3847
2018-04-02,62,NORMAL,35,MAGDA MARGARIDA DA MOTA,3848
2018-04-02,62,NORMAL,35,ELISETE DE SOUZA CARDOZO,3849


In [8]:
douro_ctr = douro.copy()
douro_ctr = douro_ctr.groupby(INDEX_COL).count()
# Pegar uma coluna que contenha a contagem de quantas vezes o
# mesmo índice se repete.
douro_ctr = douro_ctr.iloc[:, :1]    
douro_ctr.rename({'pagina': 'qtd_ouro'}, axis=1, inplace=True)

prf = 'douro_ctr'
douro_ctr.to_csv(f'{prf}.csv')
open(f'{prf}.txt', 'w').write(
    f"notebook de origem: {NOTEBOOK_PATH}\n\n"
    f"Arquivo `{prf}.csv` contém a quantidade de\n"
    "atos de aposentadoria encontrados em arquivos correspondentes\n"
    f"aos DODFs que aparecem na BASE OURO \n"
    f"`{path}`.\n"
)

if NUM_FILES == None:
    assert np.all(
        dfiles.tipo
        ==
        douro_ctr.index.get_level_values(2)
    )
print(douro_ctr.shape)
douro_ctr.head(3)

(228, 1)


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,qtd_ouro
data,num,tipo,Unnamed: 3_level_1
2018-04-02,62,NORMAL,21
2018-04-03,63,NORMAL,2
2018-04-04,64,NORMAL,16


### Carregando blocos e texto

In [9]:
%%time
dodf2blocks = {
    file.split('/')[-1]: get_doc_blocks(fitz.open(file))
    for file in PATHS
}

dodf2blocks = {get_dodf_key(k): v for (k, v) in dodf2blocks.items()}

CPU times: user 1min 39s, sys: 2.18 s, total: 1min 41s
Wall time: 1min 41s


### Extraindo a nível de bloco

In [10]:
%%time
map_blocks = {
    k: [(wRetirements(bl[4]), bl[-1]) for bl in v]
    for (k, v) in dodf2blocks.items()
}
for k, v in map_blocks.items():
    map_blocks[k] = [i for i in v if not i[0].data_frame.empty]
pickle.dump(map_blocks, open('map_blocks.pkl', 'wb'))
# pickle.dump(map_blocks, open('map_text.pkl', 'wb'))

CPU times: user 1min 42s, sys: 564 ms, total: 1min 42s
Wall time: 1min 42s


In [11]:
def f(x):
    if x.name == 'data':
        return pd.to_datetime(x)
    elif x.name == 'num':
        return x.astype(np.int16)
    return x

miner_dict = {k: sum([i[0].data_frame.shape[0] for i in v ])
        for (k,v) in map_blocks.items()}

index = miner_dict.keys()
values = miner_dict.values()
dminer_blocks_ctr = pd.DataFrame.from_records(
    [(*idx, val) for (idx, val) in zip(index, values)],
    columns=INDEX_COL+['qtd_miner_blocklevel']
)
dminer_blocks_ctr.set_index(INDEX_COL, inplace=True)
dminer_blocks_ctr.sort_index(key=f, inplace=True)
dminer_blocks_ctr.to_csv('dminer_blocks_ctr.csv')

open('dminer_blocks_ctr.txt', 'w').write(
    f"notebook de origem: {NOTEBOOK_PATH}\n\n"
    "Arquivo com a quantidade de atos de aposentadoria encontrados\n"
    "NOS BLOCOS, levando em consideração os arquivos que aparecem\n"
    f"em {path}.\n"
    "O arquivo map_blocks.pkl é o dicionário a partir do qual\n"
    "gerou-se dminer_blocks_ctr.csv"
    
)
del miner_dict, index, values
gc.collect()
dminer_blocks_ctr.head(1)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,qtd_miner_blocklevel
data,num,tipo,Unnamed: 3_level_1
2018-04-02,62,NORMAL,10


In [12]:
if NUM_FILES == None:
    assert douro_ctr.shape == dminer_blocks_ctr.shape

In [13]:
# douro_ctr.join(dminer_blocks_ctr)

In [14]:
dminer_ctr = pd.read_csv('dminer_ctr.csv', index_col=INDEX_COL)
douro_ctr = pd.read_csv('douro_ctr.csv', index_col=INDEX_COL)
dminer_block_ctr = pd.read_csv(
    'dminer_blocks_ctr.csv',
    index_col=INDEX_COL
)

In [15]:
dfinal = pd.DataFrame.join(dminer_blocks_ctr, [dminer_ctr, douro_ctr])
dfinal.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,qtd_miner_blocklevel,qtd_miner,qtd_ouro
data,num,tipo,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2018-04-02,62,NORMAL,10,10,21
2018-04-03,63,NORMAL,0,1,2
2018-04-04,64,NORMAL,15,15,16
2018-04-06,66,NORMAL,51,51,51
2018-04-09,67,NORMAL,1,1,1


In [16]:
(dfinal.qtd_miner - dfinal.qtd_miner_blocklevel).std()

0.8660477139642268

In [17]:
(dfinal.qtd_ouro - dfinal.qtd_miner_blocklevel).std()

10.63565173714661

In [18]:
(dfinal.qtd_ouro - dfinal.qtd_miner).std()

10.521801383374727

In [22]:
prf='qtd_qtd_qtd'
dfinal.to_csv(f'{prf}.csv')
open(f'{prf}.txt', 'w').write(
    f"notebook de origem: {NOTEBOOK_PATH}\n\n"
    f"`{prf}.csv`contém quantidades de :\n"
    "\t- atos detectados a nível de blocos de texto\n"
    "\t- atos detectados a nível do texto completo\n"
    "\t- atos detectados segundo base ouro\n"
)

226

In [29]:
df = dfinal
df.sum(axis=0)

qtd_miner_blocklevel    4472
qtd_miner               4611
qtd_ouro                5516
dtype: int64

In [30]:
df[(df.qtd_ouro > df.qtd_miner)]

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,qtd_miner_blocklevel,qtd_miner,qtd_ouro
data,num,tipo,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2018-04-02,62,NORMAL,10,10,21
2018-04-03,63,NORMAL,0,1,2
2018-04-04,64,NORMAL,15,15,16
2018-04-16,72,NORMAL,32,33,40
2018-04-19,75,NORMAL,0,0,1
...,...,...,...,...,...
2019-08-01,144,NORMAL,35,38,99
2019-08-27,162,NORMAL,33,35,36
2019-09-02,166,NORMAL,0,0,49
2019-09-05,169,NORMAL,124,127,131


In [31]:
df[(df.qtd_ouro > df.qtd_miner_blocklevel)]

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,qtd_miner_blocklevel,qtd_miner,qtd_ouro
data,num,tipo,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2018-04-02,62,NORMAL,10,10,21
2018-04-03,63,NORMAL,0,1,2
2018-04-04,64,NORMAL,15,15,16
2018-04-10,68,NORMAL,42,44,43
2018-04-16,72,NORMAL,32,33,40
...,...,...,...,...,...
2019-09-13,175,NORMAL,63,66,66
2019-09-17,177,NORMAL,14,15,15
2019-09-20,180,NORMAL,61,63,63
2019-09-25,183,NORMAL,10,10,18
