In [1]:
import requests
import ipykernel
import re
from notebook.notebookapp import list_running_servers
from notebook import notebookapp

servers = list(notebookapp.list_running_servers())

TOKEN = servers[0]['token']

base_url = next(list_running_servers())['url']
r = requests.get(
    url=base_url + 'api/sessions',
    headers={'Authorization': 'token {}'.format(TOKEN),})

r.raise_for_status()
response = r.json()

kernel_id = re.search(
    'kernel-(.*).json',
    ipykernel.connect.get_connection_file()
).group(1)

NOTEBOOK_PATH = {
    r['kernel']['id']: r['notebook']['path']
    for r in response
}[kernel_id]
# print(NOTEBOOK_PATH)

In [22]:
%%time
import os
import glob
from itertools import chain

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import fitz
from dodfminer.extract.polished.acts.aposentadoria import Retirements
import unidecode
from utils import get_dodf_key

METRICS = [
    'f1_macro',
    'f1_micro',
    'f1_weighted',
    'accuracy',
    'balanced_accuracy'
]
PATH_PREDICT = 'marked_pdf/predict/'
PATH_REGEX = 'marked_pdf/regex/'

CPU times: user 2.53 ms, sys: 0 ns, total: 2.53 ms
Wall time: 1.36 ms


In [3]:
def get_doc_blocks(doc):
    lis = []
    for idx, page in enumerate(doc, start=1):
        bls = page.getTextBlocks()
        bls = [(*i, idx) for i in bls]
        lis.extend(bls)
    return lis

def get_dodfs_path():
    return glob.glob('data/aposentadoria-ouro/pdfs/*.pdf')

def wRetirements(s):
    return Retirements(None, 'regex',
                       txt= unidecode.unidecode_expect_ascii(s))

INDEX_COL = ['data', 'num', 'tipo']
NUM_FILES = 3
NUM_FILES = len(get_dodfs_path())
PATHS = np.random.choice(get_dodfs_path(), NUM_FILES)

## Carrega os dados em DataFrames e os rotula

In [5]:
len(PATHS)

228

In [6]:
files_train, files_test, _, _ = train_test_split(
    np.array(PATHS).reshape(-1, 1), range(len(PATHS)), test_size=.20
)

In [38]:
%%time

def cond(s):
    return len(s) > 40 and '\n' in s


def build_from_files(files):
    data = {}
    for fname in files:
        doc = fitz.open(fname)
        blocks = list(
            chain(
                *[ [(*i, pnum) for i in p.getTextBlocks()]
                  for (pnum, p) in enumerate(doc, start=1)]
            )
        )
        # dropa casos 99% probab. não ser aposentadoria
        blocks = [i for i in blocks if cond(i[4])]
        pars = [i[4] for i in blocks]
        retirements = [wRetirements(i) for i in pars]
        dt, num, tipo = get_dodf_key(fname)
        df = pd.DataFrame({
            'data':[dt] *len(pars),
            'num':[num] *len(pars),
            'tipo':[tipo] *len(pars),
            'text': pars,
            'pnum':[int(i[-1]) for i in blocks],
            'x0': [i[0] for i in blocks],
            'y0': [i[1] for i in blocks],
            'x1': [i[2] for i in blocks],
            'y1': [i[3] for i in blocks],
            'y': [not i.data_frame.empty for i in retirements],
            'qtd': [i.data_frame.shape[0] for i in retirements],
        })
        data[fname] = df
    return data



CPU times: user 11 µs, sys: 0 ns, total: 11 µs
Wall time: 14.1 µs


In [39]:
%%time
train_data = build_from_files(chain.from_iterable(files_train))
test_data = build_from_files(chain.from_iterable(files_test))

CPU times: user 3min, sys: 1.77 s, total: 3min 2s
Wall time: 3min 2s


In [41]:
%%time
df_train = pd.concat(train_data.values(), axis=0)
df_test = pd.concat(test_data.values(), axis=0)

CPU times: user 120 ms, sys: 7.95 ms, total: 128 ms
Wall time: 125 ms


In [43]:
df_train.sample()

Unnamed: 0,data,num,tipo,text,pnum,x0,y0,x1,y1,y,qtd
458,2018-11-12,215,NORMAL,POLÍCIA MILITAR DO DISTRITO FEDERAL\nEDITAL Nº...,30,418.110535,443.719147,765.502808,874.081909,False,0


In [44]:
%%time
df_train.to_csv('xgb_train.csv', sep='®', index=False)
df_test.to_csv('xgb_test.csv', sep='®', index=False)

prf = 'xgb_train'
open(f'{prf}.txt', 'w').write(
    f"notebook de origem: {NOTEBOOK_PATH}\n\n"
    f"O arquivo {prf}.csv contém a rotulação dos blocos de texto\n"
    "a respeito de aposentadoria, segundo a classe `Retirements` do"
    "dodfminer. Espera-se sua utilização para TREINAMENTO."
)

prf = 'xgb_test'
open(f'{prf}.txt', 'w').write(
    f"notebook de origem: {NOTEBOOK_PATH}\n\n"
    f"O arquivo {prf}.csv contém a rotulação dos blocos de texto\n"
    "a respeito de aposentadoria, segundo a classe `Retirements` do"
    "dodfminer. Espera-se sua utilização para TESTES."
)


CPU times: user 3.64 s, sys: 136 ms, total: 3.77 s
Wall time: 3.77 s


213

In [19]:
df_train.shape, df_test.shape

((83269, 8), (26036, 8))

In [20]:
df_train[df_train.y].qtd.sum(), df_test[df_test.y].qtd.sum()

(2724, 826)

## Dump predictions to csv and marked PDFs

In [45]:
# %%time
# def dump_predict(lis, dist_dir, c=(.23, .41, .88)):
#     for fname in lis:    
#         doc = fitz.open(fname)
#         df = file_data[fname]
#         clf = pipes[fname]
#         X, y = df['text'], df['y']
#         predict = clf.predict(X)
#         trues = df[predict == True]
#         [doc[int(i.pnum)].drawRect(i[2:6], color=c, width=1)
#             for i in trues.iloc];
#         doc.save(dist_dir + fname.split('/')[-1][:-4] + '_predict.pdf');

# def dump_regex(lis, dist_dir, c=(0, .5, .26)):
#     for fname in lis:    
#         doc = fitz.open(fname)
#         df = file_data[fname]
#         clf = pipes[fname]
#         trues = df[df.y == True]
#         if not any(trues):
#             print("skip", fname)
#         [doc[int(i.pnum)].drawRect(i[2:6], color=c, width=1)
#             for i in trues.iloc];
#         doc.save(dist_dir + fname.split('/')[-1][:-4] + '_regex.pdf');

# def dump_csv():
#     def pdf_csv(s):
#         return s.split('/')[-1][:-3]+'csv'

#     for fname in file_lis:    
#         df = file_data[fname]
#         prediction = preds[fname]

#         df.to_csv(PATH_REGEX[:-1] + '_csv/' + pdf_csv(fname), index=False)
#         pd.concat([df.iloc[:, :-1], pd.Series(prediction, name='y')], axis=1).to_csv(
#             PATH_PREDICT[:-1] + '_csv/' + pdf_csv(fname), index=False
#         )
# dump_regex(file_lis, PATH_REGEX)
# dump_predict(file_lis, PATH_PREDICT)
# dump_csv()