Skip to content

Commit

Permalink
Merge branch 'dev' into update_polished_docs
Browse files Browse the repository at this point in the history
  • Loading branch information
lacwerda committed Aug 13, 2021
2 parents a3df1ac + 87cb849 commit 8f2a4fe
Show file tree
Hide file tree
Showing 7 changed files with 29 additions and 40 deletions.
21 changes: 12 additions & 9 deletions dodfminer/extract/polished/create_xml.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,21 +8,24 @@

class XMLFy:

def __init__(self, file, acts_ids, i):
file_nums = list((map(int, re.findall(r'\d+', file))))
file_nums = file_nums[2:]
print(file_nums)
file_id = f"{i}_"
for s in file_nums:
file_id += str(s) + "."
file_id = file_id[:-1]
def __init__(self, file, acts_ids, id):
self._file = file
self._acts_ids = acts_ids
self._xml_id = file_id
self._xml_id = self.build_xml_id(id)
self._annotation_id = 1
self._relations_id = 1
self.xml = self._create_xml()

def build_xml_id(self, id):
file_name = self._file.split('/')[-1]

str2int2str = lambda x : str(int(x))
file_numbers_list = map(str2int2str, re.findall(r'\d+', file_name))

file_id = ".".join(list(file_numbers_list)[1:])

return f"{id}_{file_id}"

def print_tree(self):
print(etree.tostring(self.xml, pretty_print=True).decode())

Expand Down
9 changes: 4 additions & 5 deletions dodfminer/extract/pure/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,11 +44,11 @@ class ContentExtractor:
"""

@classmethod
def extract_text(cls, file, single=False, block=False, json=True, sep=' ', norm='NFKD'):
def extract_text(cls, file, single=False, block=False, is_json=True, sep=' ', norm='NFKD'):
"""Extract block of text from file
Args:
file: The DODF to extract the titles.
file: The DODF to extract titles from.
single: output content in a single file in the file directory.
block: Extract the text as a list of text blocks.
json: The list of text blocks are written as a json file.
Expand Down Expand Up @@ -100,18 +100,17 @@ def extract_text(cls, file, single=False, block=False, json=True, sep=' ', norm=
if int(text[1]) != 55 and int(text[1]) != 881:
if block:
norm_text = cls._normalize_text(text[4], norm)
if json:
if is_json:
list_of_boxes.append((text[0], text[1], text[2],
text[3], norm_text))
else:
drawboxes_text += (norm_text + sep)
else:
drawboxes_text += (text[4] + sep)

if block:
if not single:
return list_of_boxes
elif json:
elif is_json:
cls._save_single_file(file, 'json', json.dumps(list_of_boxes))
else:
cls._save_single_file(file, 'txt', drawboxes_text)
Expand Down
8 changes: 4 additions & 4 deletions tests/support/dodf_pdfs/cessoes.csv
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
,nome,matricula,cargo_efetivo,classe,padrao,orgao_cedente,orgao_cessionario,onus,fundamento legal,processo_SEI,vigencia,matricula_SIAPE,cargo_orgao_cessionario,simbolo,hierarquia_lotacao
0,SADI PERES MARTINS,79.206-3,,,,,,"ONUS FINANCEIRO: orgao cessionario, com ressarcimento mensal a origem.",,-,,, resolve: SUSPENDER,,
1,ARLETE OLIVEIRA SANTOS GONDAR,124.604-6,,,,,,ONUS FINANCEIRO: orgao cedente.,,00138-00007294/2019-45,,, com alicerce no art. 2o,,
2,ROBERT WAGNER DE SANTANA,1.430.783-9,,,,,,ONUS FINANCEIRO: orgao cedente.,,04019-00000669/2019-17.,,, com alicerce no art. 2o,,
3,JULIO CESAR MENEGOTTO,74.682-7,,,,,,"onus para o orgao
0,ARLETE OLIVEIRA SANTOS GONDAR,124.604-6,,,,,,ONUS FINANCEIRO: orgao cedente.,,00138-00007294/2019-45,,, com alicerce no art. 2o,,
1,ROBERT WAGNER DE SANTANA,1.430.783-9,,,,,,ONUS FINANCEIRO: orgao cedente.,,04019-00000669/2019-17.,,, com alicerce no art. 2o,,
2,JULIO CESAR MENEGOTTO,74.682-7,,,,,,"onus para o orgao
de origem, conforme Decisao da Diretoria Executiva, exarada pela Sessao no 4.",,00112-00037276/2019-21.,,, usando das atribuicoes conferidas pelo Art. 25,,
3,SADI PERES MARTINS,79.206-3,,,,,,"ONUS FINANCEIRO: orgao cessionario, com ressarcimento mensal a origem.",,-,,, resolve: SUSPENDER,,
10 changes: 5 additions & 5 deletions tests/support/dodf_pdfs/sem_efeito_aposentadoria.csv
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
,tipo_ato,tipo_documento,numero_documento,data_documento,numero_dodf,data_dodf,pagina_dodf,nome,matricula,matricula_SIAPE,cargo_efetivo,classe,padrao,quadro,orgao,processo_SEI,tipo_edicao
0,Atos tornados sem efeito - aposentadoria,Ordem de Servico,,03 de dezembro de 2019,04,04 de dezembro de 2019,," FRANCINEIDE
DANIEL DE LIMA",66.260-7,, no Cargo de Professor de Educacao Basica,,,,,00040-00024368/2019-32,normal
1,Atos tornados sem efeito - aposentadoria,,,26 de fevereiro de 2019,42,28 de fevereiro 2019, 29,GENI TEREZINHA SPIES DA SILVEIRA,30735-1,, totalizando 731 dias,,,,,00401.00003406/2019-59,normal
2,Atos tornados sem efeito - aposentadoria,,,,137,19/07/2017,. 42, DELFINO BERNARDES RABELO,100.652-5. ,,,,,,,0070-001865/2016,normal
3,Atos tornados sem efeito - aposentadoria,,,05 de fevereiro de 1990,248,"29 de
0,Atos tornados sem efeito - aposentadoria,,,,137,19/07/2017,. 42, DELFINO BERNARDES RABELO,100.652-5. ,,,,,,,0070-001865/2016,normal
1,Atos tornados sem efeito - aposentadoria,,,05 de fevereiro de 1990,248,"29 de
dezembro de 2017", 39.,"MARCO
ANTONIO CATTANI FRANCA","129661-2,",, 129.661-2,,,,,271.000.680/2017,normal
2,Atos tornados sem efeito - aposentadoria,Ordem de Servico,,03 de dezembro de 2019,04,04 de dezembro de 2019,," FRANCINEIDE
DANIEL DE LIMA",66.260-7,, no Cargo de Professor de Educacao Basica,,,,,00040-00024368/2019-32,normal
3,Atos tornados sem efeito - aposentadoria,,,26 de fevereiro de 2019,42,28 de fevereiro 2019, 29,GENI TEREZINHA SPIES DA SILVEIRA,30735-1,, totalizando 731 dias,,,,,00401.00003406/2019-59,normal
4 changes: 2 additions & 2 deletions tests/test_extract_polished_helper.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,8 +7,8 @@ def test_helper_xml_multiple():
dir = ""+os.path.dirname(__file__)+"/support/support_supporter"
try:
xml_multiple(dir, "regex")
assert "1_1.2019.xml" in os.listdir(dir)
os.remove(os.path.join(dir, "1_1.2019.xml"))
assert "1_1.1.2019.xml" in os.listdir(dir)
os.remove(os.path.join(dir, "1_1.1.2019.xml"))
except:
assert False

Expand Down
13 changes: 0 additions & 13 deletions tests/test_extract_pure_utils_title_extractor.py
Original file line number Diff line number Diff line change
Expand Up @@ -255,19 +255,15 @@ def test_extract_bold_upper_page():
""" '_extract_bold_upper_page' will not be tested directly"""
pass

@pytest.mark.xfail(reason="Bug #45 not yet fixed.", run=False)
def test_extract_bold_upper_pdf_1(doc_2001):
functions_whole_doc('_extract_bold_upper_pdf', doc_2001)

@pytest.mark.xfail(reason="Bug #45 not yet fixed.", run=False)
def test_extract_bold_upper_pdf_2(doc_2017):
functions_whole_doc('_extract_bold_upper_pdf', doc_2017)

@pytest.mark.xfail(reason="Bug #45 not yet fixed.", run=False)
def test_extract_bold_upper_pdf_3(doc_2018):
functions_whole_doc('_extract_bold_upper_pdf', doc_2018)

@pytest.mark.xfail(reason="Bug #45 not yet fixed.", run=False)
def test_extract_bold_upper_pdf_4(doc_2020):
functions_whole_doc('_extract_bold_upper_pdf', doc_2020)

Expand All @@ -283,25 +279,21 @@ def test_get_titles_subtitles():
"""Hard to test. It's indirectly tested by `test_get_titles_subtitles_smart_X`"""
pass

@pytest.mark.xfail(reason="Bug #45 not yet fixed (will change function signature).", run=False)
def test_get_titles_subtitles_smart_1(doc_2001):
functions_with_kargs(
'_get_titles_subtitles_smart', PDF_2001_PATH, doc=doc_2001,
width_lis = [p.MediaBox[2] for p in doc_2001],
)
@pytest.mark.xfail(reason="Bug #45 not yet fixed (will change function signature).", run=False)
def test_get_titles_subtitles_smart_2(doc_2017):
functions_with_kargs(
'_get_titles_subtitles_smart', PDF_2017_PATH, doc=doc_2017,
width_lis = [p.MediaBox[2] for p in doc_2017],
)
@pytest.mark.xfail(reason="Bug #45 not yet fixed (will change function signature).", run=False)
def test_get_titles_subtitles_smart_3(doc_2018):
functions_with_kargs(
'_get_titles_subtitles_smart', PDF_2018_PATH, doc=doc_2018,
width_lis = [p.MediaBox[2] for p in doc_2018],
)
@pytest.mark.xfail(reason="Bug #45 not yet fixed (will change function signature).", run=False)
def test_get_titles_subtitles_smart_4(doc_2020):
functions_with_kargs(
'_get_titles_subtitles_smart', PDF_2020_PATH, doc=doc_2020,
Expand All @@ -318,7 +310,6 @@ def test_extract_titles_subtitles_2():
def test_extract_titles_subtitles_3():
functions_with_path('extract_titles_subtitles', PDF_2018_PATH)

@pytest.mark.xfail(reason="Bug #45 not yet fixed.", run=False)
def test_extract_titles_subtitles_4():
functions_with_path('extract_titles_subtitles', PDF_2020_PATH)

Expand All @@ -333,7 +324,6 @@ def test_titles_2(extractor_2017):
def test_titles_3(extractor_2018):
wrapper_extractor_props(extractor_2018, 'titles')

@pytest.mark.xfail(reason="Bug #45 not yet fixed.", run=False)
def test_titles_4(extractor_2020):
wrapper_extractor_props(extractor_2020, 'titles')

Expand All @@ -347,7 +337,6 @@ def test_subtitles_2(extractor_2017):
def test_subtitles_3(extractor_2018):
wrapper_extractor_props(extractor_2018, 'subtitles')

@pytest.mark.xfail(reason="Bug #45 not yet fixed.", run=False)
def test_subtitles_4(extractor_2020):
wrapper_extractor_props(extractor_2020, 'subtitles')

Expand All @@ -361,7 +350,6 @@ def test_json_2(extractor_2017):
def test_json_3(extractor_2018):
wrapper_extractor_props(extractor_2018, 'json')

@pytest.mark.xfail(reason="Bug #45 not yet fixed.", run=False)
def test_json_4(extractor_2020):
wrapper_extractor_props(extractor_2020, 'json')

Expand All @@ -385,7 +373,6 @@ def test_titles_subtitles_hierarchy_3(extractor_2018):
wrapper_extractor_props(extractor_2018, 'titles_subtitles_hierarchy')


@pytest.mark.xfail(reason="Bug #45 not yet fixed.", run=False)
def test_titles_subtitles_hierarchy_4(extractor_2020):
wrapper_extractor_props(extractor_2020, 'titles_subtitles_hierarchy')

Expand Down
4 changes: 2 additions & 2 deletions tests/test_run_extract.py
Original file line number Diff line number Diff line change
Expand Up @@ -150,8 +150,8 @@ def test_run_extract_input_folder_xml():
targets = ["cmd", "extract", "-i", folder, "-x"]
with patch.object(sys, 'argv', targets):
run()
assert os.path.isfile(os.path.join(folder, '1_1.10.1.2020.xml'))
os.remove(folder+'/1_1.10.1.2020.xml')
assert os.path.isfile(os.path.join(folder, '1_10.1.2020.xml'))
os.remove(folder+'/1_10.1.2020.xml')

# def test_run_extract_input_single_xml():
# file = ""+os.path.dirname(__file__)+"/support/xml_extract/DODF 001 02-01-2020 INTEGRA.pdf"
Expand Down

0 comments on commit 8f2a4fe

Please sign in to comment.