Skip to content

Commit

Permalink
Merge pull request #151 from UnB-KnEDLe/fix_failing_tests
Browse files Browse the repository at this point in the history
Fix failing tests
  • Loading branch information
Lary15 committed Aug 11, 2021
2 parents 36655d2 + e19456e commit 87cb849
Show file tree
Hide file tree
Showing 9 changed files with 44 additions and 44 deletions.
4 changes: 3 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -140,4 +140,6 @@ dmypy.json
.vscode/

# temporary local files
tmp/
tmp/

dodfs/
21 changes: 12 additions & 9 deletions dodfminer/extract/polished/create_xml.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,21 +8,24 @@

class XMLFy:

def __init__(self, file, acts_ids, i):
file_nums = list((map(int, re.findall(r'\d+', file))))
file_nums = file_nums[2:]
print(file_nums)
file_id = f"{i}_"
for s in file_nums:
file_id += str(s) + "."
file_id = file_id[:-1]
def __init__(self, file, acts_ids, id):
self._file = file
self._acts_ids = acts_ids
self._xml_id = file_id
self._xml_id = self.build_xml_id(id)
self._annotation_id = 1
self._relations_id = 1
self.xml = self._create_xml()

def build_xml_id(self, id):
file_name = self._file.split('/')[-1]

str2int2str = lambda x : str(int(x))
file_numbers_list = map(str2int2str, re.findall(r'\d+', file_name))

file_id = ".".join(list(file_numbers_list)[1:])

return f"{id}_{file_id}"

def print_tree(self):
print(etree.tostring(self.xml, pretty_print=True).decode())

Expand Down
22 changes: 15 additions & 7 deletions dodfminer/extract/pure/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,13 +42,14 @@ class ContentExtractor:
"""

@classmethod
def extract_text(cls, file, single=False, block=False, sep=' ', norm='NFKD'):
def extract_text(cls, file, single=False, block=False, is_json=True, sep=' ', norm='NFKD'):
"""Extract block of text from file
Args:
file: The DODF to extract the titles.
single: output content in a single file in the file directory
file: The DODF to extract the titles.
single: output content in a single file in the file directory
block: Extract the text as a list of text blocks
is_json: the list of text blocks are written as a json file.
sep: The separator character between each block of text
norm: Type of normalization applied to the text
Expand All @@ -67,15 +68,22 @@ def extract_text(cls, file, single=False, block=False, sep=' ', norm='NFKD'):
for textboxes in get_doc_text_boxes(pymu_file):
for text in textboxes:
if int(text[1]) != 55 and int(text[1]) != 881:
if block:
if block:
norm_text = cls._normalize_text(text[4], norm)
list_of_boxes.append((text[0], text[1], text[2],
if is_json:
list_of_boxes.append((text[0], text[1], text[2],
text[3], norm_text))
else:
drawboxes_text += (norm_text + sep)
else:
drawboxes_text += (text[4] + sep)

if block:
return list_of_boxes if not single else cls._save_single_file(file, 'json', json.dumps(list_of_boxes))
if not single:
return list_of_boxes
elif is_json:
cls._save_single_file(file, 'json', json.dumps(list_of_boxes))
else:
cls._save_single_file(file, 'txt', drawboxes_text)

drawboxes_text = cls._normalize_text(drawboxes_text, norm)
return drawboxes_text if not single else cls._save_single_file(file, 'txt', drawboxes_text)
Expand Down
2 changes: 1 addition & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ certifi==2020.4.5.2
chardet==3.0.4
idna==2.9
numpy==1.18.5
pandas==1.3.0
pandas==1.1.5
PyMuPDF==1.17.0
python-dateutil==2.8.1
pytz==2020.1
Expand Down
8 changes: 4 additions & 4 deletions tests/support/dodf_pdfs/cessoes.csv
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
,nome,matricula,cargo_efetivo,classe,padrao,orgao_cedente,orgao_cessionario,onus,fundamento legal,processo_SEI,vigencia,matricula_SIAPE,cargo_orgao_cessionario,simbolo,hierarquia_lotacao
0,SADI PERES MARTINS,79.206-3,,,,,,"ONUS FINANCEIRO: orgao cessionario, com ressarcimento mensal a origem.",,-,,, resolve: SUSPENDER,,
1,ARLETE OLIVEIRA SANTOS GONDAR,124.604-6,,,,,,ONUS FINANCEIRO: orgao cedente.,,00138-00007294/2019-45,,, com alicerce no art. 2o,,
2,ROBERT WAGNER DE SANTANA,1.430.783-9,,,,,,ONUS FINANCEIRO: orgao cedente.,,04019-00000669/2019-17.,,, com alicerce no art. 2o,,
3,JULIO CESAR MENEGOTTO,74.682-7,,,,,,"onus para o orgao
0,ARLETE OLIVEIRA SANTOS GONDAR,124.604-6,,,,,,ONUS FINANCEIRO: orgao cedente.,,00138-00007294/2019-45,,, com alicerce no art. 2o,,
1,ROBERT WAGNER DE SANTANA,1.430.783-9,,,,,,ONUS FINANCEIRO: orgao cedente.,,04019-00000669/2019-17.,,, com alicerce no art. 2o,,
2,JULIO CESAR MENEGOTTO,74.682-7,,,,,,"onus para o orgao
de origem, conforme Decisao da Diretoria Executiva, exarada pela Sessao no 4.",,00112-00037276/2019-21.,,, usando das atribuicoes conferidas pelo Art. 25,,
3,SADI PERES MARTINS,79.206-3,,,,,,"ONUS FINANCEIRO: orgao cessionario, com ressarcimento mensal a origem.",,-,,, resolve: SUSPENDER,,
10 changes: 5 additions & 5 deletions tests/support/dodf_pdfs/sem_efeito_aposentadoria.csv
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
,tipo_ato,tipo_documento,numero_documento,data_documento,numero_dodf,data_dodf,pagina_dodf,nome,matricula,matricula_SIAPE,cargo_efetivo,classe,padrao,quadro,orgao,processo_SEI,tipo_edicao
0,Atos tornados sem efeito - aposentadoria,Ordem de Servico,,03 de dezembro de 2019,04,04 de dezembro de 2019,," FRANCINEIDE
DANIEL DE LIMA",66.260-7,, no Cargo de Professor de Educacao Basica,,,,,00040-00024368/2019-32,normal
1,Atos tornados sem efeito - aposentadoria,,,26 de fevereiro de 2019,42,28 de fevereiro 2019, 29,GENI TEREZINHA SPIES DA SILVEIRA,30735-1,, totalizando 731 dias,,,,,00401.00003406/2019-59,normal
2,Atos tornados sem efeito - aposentadoria,,,,137,19/07/2017,. 42, DELFINO BERNARDES RABELO,100.652-5. ,,,,,,,0070-001865/2016,normal
3,Atos tornados sem efeito - aposentadoria,,,05 de fevereiro de 1990,248,"29 de
0,Atos tornados sem efeito - aposentadoria,,,,137,19/07/2017,. 42, DELFINO BERNARDES RABELO,100.652-5. ,,,,,,,0070-001865/2016,normal
1,Atos tornados sem efeito - aposentadoria,,,05 de fevereiro de 1990,248,"29 de
dezembro de 2017", 39.,"MARCO
ANTONIO CATTANI FRANCA","129661-2,",, 129.661-2,,,,,271.000.680/2017,normal
2,Atos tornados sem efeito - aposentadoria,Ordem de Servico,,03 de dezembro de 2019,04,04 de dezembro de 2019,," FRANCINEIDE
DANIEL DE LIMA",66.260-7,, no Cargo de Professor de Educacao Basica,,,,,00040-00024368/2019-32,normal
3,Atos tornados sem efeito - aposentadoria,,,26 de fevereiro de 2019,42,28 de fevereiro 2019, 29,GENI TEREZINHA SPIES DA SILVEIRA,30735-1,, totalizando 731 dias,,,,,00401.00003406/2019-59,normal
4 changes: 2 additions & 2 deletions tests/test_extract_polished_helper.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,8 +7,8 @@ def test_helper_xml_multiple():
dir = ""+os.path.dirname(__file__)+"/support/support_supporter"
try:
xml_multiple(dir, "regex")
assert "1_1.2019.xml" in os.listdir(dir)
os.remove(os.path.join(dir, "1_1.2019.xml"))
assert "1_1.1.2019.xml" in os.listdir(dir)
os.remove(os.path.join(dir, "1_1.1.2019.xml"))
except:
assert False

Expand Down
13 changes: 0 additions & 13 deletions tests/test_extract_pure_utils_title_extractor.py
Original file line number Diff line number Diff line change
Expand Up @@ -255,19 +255,15 @@ def test_extract_bold_upper_page():
""" '_extract_bold_upper_page' will not be tested directly"""
pass

@pytest.mark.xfail(reason="Bug #45 not yet fixed.", run=False)
def test_extract_bold_upper_pdf_1(doc_2001):
functions_whole_doc('_extract_bold_upper_pdf', doc_2001)

@pytest.mark.xfail(reason="Bug #45 not yet fixed.", run=False)
def test_extract_bold_upper_pdf_2(doc_2017):
functions_whole_doc('_extract_bold_upper_pdf', doc_2017)

@pytest.mark.xfail(reason="Bug #45 not yet fixed.", run=False)
def test_extract_bold_upper_pdf_3(doc_2018):
functions_whole_doc('_extract_bold_upper_pdf', doc_2018)

@pytest.mark.xfail(reason="Bug #45 not yet fixed.", run=False)
def test_extract_bold_upper_pdf_4(doc_2020):
functions_whole_doc('_extract_bold_upper_pdf', doc_2020)

Expand All @@ -283,25 +279,21 @@ def test_get_titles_subtitles():
"""Hard to test. It's indirectly tested by `test_get_titles_subtitles_smart_X`"""
pass

@pytest.mark.xfail(reason="Bug #45 not yet fixed (will change function signature).", run=False)
def test_get_titles_subtitles_smart_1(doc_2001):
functions_with_kargs(
'_get_titles_subtitles_smart', PDF_2001_PATH, doc=doc_2001,
width_lis = [p.MediaBox[2] for p in doc_2001],
)
@pytest.mark.xfail(reason="Bug #45 not yet fixed (will change function signature).", run=False)
def test_get_titles_subtitles_smart_2(doc_2017):
functions_with_kargs(
'_get_titles_subtitles_smart', PDF_2017_PATH, doc=doc_2017,
width_lis = [p.MediaBox[2] for p in doc_2017],
)
@pytest.mark.xfail(reason="Bug #45 not yet fixed (will change function signature).", run=False)
def test_get_titles_subtitles_smart_3(doc_2018):
functions_with_kargs(
'_get_titles_subtitles_smart', PDF_2018_PATH, doc=doc_2018,
width_lis = [p.MediaBox[2] for p in doc_2018],
)
@pytest.mark.xfail(reason="Bug #45 not yet fixed (will change function signature).", run=False)
def test_get_titles_subtitles_smart_4(doc_2020):
functions_with_kargs(
'_get_titles_subtitles_smart', PDF_2020_PATH, doc=doc_2020,
Expand All @@ -318,7 +310,6 @@ def test_extract_titles_subtitles_2():
def test_extract_titles_subtitles_3():
functions_with_path('extract_titles_subtitles', PDF_2018_PATH)

@pytest.mark.xfail(reason="Bug #45 not yet fixed.", run=False)
def test_extract_titles_subtitles_4():
functions_with_path('extract_titles_subtitles', PDF_2020_PATH)

Expand All @@ -333,7 +324,6 @@ def test_titles_2(extractor_2017):
def test_titles_3(extractor_2018):
wrapper_extractor_props(extractor_2018, 'titles')

@pytest.mark.xfail(reason="Bug #45 not yet fixed.", run=False)
def test_titles_4(extractor_2020):
wrapper_extractor_props(extractor_2020, 'titles')

Expand All @@ -347,7 +337,6 @@ def test_subtitles_2(extractor_2017):
def test_subtitles_3(extractor_2018):
wrapper_extractor_props(extractor_2018, 'subtitles')

@pytest.mark.xfail(reason="Bug #45 not yet fixed.", run=False)
def test_subtitles_4(extractor_2020):
wrapper_extractor_props(extractor_2020, 'subtitles')

Expand All @@ -361,7 +350,6 @@ def test_json_2(extractor_2017):
def test_json_3(extractor_2018):
wrapper_extractor_props(extractor_2018, 'json')

@pytest.mark.xfail(reason="Bug #45 not yet fixed.", run=False)
def test_json_4(extractor_2020):
wrapper_extractor_props(extractor_2020, 'json')

Expand All @@ -385,7 +373,6 @@ def test_titles_subtitles_hierarchy_3(extractor_2018):
wrapper_extractor_props(extractor_2018, 'titles_subtitles_hierarchy')


@pytest.mark.xfail(reason="Bug #45 not yet fixed.", run=False)
def test_titles_subtitles_hierarchy_4(extractor_2020):
wrapper_extractor_props(extractor_2020, 'titles_subtitles_hierarchy')

Expand Down
4 changes: 2 additions & 2 deletions tests/test_run_extract.py
Original file line number Diff line number Diff line change
Expand Up @@ -150,8 +150,8 @@ def test_run_extract_input_folder_xml():
targets = ["cmd", "extract", "-i", folder, "-x"]
with patch.object(sys, 'argv', targets):
run()
assert os.path.isfile(os.path.join(folder, '1_1.10.1.2020.xml'))
os.remove(folder+'/1_1.10.1.2020.xml')
assert os.path.isfile(os.path.join(folder, '1_10.1.2020.xml'))
os.remove(folder+'/1_10.1.2020.xml')

# def test_run_extract_input_single_xml():
# file = ""+os.path.dirname(__file__)+"/support/xml_extract/DODF 001 02-01-2020 INTEGRA.pdf"
Expand Down

0 comments on commit 87cb849

Please sign in to comment.