Merge pull request #151 from UnB-KnEDLe/fix_failing_tests

Fix failing tests
UnB-KnEDLe · Aug 11, 2021 · 87cb849 · 87cb849
2 parents 36655d2 + e19456e
commit 87cb849
Show file tree

Hide file tree

Showing 9 changed files with 44 additions and 44 deletions.
diff --git a/.gitignore b/.gitignore
@@ -140,4 +140,6 @@ dmypy.json
 .vscode/
 
 # temporary local files
-tmp/
+tmp/
+
+dodfs/
diff --git a/dodfminer/extract/polished/create_xml.py b/dodfminer/extract/polished/create_xml.py
@@ -8,21 +8,24 @@
 
 class XMLFy:
 
-    def __init__(self, file, acts_ids, i):
-        file_nums = list((map(int, re.findall(r'\d+', file))))
-        file_nums = file_nums[2:]
-        print(file_nums)
-        file_id = f"{i}_"
-        for s in file_nums:
-            file_id += str(s) + "."
-        file_id = file_id[:-1]
+    def __init__(self, file, acts_ids, id):
         self._file = file
         self._acts_ids = acts_ids
-        self._xml_id = file_id
+        self._xml_id = self.build_xml_id(id)
         self._annotation_id = 1
         self._relations_id = 1
         self.xml = self._create_xml()
 
+    def build_xml_id(self, id):
+        file_name = self._file.split('/')[-1]
+
+        str2int2str = lambda x : str(int(x))
+        file_numbers_list = map(str2int2str, re.findall(r'\d+', file_name))
+
+        file_id = ".".join(list(file_numbers_list)[1:])
+
+        return f"{id}_{file_id}"
+
     def print_tree(self):
         print(etree.tostring(self.xml, pretty_print=True).decode())
 

diff --git a/dodfminer/extract/pure/core.py b/dodfminer/extract/pure/core.py
@@ -42,13 +42,14 @@ class ContentExtractor:
     """
 
     @classmethod
-    def extract_text(cls, file, single=False, block=False, sep=' ', norm='NFKD'):
+    def extract_text(cls, file, single=False, block=False, is_json=True, sep=' ', norm='NFKD'):
         """Extract block of text from file
 
         Args:
-            file: The DODF to extract the titles.
-            single: output content in a single file in the file directory
+            file: The DODF to extract the titles.            
+            single: output content in a single file in the file directory            
             block: Extract the text as a list of text blocks
+            is_json: the list of text blocks are written as a json file. 
             sep: The separator character between each block of text
             norm: Type of normalization applied to the text
 
@@ -67,15 +68,22 @@ def extract_text(cls, file, single=False, block=False, sep=' ', norm='NFKD'):
         for textboxes in get_doc_text_boxes(pymu_file):
             for text in textboxes:
                 if int(text[1]) != 55 and int(text[1]) != 881:
-                    if block:
+                    if block:                        
                         norm_text = cls._normalize_text(text[4], norm)
-                        list_of_boxes.append((text[0], text[1], text[2],
+                        if is_json:
+                            list_of_boxes.append((text[0], text[1], text[2],
                                               text[3], norm_text))
+                        else:
+                            drawboxes_text += (norm_text + sep)    
                     else:
                         drawboxes_text += (text[4] + sep)
-
         if block:
-            return list_of_boxes if not single else cls._save_single_file(file, 'json', json.dumps(list_of_boxes))
+            if not single:
+                return list_of_boxes 
+            elif is_json:
+                cls._save_single_file(file, 'json', json.dumps(list_of_boxes))            
+            else:
+                cls._save_single_file(file, 'txt', drawboxes_text)
 
         drawboxes_text = cls._normalize_text(drawboxes_text, norm)
         return drawboxes_text if not single else cls._save_single_file(file, 'txt', drawboxes_text)

diff --git a/requirements.txt b/requirements.txt
@@ -4,7 +4,7 @@ certifi==2020.4.5.2
 chardet==3.0.4
 idna==2.9
 numpy==1.18.5
-pandas==1.3.0
+pandas==1.1.5
 PyMuPDF==1.17.0
 python-dateutil==2.8.1
 pytz==2020.1

diff --git a/tests/support/dodf_pdfs/cessoes.csv b/tests/support/dodf_pdfs/cessoes.csv
@@ -1,6 +1,6 @@
 ,nome,matricula,cargo_efetivo,classe,padrao,orgao_cedente,orgao_cessionario,onus,fundamento legal,processo_SEI,vigencia,matricula_SIAPE,cargo_orgao_cessionario,simbolo,hierarquia_lotacao
-0,SADI PERES MARTINS,79.206-3,,,,,,"ONUS FINANCEIRO: orgao cessionario, com ressarcimento mensal a origem.",,-,,, resolve: SUSPENDER,,
-1,ARLETE OLIVEIRA SANTOS GONDAR,124.604-6,,,,,,ONUS FINANCEIRO: orgao cedente.,,00138-00007294/2019-45,,, com alicerce no art. 2o,,
-2,ROBERT WAGNER DE SANTANA,1.430.783-9,,,,,,ONUS FINANCEIRO: orgao cedente.,,04019-00000669/2019-17.,,, com alicerce no art. 2o,,
-3,JULIO CESAR MENEGOTTO,74.682-7,,,,,,"onus para o orgao
+0,ARLETE OLIVEIRA SANTOS GONDAR,124.604-6,,,,,,ONUS FINANCEIRO: orgao cedente.,,00138-00007294/2019-45,,, com alicerce no art. 2o,,
+1,ROBERT WAGNER DE SANTANA,1.430.783-9,,,,,,ONUS FINANCEIRO: orgao cedente.,,04019-00000669/2019-17.,,, com alicerce no art. 2o,,
+2,JULIO CESAR MENEGOTTO,74.682-7,,,,,,"onus para o orgao
 de origem, conforme Decisao da Diretoria Executiva, exarada pela Sessao no 4.",,00112-00037276/2019-21.,,, usando das atribuicoes conferidas pelo Art. 25,,
+3,SADI PERES MARTINS,79.206-3,,,,,,"ONUS FINANCEIRO: orgao cessionario, com ressarcimento mensal a origem.",,-,,, resolve: SUSPENDER,,
diff --git a/tests/support/dodf_pdfs/sem_efeito_aposentadoria.csv b/tests/support/dodf_pdfs/sem_efeito_aposentadoria.csv
@@ -1,8 +1,8 @@
 ,tipo_ato,tipo_documento,numero_documento,data_documento,numero_dodf,data_dodf,pagina_dodf,nome,matricula,matricula_SIAPE,cargo_efetivo,classe,padrao,quadro,orgao,processo_SEI,tipo_edicao
-0,Atos tornados sem efeito - aposentadoria,Ordem de Servico,,03 de dezembro de 2019,04,04 de dezembro de 2019,," FRANCINEIDE
-DANIEL DE LIMA",66.260-7,, no Cargo de Professor de Educacao Basica,,,,,00040-00024368/2019-32,normal
-1,Atos tornados sem efeito - aposentadoria,,,26 de fevereiro de 2019,42,28 de fevereiro 2019, 29,GENI TEREZINHA SPIES DA SILVEIRA,30735-1,, totalizando 731 dias,,,,,00401.00003406/2019-59,normal
-2,Atos tornados sem efeito - aposentadoria,,,,137,19/07/2017,. 42, DELFINO BERNARDES RABELO,100.652-5. ,,,,,,,0070-001865/2016,normal
-3,Atos tornados sem efeito - aposentadoria,,,05 de fevereiro de 1990,248,"29 de
+0,Atos tornados sem efeito - aposentadoria,,,,137,19/07/2017,. 42, DELFINO BERNARDES RABELO,100.652-5. ,,,,,,,0070-001865/2016,normal
+1,Atos tornados sem efeito - aposentadoria,,,05 de fevereiro de 1990,248,"29 de
 dezembro de 2017", 39.,"MARCO
 ANTONIO CATTANI FRANCA","129661-2,",, 129.661-2,,,,,271.000.680/2017,normal
+2,Atos tornados sem efeito - aposentadoria,Ordem de Servico,,03 de dezembro de 2019,04,04 de dezembro de 2019,," FRANCINEIDE
+DANIEL DE LIMA",66.260-7,, no Cargo de Professor de Educacao Basica,,,,,00040-00024368/2019-32,normal
+3,Atos tornados sem efeito - aposentadoria,,,26 de fevereiro de 2019,42,28 de fevereiro 2019, 29,GENI TEREZINHA SPIES DA SILVEIRA,30735-1,, totalizando 731 dias,,,,,00401.00003406/2019-59,normal
diff --git a/tests/test_extract_polished_helper.py b/tests/test_extract_polished_helper.py
@@ -7,8 +7,8 @@ def test_helper_xml_multiple():
     dir = ""+os.path.dirname(__file__)+"/support/support_supporter"
     try:
         xml_multiple(dir, "regex")
-        assert "1_1.2019.xml" in os.listdir(dir)
-        os.remove(os.path.join(dir, "1_1.2019.xml"))
+        assert "1_1.1.2019.xml" in os.listdir(dir)
+        os.remove(os.path.join(dir, "1_1.1.2019.xml"))
     except:
         assert False
 

diff --git a/tests/test_extract_pure_utils_title_extractor.py b/tests/test_extract_pure_utils_title_extractor.py
@@ -255,19 +255,15 @@ def test_extract_bold_upper_page():
     """ '_extract_bold_upper_page' will not be tested directly"""
     pass
 
-@pytest.mark.xfail(reason="Bug #45 not yet fixed.", run=False)
 def test_extract_bold_upper_pdf_1(doc_2001):
     functions_whole_doc('_extract_bold_upper_pdf', doc_2001)
 
-@pytest.mark.xfail(reason="Bug #45 not yet fixed.", run=False)
 def test_extract_bold_upper_pdf_2(doc_2017):
     functions_whole_doc('_extract_bold_upper_pdf', doc_2017)
 
-@pytest.mark.xfail(reason="Bug #45 not yet fixed.", run=False)
 def test_extract_bold_upper_pdf_3(doc_2018):
     functions_whole_doc('_extract_bold_upper_pdf', doc_2018)
 
-@pytest.mark.xfail(reason="Bug #45 not yet fixed.", run=False)
 def test_extract_bold_upper_pdf_4(doc_2020):
     functions_whole_doc('_extract_bold_upper_pdf', doc_2020)
 
@@ -283,25 +279,21 @@ def test_get_titles_subtitles():
     """Hard to test. It's indirectly tested by `test_get_titles_subtitles_smart_X`"""
     pass
 
-@pytest.mark.xfail(reason="Bug #45 not yet fixed (will change function signature).", run=False)
 def test_get_titles_subtitles_smart_1(doc_2001):
     functions_with_kargs(
         '_get_titles_subtitles_smart', PDF_2001_PATH, doc=doc_2001,
         width_lis = [p.MediaBox[2] for p in doc_2001],
     )
-@pytest.mark.xfail(reason="Bug #45 not yet fixed (will change function signature).", run=False)
 def test_get_titles_subtitles_smart_2(doc_2017):
     functions_with_kargs(
         '_get_titles_subtitles_smart', PDF_2017_PATH, doc=doc_2017,
         width_lis = [p.MediaBox[2] for p in doc_2017],
     )
-@pytest.mark.xfail(reason="Bug #45 not yet fixed (will change function signature).", run=False)
 def test_get_titles_subtitles_smart_3(doc_2018):
     functions_with_kargs(
         '_get_titles_subtitles_smart', PDF_2018_PATH, doc=doc_2018,
         width_lis = [p.MediaBox[2] for p in doc_2018],
     )
-@pytest.mark.xfail(reason="Bug #45 not yet fixed (will change function signature).", run=False)
 def test_get_titles_subtitles_smart_4(doc_2020):
     functions_with_kargs(
         '_get_titles_subtitles_smart', PDF_2020_PATH, doc=doc_2020,
@@ -318,7 +310,6 @@ def test_extract_titles_subtitles_2():
 def test_extract_titles_subtitles_3():
     functions_with_path('extract_titles_subtitles', PDF_2018_PATH)
 
-@pytest.mark.xfail(reason="Bug #45 not yet fixed.", run=False)
 def test_extract_titles_subtitles_4():
     functions_with_path('extract_titles_subtitles', PDF_2020_PATH)
 
@@ -333,7 +324,6 @@ def test_titles_2(extractor_2017):
 def test_titles_3(extractor_2018):
     wrapper_extractor_props(extractor_2018, 'titles')
 
-@pytest.mark.xfail(reason="Bug #45 not yet fixed.", run=False)
 def test_titles_4(extractor_2020):
     wrapper_extractor_props(extractor_2020, 'titles')
 
@@ -347,7 +337,6 @@ def test_subtitles_2(extractor_2017):
 def test_subtitles_3(extractor_2018):
     wrapper_extractor_props(extractor_2018, 'subtitles')
 
-@pytest.mark.xfail(reason="Bug #45 not yet fixed.", run=False)
 def test_subtitles_4(extractor_2020):
     wrapper_extractor_props(extractor_2020, 'subtitles')
 
@@ -361,7 +350,6 @@ def test_json_2(extractor_2017):
 def test_json_3(extractor_2018):
     wrapper_extractor_props(extractor_2018, 'json')
 
-@pytest.mark.xfail(reason="Bug #45 not yet fixed.", run=False)
 def test_json_4(extractor_2020):
     wrapper_extractor_props(extractor_2020, 'json')
 
@@ -385,7 +373,6 @@ def test_titles_subtitles_hierarchy_3(extractor_2018):
     wrapper_extractor_props(extractor_2018, 'titles_subtitles_hierarchy')
 
 
-@pytest.mark.xfail(reason="Bug #45 not yet fixed.", run=False)
 def test_titles_subtitles_hierarchy_4(extractor_2020):
     wrapper_extractor_props(extractor_2020, 'titles_subtitles_hierarchy')
 

diff --git a/tests/test_run_extract.py b/tests/test_run_extract.py
@@ -150,8 +150,8 @@ def test_run_extract_input_folder_xml():
   targets = ["cmd", "extract", "-i", folder, "-x"]
   with patch.object(sys, 'argv', targets):
     run()
-    assert os.path.isfile(os.path.join(folder, '1_1.10.1.2020.xml'))
-    os.remove(folder+'/1_1.10.1.2020.xml')
+    assert os.path.isfile(os.path.join(folder, '1_10.1.2020.xml'))
+    os.remove(folder+'/1_10.1.2020.xml')
 
 # def test_run_extract_input_single_xml():
 #   file = ""+os.path.dirname(__file__)+"/support/xml_extract/DODF 001 02-01-2020 INTEGRA.pdf"