Skip to content

Commit

Permalink
Merge pull request #327 from UnB-KnEDLe/highlight_patch
Browse files Browse the repository at this point in the history
Normalize text in json personal acts
  • Loading branch information
andlq committed Mar 18, 2023
2 parents 702b4bf + 22e48be commit 99b9c71
Show file tree
Hide file tree
Showing 3 changed files with 9 additions and 11 deletions.
2 changes: 1 addition & 1 deletion dodfminer/__version__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
version_info = (1, 4, 5)
version_info = (1, 4, 6)
# format:
# ('dodf_major', 'dodf_minor', 'dodf_patch')

Expand Down
8 changes: 5 additions & 3 deletions dodfminer/extract/polished/acts/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@

import re
import json
import unicodedata
import pandas as pd

from dodfminer.extract.polished.backend.regex import ActRegex
Expand Down Expand Up @@ -204,9 +205,8 @@ def highlight_dataframe(self):
return
self._data_frame = []
for IOB, text in zip(self._preds, self._acts_str):
ent_dict = {
'text': '',
}
ent_dict = dict()
ent_dict['titulo'] = None
ent_dict['text'] = ""

text_split = self._split_sentence(text) + ["O"]
Expand Down Expand Up @@ -295,6 +295,8 @@ def read_json(self, file_name):
txt = re.sub('<[^<]+?>', ' ', txt).replace('&nbsp', ' ')
all_txt.append(txt)
self._text = ''.join(all_txt)
self._text = unicodedata.normalize('NFKD', self._text).encode(
'ascii', 'ignore').decode('utf8')

def read_txt(self, file_name):
"""Reads a .txt file of a DODF.
Expand Down
10 changes: 3 additions & 7 deletions dodfminer/extract/polished/acts/base_contratos.py
Original file line number Diff line number Diff line change
Expand Up @@ -124,13 +124,9 @@ def highlight_dataframe(self):
if len(self.atos_encontrados) == 0:
return
self.data_frame = []
for IOB, text, numdodf, titulo in zip(self.predicted, self.atos_encontrados['texto'], self.atos_encontrados['numero_dodf'], self.atos_encontrados['titulo']):
ent_dict = {
'numero_dodf': '',
'titulo': '',
'text': '',
}
ent_dict['numero_dodf'] = numdodf
for IOB, text, _, titulo in zip(self.predicted, self.atos_encontrados['texto'], self.atos_encontrados['numero_dodf'], self.atos_encontrados['titulo']):
ent_dict = dict()
# ent_dict['numero_dodf'] = numdodf
ent_dict['titulo'] = titulo
ent_dict['text'] = ""

Expand Down

0 comments on commit 99b9c71

Please sign in to comment.