Skip to content

Commit

Permalink
Forçando build
Browse files Browse the repository at this point in the history
  • Loading branch information
notopoloko committed Nov 8, 2021
1 parent 1bc35b9 commit d9d7ee1
Show file tree
Hide file tree
Showing 2 changed files with 24 additions and 24 deletions.
42 changes: 21 additions & 21 deletions dodfminer/extract/pure/utils/title_extractor.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,9 +66,9 @@ def group_by_column(elements, width):
"""
left_right = [[], []]
MID_W = width / 2
mid_width = width / 2
for i in elements:
if i.bbox.x0 <= MID_W:
if i.bbox.x0 <= mid_width:
left_right[0].append(i)
else:
left_right[1].append(i)
Expand All @@ -91,8 +91,8 @@ def group_by_page(elements):
page_elements = {}
for page_num in set(map(lambda x: x.page, elements)):
page_elements[page_num] = []
for el in elements:
page_elements[el.page].append(el)
for element in elements:
page_elements[element.page].append(element)
return page_elements


Expand Down Expand Up @@ -121,7 +121,7 @@ def sort_by_column(elements, width):
return reduce(operator.add, ordenado)


def _invertTextTypeBboxPageTuple(textTypeBboxPageTuple):
def invert_text_type_bbox_page_tuple(text_type_bbox_page_tuple):
"""Reverses the type between _TYPE_TITLE and _TYPE_SUBTITLE.
Args:
Expand All @@ -131,7 +131,7 @@ def _invertTextTypeBboxPageTuple(textTypeBboxPageTuple):
copy of textTypeBboxPageTuple with its type field reversed.
"""
text, _type, bbox, page = textTypeBboxPageTuple
text, _type, bbox, page = text_type_bbox_page_tuple
return TextTypeBboxPageTuple(text, _TYPE_TITLE if _type is _TYPE_SUBTITLE
else _TYPE_SUBTITLE, bbox, page)

Expand All @@ -148,8 +148,8 @@ def _extract_bold_upper_page(page):
"""
lis = []
for bl in page.getTextPage().extractDICT()['blocks']:
for line in bl['lines']:
for block in page.getTextPage().extractDICT()['blocks']:
for line in block['lines']:
for span in line['spans']:
flags = span['flags']
txt: str = span['text']
Expand Down Expand Up @@ -281,7 +281,7 @@ def _get_titles_subtitles(elements, width_lis):
# Happens mostly when there are only one title and other stuffs.

if not titles and sub_titles:
return TitlesSubtitles([_invertTextTypeBboxPageTuple(i) for i in sub_titles], titles)
return TitlesSubtitles([invert_text_type_bbox_page_tuple(i) for i in sub_titles], titles)
else:
return TitlesSubtitles(titles, sub_titles)

Expand Down Expand Up @@ -523,9 +523,9 @@ def dump_json(self, path):
be done. Its suffixed with ".json" if it's not.
"""
with open("{}{}".format(path, (not path.endswith(".json")) * ".json"), 'w', encoding='utf-8') as jsonFile:
with open(f"{path}{(not path.endswith('.json')) * '.json'}", 'w', encoding='utf-8') as json_file:
json.dump(self.json,
jsonFile,
json_file,
ensure_ascii=False, indent=' ')

def reset(self):
Expand All @@ -550,25 +550,25 @@ def gen_title_base(dir_path=".", base_name="titles", indent=4, forced=False):
dict containing "titles" as key and a list of titles,
the same stored at base_name[.json]
"""
base_name = "{}/{}".format(
dir_path, base_name + (not base_name.endswith(".json")) * ".json")
base_name = f"{dir_path}/{base_name + (not base_name.endswith('.json')) * '.json'}"
if os.path.exists(base_name) and not forced:
print(f"Error: {base_name} already exists")
return None
elif os.path.isdir(base_name):
print("Error: {} ir a directory".format(base_name))
print(f"Error: {base_name} ir a directory")
return None

titles = set()
for file in filter(lambda x: not os.path.isdir(x) and x.endswith('.pdf'), os.listdir(dir_path)):
et = ExtractorTitleSubtitle(file)
titles_text = map(lambda x: x.text, et.titles)
extractor = ExtractorTitleSubtitle(file)
titles_text = map(lambda x: x.text, extractor.titles)
titles.update(titles_text)
js = {"titles": list(titles)}
json.dump(js, open("{}".format(base_name), 'w'),
ensure_ascii=False, indent=indent*' ')
json_content = {"titles": list(titles)}
with open(f"{base_name}", 'w', encoding='uft-8') as json_file:
json.dump(json_content, json_file,
ensure_ascii=False, indent=indent*' ')

return js
return json_content


def gen_hierarchy_base(dir_path=".",
Expand Down Expand Up @@ -606,7 +606,7 @@ def gen_hierarchy_base(dir_path=".",
dir_path = "."
try:
os.makedirs(folder, exist_ok=forced)
except Exception as error:
except OSError as error:
print(error)
return None

Expand Down
6 changes: 3 additions & 3 deletions tests/test_extract_pure_utils_title_extractor.py
Original file line number Diff line number Diff line change
Expand Up @@ -231,19 +231,19 @@ def test_sort_by_column(elements_and_width, elements_expected_sort):
def test_invert_text_type_bbox_page_tuple_1():
tup = Tuple('texto', title_extractor._TYPE_SUBTITLE, 123, 34)
inv_tup = Tuple('texto', title_extractor._TYPE_TITLE, 123, 34)
assert inv_tup == title_extractor._invertTextTypeBboxPageTuple(tup)
assert inv_tup == title_extractor.invert_text_type_bbox_page_tuple(tup)


def test_invert_text_type_bbox_page_tuple_2():
tup = Tuple('nadaver', title_extractor._TYPE_TITLE, 123, 34)
inv_tup = Tuple('nadaver', title_extractor._TYPE_SUBTITLE, 123, 34)
assert inv_tup == title_extractor._invertTextTypeBboxPageTuple(tup)
assert inv_tup == title_extractor.invert_text_type_bbox_page_tuple(tup)


def test_invert_text_type_bbox_page_tuple_3():
tup = Tuple('nadaver', title_extractor._TYPE_TITLE, 123, 34)
inv_tup = Tuple('nadaperder', title_extractor._TYPE_SUBTITLE, 123, 34)
assert inv_tup != title_extractor._invertTextTypeBboxPageTuple(tup)
assert inv_tup != title_extractor.invert_text_type_bbox_page_tuple(tup)


def test_extract_bold_upper_page():
Expand Down

0 comments on commit d9d7ee1

Please sign in to comment.