From 5e149e75d48280754bf8e31f916f8970c1d17415 Mon Sep 17 00:00:00 2001 From: Evildoor Date: Tue, 5 Jun 2018 17:56:37 +0300 Subject: [PATCH] Table work update. This includes: - the program now tries to determine whether the table captions are above or below the tables on each page with tables - changes to manager.Manager debug functions which work with tables - term "(table )header" was changed to "caption" per various sources (Elsevier, for example) --- Utils/Dataflow/030_PDFAnalyzer/manager.py | 107 +++++----- Utils/Dataflow/030_PDFAnalyzer/xmltable.py | 223 ++++++++++++++------- 2 files changed, 205 insertions(+), 125 deletions(-) diff --git a/Utils/Dataflow/030_PDFAnalyzer/manager.py b/Utils/Dataflow/030_PDFAnalyzer/manager.py index 62b5b7161..f76e4ce5b 100644 --- a/Utils/Dataflow/030_PDFAnalyzer/manager.py +++ b/Utils/Dataflow/030_PDFAnalyzer/manager.py @@ -252,8 +252,8 @@ def find(self, text, intervals, datasets): dataset_categories = [montecarlo, physcont, calibration, realdata, database] # Path must have / as separator, not \. re_pdfname = re.compile(r"/([^./]+)\.pdf$") -re_table_header = re.compile(r"Table \d+:.*?\n\n", re.DOTALL) -re_table_header_short = re.compile(r"Table (\d+):") +re_table_caption = re.compile(r"Table \d+:.*?\n\n", re.DOTALL) +re_table_caption_short = re.compile(r"Table (\d+):") re_table_datasets = re.compile("(?:sample|dataset|run)") re_column_with_datasets = re.compile("^(?:d[cs]?[-_ ]?|mc[-_ ]?|data ?" "|dataset ?" @@ -671,24 +671,24 @@ def find_datasets(self): def find_datatables(self): """ Find tables in the document which may contain datasets. """ pages_with_tables = [] - headers_data = {} + captions_data = {} n = 1 - # Find pages containing table headers. + # Find pages containing table captions. while n <= self.num_pages: text = self.get_txt_page(n, True) -# print n, re_table_header.findall(text.lower()) - page_headers = re_table_header.findall(text) - page_headers_data = {} - # Among the headers find ones which may hint that their - # tables contain datasets. Store these headers, their +# print n, re_table_caption.findall(text.lower()) + page_captions = re_table_caption.findall(text) + page_captions_data = {} + # Among the captions find ones which may hint that their + # tables contain datasets. Store these captions, their # numbers and their pages. - for h in page_headers: + for h in page_captions: if re_table_datasets.search(h.lower()): - num = int(re_table_header_short.match(h).group(1)) - page_headers_data[num] = h - if page_headers_data: + num = int(re_table_caption_short.match(h).group(1)) + page_captions_data[num] = h + if page_captions_data: pages_with_tables.append(n) - headers_data.update(page_headers_data) + captions_data.update(page_captions_data) n += 1 # print "PAGES WITH DATASETS TABLES", pages_with_tables @@ -698,16 +698,16 @@ def find_datatables(self): for n in pages_with_tables: text = self.get_xml_page(n, True) tables = xmltable.get_tables_from_text(text) - # Save headers and tables matching selected numbers and + # Save captions and tables matching selected numbers and # having dataset-related columns. for table in tables: - num = int(re_table_header_short.match(table.header).group(1)) - if num in headers_data: - # print "TABLE WITH HEADER", headers_data[num].strip(),\ + num = int(re_table_caption_short.match(table.caption).group(1)) + if num in captions_data: + # print "TABLE WITH CAPTION", captions_data[num].strip(),\ # "MAY CONTAIN DATASETS" data_column = -1 skip_first = False - # Save headers and tables matching selected numbers + # Save captions and tables matching selected numbers # and having dataset-related columns. for rnum in range(0, min(2, len(table.rows))): for i in range(0, len(table.rows[rnum])): @@ -718,7 +718,7 @@ def find_datatables(self): data_column = i if rnum == 1: # This means that first row contains - # some kind of header, or rubbish, + # some kind of caption, or rubbish, # or something else, and columns are # defined in the second one. First # one must be skipped in such case. @@ -779,7 +779,7 @@ def find_datatables(self): data = " ".join([i for i in ids]) else: data = rows - datatables[num] = (headers_data[num], data) + datatables[num] = (captions_data[num], data) # elif coef < 0.7: # print "COEFFICIENT IS LOWER THAN 0.7.\ # SKIPPING TABLE", num @@ -854,8 +854,8 @@ def export(self, quick=False, outf=False): for num in self.datatables: if isinstance(self.datatables[num][1], str)\ or isinstance(self.datatables[num][1], unicode): - header, ids = self.datatables[num] - data = [header, [i for i in ids.split()]] + caption, ids = self.datatables[num] + data = [caption, [i for i in ids.split()]] else: data = self.datatables[num] outp["content"]["table_" + str(num)] = data @@ -864,8 +864,8 @@ def export(self, quick=False, outf=False): for num in tables: if isinstance(tables[num][1], str)\ or isinstance(tables[num][1], unicode): - header, ids = tables[num] - data = [header, [i for i in ids.split()]] + caption, ids = tables[num] + data = [caption, [i for i in ids.split()]] else: data = tables[num] outp["content"]["table_" + str(num)] = data @@ -1343,12 +1343,12 @@ def update_paper_parameter(self, window, paper, param, value): self.show_paper_datasets(window, paper) elif param == "datatables": paper.datatables = {} - for [num, header, data, selected] in value: + for [num, caption, data, selected] in value: if selected.get(): if isinstance(data, list): - paper.datatables[num] = (header, data) + paper.datatables[num] = (caption, data) else: - paper.datatables[num] = (header, + paper.datatables[num] = (caption, data.get("0.0", "end").strip()) self.show_paper_datatables(window, paper) @@ -1518,11 +1518,11 @@ def show_paper_datatables(self, window, paper): keys.sort() datatables_s = [] for k in keys: - (header, data) = datatables[k] + (caption, data) = datatables[k] t_frame = Tkinter.Frame(frame) selected = Tkinter.IntVar() selected.set(1) - lbl = Tkinter.Label(t_frame, text=header, + lbl = Tkinter.Label(t_frame, text=caption, font=HEADING_FONT) b = Tkinter.Checkbutton(t_frame, var=selected) if isinstance(data, str) or isinstance(data, unicode): @@ -1532,7 +1532,7 @@ def show_paper_datatables(self, window, paper): height=data.count(" ") // 5 + 2) t.insert(Tkinter.END, data) t.grid(row=1, column=0) - datatables_s.append([k, header, t, selected]) + datatables_s.append([k, caption, t, selected]) else: rows = data lbl.grid(row=0, column=0, columnspan=len(rows[0])) @@ -1551,7 +1551,7 @@ def show_paper_datatables(self, window, paper): lbl = Tkinter.Label(t_frame, text=msg) lbl.grid(row=r, columnspan=c) break - datatables_s.append([k, header, rows, selected]) + datatables_s.append([k, caption, rows, selected]) t_frame.grid(row=num, column=0) # TO DO: checkbuttons for "(un)select all". num += 1 @@ -1596,9 +1596,9 @@ def show_paper_datatables(self, window, paper): keys = paper.datatables.keys() keys.sort() for k in keys: - (header, data) = paper.datatables[k] + (caption, data) = paper.datatables[k] t_frame = Tkinter.Frame(frame) - lbl = Tkinter.Label(t_frame, text=header, + lbl = Tkinter.Label(t_frame, text=caption, font=HEADING_FONT) if isinstance(data, str) or isinstance(data, unicode): lbl.grid(row=0, column=0) @@ -1673,7 +1673,7 @@ def show_paper_page_tables(self, window, paper, e=False): tables = xmltable.get_tables_from_text(text) for table_num in range(0, len(tables)): frame = Tkinter.Frame(window) - lbl = Tkinter.Label(frame, text="Table %d" % table_num) + lbl = Tkinter.Label(frame, text=tables[table_num].caption) lbl.grid(row=0, column=0, columnspan=len(tables[table_num].rows[0])) r = 1 @@ -1736,25 +1736,36 @@ def show_paper_visual(self, window, paper, e=False): text = paper.get_xml_page(number, True) rows = xmltable.analyze_page(text) - max_width = max([row[-1].right - row[0].left for row in rows]) - header_row = False - for row in rows: - if len(row) == 1 and row[0].text.startswith("Table "): - header_row = row - color = "red" - elif header_row and len(row) == 1 and\ - abs(row[0].left - header_row[0].left) < 1.0: + caption_row = False + caption_rows = [] + for (i, row) in list(enumerate(rows)): + if row[0].text.startswith("Table ") and\ + re_table_caption_short.match(row[0].text): + caption_row = row + caption_rows.append(i) + elif caption_row and len(row) == 1 and\ + abs(row[0].left - caption_row[0].left) < 1.0: + caption_rows.append(i) + else: + caption_row = False + for (i, row) in list(enumerate(rows)): + if i in caption_rows: color = "red" - elif abs(row[-1].right - row[0].left - max_width) < 1.0: - color = "blue" else: - header_row = False - color = "black" + if len(row) > 1 and row[0].right - row[0].left < 10.0: + f = row[1] + else: + f = row[0] + if f.left - 76.0 < 1.0 and len(row) < 3: + color = "green" + else: + color = "black" for line in row: cnvs.create_rectangle((line.left, line.top + 10, line.right, line.bottom + 10), outline=color) - + cnvs.create_text((row[0].left - 30, row[0].top + 10), + text=i) b = Tkinter.Button(window, text="Back", command=lambda window=window, paper=paper: self.show_paper_info(window, paper)) diff --git a/Utils/Dataflow/030_PDFAnalyzer/xmltable.py b/Utils/Dataflow/030_PDFAnalyzer/xmltable.py index 631644715..727ec0879 100644 --- a/Utils/Dataflow/030_PDFAnalyzer/xmltable.py +++ b/Utils/Dataflow/030_PDFAnalyzer/xmltable.py @@ -9,6 +9,7 @@ re_textline = re.compile(".+?", re.DOTALL) +re_table_caption_short = re.compile(r"Table (\d+):") class TextLine: @@ -64,22 +65,35 @@ def __init__(self, params=False, text_symbols=None): self.top = float(self.top) self.bottom = float(self.bottom) + fonts = {} for line in lines: - f = self.re_text_symbol.match(line) + f = self.re_text_symbol_params.match(line) if f: - self.text += f.group(1) + self.text += f.group(3) coords = self.re_bbox.search(line) if coords: [l0, t0, r0, b0] = coords.group(1).split(",") if after_space: self.spaces_coords[-1][1] = float(l0) after_space = False + + font = (f.group(1), f.group(2)) + if font in fonts: + fonts[font] += 1 + else: + fonts[font] = 1 continue f = self.re_text_space.match(line) if f: self.text += " " self.spaces_coords.append([float(r0), 0]) after_space = True + if fonts: + m = max(fonts.values()) + for f in fonts: + if fonts[f] == m: + self.font = f + break elif isinstance(params, list): [self.left, self.top, self.right, self.bottom, @@ -148,23 +162,11 @@ class Table: re_month = re.compile("(january|february|march|april|may|june|july|august" "|september|october|november|december)") - def __init__(self, header, lines): + def __init__(self, caption, rows, caption_position): # table description - self.header = header - # table text lines - self.lines = lines - -# print "\nTABLE WITH HEADER", header + self.caption = caption - rows = [] - t = [] - # Construct rows out of lines - for line in self.lines: - if line not in t: - row = self.construct_row(line, t) - rows.append(row) - t += row - rows.sort(key=lambda row: row_centery(row)) +# print "\nTABLE WITH CAPTION", caption # Remove rows which contain date - this is used because # sometimes date stamped on a page gets caught while we are @@ -180,27 +182,43 @@ def __init__(self, header, lines): row.sort(key=lambda line: line.center[0]) self.rows.append(row) - r = len(self.rows) - 1 - # Separate table lines from text above table. This is done by + # Separate table lines from text below/above table. This is done by # looking for a space between rows which is too large. max_diff = False - n = 1 - while r > 0: - if not max_diff: - max_diff = row_centery(self.rows[r])\ - - row_centery(self.rows[r - 1]) - else: - diff = row_centery(self.rows[r]) - \ - row_centery(self.rows[r - 1]) -# print "DIFF BETWEEN", self.row_text(num = r), "AND",\ -# self.row_text(num = r - 1), ":", diff - if diff > 1.4 * max_diff: - del self.rows[0:r] - break - elif diff > max_diff: - max_diff = diff - n += 1 - r -= 1 + if caption_position < 0: + r = 0 + while r < len(self.rows) - 1: + if not max_diff: + max_diff = row_centery(self.rows[r + 1])\ + - row_centery(self.rows[r]) + else: + diff = row_centery(self.rows[r + 1]) - \ + row_centery(self.rows[r]) +# print "DIFF BETWEEN", self.row_text(num = r), "AND",\ +# self.row_text(num = r - 1), ":", diff + if diff > 1.4 * max_diff: + del self.rows[r + 1:] + break + elif diff > max_diff: + max_diff = diff + r += 1 + else: + r = len(self.rows) - 1 + while r > 0: + if not max_diff: + max_diff = row_centery(self.rows[r])\ + - row_centery(self.rows[r - 1]) + else: + diff = row_centery(self.rows[r]) - \ + row_centery(self.rows[r - 1]) +# print "DIFF BETWEEN", self.row_text(num = r), "AND",\ +# self.row_text(num = r - 1), ":", diff + if diff > 1.4 * max_diff: + del self.rows[:r] + break + elif diff > max_diff: + max_diff = diff + r -= 1 # print "ROWS" # Find overlapping (by X coordinate) lines and merge them. So @@ -361,47 +379,6 @@ def break_short_rows(self, max_elements): self.rows.sort(key=lambda row: row_centery(row)) -def get_tables_from_text(text): - """ Get tables from a xml page text. """ - re_textbox = re.compile(r"", - re.DOTALL) - re_table_header = re.compile(r"Table \d+:") - tlines = re_textline.findall(text) - lines = [] - table_headers = [] - for line in tlines: - tl = TextLine(line) - if re_table_header.match(tl.text): - table_headers.append(tl) - else: - lines.append(tl) - - # Find the highest top coordinate possible and use it as a zero - # point for new Y axis. - top = max(table_headers + lines, key=lambda line: line.top).top - for line in table_headers + lines: - line.swap_y(top) - - table_headers.sort(key=lambda x: x.center[1]) - - table_lines = [] - tables = [] - for header in table_headers: - table_lines = [] - remaining_lines = [] - for line in lines: - if line.center[1] < header.center[1]: - table_lines.append(line) - else: - remaining_lines.append(line) - - table = Table(header.text, table_lines) - if table.rows: - tables.append(table) - lines = remaining_lines - return tables - - def analyze_page(text): tlines = re_textline.findall(text) lines = [] @@ -428,5 +405,97 @@ def analyze_page(text): rows.append(row) t += row rows.sort(key=lambda row: row_centery(row)) + for line in rows[0]: + if line.text == "DRAFT": + rows = rows[1:] + break return rows + + +def get_tables_from_text(text): + """ Get tables from a xml page text. """ + rows = analyze_page(text) + caption_start = False + caption_rows = [] + text_rows = [] + for (i, row) in list(enumerate(rows)): + if row[0].text.startswith("Table ") and\ + re_table_caption_short.match(row[0].text): + caption_start = row + caption_rows.append(i) + elif caption_start and len(row) == 1 and\ + abs(row[0].left - caption_start[0].left) < 1.0: + caption_rows.append(i) + else: + caption_start = False + if len(row) > 1 and row[0].right - row[0].left < 10.0: + f = row[1] + else: + f = row[0] + if f.left - 76.0 < 1.0 and len(row) < 3: + text_rows.append(i) + # Tables' captions position on the page + # -1 - above tables + # 0 - unknown + # 1 - below tables + caption_position = 0 + if 0 in caption_rows: + caption_position = -1 + elif len(rows) - 1 in caption_rows: + caption_position = 1 + else: + for i in caption_rows: + if i - 1 in text_rows: + caption_position = -1 + break + elif i + 1 in text_rows: + caption_position = 1 + break + tables = [] + if caption_position < 0: + i = len(rows) - 1 + last_unused = i + while i >= 0: + if i in caption_rows: + table_rows = rows[i + 1:last_unused + 1] + while i - 1 in caption_rows: + i -= 1 + caption = "".join([line.text for line in rows[i]]) + while i + 1 in caption_rows: + i += 1 + caption += "".join([line.text for line in rows[i]]) + while i - 1 in caption_rows: + i -= 1 + # Row after table cannot be another caption and is first unused + # row for the next table. + i -= 1 + last_unused = i + i -= 1 + table = Table(caption, table_rows, caption_position) + if table.rows: + tables.append(table) + else: + i -= 1 + tables = list(reversed(tables)) + else: + i = 0 + last_unused = i + while i < len(rows): + if i in caption_rows: + table_rows = rows[last_unused:i] + caption = "".join([line.text for line in rows[i]]) + while i + 1 in caption_rows: + i += 1 + caption += "".join([line.text for line in rows[i]]) + # Row after table cannot be another caption and is first unused + # row for the next table. + i += 1 + last_unused = i + i += 1 + table = Table(caption, table_rows, caption_position) + if table.rows: + tables.append(table) + else: + i += 1 + return tables