Skip to content

Commit

Permalink
Table work update.
Browse files Browse the repository at this point in the history
This includes:
- the program now tries to determine whether the table captions are
above or below the tables on each page with tables
- changes to manager.Manager debug functions which work with tables
- term "(table )header" was changed to "caption" per various sources
(Elsevier, for example)
  • Loading branch information
Evildoor committed Jun 5, 2018
1 parent 7d497b5 commit 5e149e7
Show file tree
Hide file tree
Showing 2 changed files with 205 additions and 125 deletions.
107 changes: 59 additions & 48 deletions Utils/Dataflow/030_PDFAnalyzer/manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -252,8 +252,8 @@ def find(self, text, intervals, datasets):
dataset_categories = [montecarlo, physcont, calibration, realdata, database]
# Path must have / as separator, not \.
re_pdfname = re.compile(r"/([^./]+)\.pdf$")
re_table_header = re.compile(r"Table \d+:.*?\n\n", re.DOTALL)
re_table_header_short = re.compile(r"Table (\d+):")
re_table_caption = re.compile(r"Table \d+:.*?\n\n", re.DOTALL)
re_table_caption_short = re.compile(r"Table (\d+):")
re_table_datasets = re.compile("(?:sample|dataset|run)")
re_column_with_datasets = re.compile("^(?:d[cs]?[-_ ]?|mc[-_ ]?|data ?"
"|dataset ?"
Expand Down Expand Up @@ -671,24 +671,24 @@ def find_datasets(self):
def find_datatables(self):
""" Find tables in the document which may contain datasets. """
pages_with_tables = []
headers_data = {}
captions_data = {}
n = 1
# Find pages containing table headers.
# Find pages containing table captions.
while n <= self.num_pages:
text = self.get_txt_page(n, True)
# print n, re_table_header.findall(text.lower())
page_headers = re_table_header.findall(text)
page_headers_data = {}
# Among the headers find ones which may hint that their
# tables contain datasets. Store these headers, their
# print n, re_table_caption.findall(text.lower())
page_captions = re_table_caption.findall(text)
page_captions_data = {}
# Among the captions find ones which may hint that their
# tables contain datasets. Store these captions, their
# numbers and their pages.
for h in page_headers:
for h in page_captions:
if re_table_datasets.search(h.lower()):
num = int(re_table_header_short.match(h).group(1))
page_headers_data[num] = h
if page_headers_data:
num = int(re_table_caption_short.match(h).group(1))
page_captions_data[num] = h
if page_captions_data:
pages_with_tables.append(n)
headers_data.update(page_headers_data)
captions_data.update(page_captions_data)
n += 1

# print "PAGES WITH DATASETS TABLES", pages_with_tables
Expand All @@ -698,16 +698,16 @@ def find_datatables(self):
for n in pages_with_tables:
text = self.get_xml_page(n, True)
tables = xmltable.get_tables_from_text(text)
# Save headers and tables matching selected numbers and
# Save captions and tables matching selected numbers and
# having dataset-related columns.
for table in tables:
num = int(re_table_header_short.match(table.header).group(1))
if num in headers_data:
# print "TABLE WITH HEADER", headers_data[num].strip(),\
num = int(re_table_caption_short.match(table.caption).group(1))
if num in captions_data:
# print "TABLE WITH CAPTION", captions_data[num].strip(),\
# "MAY CONTAIN DATASETS"
data_column = -1
skip_first = False
# Save headers and tables matching selected numbers
# Save captions and tables matching selected numbers
# and having dataset-related columns.
for rnum in range(0, min(2, len(table.rows))):
for i in range(0, len(table.rows[rnum])):
Expand All @@ -718,7 +718,7 @@ def find_datatables(self):
data_column = i
if rnum == 1:
# This means that first row contains
# some kind of header, or rubbish,
# some kind of caption, or rubbish,
# or something else, and columns are
# defined in the second one. First
# one must be skipped in such case.
Expand Down Expand Up @@ -779,7 +779,7 @@ def find_datatables(self):
data = " ".join([i for i in ids])
else:
data = rows
datatables[num] = (headers_data[num], data)
datatables[num] = (captions_data[num], data)
# elif coef < 0.7:
# print "COEFFICIENT IS LOWER THAN 0.7.\
# SKIPPING TABLE", num
Expand Down Expand Up @@ -854,8 +854,8 @@ def export(self, quick=False, outf=False):
for num in self.datatables:
if isinstance(self.datatables[num][1], str)\
or isinstance(self.datatables[num][1], unicode):
header, ids = self.datatables[num]
data = [header, [i for i in ids.split()]]
caption, ids = self.datatables[num]
data = [caption, [i for i in ids.split()]]
else:
data = self.datatables[num]
outp["content"]["table_" + str(num)] = data
Expand All @@ -864,8 +864,8 @@ def export(self, quick=False, outf=False):
for num in tables:
if isinstance(tables[num][1], str)\
or isinstance(tables[num][1], unicode):
header, ids = tables[num]
data = [header, [i for i in ids.split()]]
caption, ids = tables[num]
data = [caption, [i for i in ids.split()]]
else:
data = tables[num]
outp["content"]["table_" + str(num)] = data
Expand Down Expand Up @@ -1343,12 +1343,12 @@ def update_paper_parameter(self, window, paper, param, value):
self.show_paper_datasets(window, paper)
elif param == "datatables":
paper.datatables = {}
for [num, header, data, selected] in value:
for [num, caption, data, selected] in value:
if selected.get():
if isinstance(data, list):
paper.datatables[num] = (header, data)
paper.datatables[num] = (caption, data)
else:
paper.datatables[num] = (header,
paper.datatables[num] = (caption,
data.get("0.0",
"end").strip())
self.show_paper_datatables(window, paper)
Expand Down Expand Up @@ -1518,11 +1518,11 @@ def show_paper_datatables(self, window, paper):
keys.sort()
datatables_s = []
for k in keys:
(header, data) = datatables[k]
(caption, data) = datatables[k]
t_frame = Tkinter.Frame(frame)
selected = Tkinter.IntVar()
selected.set(1)
lbl = Tkinter.Label(t_frame, text=header,
lbl = Tkinter.Label(t_frame, text=caption,
font=HEADING_FONT)
b = Tkinter.Checkbutton(t_frame, var=selected)
if isinstance(data, str) or isinstance(data, unicode):
Expand All @@ -1532,7 +1532,7 @@ def show_paper_datatables(self, window, paper):
height=data.count(" ") // 5 + 2)
t.insert(Tkinter.END, data)
t.grid(row=1, column=0)
datatables_s.append([k, header, t, selected])
datatables_s.append([k, caption, t, selected])
else:
rows = data
lbl.grid(row=0, column=0, columnspan=len(rows[0]))
Expand All @@ -1551,7 +1551,7 @@ def show_paper_datatables(self, window, paper):
lbl = Tkinter.Label(t_frame, text=msg)
lbl.grid(row=r, columnspan=c)
break
datatables_s.append([k, header, rows, selected])
datatables_s.append([k, caption, rows, selected])
t_frame.grid(row=num, column=0)
# TO DO: checkbuttons for "(un)select all".
num += 1
Expand Down Expand Up @@ -1596,9 +1596,9 @@ def show_paper_datatables(self, window, paper):
keys = paper.datatables.keys()
keys.sort()
for k in keys:
(header, data) = paper.datatables[k]
(caption, data) = paper.datatables[k]
t_frame = Tkinter.Frame(frame)
lbl = Tkinter.Label(t_frame, text=header,
lbl = Tkinter.Label(t_frame, text=caption,
font=HEADING_FONT)
if isinstance(data, str) or isinstance(data, unicode):
lbl.grid(row=0, column=0)
Expand Down Expand Up @@ -1673,7 +1673,7 @@ def show_paper_page_tables(self, window, paper, e=False):
tables = xmltable.get_tables_from_text(text)
for table_num in range(0, len(tables)):
frame = Tkinter.Frame(window)
lbl = Tkinter.Label(frame, text="Table %d" % table_num)
lbl = Tkinter.Label(frame, text=tables[table_num].caption)
lbl.grid(row=0, column=0,
columnspan=len(tables[table_num].rows[0]))
r = 1
Expand Down Expand Up @@ -1736,25 +1736,36 @@ def show_paper_visual(self, window, paper, e=False):

text = paper.get_xml_page(number, True)
rows = xmltable.analyze_page(text)
max_width = max([row[-1].right - row[0].left for row in rows])
header_row = False
for row in rows:
if len(row) == 1 and row[0].text.startswith("Table "):
header_row = row
color = "red"
elif header_row and len(row) == 1 and\
abs(row[0].left - header_row[0].left) < 1.0:
caption_row = False
caption_rows = []
for (i, row) in list(enumerate(rows)):
if row[0].text.startswith("Table ") and\
re_table_caption_short.match(row[0].text):
caption_row = row
caption_rows.append(i)
elif caption_row and len(row) == 1 and\
abs(row[0].left - caption_row[0].left) < 1.0:
caption_rows.append(i)
else:
caption_row = False
for (i, row) in list(enumerate(rows)):
if i in caption_rows:
color = "red"
elif abs(row[-1].right - row[0].left - max_width) < 1.0:
color = "blue"
else:
header_row = False
color = "black"
if len(row) > 1 and row[0].right - row[0].left < 10.0:
f = row[1]
else:
f = row[0]
if f.left - 76.0 < 1.0 and len(row) < 3:
color = "green"
else:
color = "black"
for line in row:
cnvs.create_rectangle((line.left, line.top + 10,
line.right, line.bottom + 10),
outline=color)

cnvs.create_text((row[0].left - 30, row[0].top + 10),
text=i)
b = Tkinter.Button(window, text="Back",
command=lambda window=window, paper=paper:
self.show_paper_info(window, paper))
Expand Down
Loading

0 comments on commit 5e149e7

Please sign in to comment.