From 5e149e75d48280754bf8e31f916f8970c1d17415 Mon Sep 17 00:00:00 2001
From: Evildoor <evildoor256@gmail.com>
Date: Tue, 5 Jun 2018 17:56:37 +0300
Subject: [PATCH] Table work update.

This includes:
- the program now tries to determine whether the table captions are
above or below the tables on each page with tables
- changes to manager.Manager debug functions which work with tables
- term "(table )header" was changed to "caption" per various sources
(Elsevier, for example)
---
 Utils/Dataflow/030_PDFAnalyzer/manager.py  | 107 +++++-----
 Utils/Dataflow/030_PDFAnalyzer/xmltable.py | 223 ++++++++++++++-------
 2 files changed, 205 insertions(+), 125 deletions(-)

diff --git a/Utils/Dataflow/030_PDFAnalyzer/manager.py b/Utils/Dataflow/030_PDFAnalyzer/manager.py
index 62b5b7161..f76e4ce5b 100644
--- a/Utils/Dataflow/030_PDFAnalyzer/manager.py
+++ b/Utils/Dataflow/030_PDFAnalyzer/manager.py
@@ -252,8 +252,8 @@ def find(self, text, intervals, datasets):
 dataset_categories = [montecarlo, physcont, calibration, realdata, database]
 # Path must have / as separator, not \.
 re_pdfname = re.compile(r"/([^./]+)\.pdf$")
-re_table_header = re.compile(r"Table \d+:.*?\n\n", re.DOTALL)
-re_table_header_short = re.compile(r"Table (\d+):")
+re_table_caption = re.compile(r"Table \d+:.*?\n\n", re.DOTALL)
+re_table_caption_short = re.compile(r"Table (\d+):")
 re_table_datasets = re.compile("(?:sample|dataset|run)")
 re_column_with_datasets = re.compile("^(?:d[cs]?[-_ ]?|mc[-_ ]?|data ?"
                                      "|dataset ?"
@@ -671,24 +671,24 @@ def find_datasets(self):
     def find_datatables(self):
         """ Find tables in the document which may contain datasets. """
         pages_with_tables = []
-        headers_data = {}
+        captions_data = {}
         n = 1
-        # Find pages containing table headers.
+        # Find pages containing table captions.
         while n <= self.num_pages:
             text = self.get_txt_page(n, True)
-#            print n, re_table_header.findall(text.lower())
-            page_headers = re_table_header.findall(text)
-            page_headers_data = {}
-            # Among the headers find ones which may hint that their
-            # tables contain datasets. Store these headers, their
+#            print n, re_table_caption.findall(text.lower())
+            page_captions = re_table_caption.findall(text)
+            page_captions_data = {}
+            # Among the captions find ones which may hint that their
+            # tables contain datasets. Store these captions, their
             # numbers and their pages.
-            for h in page_headers:
+            for h in page_captions:
                 if re_table_datasets.search(h.lower()):
-                    num = int(re_table_header_short.match(h).group(1))
-                    page_headers_data[num] = h
-            if page_headers_data:
+                    num = int(re_table_caption_short.match(h).group(1))
+                    page_captions_data[num] = h
+            if page_captions_data:
                 pages_with_tables.append(n)
-                headers_data.update(page_headers_data)
+                captions_data.update(page_captions_data)
             n += 1
 
 #        print "PAGES WITH DATASETS TABLES", pages_with_tables
@@ -698,16 +698,16 @@ def find_datatables(self):
         for n in pages_with_tables:
             text = self.get_xml_page(n, True)
             tables = xmltable.get_tables_from_text(text)
-            # Save headers and tables matching selected numbers and
+            # Save captions and tables matching selected numbers and
             # having dataset-related columns.
             for table in tables:
-                num = int(re_table_header_short.match(table.header).group(1))
-                if num in headers_data:
-                    # print "TABLE WITH HEADER", headers_data[num].strip(),\
+                num = int(re_table_caption_short.match(table.caption).group(1))
+                if num in captions_data:
+                    # print "TABLE WITH CAPTION", captions_data[num].strip(),\
                     #                          "MAY CONTAIN DATASETS"
                     data_column = -1
                     skip_first = False
-                    # Save headers and tables matching selected numbers
+                    # Save captions and tables matching selected numbers
                     # and having dataset-related columns.
                     for rnum in range(0, min(2, len(table.rows))):
                         for i in range(0, len(table.rows[rnum])):
@@ -718,7 +718,7 @@ def find_datatables(self):
                                 data_column = i
                                 if rnum == 1:
                                     # This means that first row contains
-                                    # some kind of header, or rubbish,
+                                    # some kind of caption, or rubbish,
                                     # or something else, and columns are
                                     # defined in the second one. First
                                     # one must be skipped in such case.
@@ -779,7 +779,7 @@ def find_datatables(self):
                                 data = " ".join([i for i in ids])
                             else:
                                 data = rows
-                            datatables[num] = (headers_data[num], data)
+                            datatables[num] = (captions_data[num], data)
 # elif coef < 0.7:
 # print "COEFFICIENT IS LOWER THAN 0.7.\
 # SKIPPING TABLE", num
@@ -854,8 +854,8 @@ def export(self, quick=False, outf=False):
             for num in self.datatables:
                 if isinstance(self.datatables[num][1], str)\
                    or isinstance(self.datatables[num][1], unicode):
-                    header, ids = self.datatables[num]
-                    data = [header, [i for i in ids.split()]]
+                    caption, ids = self.datatables[num]
+                    data = [caption, [i for i in ids.split()]]
                 else:
                     data = self.datatables[num]
                 outp["content"]["table_" + str(num)] = data
@@ -864,8 +864,8 @@ def export(self, quick=False, outf=False):
             for num in tables:
                 if isinstance(tables[num][1], str)\
                    or isinstance(tables[num][1], unicode):
-                    header, ids = tables[num]
-                    data = [header, [i for i in ids.split()]]
+                    caption, ids = tables[num]
+                    data = [caption, [i for i in ids.split()]]
                 else:
                     data = tables[num]
                 outp["content"]["table_" + str(num)] = data
@@ -1343,12 +1343,12 @@ def update_paper_parameter(self, window, paper, param, value):
             self.show_paper_datasets(window, paper)
         elif param == "datatables":
             paper.datatables = {}
-            for [num, header, data, selected] in value:
+            for [num, caption, data, selected] in value:
                 if selected.get():
                     if isinstance(data, list):
-                        paper.datatables[num] = (header, data)
+                        paper.datatables[num] = (caption, data)
                     else:
-                        paper.datatables[num] = (header,
+                        paper.datatables[num] = (caption,
                                                  data.get("0.0",
                                                           "end").strip())
             self.show_paper_datatables(window, paper)
@@ -1518,11 +1518,11 @@ def show_paper_datatables(self, window, paper):
                 keys.sort()
                 datatables_s = []
                 for k in keys:
-                    (header, data) = datatables[k]
+                    (caption, data) = datatables[k]
                     t_frame = Tkinter.Frame(frame)
                     selected = Tkinter.IntVar()
                     selected.set(1)
-                    lbl = Tkinter.Label(t_frame, text=header,
+                    lbl = Tkinter.Label(t_frame, text=caption,
                                         font=HEADING_FONT)
                     b = Tkinter.Checkbutton(t_frame, var=selected)
                     if isinstance(data, str) or isinstance(data, unicode):
@@ -1532,7 +1532,7 @@ def show_paper_datatables(self, window, paper):
                                          height=data.count(" ") // 5 + 2)
                         t.insert(Tkinter.END, data)
                         t.grid(row=1, column=0)
-                        datatables_s.append([k, header, t, selected])
+                        datatables_s.append([k, caption, t, selected])
                     else:
                         rows = data
                         lbl.grid(row=0, column=0, columnspan=len(rows[0]))
@@ -1551,7 +1551,7 @@ def show_paper_datatables(self, window, paper):
                                 lbl = Tkinter.Label(t_frame, text=msg)
                                 lbl.grid(row=r, columnspan=c)
                                 break
-                        datatables_s.append([k, header, rows, selected])
+                        datatables_s.append([k, caption, rows, selected])
                     t_frame.grid(row=num, column=0)
                     # TO DO: checkbuttons for "(un)select all".
                     num += 1
@@ -1596,9 +1596,9 @@ def show_paper_datatables(self, window, paper):
                 keys = paper.datatables.keys()
                 keys.sort()
                 for k in keys:
-                    (header, data) = paper.datatables[k]
+                    (caption, data) = paper.datatables[k]
                     t_frame = Tkinter.Frame(frame)
-                    lbl = Tkinter.Label(t_frame, text=header,
+                    lbl = Tkinter.Label(t_frame, text=caption,
                                         font=HEADING_FONT)
                     if isinstance(data, str) or isinstance(data, unicode):
                         lbl.grid(row=0, column=0)
@@ -1673,7 +1673,7 @@ def show_paper_page_tables(self, window, paper, e=False):
                 tables = xmltable.get_tables_from_text(text)
                 for table_num in range(0, len(tables)):
                     frame = Tkinter.Frame(window)
-                    lbl = Tkinter.Label(frame, text="Table %d" % table_num)
+                    lbl = Tkinter.Label(frame, text=tables[table_num].caption)
                     lbl.grid(row=0, column=0,
                              columnspan=len(tables[table_num].rows[0]))
                     r = 1
@@ -1736,25 +1736,36 @@ def show_paper_visual(self, window, paper, e=False):
 
                 text = paper.get_xml_page(number, True)
                 rows = xmltable.analyze_page(text)
-                max_width = max([row[-1].right - row[0].left for row in rows])
-                header_row = False
-                for row in rows:
-                    if len(row) == 1 and row[0].text.startswith("Table "):
-                        header_row = row
-                        color = "red"
-                    elif header_row and len(row) == 1 and\
-                            abs(row[0].left - header_row[0].left) < 1.0:
+                caption_row = False
+                caption_rows = []
+                for (i, row) in list(enumerate(rows)):
+                    if row[0].text.startswith("Table ") and\
+                       re_table_caption_short.match(row[0].text):
+                        caption_row = row
+                        caption_rows.append(i)
+                    elif caption_row and len(row) == 1 and\
+                            abs(row[0].left - caption_row[0].left) < 1.0:
+                        caption_rows.append(i)
+                    else:
+                        caption_row = False
+                for (i, row) in list(enumerate(rows)):
+                    if i in caption_rows:
                         color = "red"
-                    elif abs(row[-1].right - row[0].left - max_width) < 1.0:
-                        color = "blue"
                     else:
-                        header_row = False
-                        color = "black"
+                        if len(row) > 1 and row[0].right - row[0].left < 10.0:
+                            f = row[1]
+                        else:
+                            f = row[0]
+                        if f.left - 76.0 < 1.0 and len(row) < 3:
+                            color = "green"
+                        else:
+                            color = "black"
                     for line in row:
                         cnvs.create_rectangle((line.left, line.top + 10,
                                                line.right, line.bottom + 10),
                                               outline=color)
-
+                    cnvs.create_text((row[0].left - 30, row[0].top + 10),
+                                     text=i)
                 b = Tkinter.Button(window, text="Back",
                                    command=lambda window=window, paper=paper:
                                    self.show_paper_info(window, paper))
diff --git a/Utils/Dataflow/030_PDFAnalyzer/xmltable.py b/Utils/Dataflow/030_PDFAnalyzer/xmltable.py
index 631644715..727ec0879 100644
--- a/Utils/Dataflow/030_PDFAnalyzer/xmltable.py
+++ b/Utils/Dataflow/030_PDFAnalyzer/xmltable.py
@@ -9,6 +9,7 @@
 
 re_textline = re.compile("<textline bbox=\"[0-9.,]+\">.+?</textline>",
                          re.DOTALL)
+re_table_caption_short = re.compile(r"Table (\d+):")
 
 
 class TextLine:
@@ -64,22 +65,35 @@ def __init__(self, params=False, text_symbols=None):
             self.top = float(self.top)
             self.bottom = float(self.bottom)
 
+            fonts = {}
             for line in lines:
-                f = self.re_text_symbol.match(line)
+                f = self.re_text_symbol_params.match(line)
                 if f:
-                    self.text += f.group(1)
+                    self.text += f.group(3)
                     coords = self.re_bbox.search(line)
                     if coords:
                         [l0, t0, r0, b0] = coords.group(1).split(",")
                         if after_space:
                             self.spaces_coords[-1][1] = float(l0)
                             after_space = False
+
+                    font = (f.group(1), f.group(2))
+                    if font in fonts:
+                        fonts[font] += 1
+                    else:
+                        fonts[font] = 1
                     continue
                 f = self.re_text_space.match(line)
                 if f:
                     self.text += " "
                     self.spaces_coords.append([float(r0), 0])
                     after_space = True
+            if fonts:
+                m = max(fonts.values())
+                for f in fonts:
+                    if fonts[f] == m:
+                        self.font = f
+                        break
 
         elif isinstance(params, list):
             [self.left, self.top, self.right, self.bottom,
@@ -148,23 +162,11 @@ class Table:
     re_month = re.compile("(january|february|march|april|may|june|july|august"
                           "|september|october|november|december)")
 
-    def __init__(self, header, lines):
+    def __init__(self, caption, rows, caption_position):
         # table description
-        self.header = header
-        # table text lines
-        self.lines = lines
-
-#        print "\nTABLE WITH HEADER", header
+        self.caption = caption
 
-        rows = []
-        t = []
-        # Construct rows out of lines
-        for line in self.lines:
-            if line not in t:
-                row = self.construct_row(line, t)
-                rows.append(row)
-                t += row
-        rows.sort(key=lambda row: row_centery(row))
+#        print "\nTABLE WITH CAPTION", caption
 
         # Remove rows which contain date - this is used because
         # sometimes date stamped on a page gets caught while we are
@@ -180,27 +182,43 @@ def __init__(self, header, lines):
                 row.sort(key=lambda line: line.center[0])
                 self.rows.append(row)
 
-        r = len(self.rows) - 1
-        # Separate table lines from text above table. This is done by
+        # Separate table lines from text below/above table. This is done by
         # looking for a space between rows which is too large.
         max_diff = False
-        n = 1
-        while r > 0:
-            if not max_diff:
-                max_diff = row_centery(self.rows[r])\
-                    - row_centery(self.rows[r - 1])
-            else:
-                diff = row_centery(self.rows[r]) - \
-                    row_centery(self.rows[r - 1])
-#                print "DIFF BETWEEN", self.row_text(num = r), "AND",\
-#                      self.row_text(num = r - 1), ":", diff
-                if diff > 1.4 * max_diff:
-                    del self.rows[0:r]
-                    break
-                elif diff > max_diff:
-                    max_diff = diff
-            n += 1
-            r -= 1
+        if caption_position < 0:
+            r = 0
+            while r < len(self.rows) - 1:
+                if not max_diff:
+                    max_diff = row_centery(self.rows[r + 1])\
+                        - row_centery(self.rows[r])
+                else:
+                    diff = row_centery(self.rows[r + 1]) - \
+                        row_centery(self.rows[r])
+#                    print "DIFF BETWEEN", self.row_text(num = r), "AND",\
+#                          self.row_text(num = r - 1), ":", diff
+                    if diff > 1.4 * max_diff:
+                        del self.rows[r + 1:]
+                        break
+                    elif diff > max_diff:
+                        max_diff = diff
+                r += 1
+        else:
+            r = len(self.rows) - 1
+            while r > 0:
+                if not max_diff:
+                    max_diff = row_centery(self.rows[r])\
+                        - row_centery(self.rows[r - 1])
+                else:
+                    diff = row_centery(self.rows[r]) - \
+                        row_centery(self.rows[r - 1])
+#                    print "DIFF BETWEEN", self.row_text(num = r), "AND",\
+#                          self.row_text(num = r - 1), ":", diff
+                    if diff > 1.4 * max_diff:
+                        del self.rows[:r]
+                        break
+                    elif diff > max_diff:
+                        max_diff = diff
+                r -= 1
 
 #        print "ROWS"
         # Find overlapping (by X coordinate) lines and merge them. So
@@ -361,47 +379,6 @@ def break_short_rows(self, max_elements):
         self.rows.sort(key=lambda row: row_centery(row))
 
 
-def get_tables_from_text(text):
-    """ Get tables from a xml page text. """
-    re_textbox = re.compile(r"<textbox id=\"\d+\" bbox=\"([0-9.,]+)\">",
-                            re.DOTALL)
-    re_table_header = re.compile(r"Table \d+:")
-    tlines = re_textline.findall(text)
-    lines = []
-    table_headers = []
-    for line in tlines:
-        tl = TextLine(line)
-        if re_table_header.match(tl.text):
-            table_headers.append(tl)
-        else:
-            lines.append(tl)
-
-    # Find the highest top coordinate possible and use it as a zero
-    # point for new Y axis.
-    top = max(table_headers + lines, key=lambda line: line.top).top
-    for line in table_headers + lines:
-        line.swap_y(top)
-
-    table_headers.sort(key=lambda x: x.center[1])
-
-    table_lines = []
-    tables = []
-    for header in table_headers:
-        table_lines = []
-        remaining_lines = []
-        for line in lines:
-            if line.center[1] < header.center[1]:
-                table_lines.append(line)
-            else:
-                remaining_lines.append(line)
-
-        table = Table(header.text, table_lines)
-        if table.rows:
-            tables.append(table)
-        lines = remaining_lines
-    return tables
-
-
 def analyze_page(text):
     tlines = re_textline.findall(text)
     lines = []
@@ -428,5 +405,97 @@ def analyze_page(text):
             rows.append(row)
             t += row
     rows.sort(key=lambda row: row_centery(row))
+    for line in rows[0]:
+        if line.text == "DRAFT":
+            rows = rows[1:]
+            break
 
     return rows
+
+
+def get_tables_from_text(text):
+    """ Get tables from a xml page text. """
+    rows = analyze_page(text)
+    caption_start = False
+    caption_rows = []
+    text_rows = []
+    for (i, row) in list(enumerate(rows)):
+        if row[0].text.startswith("Table ") and\
+           re_table_caption_short.match(row[0].text):
+            caption_start = row
+            caption_rows.append(i)
+        elif caption_start and len(row) == 1 and\
+                abs(row[0].left - caption_start[0].left) < 1.0:
+            caption_rows.append(i)
+        else:
+            caption_start = False
+            if len(row) > 1 and row[0].right - row[0].left < 10.0:
+                f = row[1]
+            else:
+                f = row[0]
+            if f.left - 76.0 < 1.0 and len(row) < 3:
+                text_rows.append(i)
+    # Tables' captions position on the page
+    # -1 - above tables
+    # 0 - unknown
+    # 1 - below tables
+    caption_position = 0
+    if 0 in caption_rows:
+        caption_position = -1
+    elif len(rows) - 1 in caption_rows:
+        caption_position = 1
+    else:
+        for i in caption_rows:
+            if i - 1 in text_rows:
+                caption_position = -1
+                break
+            elif i + 1 in text_rows:
+                caption_position = 1
+                break
+    tables = []
+    if caption_position < 0:
+        i = len(rows) - 1
+        last_unused = i
+        while i >= 0:
+            if i in caption_rows:
+                table_rows = rows[i + 1:last_unused + 1]
+                while i - 1 in caption_rows:
+                    i -= 1
+                caption = "".join([line.text for line in rows[i]])
+                while i + 1 in caption_rows:
+                    i += 1
+                    caption += "".join([line.text for line in rows[i]])
+                while i - 1 in caption_rows:
+                    i -= 1
+                # Row after table cannot be another caption and is first unused
+                # row for the next table.
+                i -= 1
+                last_unused = i
+                i -= 1
+                table = Table(caption, table_rows, caption_position)
+                if table.rows:
+                    tables.append(table)
+            else:
+                i -= 1
+        tables = list(reversed(tables))
+    else:
+        i = 0
+        last_unused = i
+        while i < len(rows):
+            if i in caption_rows:
+                table_rows = rows[last_unused:i]
+                caption = "".join([line.text for line in rows[i]])
+                while i + 1 in caption_rows:
+                    i += 1
+                    caption += "".join([line.text for line in rows[i]])
+                # Row after table cannot be another caption and is first unused
+                # row for the next table.
+                i += 1
+                last_unused = i
+                i += 1
+                table = Table(caption, table_rows, caption_position)
+                if table.rows:
+                    tables.append(table)
+            else:
+                i += 1
+    return tables