Skip to content

Commit

Permalink
Trying to get context around PDF fields
Browse files Browse the repository at this point in the history
Harder than anticipated, for a few reasons:

* can't get things in order. I guess that's the point of PDFminer, but...
* PDF miner doesn't give you AcroForms at all. It has a completley hardcoded
  way of getting them, outside the context of the page.
* We can do these two things:
    * kinda put all of the fields back in the original text (see replace_original_text).
      Doesn't work too well though, lots of duplicate pieces of text that put many of the fields
      in the same place, when they should be in a different place.
        * could gather fields with the same adjacent text, and get all parts of that text in the PDF.
          Not guaranteed to be in order tho.
    * for each field, get all of the surrounding context. Is okay! But consistently gets too much text for GPT4.
      Even if we make it smaller, sometimes the surrounding context isn't the full sentence, or gets too much from other
      fields (will have too much shared / confusing the two fields).

TBH next goal is to try the PDFPageAndFieldInterpreter approach, notes in there.
  • Loading branch information
BryceStevenWilley committed Oct 27, 2023
1 parent 5616b72 commit 2c01ae0
Showing 1 changed file with 151 additions and 27 deletions.
178 changes: 151 additions & 27 deletions formfyxer/pdf_wrangling.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
from copy import copy
from typing import (
Any,
Callable,
Dict,
Iterable,
Optional,
Expand All @@ -30,8 +31,22 @@
from reportlab.pdfgen import canvas
from reportlab.lib.colors import magenta, pink, blue

from pdfminer.converter import PDFLayoutAnalyzer
from pdfminer.layout import LAParams, LTPage, LTTextBoxHorizontal, LTChar, LTContainer
from pdfminer.converter import PDFLayoutAnalyzer, TextConverter
from pdfminer.layout import (
LAParams,
LTPage,
LTTextBoxHorizontal,
LTChar,
LTContainer,
LTAnno,
LTText,
LTTextBox,
LTTextBoxVertical,
LTTextGroup,
LTTextLine,
LTImage,
LTItem,
)
from pdfminer.pdffont import PDFUnicodeNotDefined
from pdfminer.pdfdocument import PDFDocument
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
Expand Down Expand Up @@ -689,6 +704,33 @@ def get_result(self) -> List[LTPage]:
return self.results


class PDFPageAndFieldInterpreter(PDFPageInterpreter):
# TODO: keep track of all of the fields per page, insert them when rendering the page
pass


class TextAndFieldConverter(TextConverter):
def receive_layout(self, ltpage: LTPage) -> None:
def render(item: LTItem) -> None:
if isinstance(item, LTContainer):
for child in item:
render(child)
elif isinstance(item, LTText):
self.write_text(item.get_text())
if isinstance(item, LTTextBox):
self.write_text("\n")
elif isinstance(item, LTImage):
if self.imagewriter is not None:
self.imagewriter.export_image(item)
elif isinstance(item, LTAnnot):
self.write_text(item.get_text())

if self.showpageno:
self.write_text("Page %s\n" % ltpage.pageid)
render(ltpage)
self.write_text("\f")


class Textbox(TypedDict):
textbox: LTTextBoxHorizontal
bbox: BoundingBoxF
Expand Down Expand Up @@ -1039,11 +1081,115 @@ def get_possible_fields(
return fields


class ImproveNameVisitor:
def __init__(self):
self.used_field_names = set()

def improve_name_with_surrounding_text(
self, field_info: FormField, textboxes: List[Textbox]
) -> FormField:
dists = [
(
bbox_distance(field_info.get_bbox(), textbox["bbox"])[0],
textbox["textbox"],
textbox["bbox"],
)
for textbox in textboxes
]
if DEBUG:
print(f"For {field_info.name}, dists: {dists}")
min_textbox = min(dists, key=lambda d: d[0])
# TODO(brycew): remove the text boxes if they intersect something, unlikely they are the label for more than one.
# text_obj_bboxes.remove(min_obj[2])
# TODO(brycew): actual regex replacement of lots of underscores
label = re.sub("[\W]", "_", min_textbox[1].get_text().lower().strip(" \n\t_,."))
label = re.sub("_{3,}", "_", label).strip("_")
if label not in self.used_field_names:
field_info.name = label
self.used_field_names.add(label)
elif DEBUG:
print(f"avoiding using label {label} more than once")
return field_info


class AllCloseTextVisitor:
def __init__(self):
self.field_map = {}

def all_close_text(self, field_info, textboxes) -> FormField:
dists = [
(tb["bbox"][0] + tb["bbox"][1] * 1000, tb["textbox"].get_text())
for tb in textboxes
] + [
(
field_info.get_bbox()[0] + field_info.get_bbox()[1] * 1000,
"{{ " + field_info.name + "}} ",
)
]
textbox_order = sorted(dists, key=lambda d: d[0])
all_text = "".join([tb[1] for tb in textbox_order])
self.field_map[field_info.name] = all_text
return field_info


class LowestVertVisitor:
"""Gets just the closest text to the field, and returns that"""

def __init__(self):
self.field_map = {}

def lowest_vert(fi, tbs):
dists = []
for tb in tbs:
dist = pdf_wrangling.bbox_distance(fi.get_bbox(), tb["bbox"])
a_side, b_side = dist[1], dist[2]
closest_side_dist = min(
pdf_wrangling.get_dist(a_side[0], b_side[0]),
pdf_wrangling.get_dist(a_side[1], b_side[1]),
)
enumm = ("After" if closest_side_dist > 0 else "Before",)
tup = (dist[0], enumm, tb["textbox"], tb["bbox"])
dists.append(tup)
min_tb = min(dists, key=lambda d: d[0])
print(f"{fi.name}, {min_tb[2].get_text()}")
self.field_map[fi.name] = min_tb
return fi


def replace_in_original(original_text, field_map):
"""Given the original text of a PDF (extract_text(...)), adds the field's names in their best places.
Doesn't always work, especially with duplicate text.
"""
text = original_text
for field_info in field_map.items():
try:
idx = text.index(field_info[1][2].get_text())
print(f"{field_info[0]}, {idx}")
if field_info[1][1] == "Before":
text = text[:idx] + " {{ " + field_info[0] + " }} " + text[idx:]
else:
new_idx = idx + len(field_info[1][2].get_text())
text = text[:new_idx] + " {{ " + field_info[0] + " }} " + text[new_idx:]
except Exception as ex:
print(f"EXCEPTION on {field_info[0]}: {ex}")
return text


def improve_names_with_surrounding_text(
fields: List[List[FormField]], textboxes: List[List[Textbox]]
):
) -> List[List[FormField]]:
name_visitor = ImproveNameVisitor()
return surrounding_text_traverse(
fields,
textboxes,
lambda fi, tbs: name_visitor.improve_name_with_surrounding_text(fi, tbs),
)


def surrounding_text_traverse(
fields: List[List[FormField]], textboxes: List[List[Textbox]], visitor: Callable
) -> List[List[FormField]]:
new_fields = []
used_field_names = set()
for i, (fields_in_page, text_in_page) in enumerate(zip(fields, textboxes)):
# Get text boxes with more than one character (not including spaces, _, etc.)
text_in_page = [
Expand Down Expand Up @@ -1071,29 +1217,7 @@ def improve_names_with_surrounding_text(
if intersect
]
if intersected:
dists = [
(
bbox_distance(field_bbox, textbox["bbox"])[0],
textbox["textbox"],
textbox["bbox"],
)
for textbox in intersected
]
if DEBUG:
print(f"For {field_info.name}, dists: {dists}")
min_textbox = min(dists, key=lambda d: d[0])
# TODO(brycew): remove the text boxes if they intersect something, unlikely they are the label for more than one.
# text_obj_bboxes.remove(min_obj[2])
# TODO(brycew): actual regex replacement of lots of underscores
label = re.sub(
"[\W]", "_", min_textbox[1].get_text().lower().strip(" \n\t_,.")
)
label = re.sub("_{3,}", "_", label).strip("_")
if label not in used_field_names:
copied_field_info.name = label
used_field_names.add(label)
elif DEBUG:
print(f"avoiding using label {label} more than once")
copied_field_info = visitor(copied_field_info, intersected)
page_fields.append(copied_field_info)

new_fields.append(page_fields)
Expand Down

0 comments on commit 2c01ae0

Please sign in to comment.