In [1]:
import json
import os
from typing import Dict, Any


class PDFParser:
    def parse(
        self, template: Dict[str, Any], pdf_data: Dict[str, Any]
    ) -> Dict[str, Any]:
        pass


template_path: str = os.path.join("src", "templates", "barclays_template.json")
pdf_data_path: str = os.path.join("src", "pdf_data", "barclays_pdf_data.json")

template: Dict[str, Any] = json.load(open(template_path))
pdf_data: Dict[str, Any] = json.load(open(pdf_data_path))

parser: PDFParser = PDFParser()
output_generated: Dict[str, Any] = parser.parse(template, pdf_data)

output_data = json.load(open("src/outputs/barclays_output.json"))


In [2]:
# Create a FormParser

In [3]:
template["pages"]

[{'page_numbers': ['1'],
  'forms': ['customer_details', 'sort_code', 'account_number']},
 {'page_numbers': ['2'],
  'forms': ['start_balance',
   'money_in',
   'money_out',
   'end_balance',
   'overdraft_limit']},
 {'page_numbers': ['2-(-2)'],
  'forms': [],
  'tables': ['transactions_page_2_onwards']},
 {'page_numbers': ['-1'], 'ignore': True}]

In [4]:
pdf_width = pdf_data["dimensions"]["width"]
pdf_height = pdf_data["dimensions"]["height"]
page_number = 1
form_rule = template["rules"][0]

In [5]:
def convert_coordinates(form_rule: Dict[str, Any], pdf_width: int, pdf_height: int) -> Dict[str, int]:
    top_left_x = int(pdf_width * form_rule['config']['coordinates']['top_left']['x'])
    top_left_y = int(pdf_height * form_rule['config']['coordinates']['top_left']['y'])
    bottom_right_x = int(pdf_width * form_rule['config']['coordinates']['bottom_right']['x'])
    bottom_right_y = int(pdf_height * form_rule['config']['coordinates']['bottom_right']['y'])
    return {
        'top_left_x': top_left_x,
        'top_left_y': top_left_y,
        'bottom_right_x': bottom_right_x,
        'bottom_right_y': bottom_right_y
    }

coordinates = convert_coordinates(form_rule, pdf_width, pdf_height)

In [6]:
coordinates

{'top_left_x': 270,
 'top_left_y': 509,
 'bottom_right_x': 294,
 'bottom_right_y': 520}

In [7]:
page_data = pdf_data["pages"][0]["content"]

In [8]:
page_data

[{'text': 'Your',
  'bounding_box': {'top_left': {'x': 433.8, 'y': 25.5},
   'bottom_right': {'x': 462.62600000000003, 'y': 39.5}}},
 {'text': 'statement',
  'bounding_box': {'top_left': {'x': 466.05600000000004, 'y': 25.5},
   'bottom_right': {'x': 527.628, 'y': 39.5}}},
 {'text': 'Miss',
  'bounding_box': {'top_left': {'x': 433.8, 'y': 53.35000000000002},
   'bottom_right': {'x': 451.84499999999997, 'y': 62.35000000000002}}},
 {'text': 'Meghan',
  'bounding_box': {'top_left': {'x': 454.05, 'y': 53.35000000000002},
   'bottom_right': {'x': 486.324, 'y': 62.35000000000002}}},
 {'text': 'Victoria',
  'bounding_box': {'top_left': {'x': 488.529, 'y': 53.35000000000002},
   'bottom_right': {'x': 518.562, 'y': 62.35000000000002}}},
 {'text': 'Thorneloe',
  'bounding_box': {'top_left': {'x': 520.767, 'y': 53.35000000000002},
   'bottom_right': {'x': 560.547, 'y': 62.35000000000002}}},
 {'text': '28',
  'bounding_box': {'top_left': {'x': 433.8, 'y': 67.14999999999998},
   'bottom_right': {'x'

In [9]:
# page_data = [{'text': 'Your',
#   'bounding_box': {'top_left': {'x': 433.8, 'y': 25.5},
#    'bottom_right': {'x': 462.62600000000003, 'y': 39.5}}},
#  {'text': 'statement',
#   'bounding_box': {'top_left': {'x': 466.05600000000004, 'y': 25.5},
#    'bottom_right': {'x': 527.628, 'y': 39.5}}},
#  {'text': 'Miss',
#   'bounding_box': {'top_left': {'x': 433.8, 'y': 53.35000000000002},
#    'bottom_right': {'x': 451.84499999999997, 'y': 62.35000000000002}}},
#  {'text': 'Meghan',
#   'bounding_box': {'top_left': {'x': 454.05, 'y': 53.35000000000002},
#    'bottom_right': {'x': 486.324, 'y': 62.35000000000002}}},
#  {'text': 'Victoria',
#   'bounding_box': {'top_left': {'x': 488.529, 'y': 53.35000000000002},
#    'bottom_right': {'x': 518.562, 'y': 62.35000000000002}}},
#  {'text': 'Thorneloe',
#   'bounding_box': {'top_left': {'x': 520.767, 'y': 53.35000000000002},
#    'bottom_right': {'x': 560.547, 'y': 62.35000000000002}}}]

# coordinates = {
#     'top_left_x': 420,
#     'top_left_y': 20,
#     'bottom_right_x': 470,
#     'bottom_right_y': 45
# }

def get_items_in_bounding_box(page_data, coordinates):
    items_in_box = []
    for item in page_data:
        bounding_box = item['bounding_box']
        if (bounding_box['top_left']['x'] >= coordinates['top_left_x'] and
            bounding_box['top_left']['y'] >= coordinates['top_left_y'] and
            bounding_box['bottom_right']['x'] <= coordinates['bottom_right_x'] and
            bounding_box['bottom_right']['y'] <= coordinates['bottom_right_y']):
            items_in_box.append(item)
    return items_in_box

items_within_coordinates = get_items_in_bounding_box(page_data, coordinates)

In [10]:
items_within_coordinates

[]