In [1]:
class TableParser:
    def __init__(self, pdf_data, template):
        self.pdf_data = pdf_data
        self.template = template

    def extract_table_data(self, table_config, page_data):
        return None

In [2]:
import json
import os
from typing import Dict, Any

template_path: str = os.path.join("src", "templates", "barclays_template.json")
pdf_data_path: str = os.path.join("src", "pdf_data", "barclays_march_2_pdf_data.json")

template: Dict[str, Any] = json.load(open(template_path))
pdf_data: Dict[str, Any] = json.load(open(pdf_data_path))

output_data = json.load(open("src/outputs/barclays_march_2_output.json"))

table_parser = TableParser(pdf_data, template)


In [3]:
table_config = template["rules"][-1]["config"]

In [4]:
table_config["columns"][0]

{'field_name': 'date',
 'coordinates': {'top_left': {'x': 0.097, 'y': 0.488},
  'bottom_right': {'x': 0.149, 'y': 0.898}},
 'type': 'text'}

In [5]:
coordinates = table_config["columns"][0]["coordinates"]

In [6]:
coordinates

{'top_left': {'x': 0.097, 'y': 0.488},
 'bottom_right': {'x': 0.149, 'y': 0.898}}

In [7]:
page_data = pdf_data["pages"][1]

In [8]:
page_content = page_data["content"]

In [9]:
def get_items_in_bounding_box(page_data, coordinates):
    items_in_box = []
    for item in page_data:
        bounding_box = item["bounding_box"]["decimal_coordinates"]
        if (
            bounding_box["top_left"]["x"] >= coordinates["top_left"]["x"]
            and bounding_box["top_left"]["y"] >= coordinates["top_left"]["y"]
            and bounding_box["bottom_right"]["x"]
            <= coordinates["bottom_right"]["x"]
            and bounding_box["bottom_right"]["y"]
            <= coordinates["bottom_right"]["y"]
        ):
            items_in_box.append(item)
    return items_in_box

def get_text_from_items(items):
    return " ".join([item["text"] for item in items])




In [10]:
items_within_coordinates = get_items_in_bounding_box(
    page_content, coordinates
)

In [11]:
items_within_coordinates

[{'text': '01',
  'bounding_box': {'coordinates': {'top_left': {'x': 58.0, 'y': 415.55},
    'bottom_right': {'x': 68.03, 'y': 424.55}},
   'decimal_coordinates': {'top_left': {'x': 0.097479, 'y': 0.493527},
    'bottom_right': {'x': 0.114336, 'y': 0.504216}}}},
 {'text': 'Mar',
  'bounding_box': {'coordinates': {'top_left': {'x': 70.23, 'y': 415.55},
    'bottom_right': {'x': 85.96, 'y': 424.55}},
   'decimal_coordinates': {'top_left': {'x': 0.118034, 'y': 0.493527},
    'bottom_right': {'x': 0.144471, 'y': 0.504216}}}},
 {'text': '02',
  'bounding_box': {'coordinates': {'top_left': {'x': 58.0, 'y': 518.35},
    'bottom_right': {'x': 68.03, 'y': 527.35}},
   'decimal_coordinates': {'top_left': {'x': 0.097479, 'y': 0.615618},
    'bottom_right': {'x': 0.114336, 'y': 0.626306}}}},
 {'text': 'Mar',
  'bounding_box': {'coordinates': {'top_left': {'x': 70.23, 'y': 518.35},
    'bottom_right': {'x': 85.96, 'y': 527.35}},
   'decimal_coordinates': {'top_left': {'x': 0.118034, 'y': 0.615618},

In [12]:
get_text_from_items(items_within_coordinates)

'01 Mar 02 Mar 06 Mar 08 Mar'

In [13]:
line_separation_y_coordinates = [0.488, 0.519, 0.671, 0.834, 0.898]
coordinates = {'top_left': {'x': 0.097, 'y': 0.488},
               'bottom_right': {'x': 0.149, 'y': 0.898}}

partitioned_box = []
for i in range(len(line_separation_y_coordinates) - 1):
    partitioned_box.append({
        'top_left': {'x': coordinates['top_left']['x'], 'y': line_separation_y_coordinates[i]},
        'bottom_right': {'x': coordinates['bottom_right']['x'], 'y': line_separation_y_coordinates[i + 1]}
    })

In [14]:
coordinates = partitioned_box[0]
# get_text_from_page(page_content, coordinates)


TABLE PARSER IS THE SAME AS FORM PARSER BUT WE USE LINE SEPARATION TO GET THE BOUNDING BOXES

# TABLE SPLITTER

In [15]:
from pdfplumber import open as pdf_open
from pdf2image import convert_from_path
from PIL import Image, ImageDraw


pdf_path = "data/bank_statements/barclays/pdf/barclays March 2.pdf"

# Open the PDF and get the second page
with pdf_open(pdf_path) as pdf:
    page = pdf.pages[1]
    lines = page.lines  # Get lines from the second page

# Convert the PDF page to a JPG image
images = convert_from_path(pdf_path)
jpg_image_original = images[1]  # Get the second page as a JPG

jpg_image_original.save("barclays_apr_2.jpeg", "JPEG")

# Get the dimensions of the PDF page
pdf_width = page.width
pdf_height = page.height

jpg_image_original_copy = jpg_image_original.copy()
# Create a draw object
draw = ImageDraw.Draw(jpg_image_original_copy)

# Draw red boxes around the lines using scaled dimensions
for line in lines:
    x0 = line['x0'] * (jpg_image_original.width / pdf_width) - 2
    y0 = line['top'] * (jpg_image_original.height / pdf_height) - 2
    x1 = line['x1'] * (jpg_image_original.width / pdf_width) + 2
    y1 = line['bottom'] * (jpg_image_original.height / pdf_height) + 2
    
    # Get the pixel color in the middle of the bounding box
    mid_x = int((x0 + x1) / 2)
    mid_y = int((y0 + y1) / 2)
    pixel_color = jpg_image_original_copy.getpixel((mid_x, mid_y))    
    # if pixel_color[0] > 150 and pixel_color[1] > 150 and pixel_color[2] > 150:
    draw.rectangle([x0, y0, x1, y1], outline="red", width=2)  # Draw the red box around each line

# # Save the image with the drawn boxes locally as 'barclays_apr_2.jpeg'
jpg_image_original_copy.show()

In [16]:
# Splitter based on delimiter


In [17]:
items_within_coordinates

[{'text': '01',
  'bounding_box': {'coordinates': {'top_left': {'x': 58.0, 'y': 415.55},
    'bottom_right': {'x': 68.03, 'y': 424.55}},
   'decimal_coordinates': {'top_left': {'x': 0.097479, 'y': 0.493527},
    'bottom_right': {'x': 0.114336, 'y': 0.504216}}}},
 {'text': 'Mar',
  'bounding_box': {'coordinates': {'top_left': {'x': 70.23, 'y': 415.55},
    'bottom_right': {'x': 85.96, 'y': 424.55}},
   'decimal_coordinates': {'top_left': {'x': 0.118034, 'y': 0.493527},
    'bottom_right': {'x': 0.144471, 'y': 0.504216}}}},
 {'text': '02',
  'bounding_box': {'coordinates': {'top_left': {'x': 58.0, 'y': 518.35},
    'bottom_right': {'x': 68.03, 'y': 527.35}},
   'decimal_coordinates': {'top_left': {'x': 0.097479, 'y': 0.615618},
    'bottom_right': {'x': 0.114336, 'y': 0.626306}}}},
 {'text': 'Mar',
  'bounding_box': {'coordinates': {'top_left': {'x': 70.23, 'y': 518.35},
    'bottom_right': {'x': 85.96, 'y': 527.35}},
   'decimal_coordinates': {'top_left': {'x': 0.118034, 'y': 0.615618},

In [20]:
def extract_unique_y_values(items_within_coordinates):
    y_values = {item['bounding_box']['decimal_coordinates']['top_left']['y'] for item in items_within_coordinates}
    return sorted(y_values)

unique_y_values = extract_unique_y_values(items_within_coordinates)

print(unique_y_values)

jpg_image_original_copy = jpg_image_original.copy()

draw = ImageDraw.Draw(jpg_image_original_copy)

# Ensure the lines are drawn at the correct height
for y in unique_y_values:
    # Calculate the y position in pixels
    pixel_y = y * (jpg_image_original.height )
    draw.line([(0, pixel_y), (jpg_image_original.width, pixel_y)], fill="red", width=5)

jpg_image_original_copy.show()

[0.493527, 0.615618, 0.65861, 0.853622]
