In [1]:
import json
from src.extractor import TextExtractor

pdf_path = "data/bank_statements/barclays/pdf/barclays March 2.pdf"

with open(pdf_path, "rb") as pdf_file:
    text_extractor = TextExtractor(pdf_file.read())

    # Extract data from the PDF file
    extracted_data = text_extractor.extract_text()

    with open("src/pdf_data/barclays_march_2_pdf_data.json", "w") as f:
        json.dump(extracted_data, f)



In [2]:
import os
from typing import Dict, Any

template_path: str = os.path.join("src", "templates", "barclays_template.json")
pdf_data_path: str = os.path.join("src", "pdf_data", "barclays_march_2_pdf_data.json")

template: Dict[str, Any] = json.load(open(template_path))
pdf_data: Dict[str, Any] = json.load(open(pdf_data_path))

output_data = json.load(open("src/outputs/barclays_march_2_output.json"))

In [3]:
pdf_data["pages"][1]["lines"]

[{'decimal_coordinates': {'top_left': {'x': 0.482689, 'y': 0.95867},
   'bottom_right': {'x': 0.484706, 'y': 0.95867}}},
 {'decimal_coordinates': {'top_left': {'x': 0.729076, 'y': 0.97304},
   'bottom_right': {'x': 0.953277, 'y': 0.97304}}},
 {'decimal_coordinates': {'top_left': {'x': 0.729076, 'y': 0.935036},
   'bottom_right': {'x': 0.953277, 'y': 0.935036}}},
 {'decimal_coordinates': {'top_left': {'x': 0.736471, 'y': 0.795487},
   'bottom_right': {'x': 0.946555, 'y': 0.795487}}},
 {'decimal_coordinates': {'top_left': {'x': 0.736471, 'y': 0.777553},
   'bottom_right': {'x': 0.946555, 'y': 0.777553}}},
 {'decimal_coordinates': {'top_left': {'x': 0.736471, 'y': 0.795487},
   'bottom_right': {'x': 0.946555, 'y': 0.795487}}},
 {'decimal_coordinates': {'top_left': {'x': 0.736471, 'y': 0.759739},
   'bottom_right': {'x': 0.946555, 'y': 0.759739}}},
 {'decimal_coordinates': {'top_left': {'x': 0.736471, 'y': 0.777553},
   'bottom_right': {'x': 0.946555, 'y': 0.777553}}},
 {'decimal_coordinat

In [4]:
pdf_data["dimensions"]

{'width': 595, 'height': 842}

In [5]:
table_config = template["rules"][-1]["config"]

In [6]:
table_config["columns"][0]

{'field_name': 'date',
 'coordinates': {'top_left': {'x': 0.095, 'y': 0.488},
  'bottom_right': {'x': 0.149, 'y': 0.898}},
 'type': 'text'}

In [7]:
coordinates = table_config["columns"][0]["coordinates"]

In [8]:
coordinates

{'top_left': {'x': 0.095, 'y': 0.488},
 'bottom_right': {'x': 0.149, 'y': 0.898}}

In [9]:
page_content = pdf_data["pages"][1]["content"]

In [10]:
from src.parser import Parser

parser = Parser()

items_within_coordinates = parser.get_items_in_bounding_box(
    page_content, coordinates
)

In [11]:
items_within_coordinates

[{'text': '01',
  'bounding_box': {'coordinates': {'top_left': {'x': 58.0, 'y': 415.55},
    'bottom_right': {'x': 68.03, 'y': 424.55}},
   'decimal_coordinates': {'top_left': {'x': 0.097479, 'y': 0.493527},
    'bottom_right': {'x': 0.114336, 'y': 0.504216}}}},
 {'text': 'Mar',
  'bounding_box': {'coordinates': {'top_left': {'x': 70.23, 'y': 415.55},
    'bottom_right': {'x': 85.96, 'y': 424.55}},
   'decimal_coordinates': {'top_left': {'x': 0.118034, 'y': 0.493527},
    'bottom_right': {'x': 0.144471, 'y': 0.504216}}}},
 {'text': '02',
  'bounding_box': {'coordinates': {'top_left': {'x': 58.0, 'y': 518.35},
    'bottom_right': {'x': 68.03, 'y': 527.35}},
   'decimal_coordinates': {'top_left': {'x': 0.097479, 'y': 0.615618},
    'bottom_right': {'x': 0.114336, 'y': 0.626306}}}},
 {'text': 'Mar',
  'bounding_box': {'coordinates': {'top_left': {'x': 70.23, 'y': 518.35},
    'bottom_right': {'x': 85.96, 'y': 527.35}},
   'decimal_coordinates': {'top_left': {'x': 0.118034, 'y': 0.615618},

In [12]:
page_content

[{'text': 'Statement',
  'bounding_box': {'coordinates': {'top_left': {'x': 294.8, 'y': 25.0},
    'bottom_right': {'x': 330.7, 'y': 33.0}},
   'decimal_coordinates': {'top_left': {'x': 0.495462, 'y': 0.029691},
    'bottom_right': {'x': 0.555798, 'y': 0.039192}}}},
 {'text': 'date',
  'bounding_box': {'coordinates': {'top_left': {'x': 332.66, 'y': 25.0},
    'bottom_right': {'x': 347.9, 'y': 33.0}},
   'decimal_coordinates': {'top_left': {'x': 0.559092, 'y': 0.029691},
    'bottom_right': {'x': 0.584706, 'y': 0.039192}}}},
 {'text': '31',
  'bounding_box': {'coordinates': {'top_left': {'x': 349.86, 'y': 25.0},
    'bottom_right': {'x': 358.78, 'y': 33.0}},
   'decimal_coordinates': {'top_left': {'x': 0.588, 'y': 0.029691},
    'bottom_right': {'x': 0.602992, 'y': 0.039192}}}},
 {'text': 'Mar',
  'bounding_box': {'coordinates': {'top_left': {'x': 360.74, 'y': 25.0},
    'bottom_right': {'x': 374.72, 'y': 33.0}},
   'decimal_coordinates': {'top_left': {'x': 0.606286, 'y': 0.029691},
   

TABLE PARSER IS THE SAME AS FORM PARSER BUT WE USE LINE SEPARATION TO GET THE BOUNDING BOXES

# TABLE SPLITTER

In [13]:
from pdfplumber import open as pdf_open
from pdf2image import convert_from_path
from PIL import Image, ImageDraw
import json

from src.parser import Parser

class TableSplitter:
    def __init__(self, template, parser):
        self.template = template
        self.parser = parser

    def split_table_by_delimiter(self, page_content, coordinates):
        items_within_coordinates = self.parser.get_items_in_bounding_box(
            page_content, coordinates
        )

        line_separation_y_coordinates = {
            item["bounding_box"]["decimal_coordinates"]["top_left"]["y"]
            for item in items_within_coordinates
        }
        return sorted(list(set(line_separation_y_coordinates)))

    def split_table_by_line(self, lines, coordinates):
        
        min_y = 1 - coordinates["top_left"]["y"]
        max_y = 1 - coordinates["bottom_right"]["y"]

        min_x = coordinates["top_left"]["x"] - 0.01
        max_x = coordinates["bottom_right"]["x"] + 0.01

        line_separation_y_coordinates = []

        print(f"min_x: {min_x}, max_x: {max_x}, min_y: {min_y}, max_y: {max_y}")

        for line in lines:
            x0 = line["decimal_coordinates"]["top_left"]["x"]
            x1 = line["decimal_coordinates"]["bottom_right"]["x"]
            y0 = line["decimal_coordinates"]["top_left"]["y"]
            y1 = line["decimal_coordinates"]["bottom_right"]["y"]

            if x0 > 0 and x1 < 0.2:
                print(f"x0: {x0}, x1: {x1}, y0: {y0}, y1: {y1}")

            if x0 >= min_x and x1 <= max_x and y0 >= min_y and y1 <= max_y:
                line_separation_y_coordinates.append(y0)

        return sorted(list(set(line_separation_y_coordinates)))

    def split_table(self, row_delimiter_type: str, page_content, coordinates):
        if row_delimiter_type == "line":
            return self.split_table_by_line(page_content, coordinates)
        elif row_delimiter_type == "delimiter":
            return self.split_table_by_delimiter(page_content, coordinates)

from src.pdf_utils import ImageDrawer

def get_vertical_lines(lines):
    return sorted(list(set([line["decimal_coordinates"]["top_left"]["y"] for line in lines])))

extracted_lines = pdf_data["pages"][1]["lines"]

table_splitter = TableSplitter(template, parser)

lines_y_coordinates = table_splitter.split_table_by_line(extracted_lines, coordinates)

# lines_y_coordinates = get_vertical_lines(extracted_lines)

print(f"Extracted {len(extracted_lines)} lines from PDF")

jpg_image = ImageDrawer(None, None, None).create_jpg_image(
    pdf_path
)

print("\nDrawing lines and rectangles...")
image_drawer = ImageDrawer(
    jpg_image,
    jpg_image.size[0],  # Use the image width
    jpg_image.size[1],  # Use the image height
)

# Draw both the coordinates (rectangles) and the horizontal lines
modified_image = image_drawer.draw_coordinates([coordinates])
modified_image = image_drawer.draw_horizontal_lines(lines_y_coordinates)

# Save and show the modified image
modified_image.save("output_with_lines_and_rectangles.jpeg", "JPEG")
modified_image.show()

print("\nProcess completed successfully!")
print("Check 'output_with_lines_and_rectangles.jpeg' for the result")

 min_x: 0.085, max_x: 0.159, min_y: 0.512, max_y: 0.10199999999999998
x0: 0.095126, x1: 0.152605, y0: 0.512708, y1: 0.512708
x0: 0.152269, x1: 0.181176, y0: 0.496318, y1: 0.496318
x0: 0.152269, x1: 0.181176, y0: 0.45677, y1: 0.45677
x0: 0.152269, x1: 0.181176, y0: 0.412233, y1: 0.412233
x0: 0.095126, x1: 0.152605, y0: 0.390618, y1: 0.390618
x0: 0.152269, x1: 0.181176, y0: 0.36924, y1: 0.36924
x0: 0.095126, x1: 0.152605, y0: 0.347625, y1: 0.347625
x0: 0.152269, x1: 0.181176, y0: 0.331235, y1: 0.331235
x0: 0.152269, x1: 0.181176, y0: 0.29323, y1: 0.29323
x0: 0.152269, x1: 0.181176, y0: 0.250238, y1: 0.250238
x0: 0.152269, x1: 0.181176, y0: 0.212233, y1: 0.212233
x0: 0.152269, x1: 0.181176, y0: 0.174228, y1: 0.174228
x0: 0.095126, x1: 0.152605, y0: 0.152613, y1: 0.152613
x0: 0.152269, x1: 0.181176, y0: 0.136223, y1: 0.136223
x0: 0.151261, x1: 0.153613, y0: 0.32791, y1: 0.32791
Extracted 86 lines from PDF
Image dimensions: 1653x2339

Drawing lines and rectangles...
Successfully drew 0 line

In [14]:
sorted_extracted_lines = sorted(
    [item for item in extracted_lines if 0.4 < item["decimal_coordinates"]["top_left"]["y"] < 0.6],
    key=lambda item: (item["decimal_coordinates"]["top_left"]["y"], item["decimal_coordinates"]["top_left"]["x"])
)


In [15]:
coordinates

{'top_left': {'x': 0.095, 'y': 0.488},
 'bottom_right': {'x': 0.149, 'y': 0.898}}

In [16]:
sorted_extracted_lines

[{'decimal_coordinates': {'top_left': {'x': 0.152269, 'y': 0.412233},
   'bottom_right': {'x': 0.181176, 'y': 0.412233}}},
 {'decimal_coordinates': {'top_left': {'x': 0.18084, 'y': 0.412233},
   'bottom_right': {'x': 0.423529, 'y': 0.412233}}},
 {'decimal_coordinates': {'top_left': {'x': 0.423193, 'y': 0.412233},
   'bottom_right': {'x': 0.514622, 'y': 0.412233}}},
 {'decimal_coordinates': {'top_left': {'x': 0.514286, 'y': 0.412233},
   'bottom_right': {'x': 0.605042, 'y': 0.412233}}},
 {'decimal_coordinates': {'top_left': {'x': 0.604706, 'y': 0.412233},
   'bottom_right': {'x': 0.695462, 'y': 0.412233}}},
 {'decimal_coordinates': {'top_left': {'x': 0.152269, 'y': 0.43361},
   'bottom_right': {'x': 0.695462, 'y': 0.43361}}},
 {'decimal_coordinates': {'top_left': {'x': 0.152269, 'y': 0.45677},
   'bottom_right': {'x': 0.181176, 'y': 0.45677}}},
 {'decimal_coordinates': {'top_left': {'x': 0.18084, 'y': 0.45677},
   'bottom_right': {'x': 0.423529, 'y': 0.45677}}},
 {'decimal_coordinates':

In [17]:
lines_y_coordinates

[]

In [18]:
pdf_data["pages"][1]["lines"]

[{'decimal_coordinates': {'top_left': {'x': 0.482689, 'y': 0.95867},
   'bottom_right': {'x': 0.484706, 'y': 0.95867}}},
 {'decimal_coordinates': {'top_left': {'x': 0.729076, 'y': 0.97304},
   'bottom_right': {'x': 0.953277, 'y': 0.97304}}},
 {'decimal_coordinates': {'top_left': {'x': 0.729076, 'y': 0.935036},
   'bottom_right': {'x': 0.953277, 'y': 0.935036}}},
 {'decimal_coordinates': {'top_left': {'x': 0.736471, 'y': 0.795487},
   'bottom_right': {'x': 0.946555, 'y': 0.795487}}},
 {'decimal_coordinates': {'top_left': {'x': 0.736471, 'y': 0.777553},
   'bottom_right': {'x': 0.946555, 'y': 0.777553}}},
 {'decimal_coordinates': {'top_left': {'x': 0.736471, 'y': 0.795487},
   'bottom_right': {'x': 0.946555, 'y': 0.795487}}},
 {'decimal_coordinates': {'top_left': {'x': 0.736471, 'y': 0.759739},
   'bottom_right': {'x': 0.946555, 'y': 0.759739}}},
 {'decimal_coordinates': {'top_left': {'x': 0.736471, 'y': 0.777553},
   'bottom_right': {'x': 0.946555, 'y': 0.777553}}},
 {'decimal_coordinat

In [19]:
pdf_data["pages"][1]

{'page_number': 2,
 'content': [{'text': 'Statement',
   'bounding_box': {'coordinates': {'top_left': {'x': 294.8, 'y': 25.0},
     'bottom_right': {'x': 330.7, 'y': 33.0}},
    'decimal_coordinates': {'top_left': {'x': 0.495462, 'y': 0.029691},
     'bottom_right': {'x': 0.555798, 'y': 0.039192}}}},
  {'text': 'date',
   'bounding_box': {'coordinates': {'top_left': {'x': 332.66, 'y': 25.0},
     'bottom_right': {'x': 347.9, 'y': 33.0}},
    'decimal_coordinates': {'top_left': {'x': 0.559092, 'y': 0.029691},
     'bottom_right': {'x': 0.584706, 'y': 0.039192}}}},
  {'text': '31',
   'bounding_box': {'coordinates': {'top_left': {'x': 349.86, 'y': 25.0},
     'bottom_right': {'x': 358.78, 'y': 33.0}},
    'decimal_coordinates': {'top_left': {'x': 0.588, 'y': 0.029691},
     'bottom_right': {'x': 0.602992, 'y': 0.039192}}}},
  {'text': 'Mar',
   'bounding_box': {'coordinates': {'top_left': {'x': 360.74, 'y': 25.0},
     'bottom_right': {'x': 374.72, 'y': 33.0}},
    'decimal_coordinates':

In [20]:
lines_y_coordinates

[]

In [21]:
coordinates

{'top_left': {'x': 0.095, 'y': 0.488},
 'bottom_right': {'x': 0.149, 'y': 0.898}}

In [22]:
pdf_data["pages"][1]["lines"]

[{'decimal_coordinates': {'top_left': {'x': 0.482689, 'y': 0.95867},
   'bottom_right': {'x': 0.484706, 'y': 0.95867}}},
 {'decimal_coordinates': {'top_left': {'x': 0.729076, 'y': 0.97304},
   'bottom_right': {'x': 0.953277, 'y': 0.97304}}},
 {'decimal_coordinates': {'top_left': {'x': 0.729076, 'y': 0.935036},
   'bottom_right': {'x': 0.953277, 'y': 0.935036}}},
 {'decimal_coordinates': {'top_left': {'x': 0.736471, 'y': 0.795487},
   'bottom_right': {'x': 0.946555, 'y': 0.795487}}},
 {'decimal_coordinates': {'top_left': {'x': 0.736471, 'y': 0.777553},
   'bottom_right': {'x': 0.946555, 'y': 0.777553}}},
 {'decimal_coordinates': {'top_left': {'x': 0.736471, 'y': 0.795487},
   'bottom_right': {'x': 0.946555, 'y': 0.795487}}},
 {'decimal_coordinates': {'top_left': {'x': 0.736471, 'y': 0.759739},
   'bottom_right': {'x': 0.946555, 'y': 0.759739}}},
 {'decimal_coordinates': {'top_left': {'x': 0.736471, 'y': 0.777553},
   'bottom_right': {'x': 0.946555, 'y': 0.777553}}},
 {'decimal_coordinat

In [23]:
# Splitter based on delimiter
y_coordinates_from_delimiter = table_splitter.split_table_by_delimiter(page_content=page_content, coordinates=coordinates)

In [24]:
y_coordinates_from_delimiter

[0.493527, 0.615618, 0.65861, 0.853622]

In [25]:
items_within_coordinates

[{'text': '01',
  'bounding_box': {'coordinates': {'top_left': {'x': 58.0, 'y': 415.55},
    'bottom_right': {'x': 68.03, 'y': 424.55}},
   'decimal_coordinates': {'top_left': {'x': 0.097479, 'y': 0.493527},
    'bottom_right': {'x': 0.114336, 'y': 0.504216}}}},
 {'text': 'Mar',
  'bounding_box': {'coordinates': {'top_left': {'x': 70.23, 'y': 415.55},
    'bottom_right': {'x': 85.96, 'y': 424.55}},
   'decimal_coordinates': {'top_left': {'x': 0.118034, 'y': 0.493527},
    'bottom_right': {'x': 0.144471, 'y': 0.504216}}}},
 {'text': '02',
  'bounding_box': {'coordinates': {'top_left': {'x': 58.0, 'y': 518.35},
    'bottom_right': {'x': 68.03, 'y': 527.35}},
   'decimal_coordinates': {'top_left': {'x': 0.097479, 'y': 0.615618},
    'bottom_right': {'x': 0.114336, 'y': 0.626306}}}},
 {'text': 'Mar',
  'bounding_box': {'coordinates': {'top_left': {'x': 70.23, 'y': 518.35},
    'bottom_right': {'x': 85.96, 'y': 527.35}},
   'decimal_coordinates': {'top_left': {'x': 0.118034, 'y': 0.615618},

In [26]:
# def extract_unique_y_values(items_within_coordinates):
#     y_values = {item['bounding_box']['decimal_coordinates']['top_left']['y'] for item in items_within_coordinates}
#     return sorted(y_values)

# unique_y_values = extract_unique_y_values(items_within_coordinates)

# print(unique_y_values)

# jpg_image_original_copy = jpg_image_original.copy()

# draw = ImageDraw.Draw(jpg_image_original_copy)

# # Ensure the lines are drawn at the correct height
# for y in unique_y_values:
#     # Calculate the y position in pixels
#     pixel_y = y * (jpg_image_original.height )
#     draw.line([(0, pixel_y), (jpg_image_original.width, pixel_y)], fill="red", width=5)

# jpg_image_original_copy.show()

In [27]:
# Debugging

In [28]:
import pdfplumber
from PIL import ImageDraw

# Open the PDF file
with pdfplumber.open(pdf_path) as pdf:
    # Get the second page
    second_page = pdf.pages[1]
    # Extract the lines from the second page
    lines = second_page.lines

    # Create a copy of the original image to draw on
    jpg_image_original_copy = jpg_image_original.copy()
    draw = ImageDraw.Draw(jpg_image_original_copy)

    # Get the dimensions of the original image
    img_width, img_height = jpg_image_original_copy.size

    # Draw the lines on the image with scaling
    for line in lines:
        # Check if the line has valid coordinates
        if 'x0' in line and 'y0' in line and 'x1' in line and 'y1' in line:
            # Scale the coordinates based on the image dimensions
            scaled_x0 = line['x0'] * img_width
            scaled_y0 = line['y0'] * img_height
            scaled_x1 = line['x1'] * img_width
            scaled_y1 = line['y1'] * img_height
            draw.line([(scaled_x0, scaled_y0), (scaled_x1, scaled_y1)], fill="red", width=5)

# Show the modified image
jpg_image_original_copy.show()

NameError: name 'jpg_image_original' is not defined