In [20]:
import json
from src.extractor import TextExtractor

pdf_path = "data/bank_statements/barclays/pdf/barclays March 2.pdf"

with open(pdf_path, "rb") as pdf_file:
    text_extractor = TextExtractor(pdf_file.read())

    # Extract data from the PDF file
    extracted_data = text_extractor.extract_text()

    with open("src/pdf_data/barclays_march_2_pdf_data.json", "w") as f:
        json.dump(extracted_data, f)



In [21]:
import os
from typing import Dict, Any

template_path: str = os.path.join("src", "templates", "barclays_template.json")
pdf_data_path: str = os.path.join("src", "pdf_data", "barclays_march_2_pdf_data.json")

template: Dict[str, Any] = json.load(open(template_path))
pdf_data: Dict[str, Any] = json.load(open(pdf_data_path))

output_data = json.load(open("src/outputs/barclays_march_2_output.json"))

In [22]:
table_config = template["rules"][-1]["config"]

In [23]:
table_config["columns"][0]

{'field_name': 'date',
 'coordinates': {'top_left': {'x': 0.095, 'y': 0.488},
  'bottom_right': {'x': 0.149, 'y': 0.898}},
 'type': 'text'}

In [24]:
coordinates = table_config["columns"][0]["coordinates"]

In [25]:
coordinates

{'top_left': {'x': 0.095, 'y': 0.488},
 'bottom_right': {'x': 0.149, 'y': 0.898}}

In [26]:
page_content = pdf_data["pages"][1]["content"]

In [27]:
from src.parser import Parser

parser = Parser()

items_within_coordinates = parser.get_items_in_bounding_box(
    page_content, coordinates
)

TABLE PARSER IS THE SAME AS FORM PARSER BUT WE USE LINE SEPARATION TO GET THE BOUNDING BOXES

# TABLE SPLITTER

In [28]:
from pdfplumber import open as pdf_open
from pdf2image import convert_from_path
from PIL import Image, ImageDraw
import json

from src.parser import TableSplitter
from src.pdf_utils import ImageDrawer

def extract_lines_from_pdf(pdf_data, page_number):
    """Extract lines from the specified page of the PDF data."""
    return pdf_data["pages"][page_number]["lines"]


def draw_lines_and_coordinates(image, coordinates, lines_y_coordinates):
    """Draw coordinates and horizontal lines on the image."""
    image_drawer = ImageDrawer(image, image.size[0], image.size[1])
    # modified_image = image_drawer.draw_coordinates([coordinates])
    modified_image = image_drawer.draw_horizontal_lines(lines_y_coordinates)
    return modified_image

def process_pdf_page(pdf_data, pdf_path, template, parser, coordinates):
    """Process a specific page of the PDF to extract lines and draw them."""
    table_splitter = TableSplitter(template, parser)
    page_content = pdf_data["pages"][1]
    
    lines_y_coordinates = table_splitter.split_table("line", page_content, coordinates)

    print(lines_y_coordinates)

    jpg_image = ImageDrawer.create_jpg_image(pdf_path)

    print("\nDrawing lines and rectangles...")
    modified_image = draw_lines_and_coordinates(jpg_image, coordinates, lines_y_coordinates)

    # Save and show the modified image
    modified_image.save("output_with_lines_and_rectangles.jpeg", "JPEG")
    modified_image.show()

    print("\nProcess completed successfully!")
    print("Check 'output_with_lines_and_rectangles.jpeg' for the result")

# Call the process function
process_pdf_page(pdf_data, pdf_path, template, parser, coordinates)

[0.48729199999999995, 0.609382, 0.6523749999999999, 0.6720900000000001, 0.847387]
Image dimensions: 1653x2339

Drawing lines and rectangles...
Successfully drew 5 lines

Process completed successfully!
Check 'output_with_lines_and_rectangles.jpeg' for the result
