In [1]:
import json
from src.extractor import TextExtractor

pdf_path = "data/bank_statements/barclays_student/pdf/barclays_student_march.pdf"

with open(pdf_path, "rb") as pdf_file:
    text_extractor = TextExtractor(pdf_file.read())

    # Extract data from the PDF file
    extracted_data = text_extractor.extract_text()

    with open("src/pdf_data/barclays_student_marchpdf_data.json", "w") as f:
        json.dump(extracted_data, f)



In [2]:
import os
from typing import Dict, Any

template_path: str = os.path.join("src", "templates", "barclays_template.json")
pdf_data_path: str = os.path.join("src", "pdf_data", "barclays_march_2_pdf_data.json")

template: Dict[str, Any] = json.load(open(template_path))
pdf_data: Dict[str, Any] = json.load(open(pdf_data_path))

output_data = json.load(open("src/outputs/barclays_march_2_output.json"))

In [3]:
table_config = template["rules"][-1]["config"]

In [4]:
table_config["columns"][0]

{'field_name': 'date',
 'coordinates': {'top_left': {'x': 0.095, 'y': 0.488},
  'bottom_right': {'x': 0.149, 'y': 0.898}},
 'type': 'text'}

In [5]:
coordinates = table_config["columns"][0]["coordinates"]

In [6]:
coordinates

{'top_left': {'x': 0.095, 'y': 0.488},
 'bottom_right': {'x': 0.149, 'y': 0.898}}

TABLE PARSER IS THE SAME AS FORM PARSER BUT WE USE LINE SEPARATION TO GET THE BOUNDING BOXES

# TABLE SPLITTER

In [7]:
from pdfplumber import open as pdf_open
from pdf2image import convert_from_path
from PIL import Image, ImageDraw
import json

from src.parser import TableSplitter
from src.pdf_utils import ImageDrawer

def extract_lines_from_pdf(pdf_data, page_number):
    """Extract lines from the specified page of the PDF data."""
    return pdf_data["pages"][page_number]["lines"]


def draw_lines_and_coordinates(image, coordinates, lines_y_coordinates):
    """Draw coordinates and horizontal lines on the image."""
    image_drawer = ImageDrawer(image, image.size[0], image.size[1])
    x0 = coordinates["top_left"]["x"]
    x1 = coordinates["bottom_right"]["x"]
    y0 = coordinates["top_left"]["y"]
    y1 = coordinates["bottom_right"]["y"]

    # Create a list of y coordinates within the range
    y_coordinates = [y for y in lines_y_coordinates if y0 <= y <= y1]
    
    # Generate new coordinate boxes
    new_coordinates = []
    for i in range(len(y_coordinates) + 1):
        top_left = {"x": x0, "y": y0 if i == 0 else y_coordinates[i - 1]}
        bottom_right = {"x": x1, "y": y1 if i == len(y_coordinates) else y_coordinates[i]}
        new_coordinates.append({"top_left": top_left, "bottom_right": bottom_right})

    modified_image = image_drawer.draw_coordinates(new_coordinates)
    return modified_image

def process_pdf_page_split(pdf_path, table_splitter, page_content, coordinates, delimiter_type, page_number):
    lines_y_coordinates = table_splitter.split_table(delimiter_type, page_content, coordinates)
    jpg_image = ImageDrawer.create_jpg_image(pdf_path, page_number)
    modified_image = draw_lines_and_coordinates(jpg_image, coordinates, lines_y_coordinates)

    # Save and show the modified image
    modified_image.save("output_with_lines_and_rectangles.jpeg", "JPEG")
    modified_image.show()

from src.parser import Parser

parser = Parser()

page_number = 2

table_splitter = TableSplitter(template, parser)

page_content = pdf_data["pages"][page_number - 1]

process_pdf_page_split(pdf_path, table_splitter, page_content, coordinates, "line", page_number)
process_pdf_page_split(pdf_path, table_splitter, page_content, coordinates, "delimiter", page_number)
