In [13]:
from PIL import Image
import io
import pytesseract


class ImageExtractor:
    def __init__(self, jpg_bytes, decimal_coordinates):
        self.image = Image.open(io.BytesIO(jpg_bytes))
        self.coordinates = decimal_coordinates

    def extract_section(self):
        # Convert decimal coordinates to pixel values
        width, height = self.image.size
        top_left_x = int(self.coordinates["top_left"]["x"] * width)
        top_left_y = int(self.coordinates["top_left"]["y"] * height)
        bottom_right_x = int(self.coordinates["bottom_right"]["x"] * width)
        bottom_right_y = int(self.coordinates["bottom_right"]["y"] * height)

        # Crop the image to the specified coordinates
        return self.image.crop((top_left_x, top_left_y, bottom_right_x, bottom_right_y))

    def extract_text(self):
        section_image = self.extract_section()
        return pytesseract.image_to_string(section_image)


In [14]:
import json
from typing import Dict, Any
from pathlib import Path

template_name: str = "halifax"
identifier: str = "april"

# Use Path to handle file paths more elegantly
root_directory = Path().resolve()

parent_directory = root_directory.parent
grandparent_directory = parent_directory.parent

pdf_path = grandparent_directory / "data" / "bank_statements" / "halifax" / "pdf" / f"{template_name}_{identifier}.pdf"
template_path: Path = parent_directory / "templates" / f"{template_name}_template.json"
pdf_data_path: Path = parent_directory / "pdf_data" / f"{template_name}_{identifier}_pdf_data.json"

# Load JSON data using context manager for better file handling
with open(template_path) as template_file:
    template: Dict[str, Any] = json.load(template_file)

with open(pdf_data_path) as pdf_data_file:
    pdf_data: Dict[str, Any] = json.load(pdf_data_file)

output_data_path: Path = root_directory / "src" / "outputs" / f"{template_name}_{identifier}_output.json"
from pdf_utils import ImageDrawer

jpg_image = ImageDrawer.create_jpg_image(pdf_path, 1)

from io import BytesIO

# Convert the JPG image to bytes
jpg_image_bytes = BytesIO()
jpg_image.save(jpg_image_bytes, format='JPEG')
jpg_image_bytes.seek(0)

# Create an instance of ImageExtractor with the image bytes and coordinates
coordinates = {
    "top_left": {
        "x": 0.086,
        "y": 0.122
    },
    "bottom_right": {
        "x": 0.313,
        "y": 0.135
    }
}

image_extractor = ImageExtractor(jpg_image_bytes.getvalue(), coordinates)
extracted_text = image_extractor.extract_text()

# Optionally, print the extracted text
print(extracted_text)



Mr Jake Holmes

