In [1]:
from typing import List, Dict


class Parser:
    def page_number_converter(
        self, page_numbers: str, number_of_pages: int
    ) -> List[int]:
        if ":" in page_numbers:
            left_index = int(page_numbers.split(":")[0])
            right_index = int(page_numbers.split(":")[1])
        else:
            index = int(page_numbers)
            if index >= 0:
                index = index - 1
            elif index < 0:
                index = number_of_pages + index
            return [index]

        if left_index > 0:
            left_index -= 1
        if right_index > 0:
            right_index -= 1

        if left_index < 0:
            left_index = number_of_pages + left_index + 1
        if right_index < 0:
            right_index = number_of_pages + right_index + 1

        if left_index == right_index:
            return [left_index]

        return list(range(left_index, right_index))

    def get_rule_from_id(self, rule_id, template):
        return [item for item in template["rules"] if item["rule_id"] == rule_id][0]

    def get_items_in_bounding_box(
        self, text_coordinates, box_coordinates, threshold=0.01
    ):
        items_in_box = []
        for item in text_coordinates:
            bounding_box = item["bounding_box"]["decimal_coordinates"]
            if (
                bounding_box["top_left"]["x"]
                >= box_coordinates["top_left"]["x"] - threshold
                and bounding_box["top_left"]["y"]
                >= box_coordinates["top_left"]["y"] - threshold
                and bounding_box["bottom_right"]["x"]
                <= box_coordinates["bottom_right"]["x"] + threshold
                and bounding_box["bottom_right"]["y"]
                <= box_coordinates["bottom_right"]["y"] + threshold
            ):
                items_in_box.append(item)
        return items_in_box

    def get_text_from_items(self, items):
        return " ".join([item["text"] for item in items])

    def get_text_from_page(self, page_content, coordinates):
        items_within_coordinates = self.get_items_in_bounding_box(
            page_content, coordinates
        )
        return self.get_text_from_items(items_within_coordinates)

    def get_output_data_from_form_rule(
        self, form_rule_id, page_index, pdf_data, template
    ):
        form_rule = self.get_rule_from_id(form_rule_id, template)
        coordinates = form_rule["config"]["coordinates"]
        page_content = pdf_data["pages"][page_index]["content"]
        return {
            form_rule["config"]["field_name"]: self.get_text_from_page(
                page_content, coordinates
            )
        }

    def get_output_data_from_table_rule(
        self, table_rule_id, page_index, pdf_data, template
    ):
        table_processor = TableProcessor(template, self)
        table_splitter = TableSplitter(template, self)
        results = table_processor.process_tables(pdf_data)

        data = []
        for result in results:
            row_data = {}
            for column in result["columns"]:
                split_boxes = table_splitter.split_bounding_box_by_lines(
                    column["coordinates"], column["lines_y_coordinates"]
                )

                row_data[column["field_name"]] = self.get_text_from_page(
                    pdf_data["pages"][page_index]["content"], split_boxes[0]
                )
            data.append(row_data)

        return data


class TableProcessor:
    def __init__(self, template, parser):
        self.template = template
        self.parser = parser

    def get_delimiter_column_coordinates(self, template, delimiter_field_name):
        """Get the coordinates of the description column from the template."""
        delimiter_coordinates = None
        for rule in template["rules"]:
            if rule["type"] == "table":
                for column in rule["config"]["columns"]:
                    if column["field_name"] == delimiter_field_name:
                        delimiter_coordinates = column["coordinates"]
                        break
                break

        if not delimiter_coordinates:
            raise ValueError("Description column coordinates not found in template")

        return delimiter_coordinates

    def process_table_data(
        self,
        table_rule: Dict,
        page_content: Dict,
        delimiter_field_name: str,
        delimiter_type: str,
    ) -> List[Dict]:
        """Process a single table's data."""

        delimiter_coordinates = self.get_delimiter_column_coordinates(
            self.template, delimiter_field_name
        )

        table_splitter = TableSplitter(self.template, self.parser)

        if delimiter_type == "line":

            lines_y_coordinates = table_splitter.split_table(
                delimiter_type, page_content
            )

        if delimiter_type == "field":
            delimiter_coordinates = self.get_delimiter_column_coordinates(
                self.template, delimiter_field_name
            )
            lines_y_coordinates = table_splitter.split_table(
                delimiter_type, page_content, delimiter_coordinates
            )

        if not delimiter_coordinates:
            raise ValueError("Delimiter coordinates not found")

        # Process each column
        processed_columns = []
        for column in table_rule["config"]["columns"]:
            processed_columns.append(
                {
                    "field_name": column["field_name"],
                    "coordinates": column["coordinates"],
                    "lines_y_coordinates": lines_y_coordinates,
                }
            )

        return processed_columns

    def process_tables(self, pdf_data: Dict) -> List[Dict]:
        """Process all tables according to template pages."""
        results = []
        # Process each page rule
        for page_rule in self.template["pages"]:
            if "tables" not in page_rule or not page_rule["tables"]:
                continue

            # Get page indexes
            page_indexes = self.parser.page_number_converter(
                page_rule["page_numbers"], len(pdf_data["pages"])
            )

            # Process each page
            for page_index in page_indexes:
                page_content = pdf_data["pages"][page_index]
                results.extend(self.process_page_tables(page_rule, page_content))

        return results

    def process_page_tables(self, page_rule: Dict, page_content: Dict) -> List[Dict]:
        """Process tables for a specific page."""
        results = []
        # Process each table rule
        for rule_id in page_rule["tables"]:
            # Get table rule
            table_rule = self.parser.get_rule_from_id(rule_id, self.template)

            delimiter_field_name = table_rule["config"]["row_delimiter"]["field_name"]
            delimiter_type = table_rule["config"]["row_delimiter"]["type"]

            # Process table data
            processed_columns = self.process_table_data(
                table_rule,
                page_content,
                delimiter_field_name,
                delimiter_type,
            )

            if processed_columns and any(
                col["lines_y_coordinates"] for col in processed_columns
            ):
                results.append(
                    {
                        "rule_id": rule_id,
                        "page_number": page_content["page_number"],
                        "columns": processed_columns,
                    }
                )
        return results


class TableSplitter:
    def __init__(self, template, parser):
        self.template = template
        self.parser = parser

    def split_bounding_box_by_lines(
        self, bounding_box: Dict, lines_y_coordinates: List[float]
    ) -> List[Dict]:
        """Split a bounding box by given y-coordinates."""
        split_boxes = []
        top_left_y = bounding_box["top_left"]["y"]
        bottom_right_y = bounding_box["bottom_right"]["y"]

        # Add the top of the bounding box as the first coordinate
        previous_y = top_left_y

        for line_y in sorted(lines_y_coordinates):
            if top_left_y < line_y < bottom_right_y:
                # Create a new bounding box for the area above the line
                split_boxes.append(
                    {
                        "top_left": {
                            "x": bounding_box["top_left"]["x"],
                            "y": previous_y,
                        },
                        "bottom_right": {
                            "x": bounding_box["bottom_right"]["x"],
                            "y": line_y,
                        },
                    }
                )
                previous_y = line_y

        # Add the last segment from the last line to the bottom of the bounding box
        if previous_y < bottom_right_y:
            split_boxes.append(
                {
                    "top_left": {"x": bounding_box["top_left"]["x"], "y": previous_y},
                    "bottom_right": {
                        "x": bounding_box["bottom_right"]["x"],
                        "y": bottom_right_y,
                    },
                }
            )

        return split_boxes

    # Filter lines by pixel value
    def filter_lines_by_pixel_value(self, lines, max_pixel_value=(100, 100, 100)):
        """Filter lines based on their average pixel value."""
        filtered_lines = []
        for line in lines:
            if "average_pixel_value" in line:
                avg_red, avg_green, avg_blue = line["average_pixel_value"]
                max_red, max_green, max_blue = max_pixel_value

                if avg_red < max_red and avg_green < max_green and avg_blue < max_blue:
                    filtered_lines.append(line)
        return filtered_lines

    def split_table_by_delimiter(self, page_content, coordinates):

        text_coordinates = page_content["content"]
        items_within_coordinates = self.parser.get_items_in_bounding_box(
            text_coordinates, coordinates
        )

        line_separation_y_coordinates = {
            item["bounding_box"]["decimal_coordinates"]["top_left"]["y"]
            for item in items_within_coordinates
        }
        return sorted(list(set(line_separation_y_coordinates)))

    def split_table_by_line(self, lines):

        filtered_lines = self.filter_lines_by_pixel_value(lines)

        lines_y_coordinates = [
            line["decimal_coordinates"]["top_left"]["y"] for line in filtered_lines
        ]

        return sorted(list(set(lines_y_coordinates)))

    def split_table(
        self,
        row_delimiter_type: str,
        page_content,
        delimiter_coordinates=None,
    ):
        if row_delimiter_type == "line":
            return self.split_table_by_line(page_content["lines"])
        elif row_delimiter_type == "field":
            return self.split_table_by_delimiter(page_content, delimiter_coordinates)


In [2]:
import json
import os
from typing import Dict, Any
from datetime import datetime
import uuid

# Define template and identifier
template_name: str = "barclays_student"
identifier: str = "march"

# Set paths for template and PDF data
root_path = os.path.abspath(os.path.join(os.path.dirname('__file__'), '..'))
template_path: str = os.path.join(root_path, "templates", f"{template_name}_template.json")
pdf_data_path: str = os.path.join(root_path, "pdf_data", f"{template_name}_{identifier}_pdf_data.json")

# Load template and PDF data
with open(template_path) as template_file:
    template: Dict[str, Any] = json.load(template_file)

with open(pdf_data_path) as pdf_data_file:
    pdf_data: Dict[str, Any] = json.load(pdf_data_file)

# Set output data path
output_data_path: str = os.path.join(root_path, "outputs", f"{template_name}_{identifier}_output.json")

def parse_pdf(template: Dict[str, Any], pdf_data: Dict[str, Any]) -> Dict[str, Any]:
    forms = []
    tables = []
    number_of_pages = len(pdf_data["pages"])

    for page_rule in template["pages"]:
        page_indexes = Parser().page_number_converter(
            page_rule["page_numbers"], number_of_pages
        )
        for page_index in page_indexes:
            if "forms" in page_rule and len(page_rule["forms"]) > 0:
                for rule_id in page_rule["forms"]:
                    try:
                        form = Parser().get_output_data_from_form_rule(
                            rule_id, page_index, pdf_data, template
                        )
                        forms.append(form)
                    except IndexError:
                        print(
                            f"Rule ID '{rule_id}' not found in template rules or page index '{page_index}' is out of range."
                        )
            if "tables" in page_rule and len(page_rule["tables"]) > 0:
                for rule_id in page_rule["tables"]:
                    try:
                        table_data = Parser().get_output_data_from_table_rule(
                            rule_id, page_index, pdf_data, template
                        )
                        tables.append({"table_header": "header", "data": table_data})
                    except IndexError:
                        print(
                            f"Rule ID '{rule_id}' not found in template rules or page index '{page_index}' is out of range."
                        )

    output = {
        "metadata": {
            "document_id": str(uuid.uuid4()),
            "parsed_at": datetime.now().strftime("%Y-%m-%dT%H:%M:%S"),
            "number_of_pages": number_of_pages,
        },
        "pages": [{"forms": forms, "tables": tables}],
    }

    return output

# Parse the PDF and save the output
output = parse_pdf(template, pdf_data)

with open(output_data_path, "w") as json_file:
    json.dump(output, json_file, indent=4)
