In [9]:
from pydantic import BaseModel
from typing import List, Any

class TextElement(BaseModel):
    type: str
    text: str
    page_number: int
    table_index: Any
    is_table: bool = False

In [10]:
import os
import json
import pdfplumber
from bs4 import BeautifulSoup

def convert_pdf_to_pixels(pdf_width, pdf_height, dpi=300):
    """Convert PDF dimensions from points to pixels at a specific DPI."""
    pixels_per_point = dpi / 72
    return (pdf_width * pixels_per_point, pdf_height * pixels_per_point)

def adjust_coordinates_for_dpi(bbox, pdf_width, pdf_height, dpi=300): ## 이미지 처리하면서 올린 dpi 다시 pdf 기본 해상도 픽셀로 낮추는 함수
    """Adjust bbox coordinates based on DPI scaling relative to the PDF dimensions."""
    pixels_width, pixels_height = convert_pdf_to_pixels(pdf_width, pdf_height, dpi)
    scale_x = pixels_width / pdf_width
    scale_y = pixels_height / pdf_height
    return (bbox[0] / scale_x, bbox[1] / scale_y, bbox[2] / scale_x, bbox[3] / scale_y)

def load_page_objects(page_number, detection_folder):
    """Load JSON data for a specific page."""
    json_path = os.path.join(detection_folder, f"output_sample_page{page_number}_objects.json")
    if os.path.exists(json_path):
        with open(json_path, 'r') as file:
            return json.load(file)
    return []

def extract_text_within_bbox(page, bbox):
    """Extract text within a specified bbox on the given page."""
    text_inside_bbox = []
    for word in page.extract_words():
        if is_within_bbox((word['x0'], word['top'], word['x1'], word['bottom']), bbox):
            text_inside_bbox.append(word['text'] + ' ')
    return "".join(text_inside_bbox).strip()

def is_within_bbox(word_bbox, table_bbox):
    """Check if the word's bounding box is within the table's bounding box."""
    word_x0, word_top, word_x1, word_bottom = word_bbox
    table_x0, table_top, table_x1, table_bottom = table_bbox
    return (word_x0 >= table_x0 and word_x1 <= table_x1 and
            word_top >= table_top and word_bottom <= table_bottom)

def process_pdf_text_from_plumber(pdf_path: str, detection_folder: str, dpi: int = 300) -> List[TextElement]:
    """
    Process a PDF file and extract text elements.

    Args:
    pdf_path (str): The path to the PDF file.
    detection_folder (str): The folder containing the detection data.
    dpi (int, optional): The DPI to use for adjustment. Defaults to 300.

    Returns:
    List[TextElement]: A list of extracted text elements.
    """
    results = []
    with pdfplumber.open(pdf_path) as pdf:
        for page_number, page in enumerate(pdf.pages):
            objects = sorted(load_page_objects(page_number - 1, detection_folder), key=lambda x: x['bbox'][1])
            tables = [adjust_coordinates_for_dpi(obj['bbox'], page.width, page.height, dpi) for obj in objects]
            current_text = ""
            is_current_table = False
            table_index = -1 #index -1부터 시작
            
            for line in page.extract_text_lines(return_chars=True):
                line_bbox = (line['x0'], line['top'], line['x1'], line['bottom'])
                line_is_table = any(is_within_bbox(line_bbox, tbl) for tbl in tables)

                if line_is_table != is_current_table or not line['text'].strip():
                    if current_text:
                        results.append(TextElement(
                            type="table" if is_current_table else "text",
                            text=current_text,
                            page_number=page_number,
                            is_table=is_current_table,
                            table_index=table_index if is_current_table else None
                        ))
                        current_text = ""
                    is_current_table = line_is_table
                    if is_current_table:  # 테이블 시작 시 인덱스 증가
                        table_index += 1

                current_text += line['text'] + ' '
                

            if current_text:
                results.append(TextElement(
                    type="table" if is_current_table else "text",
                    text=current_text,
                    page_number=page_number,
                    is_table=is_current_table,
                    table_index=table_index if is_current_table else None
                ))

    return results

def postprocess_with_datr(elements:List[TextElement], structure_path):
    
    for element in elements:
        if element.is_table:
            html_path = os.path.join(structure_path, f"output_sample_page{element.page_number}_0_{element.table_index}.html")
            if os.path.exists(html_path):
                html_content = open(html_path, 'r', encoding='utf-8').read()
                soup = BeautifulSoup(html_content, 'html.parser')
                tables = soup.find_all('table')
                element.text = tables[0]

    return elements




# Example usage
from pathlib import Path

base_path = Path("preprocess_document/output")

pdf_path = base_path / "sample.pdf"
detection_path = base_path / "results/detection"
structure_path = base_path / "results/structure"

results = process_pdf_text_from_plumber(pdf_path, detection_path)
postprocess_result = postprocess_with_datr(results, structure_path)
print(postprocess_result)



[TextElement(type='text', text='Supply specification KWN 49037 Revision: A Material and testing requirements Forgings Created by: Dipl.-Ing. M. Krondorf 03/06/2022 sgd. M. Krondorf Examined by: Dr.-Ing. T. Hähnel 03/06/2022 sgd. T. Hähnel Name Date Signature KWD Kupplungswerk Dresden GmbH Löbtauer Straße 45 D – 01159 Dresden Postfach 270144 D – 01172 Dresden Tel.: + 49(0)351 – 4999-0 Fax.: + 49(0)351 – 4999-233 kwd@kupplungswerk-dresden.de http://www.kupplungswerk-dresden.de KWN49037_A_eng.docx Page 1 von 13 Copyright (\uf0e3) KWD Kupplungswerk Dresden, All rights reserved. Supply specification ', page_number=0, table_index=None, is_table=False), TextElement(type='text', text='Supply specification KWN 49037 Revision: A Table of contents Table of contents ........................................................................................................................... 2 1 Field of application ......................................................................................

In [None]:
for element in postprocess_result:
    if element.type=='table':
        print(element)

In [36]:
import pickle

with open('elements_processed.pkl', 'wb') as file:
    pickle.dump(postprocess_result, file)

print("List has been pickled.")

List has been pickled.
