In [9]:
from pydantic import BaseModel
from typing import List

class TextElement(BaseModel):
    type: str
    text: str
    page_number: int
    is_table: bool = False

In [13]:
import os
import json
import pdfplumber

def convert_pdf_to_pixels(pdf_width, pdf_height, dpi=300):
    """Convert PDF dimensions from points to pixels at a specific DPI."""
    pixels_per_point = dpi / 72
    return (pdf_width * pixels_per_point, pdf_height * pixels_per_point)

def adjust_coordinates_for_dpi(bbox, pdf_width, pdf_height, dpi=300): ## 이미지 처리하면서 올린 dpi 다시 pdf 기본 해상도 픽셀로 낮추는 함수
    """Adjust bbox coordinates based on DPI scaling relative to the PDF dimensions."""
    pixels_width, pixels_height = convert_pdf_to_pixels(pdf_width, pdf_height, dpi)
    scale_x = pixels_width / pdf_width
    scale_y = pixels_height / pdf_height
    return (bbox[0] / scale_x, bbox[1] / scale_y, bbox[2] / scale_x, bbox[3] / scale_y)

def load_page_objects(page_number, detection_folder):
    """Load JSON data for a specific page."""
    json_path = os.path.join(detection_folder, f"data_sample_page{page_number}_objects.json")
    if os.path.exists(json_path):
        with open(json_path, 'r') as file:
            return json.load(file)
    return []

def extract_text_within_bbox(page, bbox):
    """Extract text within a specified bbox on the given page."""
    text_inside_bbox = []
    for word in page.extract_words():
        if is_within_bbox((word['x0'], word['top'], word['x1'], word['bottom']), bbox):
            text_inside_bbox.append(word['text'] + ' ')
    return "".join(text_inside_bbox).strip()

def is_within_bbox(word_bbox, table_bbox):
    """Check if the word's bounding box is within the table's bounding box."""
    word_x0, word_top, word_x1, word_bottom = word_bbox
    table_x0, table_top, table_x1, table_bottom = table_bbox
    return (word_x0 >= table_x0 and word_x1 <= table_x1 and
            word_top >= table_top and word_bottom <= table_bottom)

def process_pdf_text(pdf_path: str, detection_folder: str, dpi=300) -> List[TextElement]:
    results = []
    with pdfplumber.open(pdf_path) as pdf:
        for page_number, page in enumerate(pdf.pages, start=1):
            objects = sorted(load_page_objects(page_number - 1, detection_folder), key=lambda x: x['bbox'][1])
            tables = [adjust_coordinates_for_dpi(obj['bbox'], page.width, page.height, dpi) for obj in objects]
            current_text = ""
            is_current_table = False

            for line in page.extract_text_lines(return_chars=True):
                line_bbox = (line['x0'], line['top'], line['x1'], line['bottom'])
                line_is_table = any(is_within_bbox(line_bbox, tbl) for tbl in tables)
                
                if line_is_table != is_current_table or not line['text'].strip():
                    if current_text:
                        results.append(TextElement(type="table" if is_current_table else "text",
                                                   text=current_text,
                                                   page_number=page_number,
                                                   is_table=is_current_table))
                        current_text = ""
                    is_current_table = line_is_table
                
                current_text += line['text'] + ' '
            
            if current_text:
                results.append(TextElement(type="table" if is_current_table else "text",
                                           text=current_text,
                                           page_number=page_number,
                                           is_table=is_current_table))

    return results

# Example usage
pdf_path = "data/sample.pdf"
detection_folder = "data/results/detection"
results = process_pdf_text(pdf_path, detection_folder)

for result in results:
    if result.is_table:
        print(f"Page: {result.page_number} - Detected table text: {result.text}")
    else:
        print(f"Page: {result.page_number} - Text: {result.text}")
    print("-" * 50)


Page: 1 - Text: Supply specification KWN 49037 Revision: A Material and testing requirements Forgings 
--------------------------------------------------
Page: 1 - Detected table text: Created by: Dipl.-Ing. M. Krondorf 03/06/2022 sgd. M. Krondorf Examined by: Dr.-Ing. T. Hähnel 03/06/2022 sgd. T. Hähnel Name Date Signature 
--------------------------------------------------
Page: 1 - Text: KWD Kupplungswerk Dresden GmbH Löbtauer Straße 45 D – 01159 Dresden Postfach 270144 D – 01172 Dresden Tel.: + 49(0)351 – 4999-0 Fax.: + 49(0)351 – 4999-233 kwd@kupplungswerk-dresden.de http://www.kupplungswerk-dresden.de KWN49037_A_eng.docx Page 1 von 13 Copyright () KWD Kupplungswerk Dresden, All rights reserved. Supply specification 
--------------------------------------------------
Page: 2 - Detected table text: Supply specification KWN 49037 Revision: A 
--------------------------------------------------
Page: 2 - Text: Table of contents Table of contents ......................................

In [13]:
import os

def load_file_content(base_page, directory):
    content = ""
    for i in range(5):
        filename = os.path.join(directory, f"data_sample_page{base_page}_{0}_{i}.html")
        # print(f"Checking file: {filename}")  # 파일 경로 출력
        if os.path.exists(filename):
            with open(filename, 'r', encoding='utf-8') as file:
                print(file.read())
        else:
            # print(f"{filename} not found.")
            pass
    return content

def process_pages(start_page, end_page, directory):
    all_content = ""
    for page in range(start_page, end_page + 1):
        print(f"Processing page {page}...")
        page_content = load_file_content(page, directory)
        all_content += page_content
    return all_content

# 예제 실행:
directory = 'data/results/structure'
start_page = 0
end_page = 10
combined_content = process_pages(start_page, end_page, directory)
print(combined_content)


Processing page 0...
<html><head><style>table{border-collapse: collapse; border: 0;}th{border: 1px solid gray;}td{border: 1px solid gray;}</style></head><body><table cellspacing="0"><thead><tr><th>Created by:</th><th>Dipl.-Ing. M. Krondorf</th><th>03/06/2022</th><th>sgd. M. Krondorf</th></tr></thead><tbody><tr><td>Examined by:</td><td>Dr.-Ing. T. Hähnel</td><td>03/06/2022</td><td>sgd. T. Hähnel</td></tr><tr><td></td><td>Name</td><td>Date</td><td>Signature</td></tr></tbody></table></body></html>
Processing page 1...
<html><head><style>table{border-collapse: collapse; border: 0;}th{border: 1px solid gray;}td{border: 1px solid gray;}</style></head><body><table cellspacing="0"><thead><tr><th>Supply specification KWN 49037</th></tr><tr><th>Revision: A</th></tr></thead></table></body></html>
Processing page 2...
<html><head><style>table{border-collapse: collapse; border: 0;}th{border: 1px solid gray;}td{border: 1px solid gray;}</style></head><body><table cellspacing="0"><thead><tr><th>Supply