In [38]:
import pdfplumber

def create_visual_pdf_with_pdfplumber(pdf_path):
    """ PDF에서 페이지별 텍스트 박스 좌표를 시각화하고 이미지로 저장합니다. """
    with pdfplumber.open(pdf_path) as pdf:
        for page_number, page in enumerate(pdf.pages, start=1):
            im = page.to_image(resolution=300)  # 페이지를 이미지로 변환, 해상도는 300 DPI 설정
            for bbox in page.extract_words():  # 텍스트 박스의 좌표 추출
                # 추출된 좌표에 빨간색 사각형 그리기
                im.draw_rect(bbox, stroke='red', stroke_width=1)
            
            im_path = f"{pdf_path[:-4]}_page_{page_number}.png"
            im.save(im_path)  # 이미지 파일 저장

# 사용 예시
pdf_path = "example/sample_file.pdf"
create_visual_pdf_with_pdfplumber(pdf_path)


In [28]:
with pdfplumber.open(pdf_path) as pdf:
    for page_number, page in enumerate(pdf.pages):
        lines = page.extract_text_lines()

        print(lines[0]['chars'])

[{'matrix': (1, 0, 0, 1, 391.18, 798.12), 'fontname': 'Arial-BoldMT', 'adv': 9.36468, 'upright': True, 'x0': 391.18, 'y0': 795.1716, 'x1': 400.54468, 'y1': 809.2116, 'width': 9.364680000000021, 'height': 14.039999999999964, 'size': 14.039999999999964, 'mcid': None, 'tag': 'Artifact', 'object_type': 'char', 'page_number': 1, 'ncs': 'DeviceGray', 'text': 'S', 'stroking_color': (0,), 'stroking_pattern': None, 'non_stroking_color': (0,), 'non_stroking_pattern': None, 'top': 32.70839999999998, 'bottom': 46.74839999999995, 'doctop': 32.70839999999998}, {'matrix': (1, 0, 0, 1, 400.54468, 798.12), 'fontname': 'Arial-BoldMT', 'adv': 8.578439999999999, 'upright': True, 'x0': 400.54468, 'y0': 795.1716, 'x1': 409.12312000000003, 'y1': 809.2116, 'width': 8.57844, 'height': 14.039999999999964, 'size': 14.039999999999964, 'mcid': None, 'tag': 'Artifact', 'object_type': 'char', 'page_number': 1, 'ncs': 'DeviceGray', 'text': 'u', 'stroking_color': (0,), 'stroking_pattern': None, 'non_stroking_color': (

In [14]:
import pdfplumber
from PIL import Image

# PDF 파일 열기
with pdfplumber.open("example/to_test.pdf") as pdf:
    # 특정 페이지 선택
    page = pdf.pages[10]  # 10번 페이지, 인덱스는 0부터 시작합니다.

    # 페이지를 이미지로 변환
    im = page.to_image(resolution=300)  # DPI 설정

    # 좌표 설정 (left, top, right, bottom)
    bbox = (294.0290832519531,
      534.6534423828125,
      2297.125,
      2912.728271484375)

    # 좌표에 맞춰 이미지를 크롭하기
    cropped_image = im.original.crop(bbox)

    # 이미지 저장
    cropped_image.save("output_image.png")


In [38]:
import pdfplumber

def convert_pdf_to_pixels(pdf_width, pdf_height, dpi=300):
    """
    Convert PDF dimensions from points to pixels at a specific DPI.
    """
    pixels_per_point = dpi / 72
    return (pdf_width * pixels_per_point, pdf_height * pixels_per_point)

def adjust_coordinates_for_dpi(bbox, pdf_width, pdf_height, dpi=300):
    """
    Adjust bbox coordinates based on DPI scaling relative to the PDF dimensions.
    PDF dimensions are given in points and should be converted to pixels at the specified DPI.
    """
    pixels_width, pixels_height = convert_pdf_to_pixels(pdf_width, pdf_height, dpi)
    scale_x = pixels_width / pdf_width
    scale_y = pixels_height / pdf_height
    print(scale_x, scale_y)
    return (bbox[0] / scale_x, bbox[1] / scale_y, bbox[2] / scale_x, bbox[3] / scale_y)

def extract_text_within_bbox(pdf_path, page_number, bbox, dpi=300):
    """
    Extract text from a specified bounding box on a specified page, adjusting for DPI.
    """
    text_inside_bbox = []
    with pdfplumber.open(pdf_path) as pdf:
        page = pdf.pages[page_number]
        pdf_width = page.width
        pdf_height = page.height
        scaled_bbox = adjust_coordinates_for_dpi(bbox, pdf_width, pdf_height, dpi)

        # Loop through each character in the content
        for word in page.extract_words():
            # Check each word if it's within the scaled_bbox
            if is_within_bbox((word['x0'], word['top'], word['x1'], word['bottom']), scaled_bbox):
                text_inside_bbox.append(word['text'] + ' ')

    # Join all text pieces together and return
    return "".join(text_inside_bbox).strip()

def is_within_bbox(word_bbox, table_bbox):
    """
    Check if the word's bounding box is within the table's bounding box.
    """
    word_x0, word_top, word_x1, word_bottom = word_bbox
    table_x0, table_top, table_x1, table_bottom = table_bbox
    return (word_x0 >= table_x0 and word_x1 <= table_x1 and
            word_top >= table_top and word_bottom <= table_bottom)

# Example usage
pdf_path = "example/to_test.pdf"
page_number = 4
table_bbox = (285.0634765625,
      586.9825439453125,
      2307.5234375,
      1316.4169921875)
extracted_text = extract_text_within_bbox(pdf_path, page_number, table_bbox)
print("Extracted Text:", extracted_text)


# {'x0': 118.22, 'x1': 168.74707999999998, 'top': 208.27160000000003, 'doctop': 7785.5516, 'bottom': 218.23159999999996}

4.166666666666667 4.166666666666667
Extracted Text: Class Inclusion group A B C D DS Total length Total length Total length Count Diameter [µm] [µm] [µm] [µm] number [Pcs] 0,5 37 17 18 1 13 1 127 77 76 4 19 1,5 261 184 176 9 27 2 436 343 320 16 38 2,5 649 555 510 25 53 3 898 822 746 36 76
