In [15]:
# ==============================================================================
# PDF → Markdown 변환기 (with Text, Table, Image)
# ==============================================================================

import fitz  # PyMuPDF
import camelot
from pathlib import Path
from tqdm import tqdm
from PIL import Image
import matplotlib.pyplot as plt
import pandas as pd

In [16]:
# ==============================================================================
# 텍스트 추출 함수
# ==============================================================================
def extract_text_from_page(page):
    text = page.get_text("text")
    if text.strip():
        return ["### 텍스트 내용", text]
    return []

In [17]:
def save_table_as_image(table_df: pd.DataFrame, save_path: Path, dpi: int = 300):
    """DataFrame을 이미지로 저장"""
    fig, ax = plt.subplots(figsize=(table_df.shape[1]*2, table_df.shape[0]*0.6))
    ax.axis('off')

    table = ax.table(
        cellText=table_df.values,
        colLabels=table_df.columns,
        loc='center',
        cellLoc='left',
        colLoc='center'
    )

    table.auto_set_font_size(False)
    table.set_fontsize(10)
    table.scale(1.2, 1.2)

    plt.tight_layout()
    fig.savefig(save_path, dpi=dpi, bbox_inches='tight')
    plt.close(fig)


In [18]:
# ==============================================================================
# 표 추출 함수 (텍스트 + 이미지)
# ==============================================================================
def extract_tables_from_page(pdf_path, page, page_num, image_dir, page_image):
    md_chunks = []
    try:
        tables = camelot.read_pdf(str(pdf_path), pages=str(page_num + 1), flavor='stream')
        if tables.n > 0:
            md_chunks.append("### 추출된 표")
            for i, table in enumerate(tables):
                md_chunks.append(f"* **설명**: 페이지 {page_num + 1}에서 {i + 1}번째로 발견된 표입니다.")

                # 마크다운 텍스트도 포함
                md_chunks.append(table.df.to_markdown(index=False))

                try:
                    # 표 이미지 저장
                    img_name = f"{pdf_path.stem}_p{page_num + 1}_table{i + 1}.png"
                    img_path = image_dir / img_name
                    save_table_as_image(table.df, img_path)

                    # 마크다운에 이미지 경로도 포함
                    md_chunks.append(f"![표 이미지]({img_path.as_posix()})")
                except Exception as img_err:
                    print(f"[표 이미지 저장 실패] {pdf_path.name} p{page_num + 1} table{i + 1} - {img_err}")

    except Exception as e:
        print(f"[표 추출 실패] {pdf_path.name} {page_num + 1}페이지 - {e}")
    return md_chunks


In [19]:
# ==============================================================================
# 이미지 추출 함수
# ==============================================================================
def extract_images_from_page(pdf_document, page, page_num, pdf_stem, image_dir):
    md_chunks = []
    images = page.get_images(full=True)
    if images:
        md_chunks.append("### 추출된 이미지")
        for idx, img in enumerate(images):
            try:
                xref = img[0]
                base = pdf_document.extract_image(xref)
                ext = base["ext"]
                img_bytes = base["image"]

                filename = f"{pdf_stem}_p{page_num + 1}_{idx + 1}.{ext}"
                img_path = image_dir / filename
                with open(img_path, "wb") as f:
                    f.write(img_bytes)

                md_chunks.append(f"![페이지 {page_num + 1}의 {idx + 1}번째 이미지]({img_path.as_posix()})")
                md_chunks.append(f"* **설명**: 페이지 {page_num + 1}에서 추출된 이미지입니다.")
            except Exception as img_err:
                print(f"[이미지 저장 실패] {pdf_stem} {page_num + 1}페이지 - {img_err}")
    return md_chunks

In [20]:
# ==============================================================================
# 마크다운 저장 함수
# ==============================================================================
def save_markdown_file(markdown_chunks, pdf_path, output_dir):
    safe_name = "".join(c for c in pdf_path.stem if c.isalnum() or c in (' ', '_')).rstrip()
    md_path = output_dir / f"{safe_name}.md"
    with open(md_path, "w", encoding="utf-8") as f:
        f.write("\n".join(markdown_chunks))

In [21]:
# ==============================================================================
# PDF 한 개 처리하는 메인 함수
# ==============================================================================
def process_pdf_to_markdown(pdf_path: Path, output_dir: Path, image_dir: Path):
    doc = fitz.open(pdf_path)
    md_chunks = [f"# {pdf_path.stem}"]

    for page_num in range(len(doc)):
        page = doc.load_page(page_num)
        md_chunks.append("\n---\n")
        md_chunks.append(f"## 페이지 {page_num + 1}")

        # 렌더링 (이미지 크롭용)
        pix = page.get_pixmap(dpi=300)
        page_image = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)

        # 텍스트
        md_chunks.extend(extract_text_from_page(page))

        # 표
        md_chunks.extend(extract_tables_from_page(pdf_path, page, page_num, image_dir, page_image))

        # 이미지
        md_chunks.extend(extract_images_from_page(doc, page, page_num, pdf_path.stem, image_dir))

    save_markdown_file(md_chunks, pdf_path, output_dir)

In [None]:
# ==============================================================================
# 진입점
# ==============================================================================
if __name__ == "__main__":
    pdf_root = Path(r"C:\Users\jhwoo\Desktop\SKN_ws\project\SKN13-FINAL-1TEAM\한국방송광고진흥공사\내부문서\재무성과")
    output_dir = pdf_root / "_markdown_output"
    image_dir = output_dir / "_images"
    output_dir.mkdir(parents=True, exist_ok=True)
    image_dir.mkdir(parents=True, exist_ok=True)

    pdf_files = list(pdf_root.rglob("*.pdf"))

    if not pdf_files:
        print(f"❌ PDF 없음: {pdf_root}")
    else:
        print(f"🔥 총 {len(pdf_files)}개의 PDF를 감지. 전투 개시.")
        for pdf_file in tqdm(pdf_files, desc="PDF 변환 중"):
            try:
                process_pdf_to_markdown(pdf_file, output_dir, image_dir)
            except Exception as e:
                print(f"[ERROR] {pdf_file.name} 처리 중 오류 발생: {e}")
        print(f"✅ 모든 PDF 처리 완료. 결과는 '{output_dir}'에서 확인하라.")

🔥 총 53개의 PDF를 감지. 전투 개시.


PDF 변환 중:   0%|          | 0/53 [00:00<?, ?it/s]CropBox missing from /Page, defaulting to MediaBox
  cols, rows, v_s, h_s = self._generate_columns_and_rows(bbox, user_cols)
  fig.savefig(save_path, dpi=dpi, bbox_inches='tight')
  fig.savefig(save_path, dpi=dpi, bbox_inches='tight')
  fig.savefig(save_path, dpi=dpi, bbox_inches='tight')
  fig.savefig(save_path, dpi=dpi, bbox_inches='tight')
  fig.savefig(save_path, dpi=dpi, bbox_inches='tight')
  fig.savefig(save_path, dpi=dpi, bbox_inches='tight')
  fig.savefig(save_path, dpi=dpi, bbox_inches='tight')
  fig.savefig(save_path, dpi=dpi, bbox_inches='tight')
  fig.savefig(save_path, dpi=dpi, bbox_inches='tight')
  fig.savefig(save_path, dpi=dpi, bbox_inches='tight')
  fig.savefig(save_path, dpi=dpi, bbox_inches='tight')
  fig.savefig(save_path, dpi=dpi, bbox_inches='tight')
  fig.savefig(save_path, dpi=dpi, bbox_inches='tight')
  fig.savefig(save_path, dpi=dpi, bbox_inches='tight')
  fig.savefig(save_path, dpi=dpi, bbox_inches='tight')
  