In [6]:
!pip install pandas python-docx pdfplumber openpyxl xlrd 

Collecting pdfplumber
  Downloading pdfplumber-0.11.9-py3-none-any.whl.metadata (43 kB)
Collecting xlrd
  Downloading xlrd-2.0.2-py2.py3-none-any.whl.metadata (3.5 kB)
Collecting pdfminer.six==20251230 (from pdfplumber)
  Downloading pdfminer_six-20251230-py3-none-any.whl.metadata (4.3 kB)
Collecting pypdfium2>=4.18.0 (from pdfplumber)
  Downloading pypdfium2-5.3.0-py3-none-win_amd64.whl.metadata (67 kB)
Downloading pdfplumber-0.11.9-py3-none-any.whl (60 kB)
Downloading pdfminer_six-20251230-py3-none-any.whl (6.6 MB)
   ---------------------------------------- 0.0/6.6 MB ? eta -:--:--
   ---------------------------------------- 0.0/6.6 MB ? eta -:--:--
   - -------------------------------------- 0.3/6.6 MB ? eta -:--:--
   --- ------------------------------------ 0.5/6.6 MB 1.1 MB/s eta 0:00:06
   ---- ----------------------------------- 0.8/6.6 MB 922.2 kB/s eta 0:00:07
   ------ --------------------------------- 1.0/6.6 MB 1.2 MB/s eta 0:00:05
   ------- -----------------------------

In [4]:
def cloverocr(image_path: str) -> dict:
    """
    TODO: CLOVA OCR 호출
    image_path를 받아 OCR 결과를 JSON(dict)으로 반환해야 함
    """
    return {"status": "not_implemented"}


In [None]:
from pathlib import Path
import pandas as pd
from docx import Document
import pdfplumber

CSV_EXT = {"csv"}
EXCEL_EXT = {"xls", "xlsx"}
IMAGE_EXT = {"jpg", "png"}
OTHER_EXT = {"pdf", "docx"}

def handle_csv(path: str):
    for enc in ["cp949", "euc-kr", "utf-8"]:
        try:
            df = pd.read_csv(path, encoding=enc)
            return {"kind": "CSV", "ext": "csv", "dataframe": df}
        except UnicodeDecodeError:
            continue
    raise ValueError("인코딩 실패: cp949, euc-kr, utf-8 encoding을 지원합니다")

def handle_excel(path: str, ext: str):
    df = pd.read_excel(path)
    return {"kind": "EXCEL", "ext": ext, "dataframe": df}

def handle_image(path: str, ext: str):
    ocr = cloverocr(path)
    return {"kind": "IMAGE", "ext": ext, "content": {"pageCount": len(ocr.get("pages", [])), "pages": ocr.get("pages", {})}}

def handle_pdf(path: str, ext: str):
    PDF_PAGE_MIN_CHARS = 50
    PDF_PASS_RATIO = 0.7

    with pdfplumber.open(path) as pdf:
        texts = [(p.extract_text() or "") for p in pdf.pages]
        pass_ratio = (sum(len(t.strip()) >= PDF_PAGE_MIN_CHARS for t in texts) / max(len(texts), 1))

    if pass_ratio >= PDF_PASS_RATIO:
        pages = {f"page{i+1}": {"text": t, "tableCount": 0, "tables": {}} for i, t in enumerate(texts)}
        return {"kind": "PDF", "ext": ext, "mode": "text", "content": {"pageCount": len(texts), "pages": pages}}

    ocr = cloverocr(path)
    return {"kind": "PDF", "ext": ext, "mode": "ocr", "content": {"pageCount": len(ocr.get("pages", [])), "pages": ocr.get("pages", {})}}

def handle_docx(path: str, ext: str):
    doc = Document(path)
    paragraphs = [p.text.strip() for p in doc.paragraphs if p.text.strip()]
    pages = {
        "page1": {
            "text": "\n".join(paragraphs),
            "tableCount": len(doc.tables),
            "tables": {f"table{i+1}": {"rows": [[cell.text for cell in row.cells] for row in table.rows]} for i, table in enumerate(doc.tables)}
        }
    }
    return {"kind": "DOCX", "ext": ext, "content": {"pageCount": 1, "pages": pages}}

In [None]:
def handle_file(path: str):
    ext = Path(path).suffix.lower().lstrip(".")
    if ext in CSV_EXT: return handle_csv(path)
    if ext in EXCEL_EXT: return handle_excel(path, ext)
    if ext in IMAGE_EXT: return handle_image(path, ext)
    if ext == "pdf": return handle_pdf(path, ext)
    if ext == "docx": return handle_docx(path, ext)
    raise ValueError(f"지원하지 않는 파일 형식입니다: {ext}")