In [None]:
import fitz  # PyMuPDF
from pdf2image import convert_from_path
import pytesseract

def pdf_to_txt_with_ocr(pdf_path: str, txt_path: str = None) -> str:
    """
    同时抽取 PDF 中的文字 + OCR 图片中的文字，输出 txt 文件
    """
    doc = fitz.open(pdf_path)
    all_text = []

    # 逐页处理
    for page_num, page in enumerate(doc, start=1):
        # 1. 直接抽取可复制的文字
        text = page.get_text("text")
        if text.strip():
            all_text.append(f"--- Page {page_num} (direct text) ---\n{text}")
        else:
            # 2. 如果没有文字，转图片做 OCR
            print(f"Page {page_num}: no extractable text, fallback to OCR...")
            images = convert_from_path(pdf_path, first_page=page_num, last_page=page_num)
            if images:
                img = images[0]
                ocr_text = pytesseract.image_to_string(img, lang="chi_sim+eng")  # 中文+英文
                all_text.append(f"--- Page {page_num} (OCR) ---\n{ocr_text}")

    output_text = "\n\n".join(all_text)

    if txt_path:
        with open(txt_path, "w", encoding="utf-8") as f:
            f.write(output_text)

    return output_text

# 示例用法
pdf_file = "test.pdf"
txt_file = "test.txt"
text = pdf_to_txt_with_ocr(pdf_file, txt_file)
print("✅ 转换完成，输出保存到:", txt_file)

FileNotFoundError: no such file: 'test.pdf'

In [1]:
### 上传pdf到tos ###
import os
import re
import requests
from urllib.parse import urlparse

TOS_UPLOAD_API = "http://data-processing.bytedance.net/dmc_plat/single/upload/"

def upload_tos_pdf(file_path: str) -> str:
    """上传本地 PDF 到 TOS，返回 tos://... URI"""
    if not file_path.lower().endswith(".pdf"):
        raise ValueError(f"仅支持上传 PDF 文件，实际传入: {file_path}")

    with open(file_path, "rb") as f:
        files = {"file": (os.path.basename(file_path), f, "text/plain")}
        resp = requests.post(TOS_UPLOAD_API, files=files, timeout=30)

    resp.raise_for_status()
    try:
        data = resp.json().get("data", {})
    except Exception:
        print("Upload response text:", resp.text)
        raise

    file_url = data.get("file_url") or data.get("url") or ""
    if not file_url:
        raise RuntimeError(f"TOS上传成功但未返回 file_url/url，原始响应：{resp.text}")

    return file_url_to_tos_uri(file_url)


def file_url_to_tos_uri(file_url: str) -> str:
    """把 file url 统一转换为 tos://.... URI"""
    parsed = urlparse(file_url)
    m = re.search(r"/obj/aidataservice/(.+)$", parsed.path)
    if not m:
        raise ValueError(f"无法从URL中解析出 aidataservice 路径：{file_url}")
    tail = m.group(1)
    return f"tos://{tail}"

tos_uri = upload_tos_pdf("test.pdf")
print("上传成功，TOS URI:", tos_uri)



上传成功，TOS URI: tos://cqc_program_data/2025-09-23/47204c90906e77c9b924e41876dd369c.pdf
