### 将ipynb文件里的图片格式进行__转换__base64_外部文件夹_HTML

In [2]:
import base64
import json
import mimetypes
import re
from pathlib import Path

# ======================
# 常量 / 正则
# ======================

IMG_EXTS = {".png", ".jpg", ".jpeg", ".webp", ".gif", ".bmp"}

# ![alt](attachment:xxx.png)
ATTACH_RE = re.compile(r"!\[([^\]]*)\]\(attachment:([^)]+)\)")
# ![alt](path)
MD_IMG_RE = re.compile(r"!\[([^\]]*)\]\(([^)]+)\)")

# <img ...>
IMG_TAG_RE = re.compile(r"<img\b[^>]*>", re.IGNORECASE)
SRC_RE = re.compile(r"""src\s*=\s*(['"])(.*?)\1""", re.IGNORECASE)
ALT_RE = re.compile(r"""alt\s*=\s*(['"])(.*?)\1""", re.IGNORECASE)

# <br>
BR_RE = re.compile(r"<br\s*/?>", re.IGNORECASE)


# ======================
# 通用工具函数
# ======================

def _read_nb(ipynb_path: Path) -> dict:
    return json.loads(Path(ipynb_path).read_text(encoding="utf-8"))

def _write_nb(nb: dict, out_path: Path) -> None:
    out_path.write_text(json.dumps(nb, indent=2), encoding="utf-8")

def _get_text(cell: dict) -> str:
    src = cell.get("source", [])
    return "".join(src) if isinstance(src, list) else str(src)

def _set_text(cell: dict, text: str) -> None:
    cell["source"] = [text]

def _guess_mime(name: str) -> str:
    mt, _ = mimetypes.guess_type(name)
    return mt or "image/png"

def _b64_to_bytes(b64: str) -> bytes:
    return base64.b64decode(b64)

def _bytes_to_b64(data: bytes) -> str:
    return base64.b64encode(data).decode("ascii")

def _unique_name(base_name: str, used: set[str]) -> str:
    """
    同一个 cell 内去重：如果 base_name 已存在，则在 stem 后追加 _1/_2/...
    e.g. cell_003_image.png -> cell_003_image_1.png
    """
    if base_name not in used:
        used.add(base_name)
        return base_name

    p = Path(base_name)
    stem, suf = p.stem, p.suffix
    k = 1
    while True:
        cand = f"{stem}_{k}{suf}"
        if cand not in used:
            used.add(cand)
            return cand
        k += 1


# ======================
# 1) attachment → 文件（按 cell 编号前缀；同 cell 重名自动 _1/_2）
# ======================

def attachment_to_files(
    ipynb_path,
    out_ipynb_path=None,
    img_dir="images",
    overwrite=False,
    drop_attachments=True,
):
    ipynb_path = Path(ipynb_path)
    out_ipynb_path = Path(out_ipynb_path) if out_ipynb_path else ipynb_path.with_name(ipynb_path.stem + "_files.ipynb")

    nb = _read_nb(ipynb_path)
    out_img_dir = out_ipynb_path.parent / img_dir
    out_img_dir.mkdir(parents=True, exist_ok=True)

    exported = replaced = 0

    for i, cell in enumerate(nb.get("cells", [])):
        if cell.get("cell_type") != "markdown":
            continue

        attachments = cell.get("attachments") or {}
        if not attachments:
            continue

        used_out_names = set()
        text = _get_text(cell)
        prefix = f"cell_{i:03d}_"

        for m in ATTACH_RE.finditer(text):
            alt, name = m.groups()
            if name not in attachments:
                continue

            _, b64 = next(iter(attachments[name].items()))

            # ✅ 防止重复加 cell 前缀
            base_out_name = name if name.startswith(prefix) else f"{prefix}{name}"
            out_name = _unique_name(base_out_name, used_out_names)
            out_path = out_img_dir / out_name

            if overwrite or not out_path.exists():
                out_path.write_bytes(_b64_to_bytes(b64))
                exported += 1

            text = text.replace(m.group(0), f"![{alt}]({img_dir}/{out_name})")
            replaced += 1

        _set_text(cell, text)
        if drop_attachments:
            cell.pop("attachments", None)

    _write_nb(nb, out_ipynb_path)
    print("✅ attachment → 文件 完成")
    print(f"   输出 notebook: {out_ipynb_path}")
    print(f"   图片目录: {out_img_dir}")
    print(f"   导出 {exported} 张，替换 {replaced} 处")


# ======================
# 2) 文件 → attachment（同 cell 内 key 重名自动 _1/_2）
# ======================

def files_to_attachment(
    ipynb_path,
    out_ipynb_path=None,
    base_dir=None,
):
    ipynb_path = Path(ipynb_path)
    out_ipynb_path = Path(out_ipynb_path) if out_ipynb_path else ipynb_path.with_name(ipynb_path.stem + "_attachments.ipynb")
    base_dir = Path(base_dir) if base_dir else ipynb_path.parent

    nb = _read_nb(ipynb_path)

    embedded = replaced = 0

    for cell in nb.get("cells", []):
        if cell.get("cell_type") != "markdown":
            continue

        text = _get_text(cell)

        attachments = cell.get("attachments") or {}
        # ✅ 同一个 cell 内：attachment key 去重（考虑已有 attachments）
        used_keys = set(attachments.keys())

        changed = False

        for m in MD_IMG_RE.finditer(text):
            alt, link = m.groups()
            link = link.strip()

            if link.startswith(("attachment:", "data:", "http://", "https://")):
                continue

            p = (base_dir / link).resolve()
            if not p.exists() or p.suffix.lower() not in IMG_EXTS:
                continue

            base_key = p.name
            key = _unique_name(base_key, used_keys)

            attachments[key] = {_guess_mime(key): _bytes_to_b64(p.read_bytes())}
            text = text.replace(m.group(0), f"![{alt}](attachment:{key})")

            embedded += 1
            replaced += 1
            changed = True

        if changed:
            cell["attachments"] = attachments
            _set_text(cell, text)

    _write_nb(nb, out_ipynb_path)
    print("✅ 文件 → attachment 完成")
    print(f"   输出 notebook: {out_ipynb_path}")
    print(f"   嵌入 {embedded} 张，替换 {replaced} 处")


# ======================
# 3) Markdown 图片 → HTML <img>
# ======================

def md_images_to_html(ipynb_path, out_path=None, width=600, break_line=False, self_close=False):
    ipynb_path = Path(ipynb_path)
    out_path = Path(out_path) if out_path else ipynb_path.with_name(ipynb_path.stem + "_img600.ipynb")

    nb = _read_nb(ipynb_path)

    changed_cells = replaced_total = 0

    def _repl(m):
        nonlocal replaced_total
        alt, link = m.groups()
        link = link.strip()

        if link.startswith(("attachment:", "data:", "http://", "https://")):
            return m.group(0)

        closing = " />" if self_close else ">"
        if break_line:
            closing += "<br>"

        replaced_total += 1
        return f'<img src="{link}" width="{width}" alt="{alt}"{closing}'

    for cell in nb.get("cells", []):
        if cell.get("cell_type") != "markdown":
            continue

        text = _get_text(cell)
        new_text = MD_IMG_RE.sub(_repl, text)

        if new_text != text:
            _set_text(cell, new_text)
            changed_cells += 1

    _write_nb(nb, out_path)
    print("✅ 完成：Markdown 图片 → HTML <img>")
    print(f"   输入: {ipynb_path}")
    print(f"   输出: {out_path}")
    print(f"   修改 cell: {changed_cells} 个 | 替换图片: {replaced_total} 处")


# ======================
# 4) HTML <img> → attachment(base64)
#    - 同 cell 内 key 重名自动 _1/_2
#    - 可选 keep_html：只写 attachments，不替换 <img>
# ======================

def html_images_to_attachment(
    ipynb_path,
    out_ipynb_path=None,
    base_dir=None,
    keep_html=False,
):
    ipynb_path = Path(ipynb_path)
    out_ipynb_path = Path(out_ipynb_path) if out_ipynb_path else ipynb_path.with_name(ipynb_path.stem + "_html2attach.ipynb")
    base_dir = Path(base_dir) if base_dir else ipynb_path.parent

    nb = _read_nb(ipynb_path)

    embedded = replaced = changed_cells = 0

    for cell in nb.get("cells", []):
        if cell.get("cell_type") != "markdown":
            continue

        text = _get_text(cell)
        if "<img" not in text.lower():
            continue

        attachments = cell.get("attachments") or {}
        # ✅ 同一个 cell 内：attachment key 去重（考虑已有 attachments）
        used_keys = set(attachments.keys())

        orig_text = text

        def _replace_img(tag: str) -> str:
            nonlocal embedded, replaced, attachments, used_keys

            msrc = SRC_RE.search(tag)
            if not msrc:
                return tag

            src = msrc.group(2).strip()
            if src.startswith(("http://", "https://", "data:", "attachment:")):
                return tag

            p = (base_dir / src).resolve()
            if not p.exists() or p.suffix.lower() not in IMG_EXTS:
                return tag

            altm = ALT_RE.search(tag)
            alt = altm.group(2) if altm else p.name

            base_key = p.name
            key = _unique_name(base_key, used_keys)

            attachments[key] = {_guess_mime(key): _bytes_to_b64(p.read_bytes())}
            embedded += 1

            if keep_html:
                return tag

            replaced += 1
            return f"![{alt}](attachment:{key})"

        text = IMG_TAG_RE.sub(lambda m: _replace_img(m.group(0)), text)

        # 移除cell里的<br>
        text = BR_RE.sub("", text)

        if text != orig_text:
            cell["attachments"] = attachments
            _set_text(cell, text)
            changed_cells += 1
        

    _write_nb(nb, out_ipynb_path)
    print("✅ 完成：HTML <img> → attachment(base64)")
    print(f"   输入: {ipynb_path}")
    print(f"   输出: {out_ipynb_path}")
    print(f"   修改 cell: {changed_cells} 个 | 嵌入图片: {embedded} 张 | 替换 <img>: {replaced} 处")


### base64__转__外部文件夹格式

In [3]:
attachment_to_files(
    # "01_README.ipynb",
    "/workspace/_ty/00_pipeline/00_current/01_PrepareTrainingData__fiftyone_to_trainingData/02_x-anylableing.ipynb",
    img_dir="figures"
)


✅ attachment → 文件 完成
   输出 notebook: /workspace/_ty/00_pipeline/00_current/01_PrepareTrainingData__fiftyone_to_trainingData/02_x-anylableing_files.ipynb
   图片目录: /workspace/_ty/00_pipeline/00_current/01_PrepareTrainingData__fiftyone_to_trainingData/figures
   导出 6 张，替换 6 处


### 外部文件夹格式__转__base64

In [4]:
files_to_attachment(
    "01_README_files.ipynb"
)


✅ 文件 → attachment 完成
   输出 notebook: 01_README_files_attachments.ipynb
   嵌入 17 张，替换 17 处


### 外部文件夹格式__转__HTML格式

In [4]:
md_images_to_html(
    # "01_README_files.ipynb",
    "/workspace/_ty/00_pipeline/00_current/01_PrepareTrainingData__fiftyone_to_trainingData/02_x-anylableing_files.ipynb", 
    width=600, break_line=True)

✅ 完成：Markdown 图片 → HTML <img>
   输入: /workspace/_ty/00_pipeline/00_current/01_PrepareTrainingData__fiftyone_to_trainingData/02_x-anylableing_files.ipynb
   输出: /workspace/_ty/00_pipeline/00_current/01_PrepareTrainingData__fiftyone_to_trainingData/02_x-anylableing_files_img600.ipynb
   修改 cell: 5 个 | 替换图片: 6 处


### HTML格式__转__base64

In [7]:
html_images_to_attachment("01_README_files_img600.ipynb")

✅ 完成：HTML <img> → attachment(base64)
   输入: 01_README_files_img600.ipynb
   输出: 01_README_files_img600_html2attach.ipynb
   修改 cell: 10 个 | 嵌入图片: 17 张 | 替换 <img>: 17 处


### markdown转docx
```
pandoc 01_README.md -o 01_README.docx
```