In [1]:
import os
import re
from urllib.parse import urlparse

def extract_images_from_md(file_path):
    """从单个md文件中提取所有图片链接"""
    images = []
    try:
        with open(file_path, 'r', encoding='utf-8') as f:
            content = f.read()
        
        # 匹配Markdown格式的图片链接: ![alt text](image_url)
        # 支持普通格式和带title的格式
        pattern = r'!\[([^\]]*)\]\(([^)]+)\)'
        matches = re.findall(pattern, content)
        
        for alt_text, image_url in matches:
            # 去除可能的title部分
            image_url = image_url.split('"')[0].split("'")[0]
            images.append({
                'alt_text': alt_text,
                'image_url': image_url,
                'file_path': file_path
            })
            
        # 匹配HTML格式的图片: <img src="...">
        html_pattern = r'<img[^>]+src=["\']([^"\']+)["\'][^>]*>'
        html_matches = re.findall(html_pattern, content, re.IGNORECASE)
        
        for image_url in html_matches:
            images.append({
                'alt_text': '',
                'image_url': image_url,
                'file_path': file_path
            })
            
    except Exception as e:
        print(f"读取文件 {file_path} 时出错: {e}")
    
    return images

def find_md_files(directory):
    """查找目录下所有.md文件"""
    md_files = []
    for root, dirs, files in os.walk(directory):
        for file in files:
            if file.lower().endswith('.md'):
                md_files.append(os.path.join(root, file))
    return md_files

def main():
    """主函数"""
    current_directory = os.getcwd()
    print(f"正在扫描目录: {current_directory}")
    
    # 获取所有.md文件
    md_files = find_md_files(current_directory)
    print(f"找到 {len(md_files)} 个Markdown文件")
    
    all_images = []
    
    # 遍历每个.md文件，提取图片链接
    for md_file in md_files:
        print(f"\n正在处理: {md_file}")
        images = extract_images_from_md(md_file)
        
        if images:
            print(f"  找到 {len(images)} 个图片链接:")
            for img in images:
                print(f"    - {img['image_url']}")
                if img['alt_text']:
                    print(f"      (alt: {img['alt_text']})")
            all_images.extend(images)
        else:
            print("  未找到图片链接")
    
    print(f"\n总共找到 {len(all_images)} 个图片链接")
    
    # 按文件路径和图片URL分组显示结果
    if all_images:
        print("\n详细列表:")
        for i, img in enumerate(all_images, 1):
            print(f"{i}. 文件: {img['file_path']}")
            print(f"   图片: {img['image_url']}")
            if img['alt_text']:
                print(f"   描述: {img['alt_text']}")
            print()
    
    # 统计不同类型的图片链接
    local_images = [img for img in all_images if not img['image_url'].startswith(('http://', 'https://'))]
    remote_images = [img for img in all_images if img['image_url'].startswith(('http://', 'https://'))]
    

    return local_images


In [2]:
result = main()

正在扫描目录: /Users/wangxujie/Downloads/_posts/机器学习习题
找到 8 个Markdown文件

正在处理: /Users/wangxujie/Downloads/_posts/机器学习习题/ex3-neural network.md
  找到 3 个图片链接:
    - Figure_1.png
      (alt: image)
    - Figure_2.png
      (alt: image)
    - Figure_3.jpg
      (alt: image)

正在处理: /Users/wangxujie/Downloads/_posts/机器学习习题/ex1-linear regression.md
  找到 4 个图片链接:
    - Figure_1.png
      (alt: image)
    - Figure_2.png
      (alt: image)
    - Figure_3.png
      (alt: image)
    - Figure_4.png
      (alt: image)

正在处理: /Users/wangxujie/Downloads/_posts/机器学习习题/ex6-SVM.md
  找到 5 个图片链接:
    - Figure_1.png
      (alt: image)
    - Figure_1-1.png
      (alt: image)
    - Figure_1-2.png
      (alt: image)
    - Figure_1-3.png
      (alt: image)
    - Figure_1-4.png
      (alt: image)

正在处理: /Users/wangxujie/Downloads/_posts/机器学习习题/ex4-NN back propagation.md
  找到 7 个图片链接:
    - Figure_1.png
      (alt: image)
    - Figure_2.jpg
      (alt: image)
    - Figure_4
      (alt: image)
    - Figure_5
      (alt: 

In [15]:
from collections import Counter

def get_new_file_name(item, counter):
    file_name = item["file_path"].split('/')[-1].split('.')[0]
    img_suffix = item["image_url"].split('.')[-1]
    counter.update([file_name])
    new_name = file_name + "_" + str(counter[file_name]) + "." + img_suffix

    return new_name

In [20]:
counter = Counter()
for item in result:
    new_name = get_new_file_name(item, counter)
    item.update({'new_path': new_name})

In [21]:
result

[{'alt_text': 'image',
  'image_url': 'Figure_1.png',
  'file_path': '/Users/wangxujie/Downloads/_posts/机器学习习题/ex3-neural network.md',
  'new_name': 'ex3-neural network_1.png',
  'new_path': 'ex3-neural network_1.png'},
 {'alt_text': 'image',
  'image_url': 'Figure_2.png',
  'file_path': '/Users/wangxujie/Downloads/_posts/机器学习习题/ex3-neural network.md',
  'new_name': 'ex3-neural network_2.png',
  'new_path': 'ex3-neural network_2.png'},
 {'alt_text': 'image',
  'image_url': 'Figure_3.jpg',
  'file_path': '/Users/wangxujie/Downloads/_posts/机器学习习题/ex3-neural network.md',
  'new_name': 'ex3-neural network_3.jpg',
  'new_path': 'ex3-neural network_3.jpg'},
 {'alt_text': 'image',
  'image_url': 'Figure_1.png',
  'file_path': '/Users/wangxujie/Downloads/_posts/机器学习习题/ex1-linear regression.md',
  'new_name': 'ex1-linear regression_1.png',
  'new_path': 'ex1-linear regression_1.png'},
 {'alt_text': 'image',
  'image_url': 'Figure_2.png',
  'file_path': '/Users/wangxujie/Downloads/_posts/机器学习习题/

In [22]:
import os
import urllib.parse
import re

for item in result:
    file_path = urllib.parse.unquote(item['file_path'])
    image_url = urllib.parse.unquote(item['image_url'])
    new_path = urllib.parse.unquote(item['new_path'])

    if not os.path.exists(file_path):
        print(f"❌ 文件不存在: {file_path}")
        continue

    with open(file_path, "r", encoding="utf-8") as f:
        content = f.read()

    # 原始形式与 URL 编码形式
    encoded_image_url = urllib.parse.quote(image_url)

    # Markdown 图片匹配（防止误替换）
    patterns = [
        rf'!\[.*?\]\((?:\./)?{re.escape(image_url)}\)',
        rf'!\[.*?\]\((?:\./)?{re.escape(encoded_image_url)}\)',
    ]

    replaced = False
    for pattern in patterns:
        if re.search(pattern, content):
            content = re.sub(pattern, f"![]({new_path})", content)
            replaced = True

    if replaced:
        with open(file_path, "w", encoding="utf-8") as f:
            f.write(content)
        print(f"✅ 已更新 {file_path}: {image_url} → {new_path}")
    else:
        print(f"⚠️ 未找到匹配: {image_url} 或 {encoded_image_url} in {file_path}")


✅ 已更新 /Users/wangxujie/Downloads/_posts/机器学习习题/ex3-neural network.md: Figure_1.png → ex3-neural network_1.png
✅ 已更新 /Users/wangxujie/Downloads/_posts/机器学习习题/ex3-neural network.md: Figure_2.png → ex3-neural network_2.png
✅ 已更新 /Users/wangxujie/Downloads/_posts/机器学习习题/ex3-neural network.md: Figure_3.jpg → ex3-neural network_3.jpg
✅ 已更新 /Users/wangxujie/Downloads/_posts/机器学习习题/ex1-linear regression.md: Figure_1.png → ex1-linear regression_1.png
✅ 已更新 /Users/wangxujie/Downloads/_posts/机器学习习题/ex1-linear regression.md: Figure_2.png → ex1-linear regression_2.png
✅ 已更新 /Users/wangxujie/Downloads/_posts/机器学习习题/ex1-linear regression.md: Figure_3.png → ex1-linear regression_3.png
✅ 已更新 /Users/wangxujie/Downloads/_posts/机器学习习题/ex1-linear regression.md: Figure_4.png → ex1-linear regression_4.png
✅ 已更新 /Users/wangxujie/Downloads/_posts/机器学习习题/ex6-SVM.md: Figure_1.png → ex6-SVM_1.png
✅ 已更新 /Users/wangxujie/Downloads/_posts/机器学习习题/ex6-SVM.md: Figure_1-1.png → ex6-SVM_2.png
✅ 已更新 /Users/wangxujie/Downl