In [None]:
import re
import os


def remove_data_uris(input_filepath, output_filepath=None):
    """
    读取HTML文件，将所有data URI的内容替换为空字符串，然后保存为新文件。

    Args:
        input_filepath (str): 输入HTML文件的路径。
        output_filepath (str, optional): 输出HTML文件的路径。如果未指定，
                                          则在原文件名后添加'_no_images'。
    """
    if not os.path.exists(input_filepath):
        print(f"错误：文件不存在 {input_filepath}")
        return

    # 如果没有指定输出文件路径，则生成一个
    if output_filepath is None:
        name, ext = os.path.splitext(input_filepath)
        output_filepath = (
            f"{name}_no_images{ext}"  # 比如 original.html -> original_no_images.html
        )

    print(f"正在处理文件: {input_filepath}")
    print(f"输出文件将保存到: {output_filepath}")

    # 读取整个文件内容
    # 注意：对于180MB的文件，直接读取整个文件可能占用大量内存。
    #      但对于替换操作，通常需要一次性读取。
    #      如果内存不够，可以尝试逐行读取并处理（但对于跨行的data URI可能失效）。
    #      假设180MB在现代PC内存可接受范围内。
    try:
        with open(input_filepath, "r", encoding="utf-8") as f:
            content = f.read()
    except UnicodeDecodeError:
        print("警告：使用UTF-8解码失败，尝试使用latin-1或gbk。")
        try:
            with open(input_filepath, "r", encoding="latin-1") as f:
                content = f.read()
        except UnicodeDecodeError:
            try:
                with open(input_filepath, "r", encoding="gbk") as f:
                    content = f.read()
            except Exception as e:
                print(f"错误：无法解码文件内容，请手动检查文件编码。错误信息: {e}")
                return
    except Exception as e:
        print(f"读取文件时发生错误: {e}")
        return

    # 正则表达式匹配data URI
    # data:image/[^;]+;base64,[A-Za-z0-9+/=]+
    # 解释：
    # `data:image/` - 字面匹配 "data:image/"
    # `[^;]+` - 匹配非分号的任意字符一次或多次 (MIME类型，如png, jpeg)
    # `;base64,` - 字面匹配 ";base64,"
    # `[A-Za-z0-9+/=]+` - 匹配Base64编码字符集一次或多次
    # 注意：这个正则只针对图片。如果还有其他类型的data URI（如字体、SVG），
    #       需要调整为更通用的 `data:[^;]+;base64,[A-Za-z0-9+/=]+`
    #       或者更通用的 `data:[^"'\s]+` 如果你想清除所有data URI
    #
    # 对于你的目的是为了去除所有图片内容，所以针对`data:image`更准确。
    # 将匹配到的整个URI替换为 `""` 或 `"data:image/png;base64,"` 占位符。
    # 如果替换为空字符串，`<img>`标签的src属性就会变成`src=""`，浏览器会显示一个 broken image icon。
    # 如果你不想显示断裂图片图标，可以替换为 `src="#"` 或 `src="about:blank"`。
    # 这里我们直接替换整个data URI字符串，让src属性可能变短或者为空。
    #
    # 修改：为了确保替换后仍是合法的src属性值，我们可以将其替换为 `data:,` (一个空白的Data URI)
    # 或 `about:blank` （浏览器默认的空白页URI），或者一个小的空白图片Base64字符串。
    # 最简单粗暴直接替换为空字符串 `""`。
    # 或者替换为 `""`，那么 `src=""` 会在某些浏览器中导致请求当前页面的路径，可能引发问题。
    # 最佳实践是替换为 `about:blank` 或者一个非常小的透明图片占位符。
    # 这里我们把 `data:image/...` 替换为 `data:,` （一个空内容的数据URI），它不会加载任何东西。

    # 匹配 data Uri Scheme 的通用模式 (针对图片，但可以扩展)
    # 考虑 data:image/png;base64,.....
    # 也考虑 url() 中的 data:image/png;base64,.....
    # 替换这些data URI为 'data:,' (一个空的data URI)

    # 针对 src="data:image..."
    # pattern1 = r'src="data:image/[^;]+;base64,[A-Za-z0-9+/=]+"' # 匹配整个 src="data:..."
    # 替换为 src="" 或者 src="data:,"
    # processed_content = re.sub(pattern1, 'src="data:,"', content)

    # 更安全的做法是只替换 Base64 内容部分
    # 这会保留 `data:image/[type];base64,` 前缀，只清空后面的数据。
    # 这样 `src="data:image/png;base64,"` 仍然是一个有效的URI，但没有实际的图片数据。
    pattern_base64_data_only = r"(data:image/[^;]+;base64,)([A-Za-z0-9+/=]+)"
    processed_content = re.sub(
        pattern_base64_data_only, r"\1", content
    )  # 只保留前面的 `data:image/[type];base64,`

    # 如果文件中还有 `url(data:image/...)` 形式的CSS背景图片等，也需要处理：
    pattern_url_data_uri = r'url\("?data:image/[^;]+;base64,[A-Za-z0-9+/=]+"?\)'
    processed_content = re.sub(
        pattern_url_data_uri, r'url("data:,")', processed_content
    )  # 替换为 url('data:,')

    # 写入新文件
    try:
        with open(output_filepath, "w", encoding="utf-8") as f:
            f.write(processed_content)
        print("处理完成！")
    except Exception as e:
        print(f"写入文件时发生错误: {e}")


# 示例调用

# 如果你想指定输出文件路径：
remove_data_uris("300_NAI_Styles.html", "300_NAI_Styles_noimg.html")

正在处理文件: 300_NAI_Styles.html
输出文件将保存到: 300_NAI_Styles_noimg.html
处理完成！


In [3]:
import csv
from bs4 import BeautifulSoup
import os


def html_table_to_csv(html_file_path, csv_file_path):
    """
    Reads an HTML file containing a table and converts the table data to a CSV file.

    Args:
        html_file_path (str): The path to the input HTML file.
        csv_file_path (str): The path to the output CSV file.
    """
    try:
        with open(html_file_path, "r", encoding="utf-8") as f:
            html_content = f.read()
    except FileNotFoundError:
        print(f"错误：文件未找到 - {html_file_path}")
        return
    except Exception as e:
        print(f"读取文件时发生错误 {html_file_path}: {e}")
        return

    soup = BeautifulSoup(html_content, "html.parser")
    table = soup.find("table")

    if not table:
        print("错误：在 HTML 文件中找不到表格。")
        return

    # Extract header row
    headers = []
    thead = table.find("thead")
    if thead:
        header_row = thead.find("tr")
        if header_row:
            headers = [th.get_text(strip=True) for th in header_row.find_all("th")]

    # If no headers found in thead, try to infer from the first row of tbody
    if not headers:
        tbody = table.find("tbody")
        if tbody:
            first_row = tbody.find("tr")
            if first_row:
                # Assuming the first row of tbody might contain headers if thead is missing or empty
                headers = [td.get_text(strip=True) for td in first_row.find_all("td")]
                # Optionally, you might want to remove this row from the data rows later
                # For this specific HTML, the first row of tbody is data, so we stick to thead if possible.
                # If thead is truly missing, you might need to manually define headers or skip the first row of tbody.
                # Given the provided HTML, thead exists, so this fallback might not be strictly necessary but is good practice.
                pass  # Keep the logic simple for the provided HTML structure

    # Ensure we have at least the expected columns based on the provided HTML structure
    # The provided HTML has 6 columns in the header, but we are interested in the first two data columns.
    # Let's define the output CSV headers explicitly based on the task description (Index and Artists)
    csv_headers = ["Index", "Artists"]

    # Extract data rows
    data_rows = []
    tbody = table.find("tbody")
    if tbody:
        for row in tbody.find_all("tr"):
            cells = row.find_all(
                ["td", "th"]
            )  # Include th in case some data is in th in tbody
            if len(cells) > 1:  # Ensure there are at least two columns
                index_cell = cells[0]
                artists_cell = cells[1]

                # Extract text content, handling potential <br> tags and getting text from within tags
                index_text = index_cell.get_text(strip=True)
                artists_text = artists_cell.get_text(
                    separator=", ", strip=True
                )  # Use ', ' as separator for content within the cell

                data_rows.append([index_text, artists_text])

    if not data_rows:
        print("在表格主体中找不到数据行。")
        return

    # Write to CSV file
    try:
        # Create directory if it doesn't exist
        os.makedirs(os.path.dirname(csv_file_path), exist_ok=True)

        with open(csv_file_path, "w", newline="", encoding="utf-8") as f:
            writer = csv.writer(f)
            writer.writerow(csv_headers)  # Write the defined headers
            writer.writerows(data_rows)

        print(f"成功将数据写入到 {csv_file_path}")

    except Exception as e:
        print(f"写入 CSV 文件时发生错误 {csv_file_path}: {e}")


# Define input and output file paths
html_input_file = "./300_NAI_Styles.html"
csv_output_file = "./300_NAI_Styles_Table.csv"

# Run the conversion
html_table_to_csv(html_input_file, csv_output_file)

成功将数据写入到 ./300_NAI_Styles_Table.csv
