In [1]:
import datetime
import os
import re
import time
import google.generativeai as genai
from ruamel.yaml import YAML
from io import StringIO
import json
import random
import langid
from google.ai.generativelanguage_v1beta.types import content

In [2]:
# # Gemini 初始化
api_key = os.getenv("GEMINI_API_KEY")
if not api_key:
    api_key = input("Enter your Gemini API key: ")
genai.configure(api_key=api_key)

In [3]:
# 文件头翻译
CACHE_FILE = "translation_cache.json"


def load_cache():
    """加载缓存字典"""
    try:
        with open(CACHE_FILE, "r", encoding="utf-8") as f:
            return json.load(f)
    except FileNotFoundError:
        return {}
    except json.JSONDecodeError:
        return {}


def save_cache(cache):
    """保存缓存字典"""
    directory = os.path.dirname(CACHE_FILE)
    if directory:  # 仅在目录非空时创建
        os.makedirs(directory, exist_ok=True)
    with open(CACHE_FILE, "w", encoding="utf-8") as f:
        json.dump(cache, f, ensure_ascii=False, indent=4)


def translate_keywords(strings):
    """
    翻译字符串列表，使用缓存。
    strings: 待翻译的字符串列表
    返回一个字典，映射原始字符串到翻译后的字符串。
    """
    # 加载缓存
    cache = load_cache()

    # 找到需要翻译的字符串
    strings_to_translate = [s for s in strings if s not in cache]

    # 去重
    strings_to_translate = list(set(strings_to_translate))

    if strings_to_translate:
        # 调用翻译 API，返回一个字典
        translations = llm_translate_keywords(strings_to_translate)
        # 更新缓存
        cache.update(translations)
        # 保存缓存
        save_cache(cache)

    # 返回原始字符串到翻译结果的映射
    return {s: cache[s] for s in strings}


def llm_translate_keywords(strings):
    """
    调用 LLM 翻译
    strings: 待翻译的字符串列表
    返回一个字典，映射原始字符串到翻译后的字符串。
    """
    # 示例代码：替换为实际的翻译 API 调用
    time.sleep(5)  # 避免频繁调用

    generation_config = {
        "temperature": random.random() * 0.5,
        "top_p": 0.95,
        "top_k": 40,
        "max_output_tokens": 8192,
        "response_schema": content.Schema(
            type=content.Type.OBJECT,
            enum=[],
            required=["original_keywords", "translated_keywords"],
            properties={
                "original_keywords": content.Schema(
                    type=content.Type.ARRAY,
                    items=content.Schema(
                        type=content.Type.STRING,
                    ),
                ),
                "translated_keywords": content.Schema(
                    type=content.Type.ARRAY,
                    items=content.Schema(
                        type=content.Type.STRING,
                    ),
                ),
            },
        ),
        "response_mime_type": "application/json",
    }

    model = genai.GenerativeModel(
        model_name="gemini-2.0-flash-exp",
        generation_config=generation_config,
    )

    prompt = f"""
Translate the following list of Chinese keywords (or short phrases/sentences) into **accurate and natural English**.
Return a Json object with the original keywords (or short phrases/sentences) and their translations, respectively.
Keywords: {strings}
"""

    chat_session = model.start_chat(history=[])
    chat_response = chat_session.send_message(prompt)
    response = chat_response.text.strip()

    try:
        response_in_json = json.loads(response)
        response_dict = dict(zip(response_in_json["original_keywords"], response_in_json["translated_keywords"]))

        for original_keyword, translated_keyword in response_dict.items():
            if original_keyword not in strings:
                raise ValueError(f"翻译关键词结果中包含了不在原始列表中的关键词：{original_keyword}。")
            elif langid.classify(translated_keyword)[0] != "en":
                raise ValueError(f"翻译关键词结果不是英语：{translated_keyword}。")

            # 首字母大写，除非原始关键词的首字母是小写英语。注意，对于数字和中文字符，islower() 和 isupper() 都返回 False。
            elif translated_keyword[0].islower() and not original_keyword[0].islower():
                response_dict[original_keyword] = translated_keyword[0].upper() + translated_keyword[1:]

        return response_dict

    except Exception as e:
        raise ValueError(f"翻译关键词结果解析出错：{e}。具体内容：{response}。")

In [4]:
# 文件内容翻译
def llm_translate_article(content, file_name="unknown"):
    """
    调用 LLM 翻译
    content: 待翻译的字符串
    返回翻译后的字符串。
    """

    time.sleep(5)  # 避免频繁调用

    prompt = f"""
Translate the following file content into **accurate and natural English**, adhering to these guidelines:

1. **Content Formatting**:
   - Preserve all formatting elements, such as code blocks, styles, and markdown syntax.

2. **Link Handling**:
   - Remove links to domains `acwing.com` and `luogu.com` as these sites are only accessible to Chinese readers.

3. **Translation Quality**:
   - Ensure the translation is both precise and natural, capturing the original tone and intent. Make sure no details are lost in translation.

4. **Consistency and Directness**:
   - Treat this as a file translation. Avoid adding comments or conversational phrases like "Sure, I can help with that." Just provide the translated content directly.

Input File Content:
{content}
"""

    generation_config = {
        "temperature": random.random() * 0.5,
        "top_p": 0.9,
        "top_k": 40,
        "max_output_tokens": 8192,
        "response_mime_type": "text/plain",
    }

    model = genai.GenerativeModel(
        model_name="gemini-2.0-flash-exp",
        generation_config=generation_config,
        safety_settings=[
            {
                "category": "HARM_CATEGORY_HARASSMENT",
                "threshold": "BLOCK_NONE"
            },
            {
                "category": "HARM_CATEGORY_HATE_SPEECH",
                "threshold": "BLOCK_NONE"
            },
            {
                "category": "HARM_CATEGORY_SEXUALLY_EXPLICIT",
                "threshold": "BLOCK_NONE"
            },
            {
                "category": "HARM_CATEGORY_DANGEROUS_CONTENT",
                "threshold": "BLOCK_NONE"
            }
        ]
    )

    chat_session = model.start_chat(history=[])
    chat_response = chat_session.send_message(prompt)
    response = chat_response.text.strip()

    # 尝试优化翻译结果，去除多余的前缀
    # 如果前缀是 ``` 且后缀也是 ```，则去除前缀和后缀，并打印绿色警告
    # 如果结果不是英语，那么raise错误
    # 如果结果显著比原文短，那么raise错误
    # 如果首个字符不是英语字母，那么打印前20个字符并打印红色警告，但不raise错误
    if response.startswith("```") and response.endswith("```"):
        response = response[3:-3]
        print(f"\033[1;32m" + f"{file_name}的翻译结果以 ``` 开头和结尾，已去除。" + "\033[0m")

    if langid.classify(response)[0] != "en":
        raise ValueError(f"报错原因：翻译结果不是英语。具体内容：{response[:20]}...")

    if len(response) < 0.5 * len(content):
        # 理论上，response 是英语，至少是 content 长度的 5 倍。
        # 这里如果 response 的长度只有 content 的一半，那么视为翻译失败。
        raise ValueError(f"报错原因：翻译结果长度显著短于原文。具体内容：{response[:20]}...")

    if not response[0].isalpha() and response[0] not in ("#", "*", "$"):
        print(
            f"\033[1;31m" + f"{file_name}的翻译结果首字母不是英语字母，可能需要手动调整。具体内容：{response[:20]}..." + "\033[0m")

    return response.strip()

In [5]:
# 1. 提供文件路径，返回文件内容（字符串）
def read_file(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        return file.read()


# 2 提供文件路径和文件内容，重写或者创建文件
def write_file(file_path, content):
    with open(file_path, 'w', encoding='utf-8') as file:
        file.write(content)


# 3. 提供字符串内容，返回文件头和主要内容两个字符串，去除首尾空白字符和分隔符
def split_content(content):
    content = content.strip()  # 去掉首尾空白字符
    match = re.match(r"^(---\n.*?\n---\n)(.*)$", content, re.DOTALL)
    if match:
        front_matter = match.group(1)  # 文件头部分
        main_content = match.group(2)  # 主要内容部分
        front_matter = front_matter.strip().strip("-").strip()
        main_content = main_content.strip()
        return front_matter, main_content
    else:
        raise ValueError("文件格式不正确，没有找到 YAML 文件头。")


# 4. 提供文件路径，获取文件的上次修改时间
def get_last_modified_time(file_path):
    if os.path.exists(file_path):
        # 获取上次修改时间的时间戳
        timestamp = os.path.getmtime(file_path)
        # 转换为可读的时间格式
        last_modified_time = datetime.datetime.fromtimestamp(timestamp)
        return last_modified_time
    else:
        return f"文件路径 {file_path} 不存在。"


# 5. 组合 front_matter 和 main_content，重写文件
def build_content(front_matter, main_content, file_path):
    front_matter = front_matter.strip()
    front_matter = "---\n" + front_matter + "\n---"
    write_file(file_path, f"{front_matter}\n\n{main_content}")

In [6]:
# 提取 yaml_dict 中的string values和list string values
def extract_strings(yaml_dict):
    strings = []
    for value in yaml_dict.values():
        if isinstance(value, str):
            strings.append(value)
        elif isinstance(value, list):
            strings.extend(value)
    return strings

In [7]:
# 1. 检索当前目录下所有文件夹（递归地），找到 index.zh-cn.md 文件
def find_chinese_files(directory):
    for root, _, files in os.walk(directory):
        for file in files:
            if file.endswith(".zh-cn.md"):
                yield os.path.join(root, file)


chinese_files = list(find_chinese_files("."))


# 2. 对于每个中文文件，检查是否需要翻译
def needs_translation(chinese_file):
    english_file = chinese_file.replace(".zh-cn.md", ".md")
    if not os.path.exists(english_file):
        return True
    return get_last_modified_time(chinese_file) > get_last_modified_time(english_file)


files_to_translate = [file for file in chinese_files if needs_translation(file)]

# 3. 对于每个需要翻译的文件，进行格式化
for file_to_reformat in files_to_translate:
    print(f"需要翻译文件：{file_to_reformat}。")

# 4. 翻译文件
for file_to_translate in files_to_translate:
    print(f"开始翻译文件：{file_to_translate}。")
    # 读取文件内容
    file_content = read_file(file_to_translate)
    # 分割文件内容
    front_matter, main_content = split_content(file_content)
    # 解析 YAML 文件头
    yaml_instance = YAML()
    # 配置 yaml_instance，使其支持多行字符串
    yaml_instance.indent(mapping=2, sequence=4, offset=2)
    yaml_dict = yaml_instance.load(front_matter)

    # 翻译字符串
    max_retries = 3
    retry_delay = 30  # seconds

    translate_keywords_success = False
    for attempt in range(max_retries):
        try:
            translation_map = translate_keywords(extract_strings(yaml_dict))
            translate_keywords_success = True
            break  # Exit the loop if successful
        except Exception as e:
            print(f"翻译文件 {file_to_translate} 的文件头时出错：{e}。")
            if attempt < max_retries - 1:
                print(f"等待 {retry_delay} 秒后重试...")
                time.sleep(retry_delay * (attempt + 1))
            else:
                print(f"重试 {max_retries} 次后仍然失败，跳过此文件。")
                continue

    if not translate_keywords_success:
        continue

    # 更新 YAML 字典
    for key, value in yaml_dict.items():
        if isinstance(value, str):
            yaml_dict[key] = translation_map[value]
        elif isinstance(value, list):
            yaml_dict[key] = [translation_map[item] for item in value]

    # 将字典转换回 YAML 字符串
    output = StringIO()
    yaml_instance.dump(yaml_dict, output)
    front_matter_translated = output.getvalue()

    # 翻译主要内容
    translate_main_content_success = False
    for attempt in range(max_retries):
        try:
            main_content_translated = llm_translate_article(main_content, file_to_translate)
            translate_main_content_success = True
            break
        except Exception as e:
            print(f"\033[1;31m" + f"翻译文件 {file_to_translate} 的主要内容时出错：{e}。" + "\033[0m")
            if attempt < max_retries - 1:
                print(f"等待 {retry_delay} 秒后重试...")
                time.sleep(retry_delay * (attempt + 1))
            else:
                print(f"重试 {max_retries} 次后仍然失败，跳过此文件。")
                continue

    if not translate_main_content_success:
        continue

    # 重写文件
    build_content(front_matter_translated, main_content_translated, file_to_translate.replace(".zh-cn.md", ".md"))
    print(f"文件 {file_to_translate} 翻译完成。")

print("所有文件翻译完成。")

需要翻译文件：.\posts\1735350760948-cap\index.zh-cn.md。
开始翻译文件：.\posts\1735350760948-cap\index.zh-cn.md
文件 .\posts\1735350760948-cap\index.zh-cn.md 翻译完成。
所有文件翻译完成。
