In [None]:
import os
import re
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM

# ----------------------------------------------------
# 步驟 1: 載入模型
# ----------------------------------------------------
MODEL_PATH = "/home/nculcwu/DeepSeek/deepseek-llm-7b-chat"

os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"

print(f"準備從 '{MODEL_PATH}' 載入模型和分詞器...")
if not torch.cuda.is_available():
    print("錯誤：未偵測到 CUDA。此程式碼需要 NVIDIA GPU 才能執行。")
    exit()

try:
    tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH, trust_remote_code=True)
    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token
    model = AutoModelForCausalLM.from_pretrained(
        MODEL_PATH,
        torch_dtype="auto",
        device_map="auto",
        low_cpu_mem_usage=True,
        trust_remote_code=True
    )
    print("✅ 模型和分詞器已成功載入至 GPU！")
except Exception as e:
    print(f"模型載入失敗！錯誤訊息: {e}")
    exit()

# ----------------------------------------------------
# 步驟 2: 定義翻譯函式
# ----------------------------------------------------
def translate_to_plain_chinese(text: str) -> str:
    """呼叫模型進行繁體白話文翻譯"""
    prompt = f"將以下文言文翻譯成繁體中文白話文，只翻譯內容，不要做解釋或分點：\n\n{text}\n\n翻譯："
    
    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=1024,
            temperature=0.7,
            top_p=0.9
        )
    result = tokenizer.decode(outputs[0], skip_special_tokens=True)

    # 從生成結果中抽取「翻譯」部分
    if "翻譯：" in result:
        result = result.split("翻譯：", 1)[-1].strip()
    return result

# ----------------------------------------------------
# 步驟 3: 找到資料夾裡的所有 txt 檔，逐檔翻譯
# ----------------------------------------------------
input_dir = r"."
output_file = os.path.join(input_dir, "翻譯結果.txt")

all_results = []

for file_name in os.listdir(input_dir):
    if file_name.endswith(".txt") and file_name != "翻譯結果.txt":
        file_path = os.path.join(input_dir, file_name)
        print(f"📖 正在處理：{file_name}")

        with open(file_path, "r", encoding="utf-8") as f:
            content = f.read().strip()
        
        if content:
            translated = translate_to_plain_chinese(content)
            all_results.append(f"=== {file_name.replace('.txt','')} ===\n{translated}\n")

# ----------------------------------------------------
# 步驟 4: 輸出到單一檔案
# ----------------------------------------------------
with open(output_file, "w", encoding="utf-8") as f:
    f.write("\n\n".join(all_results))

print(f"\n✅ 所有翻譯已完成，結果輸出至：{output_file}")


In [None]:
import os
import re
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM

# ----------------------------------------------------
# 步驟 1: 載入模型
# ----------------------------------------------------
MODEL_PATH = "/home/nculcwu/DeepSeek/deepseek-llm-7b-chat"

os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"

print(f"準備從 '{MODEL_PATH}' 載入模型和分詞器...")
if not torch.cuda.is_available():
    print("錯誤：未偵測到 CUDA。此程式碼需要 NVIDIA GPU 才能執行。")
    exit()

try:
    tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH, trust_remote_code=True)
    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token
    model = AutoModelForCausalLM.from_pretrained(
        MODEL_PATH,
        torch_dtype="auto",
        device_map="auto",
        low_cpu_mem_usage=True,
        trust_remote_code=True
    )
    print("✅ 模型和分詞器已成功載入至 GPU！")
except Exception as e:
    print(f"模型載入失敗！錯誤訊息: {e}")
    exit()

# ----------------------------------------------------
# 步驟 2: 定義分段函式
# ----------------------------------------------------
def segment_text(text: str):
    """先用句號分段，可再改成 herb-based 分段"""
    segments = re.split(r"[。；]", text)  # 以 。； 切分
    return [seg.strip() for seg in segments if seg.strip()]

# ----------------------------------------------------
# 步驟 3: 定義翻譯函式
# ----------------------------------------------------
def translate_to_plain_chinese(text: str) -> str:
    """呼叫模型進行繁體白話文翻譯"""
    prompt = f"將以下文言文翻譯成繁體中文白話文，只翻譯內容，不要做解釋或分點：\n\n{text}\n\n翻譯："
    
    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=1024,
            temperature=0.7,
            top_p=0.9
        )
    result = tokenizer.decode(outputs[0], skip_special_tokens=True)

    if "翻譯：" in result:
        result = result.split("翻譯：", 1)[-1].strip()
    return result

# ----------------------------------------------------
# 步驟 4: 找到資料夾裡的所有 txt 檔，逐檔翻譯
# ----------------------------------------------------
input_dir = r"."
output_dir = os.path.join(input_dir, "translated")
os.makedirs(output_dir, exist_ok=True)

for file_name in os.listdir(input_dir):
    if file_name.endswith(".txt") and file_name != "翻譯結果.txt":
        file_path = os.path.join(input_dir, file_name)
        print(f"📖 正在處理：{file_name}")

        with open(file_path, "r", encoding="utf-8") as f:
            content = f.read().strip()
        
        if not content:
            continue

        # 分段
        segments = segment_text(content)
        translated_results = []

        for seg in segments:
            translated = translate_to_plain_chinese(seg)
            translated_results.append(f"【原文】\n{seg}\n\n【翻譯】\n{translated}\n")

        # 每卷輸出獨立檔案
        output_file = os.path.join(output_dir, file_name.replace(".txt", "_translated.txt"))
        with open(output_file, "w", encoding="utf-8") as f:
            f.write("\n\n".join(translated_results))

        print(f"✅ 翻譯完成：{output_file}")
