In [1]:
import os
import re
import datetime
import csv

from odf.opendocument import load
from odf.table import Table, TableRow, TableCell
from odf.text import P
from tqdm import tqdm
from opencc import OpenCC

#檔案清單轉換為list
def find_ods_files(directory):
    ods_files = []
    for filename in os.listdir(directory):
        if filename.endswith(".ods"):
            ods_files.append(os.path.join(directory, filename))
    return ods_files
def find_csv_files(directory):
    csv_files = []
    for filename in os.listdir(directory):
        if filename.endswith(".csv"):
            csv_files.append(os.path.join(directory, filename))
    # 最倒序排列
    csv_files.sort(reverse=True)
    return csv_files

def simplify_to_traditional(simplified_text):
    cc = OpenCC('s2t')  # 簡體字轉換為繁體
    return cc.convert(simplified_text)

def get_text_from_cell(cell):
    text_content = []
    for p in cell.getElementsByType(P):
        text_content.append("".join(node.data for node in p.childNodes if node.nodeType == node.TEXT_NODE))
    return "".join(text_content).strip()

def load_translation_table_from_ods(file_paths):
    translation_dict = {}
    for file_path in file_paths:
        doc = load(file_path)
        tables = doc.spreadsheet.getElementsByType(Table)
        print(f"Number of tables found: {len(tables)}")  
        for table in tables:
            rows = list(table.getElementsByType(TableRow))
            print(f"Processing {len(rows)} rows in table")  
            for row in tqdm(rows, desc="Processing rows"):
                cells = row.getElementsByType(TableCell)
                if len(cells) >= 5: 
                    simplified = get_text_from_cell(cells[4])
                    traditional = get_text_from_cell(cells[2])
                    english = get_text_from_cell(cells[1])  # 英文
                    # 簡體字轉繁體
                    traditional_simplified = simplify_to_traditional(simplified)
                    translation_dict[traditional_simplified] = {'traditional': traditional, 'english': english}
    return translation_dict


def load_translation_table_from_csv(file_paths):
    translation_dict = {}
    duplicate_count = 0  # 计数重复的词汇
    ignored_count = 0    # 计数因缺失字段而忽略的行
    for file_path in file_paths:
        with open(file_path, newline='', encoding='utf-8') as csvfile:
            reader = csv.DictReader(csvfile)
            # 使用tqdm包装reader，提供进度条
            for row in tqdm(reader, desc=f"Processing {file_path}"):
                simplified = row['zh-cn'].strip()
                traditional = row['zh-tw'].strip()
                english = row['en'].strip()
                if simplified and traditional and english:
                    traditional_simplified = simplify_to_traditional(simplified)
                    if traditional_simplified in translation_dict:
                        duplicate_count += 1
                    translation_dict[traditional_simplified] = {'traditional': traditional, 'english': english}
                else:
                    ignored_count += 1
            print(f"Total rows processed in {file_path}: {reader.line_num - 1}")  # line_num 计算包括标题行在内的行数
    print(f"Duplicates found: {duplicate_count}")
    print(f"Rows ignored due to missing fields: {ignored_count}")
    return translation_dict



In [2]:
def replace_text(line, translation_dict, auto_mode):
    auto_replace = auto_mode
    stop_processing = False
    matches = []

    # 尋找所有可能的替換
    for simplified, details in translation_dict.items():
        start = 0
        while simplified in line[start:]:
            start = line.find(simplified, start)
            if start == -1:
                break
            end = start + len(simplified)
            matches.append((start, end, simplified, details))
            start += len(simplified)  # 避免重複檢查

    # 按起始位置和長度排序，實現最長匹配優先
    matches.sort(key=lambda x: (x[0], x[1] - x[0]), reverse=True)

    replaced_ranges = []
    for start, end, simplified, details in matches:
        if any(start >= r[0] and end <= r[1] for r in replaced_ranges):
            continue  # 如果當前詞已經在替換範圍內，則跳過

        traditional_text = details['traditional']
        traditional_choices = traditional_text.split('；')
        chosen_traditional = traditional_choices[0]  # 預設選擇第一組

        if not auto_replace:
            print(f"找到的文字: '{simplified}' 可以被替換成 '{chosen_traditional}'.")
            if len(traditional_choices) > 1:
                print(f"為 '{simplified}' 提供了多個選擇:")
                for idx, choice in enumerate(traditional_choices, 1):
                    print(f"{idx}. {choice}")
                print(f"{len(traditional_choices) + 1}. 跳過替換")
            else:
                print("只有一個選擇可用。按 Enter 鍵跳過。")
            print("輸入 'auto' 切換到自動替換模式，輸入 'stop' 停止處理。")

            user_input = input("輸入編號進行替換，'auto' 自動替換，'stop' 停止，'n' 跳過: ").strip().lower()
            if user_input == 'auto':
                auto_replace = True
                chosen_traditional = traditional_choices[0]
            elif user_input == 'stop':
                stop_processing = True
                break
            elif user_input == 'n':
                continue
            elif user_input.isdigit():
                choice_index = int(user_input) - 1
                if 0 <= choice_index < len(traditional_choices):
                    chosen_traditional = traditional_choices[choice_index]
                else:
                    print("無效的選擇，跳過替換。")
                    continue
            elif user_input == "":
                # 沒有輸入任何值，使用預設選項進行替換
                print(f"進行替換: '{simplified}' -> '{chosen_traditional}'")
            else:
                print("無效的輸入，跳過替換。")
                continue

        # 替換文字並更新替換範圍
        line = line[:start] + chosen_traditional + line[end:]
        replaced_ranges.append((start, start + len(chosen_traditional)))

    return line, auto_replace, stop_processing


In [3]:
# 設置ods文件路徑
directory_path = "dataset"
ods_files = find_ods_files(directory_path)
print(ods_files)
# 載入csv翻譯表
directory_path = "./wikiCGroupTools/outputData"
csv_files = find_csv_files(directory_path)
print(csv_files)

['dataset/兩岸對照名詞-計算機壓縮檔_Oliver微調.ods']
['./wikiCGroupTools/outputData/CGroup_Windows_Data.csv', './wikiCGroupTools/outputData/CGroup_SignalsandSystems_Data.csv', './wikiCGroupTools/outputData/CGroup_MediaWiki_Data.csv', './wikiCGroupTools/outputData/CGroup_IT_Temp_Data.csv', './wikiCGroupTools/outputData/CGroup_IT_Data.csv', './wikiCGroupTools/outputData/CGroup_Electronics_Data.csv', './wikiCGroupTools/outputData/CGroup_Communication_Data.csv', './wikiCGroupTools/outputData/0_Kicad_PoMingLee.csv']


In [4]:
#針對taotieren事件採用繁體字的大陸詞彙
#會將大陸詞彙轉換為繁體字的大陸詞彙的預處理

# 載入樂詞網翻譯表
#translation_dict = load_translation_table_from_ods(ods_files) 

# 載入WIKI轉換的 csv翻譯表
translation_dict = load_translation_table_from_csv(csv_files)


#打印個別
# 打印translation_dict目前詞彙數量
print(f"Number of entries in translation dictionary: {len(translation_dict)}")

Processing ./wikiCGroupTools/outputData/CGroup_Windows_Data.csv: 104it [00:01, 70.86it/s]


Total rows processed in ./wikiCGroupTools/outputData/CGroup_Windows_Data.csv: 104


Processing ./wikiCGroupTools/outputData/CGroup_SignalsandSystems_Data.csv: 20it [00:00, 64.98it/s]


Total rows processed in ./wikiCGroupTools/outputData/CGroup_SignalsandSystems_Data.csv: 20


Processing ./wikiCGroupTools/outputData/CGroup_MediaWiki_Data.csv: 78it [00:01, 72.23it/s]


Total rows processed in ./wikiCGroupTools/outputData/CGroup_MediaWiki_Data.csv: 78


Processing ./wikiCGroupTools/outputData/CGroup_IT_Temp_Data.csv: 1107it [00:15, 73.50it/s]


Total rows processed in ./wikiCGroupTools/outputData/CGroup_IT_Temp_Data.csv: 1107


Processing ./wikiCGroupTools/outputData/CGroup_IT_Data.csv: 1136it [00:15, 72.24it/s]


Total rows processed in ./wikiCGroupTools/outputData/CGroup_IT_Data.csv: 1136


Processing ./wikiCGroupTools/outputData/CGroup_Electronics_Data.csv: 149it [00:02, 66.74it/s]


Total rows processed in ./wikiCGroupTools/outputData/CGroup_Electronics_Data.csv: 149


Processing ./wikiCGroupTools/outputData/CGroup_Communication_Data.csv: 94it [00:01, 69.87it/s]


Total rows processed in ./wikiCGroupTools/outputData/CGroup_Communication_Data.csv: 94


Processing ./wikiCGroupTools/outputData/0_Kicad_PoMingLee.csv: 83it [00:00, 89.74it/s] 

Total rows processed in ./wikiCGroupTools/outputData/0_Kicad_PoMingLee.csv: 83
Duplicates found: 1159
Rows ignored due to missing fields: 227
Number of entries in translation dictionary: 1385





In [5]:
# format_and_trim
def format_and_trim(text, max_length):
    if len(text) > max_length:
        return text[:max_length-3] + '...'
    return text

# 動態計算為了未來py文件於終端機運行
max_length_english = max(len(format_and_trim(detail['english'], 30)) for simplified, detail in translation_dict.items())
max_length_simplified = max(len(format_and_trim(simplified, 20)) for simplified in translation_dict.keys())
max_length_traditional = max(len(format_and_trim(detail['traditional'], 20)) for simplified, detail in translation_dict.items())

# 欄位資訊
print(f"{'English'.ljust(max_length_english)}{'Simplified'.ljust(max_length_simplified)}{'Traditional'.ljust(max_length_traditional)}")
print("-" * (max_length_english + max_length_simplified + max_length_traditional))


for index, (simplified, details) in enumerate(translation_dict.items()):
    if index < 10:
        traditional = format_and_trim(details['traditional'], max_length_traditional)
        english = format_and_trim(details['english'], max_length_english)
        simplified = format_and_trim(simplified, max_length_simplified)
        print(f"{english.ljust(max_length_english)}{simplified.ljust(max_length_simplified)}{traditional.ljust(max_length_traditional)}")


English                       Simplified         Traditional        
--------------------------------------------------------------------
operation system              操作系統               作業系統               
data execution protection     數據執行保護             系統記憶體保護            
device                        設備                 裝置                 
Help                          幫助                 說明                 
thumbnail                     縮略圖                縮圖                 
Log Off                       註銷                 登出                 
search                        搜索                 搜尋                 
refresh                       刷新                 重新整理               
performance                   性能                 效能                 
font                          字體                 字型                 


In [6]:
#input_document = "KiCad_dev_testdata.po"
input_document = "KiCad Taipei source zh Hant.po"
output_document = "KiCad Taipei source zh Hant_translated.po"
auto_mode = False  # 自動模式
debug_mode = False  # 開啟會打印更多資訊
logging_mode = True #如果開啟 會將有翻譯的行數與翻譯前後結果記錄於另外檔案

In [7]:
def process_po_file(input_file, output_file, translation_dict, auto_mode=False, debug_mode=False, logging_mode=False):
    pattern = re.compile(r'^(msgid|msgstr)\s+"(.+?)"$')
    log_directory = "log"
    log_file_name = os.path.join(log_directory, datetime.datetime.now().strftime("%Y%m%d-%H%M%S") + ".log")
    
    # 確保日誌目錄存在
    if not os.path.exists(log_directory):
        os.makedirs(log_directory)
    
    translated_count = 0  # 計算已翻譯的詞彙
    line_number = 0  # 初始化行號變量
    log_entries = []  # 初始化日誌條目列表
    
    try:
        with open(input_file, 'r', encoding='utf-8') as infile, open(output_file, 'w', encoding='utf-8') as outfile:
            for line in tqdm(infile, desc="翻譯 .po 檔案"):
                line_number += 1
                if debug_mode:
                    print(f"原始行: {line.strip()}")  # 打印原始行

                match = pattern.match(line.strip())
                if match:
                    tag, original_text = match.groups()
                    translated_text, auto_mode, stop_processing = replace_text(original_text, translation_dict, auto_mode)
                    if stop_processing:
                        print("使用者選擇停止處理。")
                        break
                    newline = f'{tag} "{translated_text}"\n'
                    outfile.write(newline)

                    if original_text != translated_text:
                        translated_count += 1
                        if logging_mode:
                            log_entry = f"[{line_number}:{tag}] <{original_text}> -> <{translated_text}>\n"
                            log_entries.append(log_entry)
                    
                    if debug_mode:
                        print(f"翻譯行: {newline.strip()}")

                else:
                    outfile.write(line)
                    if debug_mode:
                        print(f"未更改行: {line.strip()}")

        if logging_mode and log_entries:
            with open(log_file_name, 'w', encoding='utf-8') as log_file:
                log_file.writelines(log_entries)

        print(f"總共翻譯了 {translated_count} 個簡體詞彙")

    except Exception as e:
        print(f"處理檔案錯誤: {e}")

In [8]:
process_po_file(input_document, output_document, translation_dict, auto_mode, debug_mode, logging_mode)


翻譯 .po 檔案: 0it [00:00, ?it/s]

找到的文字: '工藝' 可以被替換成 '製程'.
只有一個選擇可用。按 Enter 鍵跳過。
輸入 'auto' 切換到自動替換模式，輸入 'stop' 停止處理。


輸入編號進行替換，'auto' 自動替換，'stop' 停止，'n' 跳過:  auto


翻譯 .po 檔案: 61495it [00:05, 11225.74it/s]

總共翻譯了 2271 個簡體詞彙



