In [3]:
import os
import re
import datetime

from odf.opendocument import load
from odf.table import Table, TableRow, TableCell
from odf.text import P
from tqdm import tqdm
from opencc import OpenCC

#檔案清單轉換為list
def find_ods_files(directory):
    ods_files = []
    for filename in os.listdir(directory):
        if filename.endswith(".ods"):
            ods_files.append(os.path.join(directory, filename))
    return ods_files


def simplify_to_traditional(simplified_text):
    cc = OpenCC('s2t')  # 簡體字轉換為繁體
    return cc.convert(simplified_text)

def get_text_from_cell(cell):
    text_content = []
    for p in cell.getElementsByType(P):
        text_content.append("".join(node.data for node in p.childNodes if node.nodeType == node.TEXT_NODE))
    return "".join(text_content).strip()

def load_translation_table(file_paths):
    translation_dict = {}
    for file_path in file_paths:
        doc = load(file_path)
        tables = doc.spreadsheet.getElementsByType(Table)
        print(f"Number of tables found: {len(tables)}")  
        for table in tables:
            rows = list(table.getElementsByType(TableRow))
            print(f"Processing {len(rows)} rows in table")  
            for row in tqdm(rows, desc="Processing rows"):
                cells = row.getElementsByType(TableCell)
                if len(cells) >= 5: 
                    simplified = get_text_from_cell(cells[4])
                    traditional = get_text_from_cell(cells[2])
                    english = get_text_from_cell(cells[1])  # 英文
                    # 簡體字轉繁體
                    traditional_simplified = simplify_to_traditional(simplified)
                    translation_dict[traditional_simplified] = {'traditional': traditional, 'english': english}
    return translation_dict
    
def replace_text(line, translation_dict, auto_mode):
    for simplified, details in translation_dict.items():
        if simplified in line:
            # 確認是否有多個繁體
            traditional_text = details['traditional']
            traditional_choices = traditional_text.split('；')
            chosen_traditional = traditional_choices[0]  # 默認ods文件第一組

            if not auto_mode and len(traditional_choices) > 1:
                # 給用戶作選擇
                print(f"Choose the correct translation for '{simplified}':")
                for idx, choice in enumerate(traditional_choices, 1):
                    print(f"{idx}. {choice}")
                # 提示用戶選擇項目
                user_input = input("Enter the number of the correct translation, or press Enter to skip: ")
                if user_input.strip() == "":
                    continue  # 沒輸入數值就不修改
                choice_index = int(user_input) - 1
                if choice_index < len(traditional_choices):
                    chosen_traditional = traditional_choices[choice_index]

            # 替换文本
            line = line.replace(simplified, chosen_traditional)
    return line



In [4]:
# 設置ods文件路徑
directory_path = "dataset"
ods_files = find_ods_files(directory_path)
print(ods_files)


['dataset/兩岸對照名詞-計算機壓縮檔_0.ods']


In [5]:
#針對taotieren事件採用繁體字的大陸詞彙
#會將大陸詞彙轉換為繁體字的大陸詞彙的預處理

translation_dict = load_translation_table(ods_files)

Number of tables found: 1
Processing 1533 rows in table


Processing rows: 100%|███████████████████████████████████████| 1533/1533 [00:22<00:00, 69.66it/s]


In [6]:
# format_and_trim
def format_and_trim(text, max_length):
    if len(text) > max_length:
        return text[:max_length-3] + '...'
    return text

# 動態計算為了未來py文件於終端機運行
max_length_english = max(len(format_and_trim(detail['english'], 30)) for simplified, detail in translation_dict.items())
max_length_simplified = max(len(format_and_trim(simplified, 20)) for simplified in translation_dict.keys())
max_length_traditional = max(len(format_and_trim(detail['traditional'], 20)) for simplified, detail in translation_dict.items())

# 欄位資訊
print(f"{'English'.ljust(max_length_english)}{'Simplified'.ljust(max_length_simplified)}{'Traditional'.ljust(max_length_traditional)}")
print("-" * (max_length_english + max_length_simplified + max_length_traditional))


for index, (simplified, details) in enumerate(translation_dict.items()):
    if index < 10:
        traditional = format_and_trim(details['traditional'], max_length_traditional)
        english = format_and_trim(details['english'], max_length_english)
        simplified = format_and_trim(simplified, max_length_simplified)
        print(f"{english.ljust(max_length_english)}{simplified.ljust(max_length_simplified)}{traditional.ljust(max_length_traditional)}")


English                       Simplified    Traditional         
----------------------------------------------------------------
英文名稱                          中國大陸譯名        中文名稱                
See abnormal end              反常結束          異常終止                
abort                         夭折【動】         打斷；放棄；異常中止；中斷；故障(...
accelerator                   加速器           加速器                 
access                        訪問            存取；進接；讀取；進出；擷取；     
access path                   訪問路徑          存取路徑                
access right                  訪問權           存取權；使用權限            
accountability                可覈查性          可歸責性                
accumulator                   累加器           存儲器；累加器；蓄電池；儲能電路    
acknowledgment                確認            應答                  


In [7]:
#input_document = "KiCad_dev_testdata.po"
input_document = "KiCad Taipei source zh Hant.po"
output_document = "KiCad_dev_testdata_translated.po"
auto_mode = True  # 手動模式 有bug還沒修好= =
debug_mode = False  # 開啟會打印更多資訊
logging_mode = True #如果開啟 會將有翻譯的行數與翻譯前後結果記錄於另外檔案

In [8]:
def process_po_file(input_file, output_file, translation_dict, auto_mode=False, debug_mode=False, logging_mode=False):
    pattern = re.compile(r'^(msgid|msgstr)\s+"(.+?)"$')
    log_directory = "log"
    log_file_name = os.path.join(log_directory, datetime.datetime.now().strftime("%Y%m%d-%H%M%S") + ".log")
    

    if not os.path.exists(log_directory):
        os.makedirs(log_directory)
    
    translated_count = 0  # 計算任不認真
    try:
        with open(input_file, 'r', encoding='utf-8') as infile, open(output_file, 'w', encoding='utf-8') as outfile:
            line_number = 0  
            log_entries = [] 
            
            for line in tqdm(infile, desc="Translating .po file"):
                line_number += 1
                if debug_mode:
                    print(f"Original line: {line.strip()}")  # 打印原始行

                match = pattern.match(line.strip())
                if match:
                    tag, original_text = match.groups()
                    translated_text = replace_text(original_text, translation_dict, auto_mode)
                    newline = f'{tag} "{translated_text}"\n'
                    outfile.write(newline)

                    if debug_mode:
                        print(f"Translated line: {newline.strip()}")

                    if logging_mode and original_text != translated_text:
                        log_entry = f"[{line_number}:{tag}] <{original_text}> -> <{translated_text}>\n"
                        log_entries.append(log_entry)

                    if original_text != translated_text:
                        translated_count += 1  # 認真值+1

                else:
                    outfile.write(line)
                    if debug_mode:
                        print(f"Unchanged line: {line.strip()}")

        if logging_mode and log_entries:
            with open(log_file_name, 'w', encoding='utf-8') as log_file:
                log_file.writelines(log_entries)

        print(f"總共修正了: {translated_count} 簡體詞彙")  

    except Exception as e:
        print(f"Error processing file: {e}")


In [9]:
process_po_file(input_document, output_document, translation_dict, auto_mode, debug_mode, logging_mode)


Translating .po file: 61495it [00:00, 97473.43it/s]

總共修正了: 2543 簡體詞彙



