In [1]:
import os
import re

def extract_word(line):
    """
    提取英文单词或短语：
    - 有音标：取音标前的部分
    - 无音标：取第一个中文字符前的部分
    - 去掉*，可统一转小写
    """
    s = line.strip()
    if not s:
        return None

    # 有音标的情况
    match_phon = re.match(r'^(.+?)\s*/[^/]+/', s)
    if match_phon:
        word_part = match_phon.group(1).replace('*', '').strip()
        return word_part.lower()

    # 无音标，找第一个中文字符
    m = re.search(r'[\u4e00-\u9fff]', s)
    if m:
        idx = m.start()
        word_part = s[:idx].replace('*', '').strip()
        return word_part.lower()

    # 无音标无中文 → 整行当作英文短语
    return s.replace('*', '').strip().lower()


def process_folder(folder_path, start_num=24, end_num=48):
    for i in range(start_num, end_num + 1):
        input_filename = os.path.join(folder_path, f"word list{i}.txt")
        output_folder = os.path.join(folder_path, "output_txt")
        os.makedirs(output_folder, exist_ok=True)
        output_filename = os.path.join(output_folder, f"word list{i}_E.txt")

        if not os.path.exists(input_filename):
            print(f"文件不存在，跳过: {input_filename}")
            continue

        words = []
        with open(input_filename, 'r', encoding='utf-8') as f:
            for line in f:
                word = extract_word(line)
                if word:
                    words.append(word)

        # 去重且保持顺序
        seen = set()
        unique_words = []
        for w in words:
            if w not in seen:
                seen.add(w)
                unique_words.append(w)

        # 写入新文件
        with open(output_filename, 'w', encoding='utf-8') as f_out:
            for w in unique_words:
                f_out.write(w + '\n')

        print(f"处理完成：{input_filename} -> {output_filename}, 共提取单词 {len(unique_words)} 个")

if __name__ == "__main__":
    folder = "./"  # 你的文件夹路径，修改为实际路径
    process_folder(folder)


处理完成：./word list24.txt -> ./output_txt\word list24_E.txt, 共提取单词 72 个
处理完成：./word list25.txt -> ./output_txt\word list25_E.txt, 共提取单词 71 个
处理完成：./word list26.txt -> ./output_txt\word list26_E.txt, 共提取单词 73 个
处理完成：./word list27.txt -> ./output_txt\word list27_E.txt, 共提取单词 75 个
处理完成：./word list28.txt -> ./output_txt\word list28_E.txt, 共提取单词 74 个
处理完成：./word list29.txt -> ./output_txt\word list29_E.txt, 共提取单词 76 个
处理完成：./word list30.txt -> ./output_txt\word list30_E.txt, 共提取单词 72 个
处理完成：./word list31.txt -> ./output_txt\word list31_E.txt, 共提取单词 76 个
处理完成：./word list32.txt -> ./output_txt\word list32_E.txt, 共提取单词 72 个
处理完成：./word list33.txt -> ./output_txt\word list33_E.txt, 共提取单词 75 个
处理完成：./word list34.txt -> ./output_txt\word list34_E.txt, 共提取单词 74 个
处理完成：./word list35.txt -> ./output_txt\word list35_E.txt, 共提取单词 77 个
处理完成：./word list36.txt -> ./output_txt\word list36_E.txt, 共提取单词 76 个
处理完成：./word list37.txt -> ./output_txt\word list37_E.txt, 共提取单词 75 个
处理完成：./word list38.txt -> ./output