In [2]:
import re

INPUT_FILE = '摇滚红与黑剧本.txt'
CLEAN_FILE = '摇滚红与黑.txt'  

 1. 人物字典及别名映射 
ROLES = {
    "于连": ["于连", "索雷尔", "于连・索雷尔"],
    "瑞那夫人": ["瑞那夫人", "露易丝", "德瑞纳夫人"],
    "德瑞纳先生": ["德瑞纳先生", "市长先生"],
    "玛娣儿特": ["玛蒂尔德", "玛蒂尔德·德拉莫小姐"],
    "拉穆尔侯爵": ["拉莫尔侯爵", "父亲", "侯爵"],
    "杰洛尼莫": ["杰洛尼莫"],
    "瓦勒诺": ["瓦勒诺先生", "瓦勒诺", "瓦勒诺男爵"],
    "瓦勒诺夫人": ["瓦勒诺夫人"],
    "爱丽莎": ["爱丽莎"],
    "律师": ["律师"],
    "法官": ["法官"],
    "别人": ["别人"], 
    "元帅夫人": ["元帅夫人"],
    "夸泽诺侯爵": ["夸泽诺侯爵"]
}

ALIAS_MAP = {alias: name for name, aliases in ROLES.items() for alias in aliases}

def clean_script_file_final():
 
    cleaning_pattern = re.compile(r"^(.+?)\s*[：:]", re.MULTILINE)
    
    try:
        with open(INPUT_FILE, 'r', encoding='utf-8') as f:
            text = f.read()
    except FileNotFoundError:
        print(f"清洗失败：未找到文件 {INPUT_FILE}")
        return False

    cleaned_lines = []
    
    for line in text.split('\n'):
        line = line.strip()
        match = cleaning_pattern.match(line)
        
        if match:
            full_speaker_tag = match.group(1).strip()
            speaker_name_raw = re.sub(r'[\(\[].*?[\)\]]', '', full_speaker_tag).strip()
            
            standard_name = None
            for alias in ALIAS_MAP.keys():
                if speaker_name_raw == alias:
                    standard_name = ALIAS_MAP.get(alias)
                    break
            
            if standard_name:
                dialogue = line[match.end():].strip()
                if dialogue:
                    cleaned_lines.append(f"{standard_name}：{dialogue}")

    if not cleaned_lines:
        print("清洗失败：未找到任何符合格式的对话行。")
        return False
    
    try:
        with open(CLEAN_FILE, 'w', encoding='utf-8') as f:
            f.write('\n'.join(cleaned_lines))
        print(f"清洗成功： {CLEAN_FILE}（共 {len(cleaned_lines)} 行）")
        return True
    except Exception as e:
        print(f"清洗失败，写入文件错误：{str(e)}")
        return False


if __name__ == '__main__':
    clean_script_file_final()


清洗成功： 摇滚红与黑.txt（共 263 行）
