In [4]:
import os
import pandas as pd
from sentence_transformers import SentenceTransformer
import numpy as np
from tqdm import tqdm
from concurrent.futures import ThreadPoolExecutor

# 定义文件路径
root_folder = '/Users/zhaorunping/Desktop/Research_Onging/2410_LSE_Xue/data/raw_data_tales/安徽820'
output_base_folder = '/Users/zhaorunping/Desktop/Research_Onging/2410_LSE_Xue/result/241121similarity_scores'
stories_file = '/Users/zhaorunping/Desktop/Research_Onging/2410_LSE_Xue/result/241107generated_stories.xlsx'

# 遍历目录，收集所有 .txt 文件路径
all_files = []
for dirpath, _, filenames in os.walk(root_folder):
    for filename in filenames:
        if filename.endswith('.txt'):  # 仅处理 .txt 文件
            relative_path = os.path.join(*dirpath.split(os.sep)[-2:], filename)
            all_files.append((os.path.join(dirpath, filename), relative_path))

# 加载模型
model = SentenceTransformer('DMetaSoul/sbert-chinese-general-v2')

# 加载生成的伪故事
stories_df = pd.read_excel(stories_file)

# 读取文件内容的函数
def read_file(file_info):
    file_path, relative_path = file_info
    try:
        with open(file_path, 'r', encoding='utf-8') as f:
            return relative_path, f.read()
    except Exception as e:
        print(f"Error reading file {file_path}: {e}")
        return relative_path, None

# 文件相似度计算函数
def compute_similarity(file_content, story_content):
    if file_content is None:
        return None
    embeddings = model.encode([file_content, story_content])
    similarity = np.dot(embeddings[0], embeddings[1]) / (np.linalg.norm(embeddings[0]) * np.linalg.norm(embeddings[1]))
    return similarity

# 批量处理文件
def process_files_in_batches(all_files, story_content, theme_name, output_path, batch_size=1000):
    batch_number = 1
    os.makedirs(output_path, exist_ok=True)

    for i in range(0, len(all_files), batch_size):
        batch_files = all_files[i:i + batch_size]

        # 读取文件内容
        with ThreadPoolExecutor(max_workers=10) as executor:
            file_contents = list(tqdm(executor.map(read_file, batch_files), total=len(batch_files), desc="Reading files"))

        # 计算相似度
        with ThreadPoolExecutor(max_workers=10) as executor:
            similarities = list(tqdm(executor.map(lambda content: compute_similarity(content[1], story_content), 
                                                  file_contents), total=len(file_contents), desc="Computing similarity"))

        # 组织结果并保存
        batch_results = []
        for (relative_path, content), similarity in zip(file_contents, similarities):
            if content is not None:
                trimmed_relative_path = os.path.join(*relative_path.split(os.sep)[-2:])
                batch_results.append({
                    "文件路径": trimmed_relative_path,
                    "内容": content,
                    "相似度": similarity
                })
        
        # 存储当前批次的结果
        batch_df = pd.DataFrame(batch_results)
        batch_save_path = os.path.join(output_path, f"241121{theme_name}_batch_{batch_number}_similarity_scores.csv")
        batch_df.to_csv(batch_save_path, index=False, encoding='utf-8-sig')
        print(f"Batch {batch_number} for theme '{theme_name}' saved to {batch_save_path}")
        batch_number += 1

# 遍历每个主题，生成相应的相似度结果
for _, row in stories_df.iterrows():
    theme = row['主题']
    story_content = row['生成的故事']
    theme_output_path = os.path.join(output_base_folder, theme)
    
    # 执行批量处理
    process_files_in_batches(all_files, story_content, theme, theme_output_path)

Reading files: 100%|██████████| 820/820 [00:00<00:00, 23323.81it/s]
Computing similarity: 100%|██████████| 820/820 [00:28<00:00, 29.13it/s]


Batch 1 for theme '结拜' saved to /Users/zhaorunping/Desktop/Research_Onging/2410_LSE_Xue/result/241121similarity_scores/结拜/241121结拜_batch_1_similarity_scores.csv


Reading files: 100%|██████████| 820/820 [00:00<00:00, 20164.33it/s]
Computing similarity: 100%|██████████| 820/820 [00:28<00:00, 29.12it/s]


Batch 1 for theme '信任' saved to /Users/zhaorunping/Desktop/Research_Onging/2410_LSE_Xue/result/241121similarity_scores/信任/241121信任_batch_1_similarity_scores.csv


Reading files: 100%|██████████| 820/820 [00:00<00:00, 20388.46it/s]
Computing similarity: 100%|██████████| 820/820 [00:28<00:00, 29.11it/s]


Batch 1 for theme '道教' saved to /Users/zhaorunping/Desktop/Research_Onging/2410_LSE_Xue/result/241121similarity_scores/道教/241121道教_batch_1_similarity_scores.csv


Reading files: 100%|██████████| 820/820 [00:00<00:00, 19509.05it/s]
Computing similarity: 100%|██████████| 820/820 [00:28<00:00, 28.80it/s]


Batch 1 for theme '慈善' saved to /Users/zhaorunping/Desktop/Research_Onging/2410_LSE_Xue/result/241121similarity_scores/慈善/241121慈善_batch_1_similarity_scores.csv


Reading files: 100%|██████████| 820/820 [00:00<00:00, 19348.71it/s]
Computing similarity: 100%|██████████| 820/820 [00:28<00:00, 28.81it/s]


Batch 1 for theme '诚实' saved to /Users/zhaorunping/Desktop/Research_Onging/2410_LSE_Xue/result/241121similarity_scores/诚实/241121诚实_batch_1_similarity_scores.csv


Reading files: 100%|██████████| 820/820 [00:00<00:00, 20938.71it/s]
Computing similarity: 100%|██████████| 820/820 [00:28<00:00, 28.66it/s]


Batch 1 for theme '宗族' saved to /Users/zhaorunping/Desktop/Research_Onging/2410_LSE_Xue/result/241121similarity_scores/宗族/241121宗族_batch_1_similarity_scores.csv


Reading files: 100%|██████████| 820/820 [00:00<00:00, 20351.06it/s]
Computing similarity: 100%|██████████| 820/820 [00:28<00:00, 28.86it/s]


Batch 1 for theme '长寿' saved to /Users/zhaorunping/Desktop/Research_Onging/2410_LSE_Xue/result/241121similarity_scores/长寿/241121长寿_batch_1_similarity_scores.csv


Reading files: 100%|██████████| 820/820 [00:00<00:00, 20114.33it/s]
Computing similarity: 100%|██████████| 820/820 [00:28<00:00, 28.76it/s]


Batch 1 for theme '孝道' saved to /Users/zhaorunping/Desktop/Research_Onging/2410_LSE_Xue/result/241121similarity_scores/孝道/241121孝道_batch_1_similarity_scores.csv


Reading files: 100%|██████████| 820/820 [00:00<00:00, 14443.74it/s]
Computing similarity: 100%|██████████| 820/820 [00:28<00:00, 28.74it/s]


Batch 1 for theme '平等' saved to /Users/zhaorunping/Desktop/Research_Onging/2410_LSE_Xue/result/241121similarity_scores/平等/241121平等_batch_1_similarity_scores.csv


Reading files: 100%|██████████| 820/820 [00:00<00:00, 15009.79it/s]
Computing similarity: 100%|██████████| 820/820 [00:28<00:00, 28.70it/s]


Batch 1 for theme '祖先' saved to /Users/zhaorunping/Desktop/Research_Onging/2410_LSE_Xue/result/241121similarity_scores/祖先/241121祖先_batch_1_similarity_scores.csv


Reading files: 100%|██████████| 820/820 [00:00<00:00, 20952.99it/s]
Computing similarity: 100%|██████████| 820/820 [00:28<00:00, 29.03it/s]


Batch 1 for theme '惩罚' saved to /Users/zhaorunping/Desktop/Research_Onging/2410_LSE_Xue/result/241121similarity_scores/惩罚/241121惩罚_batch_1_similarity_scores.csv


Reading files: 100%|██████████| 820/820 [00:00<00:00, 20867.31it/s]
Computing similarity: 100%|██████████| 820/820 [00:28<00:00, 29.14it/s]


Batch 1 for theme '努力' saved to /Users/zhaorunping/Desktop/Research_Onging/2410_LSE_Xue/result/241121similarity_scores/努力/241121努力_batch_1_similarity_scores.csv


Reading files: 100%|██████████| 820/820 [00:00<00:00, 21440.06it/s]
Computing similarity: 100%|██████████| 820/820 [00:28<00:00, 28.94it/s]


Batch 1 for theme '鬼神' saved to /Users/zhaorunping/Desktop/Research_Onging/2410_LSE_Xue/result/241121similarity_scores/鬼神/241121鬼神_batch_1_similarity_scores.csv


Reading files: 100%|██████████| 820/820 [00:00<00:00, 20625.17it/s]
Computing similarity: 100%|██████████| 820/820 [00:28<00:00, 29.01it/s]


Batch 1 for theme '古时候' saved to /Users/zhaorunping/Desktop/Research_Onging/2410_LSE_Xue/result/241121similarity_scores/古时候/241121古时候_batch_1_similarity_scores.csv


Reading files: 100%|██████████| 820/820 [00:00<00:00, 20762.00it/s]
Computing similarity: 100%|██████████| 820/820 [00:28<00:00, 29.14it/s]


Batch 1 for theme '馈赠' saved to /Users/zhaorunping/Desktop/Research_Onging/2410_LSE_Xue/result/241121similarity_scores/馈赠/241121馈赠_batch_1_similarity_scores.csv


Reading files: 100%|██████████| 820/820 [00:00<00:00, 20361.06it/s]
Computing similarity: 100%|██████████| 820/820 [00:28<00:00, 29.26it/s]


Batch 1 for theme '杀戮' saved to /Users/zhaorunping/Desktop/Research_Onging/2410_LSE_Xue/result/241121similarity_scores/杀戮/241121杀戮_batch_1_similarity_scores.csv


Reading files: 100%|██████████| 820/820 [00:00<00:00, 21051.62it/s]
Computing similarity: 100%|██████████| 820/820 [00:28<00:00, 29.17it/s]


Batch 1 for theme '遵守' saved to /Users/zhaorunping/Desktop/Research_Onging/2410_LSE_Xue/result/241121similarity_scores/遵守/241121遵守_batch_1_similarity_scores.csv


Reading files: 100%|██████████| 820/820 [00:00<00:00, 19961.98it/s]
Computing similarity: 100%|██████████| 820/820 [00:28<00:00, 29.19it/s]


Batch 1 for theme '忠诚' saved to /Users/zhaorunping/Desktop/Research_Onging/2410_LSE_Xue/result/241121similarity_scores/忠诚/241121忠诚_batch_1_similarity_scores.csv


Reading files: 100%|██████████| 820/820 [00:00<00:00, 19203.29it/s]
Computing similarity: 100%|██████████| 820/820 [00:28<00:00, 28.96it/s]


Batch 1 for theme '儒家' saved to /Users/zhaorunping/Desktop/Research_Onging/2410_LSE_Xue/result/241121similarity_scores/儒家/241121儒家_batch_1_similarity_scores.csv


Reading files: 100%|██████████| 820/820 [00:00<00:00, 19536.87it/s]
Computing similarity: 100%|██████████| 820/820 [00:28<00:00, 29.20it/s]


Batch 1 for theme '帝王' saved to /Users/zhaorunping/Desktop/Research_Onging/2410_LSE_Xue/result/241121similarity_scores/帝王/241121帝王_batch_1_similarity_scores.csv


Reading files: 100%|██████████| 820/820 [00:00<00:00, 20067.62it/s]
Computing similarity: 100%|██████████| 820/820 [00:28<00:00, 29.17it/s]


Batch 1 for theme '佛教' saved to /Users/zhaorunping/Desktop/Research_Onging/2410_LSE_Xue/result/241121similarity_scores/佛教/241121佛教_batch_1_similarity_scores.csv


Reading files: 100%|██████████| 820/820 [00:00<00:00, 20721.22it/s]
Computing similarity: 100%|██████████| 820/820 [00:28<00:00, 29.09it/s]

Batch 1 for theme '仁义' saved to /Users/zhaorunping/Desktop/Research_Onging/2410_LSE_Xue/result/241121similarity_scores/仁义/241121仁义_batch_1_similarity_scores.csv





In [5]:
import os
import pandas as pd
from tqdm import tqdm

# 合并每个主题的相似度结果为单独的 CSV 文件
def merge_theme_similarity_files(output_base_folder):
    # 创建存储合并结果的文件夹
    merged_folder = os.path.join(output_base_folder, '241121similarity_scores_merged_anhui')
    os.makedirs(merged_folder, exist_ok=True)

    # 遍历输出文件夹，收集所有主题文件夹
    for theme_folder in os.listdir(output_base_folder):
        theme_path = os.path.join(output_base_folder, theme_folder)
        if os.path.isdir(theme_path):  # 确保是文件夹
            # 初始化一个空的 DataFrame 来存储该主题的所有 CSV 数据
            theme_combined_df = pd.DataFrame()
            csv_files = [file for file in os.listdir(theme_path) if file.endswith('_similarity_scores.csv')]

            # 显示进度条
            for file in tqdm(csv_files, desc=f"Merging theme '{theme_folder}'", unit='file'):
                file_path = os.path.join(theme_path, file)
                # 读取 CSV 文件并附加到主题 DataFrame
                df = pd.read_csv(file_path)
                theme_combined_df = pd.concat([theme_combined_df, df], ignore_index=True)

            # 保存合并后的结果为单独的文件到新文件夹
            if not theme_combined_df.empty:
                theme_output_file = os.path.join(merged_folder, f"241121{theme_folder}_similarity_scores_merged.csv")
                theme_combined_df.to_csv(theme_output_file, index=False, encoding='utf-8-sig')
                print(f"主题 '{theme_folder}' 的相似度结果已合并，文件保存为：{theme_output_file}")

# 调用合并函数
merge_theme_similarity_files(output_base_folder)

Merging theme '宗族': 100%|██████████| 1/1 [00:00<00:00, 28.15file/s]


主题 '宗族' 的相似度结果已合并，文件保存为：/Users/zhaorunping/Desktop/Research_Onging/2410_LSE_Xue/result/241121similarity_scores/241121similarity_scores_merged_anhui/241121宗族_similarity_scores_merged.csv


Merging theme '鬼神': 100%|██████████| 1/1 [00:00<00:00, 34.17file/s]


主题 '鬼神' 的相似度结果已合并，文件保存为：/Users/zhaorunping/Desktop/Research_Onging/2410_LSE_Xue/result/241121similarity_scores/241121similarity_scores_merged_anhui/241121鬼神_similarity_scores_merged.csv


Merging theme '241121similarity_scores_merged_anhui': 0file [00:00, ?file/s]
Merging theme '结拜': 100%|██████████| 1/1 [00:00<00:00, 34.05file/s]


主题 '结拜' 的相似度结果已合并，文件保存为：/Users/zhaorunping/Desktop/Research_Onging/2410_LSE_Xue/result/241121similarity_scores/241121similarity_scores_merged_anhui/241121结拜_similarity_scores_merged.csv


Merging theme '诚实': 100%|██████████| 1/1 [00:00<00:00, 32.96file/s]


主题 '诚实' 的相似度结果已合并，文件保存为：/Users/zhaorunping/Desktop/Research_Onging/2410_LSE_Xue/result/241121similarity_scores/241121similarity_scores_merged_anhui/241121诚实_similarity_scores_merged.csv


Merging theme '慈善': 100%|██████████| 1/1 [00:00<00:00, 34.14file/s]


主题 '慈善' 的相似度结果已合并，文件保存为：/Users/zhaorunping/Desktop/Research_Onging/2410_LSE_Xue/result/241121similarity_scores/241121similarity_scores_merged_anhui/241121慈善_similarity_scores_merged.csv


Merging theme '平等': 100%|██████████| 1/1 [00:00<00:00, 32.37file/s]


主题 '平等' 的相似度结果已合并，文件保存为：/Users/zhaorunping/Desktop/Research_Onging/2410_LSE_Xue/result/241121similarity_scores/241121similarity_scores_merged_anhui/241121平等_similarity_scores_merged.csv


Merging theme '孝道': 100%|██████████| 1/1 [00:00<00:00, 34.98file/s]


主题 '孝道' 的相似度结果已合并，文件保存为：/Users/zhaorunping/Desktop/Research_Onging/2410_LSE_Xue/result/241121similarity_scores/241121similarity_scores_merged_anhui/241121孝道_similarity_scores_merged.csv


Merging theme '佛教': 100%|██████████| 1/1 [00:00<00:00, 25.35file/s]


主题 '佛教' 的相似度结果已合并，文件保存为：/Users/zhaorunping/Desktop/Research_Onging/2410_LSE_Xue/result/241121similarity_scores/241121similarity_scores_merged_anhui/241121佛教_similarity_scores_merged.csv


Merging theme '馈赠': 100%|██████████| 1/1 [00:00<00:00, 35.27file/s]


主题 '馈赠' 的相似度结果已合并，文件保存为：/Users/zhaorunping/Desktop/Research_Onging/2410_LSE_Xue/result/241121similarity_scores/241121similarity_scores_merged_anhui/241121馈赠_similarity_scores_merged.csv


Merging theme '古时候': 100%|██████████| 1/1 [00:00<00:00, 34.56file/s]


主题 '古时候' 的相似度结果已合并，文件保存为：/Users/zhaorunping/Desktop/Research_Onging/2410_LSE_Xue/result/241121similarity_scores/241121similarity_scores_merged_anhui/241121古时候_similarity_scores_merged.csv


Merging theme '遵守': 100%|██████████| 1/1 [00:00<00:00, 34.27file/s]


主题 '遵守' 的相似度结果已合并，文件保存为：/Users/zhaorunping/Desktop/Research_Onging/2410_LSE_Xue/result/241121similarity_scores/241121similarity_scores_merged_anhui/241121遵守_similarity_scores_merged.csv


Merging theme '儒家': 100%|██████████| 1/1 [00:00<00:00, 35.43file/s]


主题 '儒家' 的相似度结果已合并，文件保存为：/Users/zhaorunping/Desktop/Research_Onging/2410_LSE_Xue/result/241121similarity_scores/241121similarity_scores_merged_anhui/241121儒家_similarity_scores_merged.csv


Merging theme '忠诚': 100%|██████████| 1/1 [00:00<00:00, 32.07file/s]


主题 '忠诚' 的相似度结果已合并，文件保存为：/Users/zhaorunping/Desktop/Research_Onging/2410_LSE_Xue/result/241121similarity_scores/241121similarity_scores_merged_anhui/241121忠诚_similarity_scores_merged.csv


Merging theme '道教': 100%|██████████| 1/1 [00:00<00:00, 29.80file/s]


主题 '道教' 的相似度结果已合并，文件保存为：/Users/zhaorunping/Desktop/Research_Onging/2410_LSE_Xue/result/241121similarity_scores/241121similarity_scores_merged_anhui/241121道教_similarity_scores_merged.csv


Merging theme '长寿': 100%|██████████| 1/1 [00:00<00:00, 30.08file/s]


主题 '长寿' 的相似度结果已合并，文件保存为：/Users/zhaorunping/Desktop/Research_Onging/2410_LSE_Xue/result/241121similarity_scores/241121similarity_scores_merged_anhui/241121长寿_similarity_scores_merged.csv


Merging theme '信任': 100%|██████████| 1/1 [00:00<00:00, 30.44file/s]


主题 '信任' 的相似度结果已合并，文件保存为：/Users/zhaorunping/Desktop/Research_Onging/2410_LSE_Xue/result/241121similarity_scores/241121similarity_scores_merged_anhui/241121信任_similarity_scores_merged.csv


Merging theme '仁义': 100%|██████████| 1/1 [00:00<00:00, 23.11file/s]


主题 '仁义' 的相似度结果已合并，文件保存为：/Users/zhaorunping/Desktop/Research_Onging/2410_LSE_Xue/result/241121similarity_scores/241121similarity_scores_merged_anhui/241121仁义_similarity_scores_merged.csv


Merging theme '杀戮': 100%|██████████| 1/1 [00:00<00:00, 33.23file/s]


主题 '杀戮' 的相似度结果已合并，文件保存为：/Users/zhaorunping/Desktop/Research_Onging/2410_LSE_Xue/result/241121similarity_scores/241121similarity_scores_merged_anhui/241121杀戮_similarity_scores_merged.csv


Merging theme '努力': 100%|██████████| 1/1 [00:00<00:00, 32.35file/s]


主题 '努力' 的相似度结果已合并，文件保存为：/Users/zhaorunping/Desktop/Research_Onging/2410_LSE_Xue/result/241121similarity_scores/241121similarity_scores_merged_anhui/241121努力_similarity_scores_merged.csv


Merging theme '帝王': 100%|██████████| 1/1 [00:00<00:00, 32.58file/s]


主题 '帝王' 的相似度结果已合并，文件保存为：/Users/zhaorunping/Desktop/Research_Onging/2410_LSE_Xue/result/241121similarity_scores/241121similarity_scores_merged_anhui/241121帝王_similarity_scores_merged.csv


Merging theme '祖先': 100%|██████████| 1/1 [00:00<00:00, 32.41file/s]


主题 '祖先' 的相似度结果已合并，文件保存为：/Users/zhaorunping/Desktop/Research_Onging/2410_LSE_Xue/result/241121similarity_scores/241121similarity_scores_merged_anhui/241121祖先_similarity_scores_merged.csv


Merging theme '惩罚': 100%|██████████| 1/1 [00:00<00:00, 30.92file/s]


主题 '惩罚' 的相似度结果已合并，文件保存为：/Users/zhaorunping/Desktop/Research_Onging/2410_LSE_Xue/result/241121similarity_scores/241121similarity_scores_merged_anhui/241121惩罚_similarity_scores_merged.csv


In [None]:
file_path = '/Users/zhaorunping/Desktop/Research_Onging/2410_LSE_Xue/data/raw_data_tales/地图匹配_0426_v4.csv'
base = pd.read_csv(file_path)

In [8]:
import re

# 合并相似度列到基础表的函数
def merge_similarity_to_base(base_file, output_base_folder):
    # 加载基础表
    base = pd.read_csv(base_file)

    # 获取合并的相似度结果文件夹
    merged_folder = os.path.join(output_base_folder, '241121similarity_scores_merged_anhui')

    # 初始化合并后的基础表
    merged_base = base.copy()

    # 遍历合并的相似度文件
    for theme_file in tqdm(os.listdir(merged_folder), desc="Merging similarity scores to base"):
        if theme_file.endswith('_similarity_scores_merged.csv'):
            theme_path = os.path.join(merged_folder, theme_file)
            data = pd.read_csv(theme_path)

            # 提取文件名
            data['文件名'] = data['文件路径'].apply(
                lambda x: re.search(r'\/(.*?)\.txt', x).group(1) if re.search(r'\/(.*?)\.txt', x) else None
            )

            # 重新命名相似度列
            # 去掉文件名中的数字并使用主题名称作为列名
            theme_name = ''.join(filter(lambda x: not x.isdigit(), theme_file.split('_')[0]))
            similarity_column_name = f"{theme_name}_相似度"
            data.rename(columns={'相似度': similarity_column_name}, inplace=True)

            # 合并到基础表中
            merged_base = merged_base.merge(data[['文件名', similarity_column_name]], left_on='name', right_on='文件名', how='left')

            # 删除临时列
            merged_base.drop(columns=['文件名'], inplace=True)

    return merged_base

# 调用合并函数
output_base_folder = '/Users/zhaorunping/Desktop/Research_Onging/2410_LSE_Xue/result/241121similarity_scores'
base_file = '/Users/zhaorunping/Desktop/Research_Onging/2410_LSE_Xue/data/raw_data_tales/241113similarity_scores_multitopic.csv'

# 2. 将合并的相似度列合并到基础表
merged_base = merge_similarity_to_base(base_file, output_base_folder)

# 保存最终合并的基础表
output_file = '/Users/zhaorunping/Desktop/Research_Onging/2410_LSE_Xue/result/241121similarity_scores_multitopic.csv'  # 替换为保存文件的路径
merged_base.to_csv(output_file, index=False, encoding='utf-8-sig')
print(f"合并后的基础表已保存为：{output_file}")

Merging similarity scores to base: 100%|██████████| 22/22 [00:00<00:00, 27.37it/s]


合并后的基础表已保存为：/Users/zhaorunping/Desktop/Research_Onging/2410_LSE_Xue/result/241121similarity_scores_multitopic.csv


In [10]:
# 找到所有重复列的基础列名
cols_to_merge = [col[:-2] for col in merged_base.columns if col.endswith(('_x', '_y'))]
cols_to_merge = set(cols_to_merge)

# 合并列
for col in cols_to_merge:
    merged_base[col] = merged_base[[f"{col}_x", f"{col}_y"]].bfill(axis=1).iloc[:, 0]

# 删除重复列
merged_base = merged_base[[col for col in merged_base.columns if not col.endswith(('_x', '_y'))]]
merged_base.to_csv(output_file, index=False, encoding='utf-8-sig')
merged_base

Unnamed: 0,rowid,name,cailu,provgb,citygb,cntygb,area,geocode,PROVGB_1953,CITYGB_1953,...,佛教_相似度,杀戮_相似度,信任_相似度,惩罚_相似度,长寿_相似度,慈善_相似度,儒家_相似度,诚实_相似度,平等_相似度,努力_相似度
0,1,04946__322老汉的宝物,missing values,missing valuesd,missing valuesd,missing valuesd,宁夏撒拉族循化貝,"{'lng': 116.413384, 'lat': 39.910925}",,,...,0.717883,0.631977,0.731195,0.691494,0.713617,0.699354,0.689755,0.652281,0.691983,0.773002
1,2,04743__121造过麦山的白土城,missing values,missing valuesd,missing valuesd,missing valuesd,宁夏土族民和县,"{'lng': 102.83639, 'lat': 36.325561}",,,...,0.589558,0.769028,0.630614,0.643676,0.641812,0.702331,0.701671,0.629206,0.708508,0.704313
2,3,05119__496伊斯玛悔过,林昌林,missing valuesd,missing valuesd,missing valuesd,宁夏回族西宁市,"{'lng': 101.78445, 'lat': 36.623385}",,,...,0.620851,0.578783,0.738193,0.659119,0.743266,0.757087,0.735706,0.673171,0.770947,0.765007
3,4,05049__425三姊妹,李友楼,missing valuesd,missing valuesd,missing valuesd,宁夏土族互助县,"{'lng': 101.964569, 'lat': 36.850022}",,,...,0.623023,0.581853,0.689178,0.668782,0.664430,0.677478,0.658897,0.597253,0.733262,0.747147
4,5,04849__225老鼠的智慧,missing values,missing valuesd,missing valuesd,missing valuesd,宁夏藏抜同仁县,"{'lng': 116.413384, 'lat': 39.910925}",,,...,0.688885,0.613048,0.686730,0.702645,0.656781,0.692279,0.661375,0.632269,0.699725,0.714526
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17258,17259,17040_252牛虎兄弟和大灰狼.txt,r更映月女23岁汉族燃样大手毕止,missing values,missing values,missing values,吉林1989年旭做,missing values,,,...,,,,,,,,,,
17259,17260,16886_071李子.txt,,missing values,missing values,missing values,吉林1989年如技,missing values,,,...,,,,,,,,,,
17260,17261,17073_291桃花仙子.txt,堡镇偏脸城甘农氏不识字 8杨彤歧男24罗汉族于部大专半业 采录时间 1952年5月（1988...,missing values,missing values,missing values,吉林1988年复犠,missing values,,,...,,,,,,,,,,
17261,17262,16901_091五台山.txt,于中源S 29罗汉袂救即大学毕业,missing values,missing values,missing values,吉林1987年JL植,missing values,,,...,,,,,,,,,,


In [None]:
merged_base