In [None]:
from openai import OpenAI
# 设置 API 密钥和组织
api_key = "sk-"
client = OpenAI(api_key=api_key)

# 设置组织信息
client.organization = "org-"

In [8]:
import os
import openai
import pandas as pd
import json
import csv
import re
from concurrent.futures import ThreadPoolExecutor, as_completed
from tqdm import tqdm

def analyze_story_with_gpt(story_content, story_name, themes_df, output_dir):
    # 构建包含多个主题的分析提示
    themes_analysis = []
    for _, row in themes_df.iterrows():
        theme = row['theme']
        definition = row['definition']
        support = row['support']
        
        theme_analysis = f"主题：{theme}，主题定义：{definition}。对应“提及并支持（A）”的观点如下：{support}。"
        themes_analysis.append(theme_analysis)

    # 拼接多个主题的分析信息
    themes_prompt = " ".join(themes_analysis)

    prompt_template = f"""
    你是一名精通中国民间故事的研究员，你的任务是对传入的中国民间故事文本内容进行分析，逐一判断其与若干主题的相关性及观点，输出结构化的分析结果。

    分析步骤：
    第一步，主题相关性判断
    根据每个主题的定义，判断故事内容是否与该主题相关：
    • 若故事未涉及该主题，标记为“并没有提及（D）”。
    • 若故事涉及该主题，进入第二步分析。

    第二步，观点判断
    若故事内容涉及该主题，则进一步判断故事的观点：
    • 提及并支持（A）：故事明确表达支持对应观点。
    • 提及并反对（B）：故事明确表达反对对应观点。
    • 提及但没有明确观点（C）：故事提及该主题，但未明确表达支持或反对对应观点。

    故事文本：{story_content}。

    {themes_prompt}

    输出要求：
    请将每个故事的分析结果以JSON格式输出，结构如下：
    {{
      "故事名称": "{story_name}",
      "仁义": "A/B/C/D",
      "忠诚": "A/B/C/D",
      "佛教": "A/B/C/D",
      "道教": "A/B/C/D",
      "儒家": "A/B/C/D",
      "杀戮": "A/B/C/D",
      "孝道": "A/B/C/D",
      "慈善": "A/B/C/D",
      "信任": "A/B/C/D",
      "努力": "A/B/C/D",
      "惩罚": "A/B/C/D",
      "诚实": "A/B/C/D",
      "长寿": "A/B/C/D",
      "宗族": "A/B/C/D",
      "祖先": "A/B/C/D",
      "平等": "A/B/C/D",
      "鬼神": "A/B/C/D",
      "古时候": "A/B/C/D",
      "馈赠": "A/B/C/D"
    }}
    """
    
    response = client.chat.completions.create(
        model="gpt-4o-mini",
        messages=[{"role": "user", "content": prompt_template}],
        max_tokens=3000,  # 提高 token 限制以处理多个主题
        temperature=0.1,
        top_p=0.9
    )
    
    # 获取分析结果并清理不可见字符
    analysis_result = response.choices[0].message.content.strip()  # 去除前后的空格、换行符
    
    # 打印原始返回内容以供调试
    # print(f"Raw analysis result for {story_name}: {analysis_result}")
    
    # 清理返回的内容中的不可见字符，去除 markdown 代码块标记
    cleaned_analysis_result = re.sub(r'```json', '', analysis_result)  # 去除开头的 markdown json 标记
    cleaned_analysis_result = re.sub(r'```', '', cleaned_analysis_result)  # 去除结束的 markdown 标记
    cleaned_analysis_result = re.sub(r'[\x00-\x1F\x7F]', '', cleaned_analysis_result)  # 删除不可见字符
    
    # 检查清理后的分析结果是否为空
    if not cleaned_analysis_result:
        print(f"Error: No content in analysis result for {story_name}")
        return None

    # 打印清理后的分析结果
    # print(f"Cleaned analysis result for {story_name}: {cleaned_analysis_result}")
    
    # 尝试解析清理后的字符串
    try:
        analysis_json = json.loads(cleaned_analysis_result)  # 尝试解析清理后的内容
        output_file = os.path.join(output_dir, f"{story_name}_analysis.json")
        with open(output_file, 'w', encoding='utf-8') as f:
            json.dump(analysis_json, f, ensure_ascii=False, indent=4)
    except json.JSONDecodeError as e:
        print(f"Error decoding JSON for story {story_name}: {e}")
        print(f"Returned content: {cleaned_analysis_result}")
    except Exception as e:
        print(f"Error saving JSON for story {story_name}: {e}")
    
    return cleaned_analysis_result

def process_stories_in_directory(stories_dir, themes_df, output_dir):
    # 获取所有的故事文件
    story_files = [f for f in os.listdir(stories_dir) if f.endswith(".txt")]
    
    # 创建进度条显示
    with ThreadPoolExecutor(max_workers=40) as executor, tqdm(total=len(story_files), desc="Processing Stories") as pbar:
        # 提交任务给线程池
        future_to_story = {}
        for story_name in story_files:
            story_path = os.path.join(stories_dir, story_name)
            with open(story_path, 'r', encoding='utf-8') as file:
                story_content = file.read()
                
                # 提交任务
                future = executor.submit(analyze_story_with_gpt, story_content, story_name, themes_df, output_dir)
                future_to_story[future] = story_name
        
        # 处理所有的返回结果
        for future in as_completed(future_to_story):
            story_name = future_to_story[future]
            try:
                analysis_result = future.result()
                # 不再需要处理保存结果到 CSV，这部分已替换为 JSON 文件
            except Exception as e:
                print(f"Error processing story {story_name}: {e}")
            finally:
                pbar.update(1)  # 更新进度条

# 读取 Excel 文件，获取主题定义
themes_file = '/Users/zhaorunping/Desktop/Research_Onging/2410_LSE_Xue/result/241124topic_prompt_dictionary.xlsx'
themes_df = pd.read_excel(themes_file)

# 定义故事所在目录和输出的 JSON 文件目录
stories_dir = '/Users/zhaorunping/Desktop/Research_Onging/2410_LSE_Xue/data/raw_data_tales/安徽820'
output_dir = '/Users/zhaorunping/Desktop/Research_Onging/2410_LSE_Xue/result/241124advanced_multitopic_story_analysis_jsons'

# 创建目录以保存分析结果
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

# 处理所有故事并保存结果
process_stories_in_directory(stories_dir, themes_df, output_dir)

Processing Stories: 100%|██████████| 820/820 [01:03<00:00, 12.84it/s]


In [2]:
import os
import openai
import pandas as pd
import json
import re
from concurrent.futures import ThreadPoolExecutor, as_completed
from tqdm import tqdm


def analyze_story_with_gpt(story_content, story_name, themes_df, output_dir):
    themes_analysis = []
    for _, row in themes_df.iterrows():
        theme = row['theme']
        definition = row['definition']
        support = row['support']

        theme_analysis = f"主题：{theme}，主题定义：{definition}。对应“提及并支持（A）”的观点如下：{support}。"
        themes_analysis.append(theme_analysis)

    themes_prompt = " ".join(themes_analysis)

    prompt_template = f"""
    你是一名精通中国民间故事的研究员，你的任务是对传入的中国民间故事文本内容进行分析，逐一判断其与若干主题的相关性及观点，输出结构化的分析结果。

    分析步骤：
    第一步，主题相关性判断
    根据每个主题的定义，判断故事内容是否与该主题相关：
    • 若故事未涉及该主题，标记为“并没有提及（D）”。
    • 若故事涉及该主题，进入第二步分析。

    第二步，观点判断
    若故事内容涉及该主题，则进一步判断故事的观点：
    • 提及并支持（A）：故事明确表达支持对应观点。
    • 提及并反对（B）：故事明确表达反对对应观点。
    • 提及但没有明确观点（C）：故事提及该主题，但未明确表达支持或反对对应观点。

    故事文本：{story_content}。

    {themes_prompt}

    注意事项：
    • 仅依据传入的故事文本和主题的定义与观点进行判断，不使用任何外部信息。
    • 不允许推断，不允许牵强附会或过度解读。
    • 输出结果必须严格按照JSON格式，以方便进一步处理。

    输出要求：
    请将每个故事的分析结果以JSON格式输出，结构如下：
    {{
      "故事名称": "{story_name}",
      "帝王": "A/B/C/D",
      "遵守": "A/B/C/D",
      "结拜": "A/B/C/D",
      "仁义": "A/B/C/D",
      "忠诚": "A/B/C/D",
      "佛教": "A/B/C/D",
      "道教": "A/B/C/D",
      "儒家": "A/B/C/D",
      "杀戮": "A/B/C/D",
      "孝道": "A/B/C/D",
      "慈善": "A/B/C/D",
      "信任": "A/B/C/D",
      "努力": "A/B/C/D",
      "惩罚": "A/B/C/D",
      "诚实": "A/B/C/D",
      "长寿": "A/B/C/D",
      "宗族": "A/B/C/D",
      "祖先": "A/B/C/D",
      "平等": "A/B/C/D",
      "鬼神": "A/B/C/D",
      "古时候": "A/B/C/D",
      "馈赠": "A/B/C/D"
    }}
    """

    response = client.chat.completions.create(
        model="gpt-4o-mini",
        messages=[{"role": "user", "content": prompt_template}],
        max_tokens=3000,  # 提高 token 限制以处理多个主题
        temperature=0.1,
        top_p=0.9
    )
    
    # 获取分析结果并清理不可见字符
    analysis_result = response.choices[0].message.content.strip()  # 去除前后的空格、换行符
    
    cleaned_analysis_result = re.sub(r'```json', '', analysis_result)
    cleaned_analysis_result = re.sub(r'```', '', cleaned_analysis_result)
    cleaned_analysis_result = re.sub(r'[\x00-\x1F\x7F]', '', cleaned_analysis_result)

    if not cleaned_analysis_result:
        print(f"Error: No content in analysis result for {story_name}")
        return None

    try:
        analysis_json = json.loads(cleaned_analysis_result)
        story_output_dir = os.path.join(output_dir, os.path.basename(os.path.dirname(story_name)))
        os.makedirs(story_output_dir, exist_ok=True)
        output_file = os.path.join(story_output_dir, f"{os.path.basename(story_name)}_analysis.json")
        with open(output_file, 'w', encoding='utf-8') as f:
            json.dump(analysis_json, f, ensure_ascii=False, indent=4)
    except json.JSONDecodeError as e:
        print(f"Error decoding JSON for story {story_name}: {e}")
    except Exception as e:
        print(f"Error saving JSON for story {story_name}: {e}")

    return cleaned_analysis_result


def process_stories_recursively(base_dir, themes_df, output_dir):
    story_files = []
    for root, _, files in os.walk(base_dir):
        for file in files:
            if file.endswith(".txt"):
                story_files.append(os.path.join(root, file))

    with ThreadPoolExecutor(max_workers=60) as executor, tqdm(total=len(story_files), desc="Processing Stories") as pbar:
        future_to_story = {}
        for story_path in story_files:
            with open(story_path, 'r', encoding='utf-8') as file:
                story_content = file.read()
                future = executor.submit(analyze_story_with_gpt, story_content, story_path, themes_df, output_dir)
                future_to_story[future] = story_path

        for future in as_completed(future_to_story):
            story_name = future_to_story[future]
            try:
                future.result()
            except Exception as e:
                print(f"Error processing story {story_name}: {e}")
            finally:
                pbar.update(1)


# 读取 Excel 文件，获取主题定义
themes_file = '/Users/zhaorunping/Desktop/Research_Onging/2410_LSE_Xue/result/241124topic_prompt_dictionary.xlsx'
themes_df = pd.read_excel(themes_file)

# 定义故事所在目录和输出的 JSON 文件目录
base_dir = '/Users/zhaorunping/Desktop/Research_Onging/2410_LSE_Xue/data/raw_data_tales_add'
output_dir = '/Users/zhaorunping/Desktop/Research_Onging/2410_LSE_Xue/result/250202advanced_multitopic_story_analysis_jsons'

# 创建目录以保存分析结果
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

# 处理所有故事并保存结果
process_stories_recursively(base_dir, themes_df, output_dir)

Processing Stories:   9%|▊         | 260/3046 [00:31<06:24,  7.24it/s]

Error decoding JSON for story /Users/zhaorunping/Desktop/Research_Onging/2410_LSE_Xue/data/raw_data_tales_add/黑龙江580/18486__553“夫"字的解释.txt: Expecting ',' delimiter: line 1 column 113 (char 112)


Processing Stories:  21%|██▏       | 649/3046 [01:16<03:06, 12.86it/s]

Error decoding JSON for story /Users/zhaorunping/Desktop/Research_Onging/2410_LSE_Xue/data/raw_data_tales_add/吉林-新/17024_231鱼神脸儿的传说.txt: Unterminated string starting at: line 1 column 314 (char 313)


Processing Stories:  32%|███▏      | 977/3046 [01:56<03:16, 10.52it/s]

Error decoding JSON for story /Users/zhaorunping/Desktop/Research_Onging/2410_LSE_Xue/data/raw_data_tales_add/吉林-新/17280_545六窍全通.txt: Unterminated string starting at: line 1 column 161 (char 160)


Processing Stories:  56%|█████▌    | 1709/3046 [03:12<02:24,  9.25it/s]

Error decoding JSON for story /Users/zhaorunping/Desktop/Research_Onging/2410_LSE_Xue/data/raw_data_tales_add/吉林-新/17099_320木头婆婆.txt: Unterminated string starting at: line 1 column 359 (char 358)


Processing Stories:  57%|█████▋    | 1740/3046 [03:15<01:40, 12.96it/s]

Error decoding JSON for story /Users/zhaorunping/Desktop/Research_Onging/2410_LSE_Xue/data/raw_data_tales_add/山东725/17535__287“没有麻”和“不赌了".txt: Expecting ',' delimiter: line 1 column 120 (char 119)


Processing Stories:  74%|███████▍  | 2247/3046 [04:14<01:19, 10.03it/s]

Error decoding JSON for story /Users/zhaorunping/Desktop/Research_Onging/2410_LSE_Xue/data/raw_data_tales_add/青海667/19091__587“敏干"云登和"一个"云登.txt: Expecting ',' delimiter: line 1 column 113 (char 112)


Processing Stories:  78%|███████▊  | 2370/3046 [04:28<01:09,  9.71it/s]

Error decoding JSON for story /Users/zhaorunping/Desktop/Research_Onging/2410_LSE_Xue/data/raw_data_tales_add/青海667/19158__654“啸嘴车".txt: Expecting ',' delimiter: line 1 column 114 (char 113)


Processing Stories:  78%|███████▊  | 2373/3046 [04:28<01:03, 10.61it/s]

Error decoding JSON for story /Users/zhaorunping/Desktop/Research_Onging/2410_LSE_Xue/data/raw_data_tales_add/山东725/17506__256曹植的七步诗台.txt: Unterminated string starting at: line 1 column 12 (char 11)


Processing Stories:  86%|████████▌ | 2605/3046 [04:59<01:01,  7.20it/s]

Error decoding JSON for story /Users/zhaorunping/Desktop/Research_Onging/2410_LSE_Xue/data/raw_data_tales_add/青海667/18944__440宝中宝.txt: Unterminated string starting at: line 1 column 348 (char 347)


Processing Stories:  89%|████████▉ | 2720/3046 [05:13<00:28, 11.60it/s]

Error decoding JSON for story /Users/zhaorunping/Desktop/Research_Onging/2410_LSE_Xue/data/raw_data_tales_add/内蒙古641/19689__528活捉“偷马贼".txt: Expecting ',' delimiter: line 1 column 117 (char 116)


Processing Stories:  96%|█████████▌| 2916/3046 [05:35<00:09, 13.71it/s]

Error decoding JSON for story /Users/zhaorunping/Desktop/Research_Onging/2410_LSE_Xue/data/raw_data_tales_add/内蒙古641/19612__449给山羊脱“鞋".txt: Expecting ',' delimiter: line 1 column 117 (char 116)


Processing Stories: 100%|█████████▉| 3040/3046 [05:57<00:03,  1.96it/s]

Error decoding JSON for story /Users/zhaorunping/Desktop/Research_Onging/2410_LSE_Xue/data/raw_data_tales_add/内蒙古641/19366__197地方传说•.txt: Extra data: line 1 column 286 (char 285)


Processing Stories: 100%|█████████▉| 3044/3046 [06:11<00:07,  3.62s/it]

Error decoding JSON for story /Users/zhaorunping/Desktop/Research_Onging/2410_LSE_Xue/data/raw_data_tales_add/内蒙古641/19429__260鲤鱼烟嘴儿..txt: Unterminated string starting at: line 1 column 220 (char 219)


Processing Stories: 100%|█████████▉| 3045/3046 [06:48<00:12, 12.46s/it]

Error decoding JSON for story /Users/zhaorunping/Desktop/Research_Onging/2410_LSE_Xue/data/raw_data_tales_add/内蒙古641/19277__106呼伦贝尔湖的传说..txt: Unterminated string starting at: line 1 column 253 (char 252)


Processing Stories: 100%|██████████| 3046/3046 [06:50<00:00,  7.42it/s]


In [3]:
from tqdm import tqdm
import os
import json
import pandas as pd

def save_results_to_csv(output_dir, csv_file, stories_dir):
    # 获取所有的 JSON 文件
    json_files = [f for f in os.listdir(output_dir) if f.endswith(".json")]
    
    # 用于保存 CSV 数据的列表
    csv_data = []
    
    # 读取每个 JSON 文件并显示进度条
    for json_file in tqdm(json_files, desc="Processing files", unit="file"):
        # 去掉 "_analysis.json" 后缀，获取故事名称
        story_name = json_file.replace("_analysis.json", "")
        
        # 获取对应的 JSON 文件内容
        json_file_path = os.path.join(output_dir, json_file)
        with open(json_file_path, 'r', encoding='utf-8') as file:
            analysis_data = json.load(file)
        
        # 获取故事原文
        story_file_path = os.path.join(stories_dir, story_name)
        if os.path.exists(story_file_path):
            with open(story_file_path, 'r', encoding='utf-8') as file:
                story_text = file.read()
        else:
            story_text = "Story file not found"  # 如果对应的 .txt 文件没有找到
        
        # 处理每个分析结果
        story_analysis = {
            "故事名称": analysis_data.get("故事名称"),
            "帝王": analysis_data.get("帝王"),
            "遵守": analysis_data.get("遵守"),
            "结拜": analysis_data.get("结拜"),
            "仁义": analysis_data.get("仁义"),
            "忠诚": analysis_data.get("忠诚"),
            "佛教": analysis_data.get("佛教"),
            "道教": analysis_data.get("道教"),
            "儒家": analysis_data.get("儒家"),
            "杀戮": analysis_data.get("杀戮"),
            "孝道": analysis_data.get("孝道"),
            "慈善": analysis_data.get("慈善"),
            "信任": analysis_data.get("信任"),
            "努力": analysis_data.get("努力"),
            "惩罚": analysis_data.get("惩罚"),
            "诚实": analysis_data.get("诚实"),
            "长寿": analysis_data.get("长寿"),
            "宗族": analysis_data.get("宗族"),
            "祖先": analysis_data.get("祖先"),
            "平等": analysis_data.get("平等"),
            "鬼神": analysis_data.get("鬼神"),
            "古时候": analysis_data.get("古时候"),
            "馈赠": analysis_data.get("馈赠"),
            "故事原文": story_text
        }
        csv_data.append(story_analysis)
    
    # 保存 CSV 文件
    df = pd.DataFrame(csv_data)
    df.to_csv(csv_file, index=False, encoding='utf-8')
    print(f"结果已保存为 {csv_file}")

# 保存分析结果到 CSV 文件
csv_output_file = "/Users/zhaorunping/Desktop/Research_Onging/2410_LSE_Xue/result/250202advanced_multitopic_story_analysis.csv"
save_results_to_csv(output_dir, csv_output_file, stories_dir)

NameError: name 'stories_dir' is not defined

In [4]:
from tqdm import tqdm
import os
import json
import pandas as pd

def save_results_to_csv(base_output_dir, csv_file, base_stories_dir):
    # 遍历所有子文件夹，获取所有 JSON 文件的路径
    json_files = []
    for root, _, files in os.walk(base_output_dir):
        for file in files:
            if file.endswith(".json"):
                json_files.append(os.path.join(root, file))
    
    # 用于保存 CSV 数据的列表
    csv_data = []
    
    # 处理每个 JSON 文件并显示进度条
    for json_file_path in tqdm(json_files, desc="Processing files", unit="file"):
        # 获取故事名称（去掉文件路径和 "_analysis.json" 后缀）
        story_name = os.path.basename(json_file_path).replace("_analysis.json", "")
        
        # 读取 JSON 文件内容
        with open(json_file_path, 'r', encoding='utf-8') as file:
            analysis_data = json.load(file)
        
        # 找到对应的故事原文路径
        relative_dir = os.path.relpath(os.path.dirname(json_file_path), base_output_dir)
        story_file_path = os.path.join(base_stories_dir, relative_dir, story_name)

        # 读取故事原文
        if os.path.exists(story_file_path):
            with open(story_file_path, 'r', encoding='utf-8') as file:
                story_text = file.read()
        else:
            story_text = "Story file not found"  # 如果找不到故事原文
        
        # 处理每个分析结果
        story_analysis = {
            "故事名称": analysis_data.get("故事名称"),
            "帝王": analysis_data.get("帝王"),
            "遵守": analysis_data.get("遵守"),
            "结拜": analysis_data.get("结拜"),
            "仁义": analysis_data.get("仁义"),
            "忠诚": analysis_data.get("忠诚"),
            "佛教": analysis_data.get("佛教"),
            "道教": analysis_data.get("道教"),
            "儒家": analysis_data.get("儒家"),
            "杀戮": analysis_data.get("杀戮"),
            "孝道": analysis_data.get("孝道"),
            "慈善": analysis_data.get("慈善"),
            "信任": analysis_data.get("信任"),
            "努力": analysis_data.get("努力"),
            "惩罚": analysis_data.get("惩罚"),
            "诚实": analysis_data.get("诚实"),
            "长寿": analysis_data.get("长寿"),
            "宗族": analysis_data.get("宗族"),
            "祖先": analysis_data.get("祖先"),
            "平等": analysis_data.get("平等"),
            "鬼神": analysis_data.get("鬼神"),
            "古时候": analysis_data.get("古时候"),
            "馈赠": analysis_data.get("馈赠"),
            "故事原文": story_text
        }
        csv_data.append(story_analysis)
    
    # 保存结果到 CSV 文件
    df = pd.DataFrame(csv_data)
    df.to_csv(csv_file, index=False, encoding='utf-8')
    print(f"结果已保存为 {csv_file}")

# 定义输出文件夹路径、原始故事文件夹路径和 CSV 文件路径
base_output_dir = '/Users/zhaorunping/Desktop/Research_Onging/2410_LSE_Xue/result/250202advanced_multitopic_story_analysis_jsons'
base_stories_dir = '/Users/zhaorunping/Desktop/Research_Onging/2410_LSE_Xue/data/raw_data_tales_add'
csv_output_file = '/Users/zhaorunping/Desktop/Research_Onging/2410_LSE_Xue/result/250202advanced_multitopic_story_analysis.csv'

# 调用函数保存结果到 CSV 文件
save_results_to_csv(base_output_dir, csv_output_file, base_stories_dir)

Processing files:   0%|          | 0/3032 [00:00<?, ?file/s]

Processing files: 100%|██████████| 3032/3032 [00:00<00:00, 7170.17file/s]

结果已保存为 /Users/zhaorunping/Desktop/Research_Onging/2410_LSE_Xue/result/250202advanced_multitopic_story_analysis.csv





In [18]:
df2.to_csv('/Users/zhaorunping/Desktop/Research_Onging/2410_LSE_Xue/result/241125advanced_multitopic_story_analysis.csv', index=False, encoding='utf-8')

你看一下 分类不是D的故事，是不是在之前算bert similarity的时候，分数也更高 

In [8]:
import pandas as pd

# File paths
file_path_1 = '/Users/zhaorunping/Desktop/Research_Onging/2410_LSE_Xue/result/241121similarity_scores_multitopic.csv'
file_path_2 = '/Users/zhaorunping/Desktop/Research_Onging/2410_LSE_Xue/result/241125advanced_multitopic_story_analysis.csv'

# Read the CSV files into DataFrames
df1 = pd.read_csv(file_path_1)
df2 = pd.read_csv(file_path_2)

In [17]:
df2.head()

Unnamed: 0,故事名称,仁义,忠诚,佛教,道教,儒家,杀戮,孝道,慈善,信任,...,惩罚,诚实,长寿,宗族,祖先,平等,鬼神,古时候,馈赠,故事原文
0,01233__479-拿龙,A,A,D,D,D,D,D,A,A,...,A,D,D,D,D,D,A,D,D,479-拿 龙\n\n（回族）\n\n桃园坝子的回回人可多啦，相传回回人中还出了不少降龙伏虎...
1,01085__328-吐良的传说,A,A,D,D,D,A,D,D,D,...,D,D,D,D,D,D,D,A,D,328-吐良的传说\n\n（景颇族）\n\n传说在很久以前，瑞达崩地方有一个英俊剽悍的小伙子...
2,01362__611摇竹惊雀,C,D,D,D,D,D,D,D,C,...,C,D,D,D,D,D,D,A,D,611.摇竹惊雀\n\n（汉族）\n\n很早以前.在个旧锡矿山上，有一个叫金竹林的地方。这里...
3,01356__605-父亲长出了双牛脚,A,A,D,D,A,D,A,D,A,...,A,A,D,A,D,D,A,D,A,605-父亲长出了双牛脚\n\n（汉族）\n\n有一对相处得极好的朋友，一个叫张三，另一个叫...
4,01357__606偷盗的下场,A,A,D,D,A,D,A,A,C,...,A,A,D,A,D,D,A,A,A,606.偷盗的下场\n\n（阿昌族〉\n\n很久以前，石嘴山下住着两兄弟.哥哥叫岩?弟弟叫航...


In [9]:
df2['故事名称'] = df2['故事名称'].str.replace('.txt', '', regex=False)
df2['故事名称'] = df2['故事名称'].str.replace('/Users/zhaorunping/Desktop/Research_Onging/2410_LSE_Xue/data/raw_data_tales/', '', regex=False)
df2.head()

Unnamed: 0,故事名称,仁义,忠诚,佛教,道教,儒家,杀戮,孝道,慈善,信任,...,惩罚,诚实,长寿,宗族,祖先,平等,鬼神,古时候,馈赠,故事原文
0,云南下686/01233__479-拿龙,A,A,D,D,D,D,D,A,A,...,A,D,D,D,D,D,A,D,D,479-拿 龙\n\n（回族）\n\n桃园坝子的回回人可多啦，相传回回人中还出了不少降龙伏虎...
1,云南下686/01085__328-吐良的传说,A,A,D,D,D,A,D,D,D,...,D,D,D,D,D,D,D,A,D,328-吐良的传说\n\n（景颇族）\n\n传说在很久以前，瑞达崩地方有一个英俊剽悍的小伙子...
2,云南下686/01362__611摇竹惊雀,C,D,D,D,D,D,D,D,C,...,C,D,D,D,D,D,D,A,D,611.摇竹惊雀\n\n（汉族）\n\n很早以前.在个旧锡矿山上，有一个叫金竹林的地方。这里...
3,云南下686/01356__605-父亲长出了双牛脚,A,A,D,D,A,D,A,D,A,...,A,A,D,A,D,D,A,D,A,605-父亲长出了双牛脚\n\n（汉族）\n\n有一对相处得极好的朋友，一个叫张三，另一个叫...
4,云南下686/01357__606偷盗的下场,A,A,D,D,A,D,A,A,C,...,A,A,D,A,D,D,A,A,A,606.偷盗的下场\n\n（阿昌族〉\n\n很久以前，石嘴山下住着两兄弟.哥哥叫岩?弟弟叫航...


In [10]:
# 检查是否有“故事名称”列
if '故事名称' in df2.columns:
    # 修改“故事名称”列，保留第一个“/”后的字符
    df2['故事名称'] = df2['故事名称'].apply(lambda x: x.split('/', 1)[1] if '/' in x else x)
else:
    print("df2中没有名为'故事名称'的列")

In [11]:
df2.loc[0, '故事名称']

'01233__479-拿龙'

In [12]:
merged_df = pd.merge(df2, df1, left_on='故事名称', right_on='name', how='left')
merged_df

Unnamed: 0,故事名称,仁义,忠诚,佛教,道教,儒家,杀戮,孝道,慈善,信任,...,佛教_相似度,杀戮_相似度,信任_相似度,惩罚_相似度,长寿_相似度,慈善_相似度,儒家_相似度,诚实_相似度,平等_相似度,努力_相似度
0,01233__479-拿龙,A,A,D,D,D,D,D,A,A,...,0.668796,0.670545,0.665954,0.678478,0.688834,0.688585,0.674124,0.613636,0.670598,0.711067
1,01085__328-吐良的传说,A,A,D,D,D,A,D,D,D,...,0.676918,0.647617,0.746513,0.704000,0.689166,0.679498,0.643999,0.650687,0.693016,0.757729
2,01362__611摇竹惊雀,C,D,D,D,D,D,D,D,C,...,0.672273,0.643334,0.740946,0.722522,0.765060,0.794517,0.764186,0.713423,0.762813,0.818382
3,01356__605-父亲长出了双牛脚,A,A,D,D,A,D,A,D,A,...,0.662645,0.568737,0.757755,0.674839,0.704335,0.693191,0.686799,0.692336,0.705453,0.730344
4,01357__606偷盗的下场,A,A,D,D,A,D,A,A,C,...,0.655725,0.667060,0.741811,0.685953,0.740729,0.781862,0.787774,0.678618,0.751584,0.811732
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17319,13746__647百合花,D,D,D,D,D,D,D,D,D,...,0.629806,0.612175,0.635546,0.675744,0.707198,0.629043,0.620104,0.607564,0.644182,0.691505
17320,13522__422老虎怕“屋漏,D,D,D,D,D,D,D,D,C,...,0.694735,0.641131,0.679612,0.688364,0.665071,0.634898,0.638745,0.654686,0.670512,0.734439
17321,13120__017女蜗补天造人,D,D,D,D,D,D,D,D,D,...,0.664962,0.598921,0.677673,0.695205,0.624682,0.692207,0.630434,0.580305,0.644922,0.689768
17322,13695__596幌江山逗江湖客,D,D,D,D,D,D,D,D,A,...,0.633313,0.610292,0.755143,0.667938,0.723545,0.698319,0.656768,0.655553,0.742461,0.786270


In [15]:
# List of themes as per your provided input
themes = [
    "仁义", "忠诚", "佛教", "道教", "儒家", "杀戮", "孝道", "慈善", "信任", "努力",
    "惩罚", "诚实", "长寿", "宗族", "祖先", "平等", "鬼神", "古时候", "馈赠"
]

averages = {}

# Loop over each theme to calculate the mean similarity score for A, B, C, D values
for theme in themes:
    # Construct the column names for the theme and similarity
    theme_col = theme
    similarity_col = theme + '_相似度'
    
    # Ensure that the theme column and the similarity column exist in the DataFrame
    if theme_col in merged_df.columns and similarity_col in merged_df.columns:
        # Group by the theme column (A/B/C/D) and calculate the mean of the similarity scores
        mean_similarity = merged_df.groupby(theme_col)[similarity_col].mean()
        
        # Store the result in the dictionary
        averages[theme] = mean_similarity

# Convert the dictionary to a DataFrame for better readability
averages_df = pd.DataFrame(averages).reset_index()

averages_df

Unnamed: 0,index,仁义,忠诚,佛教,道教,儒家,杀戮,孝道,慈善,信任,努力,惩罚,诚实,长寿,宗族,祖先,平等,鬼神,古时候,馈赠
0,A,0.696454,0.700142,0.700348,0.687716,0.722503,0.658109,0.733784,0.719676,0.712159,0.712368,0.684974,0.689414,0.703828,0.723862,0.731989,0.710787,0.714663,0.71013,0.731601
1,B,0.695134,0.691725,0.685499,0.682674,0.700677,0.645302,0.730437,0.734375,0.710797,0.732742,0.677944,0.67111,0.642184,0.700376,0.691951,0.717817,0.672012,,0.717585
2,C,0.678473,0.689415,0.69102,0.679667,0.708595,0.639844,0.726876,0.711255,0.70434,0.714763,0.661073,0.671001,0.691223,0.714425,0.726817,0.701522,0.704445,0.683245,0.720917
3,D,0.653583,0.673656,0.671184,0.662815,0.688073,0.620224,0.690094,0.689061,0.690602,0.701663,0.674255,0.65768,0.69404,0.692837,0.711583,0.689685,0.690724,0.694181,0.709535


In [13]:
import pandas as pd
from scipy.stats import ttest_ind

# List of themes as per your provided input
themes = [
    "仁义", "忠诚", "佛教", "道教", "儒家", "杀戮", "孝道", "慈善", "信任", "努力",
    "惩罚", "诚实", "长寿", "宗族", "祖先", "平等", "鬼神", "古时候", "馈赠"
]

# Initialize a dictionary to store results
results = []

# Loop over each theme to calculate the mean similarity score for D and non-D values
for theme in themes:
    # Construct the column names for the theme and similarity
    theme_col = theme
    similarity_col = theme + '_相似度'
    
    # Ensure that the theme column and the similarity column exist in the DataFrame
    if theme_col in merged_df.columns and similarity_col in merged_df.columns:
        # Split the data into D and non-D groups
        d_group = merged_df[merged_df[theme_col] == "D"][similarity_col]
        non_d_group = merged_df[merged_df[theme_col] != "D"][similarity_col]
        
        # Calculate means
        mean_d = d_group.mean()
        mean_non_d = non_d_group.mean()
        
        # Perform t-test
        t_stat, p_value = ttest_ind(d_group, non_d_group, equal_var=False, nan_policy='omit')
        
        # Append results to the list
        results.append({
            "主题": theme,
            "D均值": mean_d,
            "非D均值": mean_non_d,
            "t值": t_stat,
            "p值": p_value
        })

# Convert results to a DataFrame for better readability
results_df = pd.DataFrame(results)

# Transpose the DataFrame
transposed_df = results_df.set_index("主题").T

# Save the transposed DataFrame to an Excel file
output_path = "/Users/zhaorunping/Desktop/Research_Onging/2410_LSE_Xue/result/241125D_average_similarity_score_t_test.xlsx"
transposed_df.to_excel(output_path)

print(f"转置结果已保存到文件：{output_path}")

转置结果已保存到文件：/Users/zhaorunping/Desktop/Research_Onging/2410_LSE_Xue/result/241125D_average_similarity_score_t_test.xlsx


In [14]:
transposed_df

主题,仁义,忠诚,佛教,道教,儒家,杀戮,孝道,慈善,信任,努力,惩罚,诚实,长寿,宗族,祖先,平等,鬼神,古时候,馈赠
D均值,0.653583,0.6736557,0.6711845,0.6628153,0.6880735,0.6202239,0.690094,0.6890614,0.6906025,0.7016629,0.6742548,0.6576796,0.6940396,0.6928373,0.7115831,0.6896854,0.6907236,0.6941808,0.709535
非D均值,0.691741,0.6978116,0.6994721,0.6853669,0.7199753,0.6577659,0.733165,0.7186381,0.7082457,0.7127804,0.6839248,0.6791405,0.702305,0.7227715,0.73169,0.708888,0.714084,0.7101184,0.7303407
t值,-44.06363,-34.43133,-19.51711,-27.31396,-27.95169,-35.54376,-45.652225,-34.27557,-23.97321,-15.99906,-12.49961,-25.68361,-6.107529,-33.01833,-13.41001,-10.69735,-33.49875,-21.50372,-25.61987
p值,0.0,9.754444e-251,3.334423e-74,2.273908e-155,6.157422000000001e-156,1.812161e-248,0.0,1.1741010000000001e-247,3.657586e-124,3.457466e-57,1.203861e-35,1.019308e-142,1.371204e-09,1.7769880000000002e-206,2.6311409999999997e-38,4.964827e-25,3.969511e-236,1.6890579999999998e-100,3.578987e-140


In [16]:
# Export the averages_df DataFrame to an Excel file
output_file = "/Users/zhaorunping/Desktop/Research_Onging/2410_LSE_Xue/result/241125ABCD_average_similarity_score.xlsx"  # Specify the path where you want to save the file
averages_df.to_excel(output_file, index=False)

print(f"Exported results to {output_file}")

Exported results to /Users/zhaorunping/Desktop/Research_Onging/2410_LSE_Xue/result/241125ABCD_average_similarity_score.xlsx
