In [47]:
import jieba
import numpy as np
from gensim.models import Word2Vec
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
from collections import Counter

In [9]:
# 佛教和道教词典
buddhism_words = """
佛 菩萨 僧 和尚 住持 庙 经 西天 寺 禅 空 悟 皈 菩提 乘 如来 观音 宗 法 沙弥 罗汉 夜叉 沙门 度/渡 布施 化缘 施主 世尊 迦 伽 婆 比丘 阿鼻地狱 供养 戒 出家 功德 涅槃 劫 慈 悲 愿 弥勒 自在 行者 金刚 清净 供养 袈裟 舍利 南无 钵 无量
""".split()

taoism_words = """
道 丹 仙 神 真 观 玄 五行 无 阴 阳 太极 玉帝 玉皇 天尊 星 太 坛 文昌 三清 龙王 阎罗 造化 本 归 霄 点化 金童 玉女 气 炁 洞天 箓 虚
""".split()


In [52]:
import re
from collections import Counter

# 定义别名映射字典
alias_map = {
    '唐僧': ['师父', '唐三藏', '玄奘', '三藏', '长老', '圣僧', '唐和尚', '金蝉禅', '金禅'],
    '孙悟空': ['孙行者', '美猴王', '齐天大圣', '石猴', '猴王', '大圣', '行者', '弼马温', '师兄', '泼猴', '猴子', '哥', '猴头', '心猿'],
    '猪八戒': ['悟能', '天蓬元帅', '八戒', '呆子', '猪刚鬣', '夯货', '老猪'],
    '沙和尚': ['悟净', '卷帘大将', '沙僧']
}

# 构造反向映射
reverse_alias_map = {alias: standard for standard, aliases in alias_map.items() for alias in aliases}

# 读取文本
input_path = r"C:\Users\yangx\Desktop\与国内的合作\人大DH组会\西游记.txt"
with open(input_path, "r", encoding="utf-8") as f:
    text = f.read()

# 统计替换前别名频率
freq_before = Counter(re.findall('|'.join(map(re.escape, reverse_alias_map.keys())), text))

# 使用正则替换别名为标准名称
def replace_alias(match):
    alias = match.group(0)  # 匹配的别名
    return reverse_alias_map[alias]  # 返回对应的标准名称

pattern = re.compile('|'.join(map(re.escape, reverse_alias_map.keys())))  # 构造匹配模式
cleaned_text = pattern.sub(replace_alias, text)  # 替换

# 统计替换后标准名称频率
freq_after = Counter(re.findall('|'.join(map(re.escape, alias_map.keys())), cleaned_text))

# 确保清洗后别名频率为 0
freq_after_alias = Counter(re.findall('|'.join(map(re.escape, reverse_alias_map.keys())), cleaned_text))

# 保存清洗后的文本
output_path = r"C:\Users\yangx\Desktop\与国内的合作\人大DH组会\西游记_cleaned_02.txt"
with open(output_path, "w", encoding="utf-8") as f:
    f.write(cleaned_text)

# 输出统计结果
print("替换前别名频率：", freq_before)
print("替换后标准名称频率：", freq_after)
print("清洗后别名频率：", freq_after_alias)
print(f"清洗后的文本已保存到 {output_path}")


替换前别名频率： Counter({'行者': 4148, '八戒': 1819, '师父': 1640, '三藏': 1329, '大圣': 1178, '沙僧': 821, '长老': 663, '哥': 590, '呆子': 432, '孙行者': 239, '师兄': 177, '圣僧': 163, '猴王': 143, '齐天大圣': 102, '老猪': 90, '玄奘': 89, '猴子': 78, '悟能': 71, '悟净': 69, '泼猴': 57, '唐三藏': 57, '弼马温': 49, '美猴王': 42, '猴头': 35, '心猿': 33, '夯货': 32, '天蓬元帅': 16, '石猴': 15, '卷帘大将': 11, '唐和尚': 8, '猪刚鬣': 5, '金禅': 4})
替换后标准名称频率： Counter({'孙悟空': 7012, '唐僧': 4965, '猪八戒': 2465, '沙和尚': 997})
清洗后别名频率： Counter({'八戒': 2465})
清洗后的文本已保存到 C:\Users\yangx\Desktop\与国内的合作\人大DH组会\西游记_cleaned_02.txt


In [53]:
# 分词
words = jieba.lcut(cleaned_text)

# 清洗词汇，保留所有词汇，包括佛教和道教词汇
cleaned_words = [word for word in words if len(word) > 1]

# 构建句子，以佛教和道教词汇为边界
# 通过在遇到佛教或道教词汇时开始一个新的句子，我们可以在模型训练中强调这些词汇与其他词汇之间的边界。
# 这有助于模型更好地捕捉到宗教词汇与其他词汇之间的关系，从而在后续分析中更准确地评估特定角色与佛教和道教的关系。
sentences = []
sentence = []
for word in cleaned_words:
    if word in buddhism_words or word in taoism_words:
        if sentence:
            sentences.append(sentence)
            sentence = [word]  # 以宗教词汇开始新的句子
        else:
            sentence = [word]
    else:
        sentence.append(word)
if sentence:
    sentences.append(sentence)


In [54]:
from collections import Counter

all_words = [word for sentence in sentences for word in sentence]
word_counts = Counter(all_words)

# 设置一个阈值，只保留出现次数超过10次的词汇
threshold = 10

# 筛选佛教和道教词典中的词汇
buddhism_words_filtered = [word for word in buddhism_words if word_counts[word] > threshold]
taoism_words_filtered = [word for word in taoism_words if word_counts[word] > threshold]

# 打印筛选后的词典
print("筛选后的佛教词汇：", buddhism_words_filtered)
print("筛选后的道教词汇：", taoism_words_filtered)

筛选后的佛教词汇： ['菩萨', '和尚', '西天', '菩提', '如来', '观音', '罗汉', '夜叉', '沙门', '布施', '施主', '世尊', '比丘', '供养', '出家', '弥勒', '金刚', '供养', '袈裟', '南无', '无量']
筛选后的道教词汇： ['五行', '玉帝', '玉皇', '天尊', '三清', '龙王', '造化', '洞天']


In [55]:
# 训练Word2Vec模型
model_1 = Word2Vec(sentences, vector_size=100, window=5, min_count=1, workers=4)

In [56]:
# 获取角色向量
characters = ['唐僧', '孙悟空', '猪八戒', '沙和尚']
char_vectors = {char: model_1.wv[char] for char in characters if char in model_1.wv}

In [57]:
# 计算每个宗教词汇的向量
buddhism_vectors = np.array([model_1.wv[w] for w in buddhism_words_filtered if w in model_1.wv])
taoism_vectors = np.array([model_1.wv[w] for w in taoism_words_filtered if w in model_1.wv])

# 计算每个宗教词汇的权重
buddhism_weights = Counter([word for sentence in sentences for word in sentence if word in buddhism_words_filtered])
taoism_weights = Counter([word for sentence in sentences for word in sentence if word in taoism_words_filtered])

# 将权重转换为numpy数组，并确保权重值是浮点数类型
buddhism_weights_array = np.array([buddhism_weights.get(word, 0) for word in buddhism_words_filtered], dtype=float)
taoism_weights_array = np.array([taoism_weights.get(word, 0) for word in taoism_words_filtered], dtype=float)

# 计算加权平均距离的函数
def weighted_average_distance(vectors, weights, char_vector):
    if len(vectors) == 0 or len(weights) == 0:
        return 0
    weighted_sum = np.zeros(100)
    total_weight = 0.0
    for vec, weight in zip(vectors, weights):
        weighted_sum += vec * weight
        total_weight += weight
    return np.linalg.norm(weighted_sum / total_weight - char_vector) if total_weight > 0 else 0

# 计算每个角色与佛教和道教的加权平均距离
buddhism_distances = {char: weighted_average_distance(buddhism_vectors, buddhism_weights_array, char_vector) for char, char_vector in char_vectors.items()}
taoism_distances = {char: weighted_average_distance(taoism_vectors, taoism_weights_array, char_vector) for char, char_vector in char_vectors.items()}

# 输出结果
for char in characters:
    if char in model_1.wv:
        print(f"{char} 与佛教的加权平均距离: {buddhism_distances[char]}")
        print(f"{char} 与道教的加权平均距离: {taoism_distances[char]}")

唐僧 与佛教的加权平均距离: 11.619473845366093
唐僧 与道教的加权平均距离: 14.591660077164432
孙悟空 与佛教的加权平均距离: 12.451545874163818
孙悟空 与道教的加权平均距离: 15.42293284967125
猪八戒 与佛教的加权平均距离: 9.99617432773271
猪八戒 与道教的加权平均距离: 12.964458146052356
沙和尚 与佛教的加权平均距离: 6.625226586177262
沙和尚 与道教的加权平均距离: 9.590450507888729


In [59]:
model_2 = Word2Vec(sentences, vector_size=100, window=10, min_count=1, workers=4)

In [60]:
# 获取角色向量
characters = ['唐僧', '孙悟空', '猪八戒', '沙和尚']
char_vectors = {char: model_2.wv[char] for char in characters if char in model_2.wv}

# 计算每个宗教词汇的向量
buddhism_vectors = np.array([model_2.wv[w] for w in buddhism_words_filtered if w in model_2.wv])
taoism_vectors = np.array([model_2.wv[w] for w in taoism_words_filtered if w in model_2.wv])

# 计算每个宗教词汇的权重
buddhism_weights = Counter([word for sentence in sentences for word in sentence if word in buddhism_words_filtered])
taoism_weights = Counter([word for sentence in sentences for word in sentence if word in taoism_words_filtered])

# 将权重转换为numpy数组，并确保权重值是浮点数类型
buddhism_weights_array = np.array([buddhism_weights.get(word, 0) for word in buddhism_words_filtered], dtype=float)
taoism_weights_array = np.array([taoism_weights.get(word, 0) for word in taoism_words_filtered], dtype=float)

# 计算加权平均距离的函数
def weighted_average_distance(vectors, weights, char_vector):
    if len(vectors) == 0 or len(weights) == 0:
        return 0
    weighted_sum = np.zeros(100)
    total_weight = 0.0
    for vec, weight in zip(vectors, weights):
        weighted_sum += vec * weight
        total_weight += weight
    return np.linalg.norm(weighted_sum / total_weight - char_vector) if total_weight > 0 else 0

# 计算每个角色与佛教和道教的加权平均距离
buddhism_distances = {char: weighted_average_distance(buddhism_vectors, buddhism_weights_array, char_vector) for char, char_vector in char_vectors.items()}
taoism_distances = {char: weighted_average_distance(taoism_vectors, taoism_weights_array, char_vector) for char, char_vector in char_vectors.items()}

# 输出结果
for char in characters:
    if char in model_2.wv:
        print(f"{char} 与佛教的加权平均距离: {buddhism_distances[char]}")
        print(f"{char} 与道教的加权平均距离: {taoism_distances[char]}")

唐僧 与佛教的加权平均距离: 15.788889389054194
唐僧 与道教的加权平均距离: 19.801046117289136
孙悟空 与佛教的加权平均距离: 17.501198811771737
孙悟空 与道教的加权平均距离: 21.512895613379303
猪八戒 与佛教的加权平均距离: 14.004802130298863
猪八戒 与道教的加权平均距离: 18.00960556976409
沙和尚 与佛教的加权平均距离: 9.875563360192356
沙和尚 与道教的加权平均距离: 13.87743662606008
