# 道德维度词向量投影脚本

# 将词向量（GST和GNT）投影到道德维度轴上，计算bias和intensity值

In [None]:
import pandas as pd
import numpy as np
import pickle
import os
from typing import Dict, Any, List, Tuple
import warnings

In [None]:
def load_moral_axes(file_path: str) -> Dict[str, np.ndarray]:
    """
    加载道德维度轴向量
    
    Args:
        file_path: moral_axes.pkl文件路径
        
    Returns:
        包含道德维度名称和对应向量的字典
    """
    try:
        with open(file_path, 'rb') as f:
            moral_axes = pickle.load(f)
        print(f"成功加载道德维度轴，包含以下维度: {list(moral_axes.keys())}")
        return moral_axes
    except Exception as e:
        raise Exception(f"加载道德维度轴失败: {e}")

In [None]:
def load_embeddings(file_path: str) -> pd.DataFrame:
    """
    加载词向量文件
    
    Args:
        file_path: 词向量CSV文件路径
        
    Returns:
        包含词语、频率和向量的DataFrame
    """
    try:
        df = pd.read_csv(file_path)
        print(f"成功加载词向量文件 {os.path.basename(file_path)}，共 {len(df)} 个词")
        return df
    except Exception as e:
        raise Exception(f"加载词向量文件失败: {e}")

In [None]:
def calculate_metrics(word_vector: np.ndarray, axis_vector: np.ndarray) -> Tuple[float, float]:
    """
    计算词向量在道德维度轴上的bias和intensity
    
    Args:
        word_vector: 词向量
        axis_vector: 道德维度轴向量
        
    Returns:
        (bias, intensity)元组
    """
    # 计算向量范数（模长）
    word_norm = np.linalg.norm(word_vector)
    axis_norm = np.linalg.norm(axis_vector)
    
    # 如果向量范数为0，返回0值
    if word_norm == 0 or axis_norm == 0:
        return 0.0, 0.0
    
    # 计算点积
    dot_product = np.dot(word_vector, axis_vector)
    
    # 计算bias (cosine similarity)
    bias = dot_product / (word_norm * axis_norm)
    
    # 计算intensity (squared projection distance)
    projection = dot_product / axis_norm
    intensity = projection ** 2
    
    return bias, intensity

In [None]:
def process_embeddings(embeddings_df: pd.DataFrame, moral_axes: Dict[str, np.ndarray]) -> pd.DataFrame:
    """
    处理词向量数据，计算在每个道德维度上的bias和intensity
    
    Args:
        embeddings_df: 词向量DataFrame
        moral_axes: 道德维度轴向量字典
        
    Returns:
        包含词语、频率和各维度bias/intensity的DataFrame
    """
    # 准备结果DataFrame，保留词语和频率
    result_df = embeddings_df[['词语', '频率']].copy()
    
    # 提取词向量维度列
    vector_cols = [f'dim_{i}' for i in range(1, 301)]
    
    # 对每个词计算在各道德维度上的投影
    for axis_name, axis_vector in moral_axes.items():
        bias_values = []
        intensity_values = []
        
        for _, row in embeddings_df.iterrows():
            try:
                # 提取词向量
                word_vector = np.array([row[col] for col in vector_cols], dtype=float)
                
                # 计算bias和intensity
                bias, intensity = calculate_metrics(word_vector, axis_vector)
                
                bias_values.append(bias)
                intensity_values.append(intensity)
            except Exception as e:
                warnings.warn(f"处理词'{row['词语']}'时出错: {e}")
                bias_values.append(0.0)
                intensity_values.append(0.0)
        
        # 添加到结果DataFrame
        result_df[f'{axis_name}_bias'] = bias_values
        result_df[f'{axis_name}_intensity'] = intensity_values
    
    return result_df

In [2]:
def main():
    """主函数"""
    try:
        # 文件路径
        moral_axes_path = 'D:\pythonProject\C_MFD2.0_embedding\代码区域\FramAxis嵌入测试\道德轴\moral_axes.pkl'
        gst_path = 'D:\pythonProject\C_MFD2.0_embedding\代码区域\FramAxis嵌入测试\GST向量\gst_embeddings.csv'
        gnt_path = 'D:\pythonProject\C_MFD2.0_embedding\代码区域\FramAxis嵌入测试\GNT向量\gnt_embeddings.csv'
        
        # 加载道德维度轴
        moral_axes = load_moral_axes(moral_axes_path)
        
        # 加载词向量
        gst_df = load_embeddings(gst_path)
        gnt_df = load_embeddings(gnt_path)
        
        # 处理GST词向量
        print("处理GST词向量...")
        projected_gst = process_embeddings(gst_df, moral_axes)
        
        # 处理GNT词向量
        print("处理GNT词向量...")
        projected_gnt = process_embeddings(gnt_df, moral_axes)
        
        # 保存结果
        projected_gst.to_csv('projected_gst.csv', index=False)
        projected_gnt.to_csv('projected_gnt.csv', index=False)
        
        print("处理完成。结果已保存至projected_gst.csv和projected_gnt.csv")
        
    except Exception as e:
        print(f"程序执行错误: {e}")

if __name__ == "__main__":
    main()

成功加载道德维度轴，包含以下维度: ['care', 'fairness', 'loyalty', 'authority', 'purity']
成功加载词向量文件 gst_embeddings.csv，共 1018 个词
成功加载词向量文件 gnt_embeddings.csv，共 5378 个词
处理GST词向量...
处理GNT词向量...
处理完成。结果已保存至projected_gst.csv和projected_gnt.csv
