# GST词向量嵌入脚本

In [None]:
import pandas as pd
import numpy as np
from gensim.models import KeyedVectors
import os
from tqdm import tqdm
import time

In [None]:
# 设置文件路径
word2vec_model_path = 'D:\pythonProject\C_MFD2.0_embedding\词嵌入模型文件夹\zhihu\sgns.zhihu.word'
gst_excel_path = r'D:\pythonProject\C_MFD2.0_embedding\代码区域\FramAxis嵌入测试\指涉术语测试数据\测试GST.xlsx'
output_csv_path = 'gst_embeddings.csv'

print(f"开始执行GST词向量嵌入任务...")
start_time = time.time()

In [None]:
# 加载词向量模型
print(f"正在加载词向量模型: {word2vec_model_path}...")
try:
    word_vectors = KeyedVectors.load_word2vec_format(word2vec_model_path, binary=False)
    print(f"词向量模型加载成功，词汇量: {len(word_vectors.key_to_index)}")
except Exception as e:
    print(f"加载词向量模型时出错: {e}")
    raise

In [None]:
# 加载GST词汇表
print(f"正在加载GST词汇表: {os.path.basename(gst_excel_path)}...")
try:
    gst_df = pd.read_excel(gst_excel_path)
    # 确保DataFrame有正确的列名
    if len(gst_df.columns) >= 2:
        gst_df = gst_df.iloc[:, :2]  # 只选择前两列
        gst_df.columns = ['词语', '频率']  # 设置列名
    print(f"GST词汇表加载成功，共{len(gst_df)}个词语")
except Exception as e:
    print(f"加载GST词汇表时出错: {e}")
    raise

In [None]:
# 初始化结果DataFrame
results = []
missing_words = []

# 词向量维度
vector_dim = word_vectors.vector_size
print(f"词向量维度: {vector_dim}")

# 处理每个词语
print("开始处理词语...")
for _, row in tqdm(gst_df.iterrows(), total=len(gst_df), desc="处理进度"):
    word = row['词语']
    frequency = row['频率']
    
    try:
        # 检查词语是否在词向量模型中
        if word in word_vectors:
            # 获取词向量
            vector = word_vectors[word]
            
            # 准备一行数据
            result_row = {'词语': word, '频率': frequency}
            
            # 添加词向量维度
            for i in range(vector_dim):
                result_row[f'dim_{i+1}'] = vector[i]
            
            results.append(result_row)
        else:
            missing_words.append(word)
            print(f"警告: 词语 '{word}' 不在词向量模型中")
    except Exception as e:
        print(f"处理词语 '{word}' 时出错: {e}")
        missing_words.append(word)

# 创建结果DataFrame
result_df = pd.DataFrame(results)

# 保存结果到CSV
print(f"正在保存结果到: {output_csv_path}...")
try:
    result_df.to_csv(output_csv_path, index=False, encoding='utf-8')
    print(f"结果已成功保存到: {output_csv_path}")
except Exception as e:
    print(f"保存结果时出错: {e}")
    
# 报告结果
print(f"\n处理完成:")
print(f"- 总词语数: {len(gst_df)}")
print(f"- 成功嵌入词语数: {len(results)}")
print(f"- 缺失词语数: {len(missing_words)}")

# 输出缺失词列表
if missing_words:
    print("\n缺失词列表:")
    for word in missing_words:
        print(f"- {word}")

end_time = time.time()
print(f"\n任务完成，耗时: {end_time - start_time:.2f}秒")