In [None]:
import pickle
import os
import time
from gensim.models import LsiModel  
from gensim.corpora import Dictionary
import multiprocessing


if __name__ == '__main__':
    
    PROCESSED_CORPUS_PICKLE = 'data/paragraph.pkl'
    MODEL_SAVE_DIR = 'data/LSI/model/' 
    TOPIC_RANGE = range(3, 16)
    os.makedirs(MODEL_SAVE_DIR, exist_ok=True)

    print(f"--- LSI模型训练 ---")

    print("--- 正在加载语料库... ---")
    with open(PROCESSED_CORPUS_PICKLE, 'rb') as f:
        processed_texts = pickle.load(f)
    dictionary = Dictionary(processed_texts)
    corpus = [dictionary.doc2bow(text) for text in processed_texts]

    start_time = time.time()
    
    for n_topics in TOPIC_RANGE:
        print(f"\n--- 开始训练: {n_topics} 主题 ---")
        
        model = LsiModel(
            corpus=corpus, 
            num_topics=n_topics, 
            id2word=dictionary
        )
        
        model.save(os.path.join(MODEL_SAVE_DIR, f'lsi_model_{n_topics}.model'))
        print(f"--- 已保存: {n_topics} 主题的模型 ---")

    print(f"\n 全部模型训练完毕, 总耗时: {time.time() - start_time:.2f} 秒")
    print(f"模型保存在: {MODEL_SAVE_DIR}")

In [None]:
import os
import pickle
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from gensim.models import LsiModel, KeyedVectors 
from gensim.corpora import Dictionary
from gensim.models.coherencemodel import CoherenceModel


from octis.evaluation_metrics.diversity_metrics import TopicDiversity, InvertedRBO


PROCESSED_CORPUS_PICKLE = 'data/paragraph.pkl'
MODEL_SAVE_DIR = 'data/LSI/model/' 
TENCENT_WV_PATH = 'data/origin/tencent-ailab-embedding-zh-d100-v0.2.0-s.txt' 
TOPIC_RANGE = range(3, 16)
RESULTS_CSV_PATH = 'data/LSI/lsi_evaluation_final.csv' 

def evaluate_models_focused(model_dir, topic_range, processed_texts, dictionary, word_vectors):
    print("\n--- 3. 开始进行专注化评估 ---")
    results = []
    

    diversity_metric = TopicDiversity(topk=10)
    rbo_metric = InvertedRBO(topk=10, weight=0.9)

    for n_topics in topic_range:
        model_path = os.path.join(model_dir, f'lsi_model_{n_topics}.model')
        if not os.path.exists(model_path):
            print(f"警告: 模型文件未找到，跳过: {model_path}")
            continue
            
        print(f"--- 正在评估: {n_topics} 主题的模型 ---")

        lsi_model = LsiModel.load(model_path)
        
        topics_for_coherence = [[word for word, _ in lsi_model.show_topic(i, topn=20)] for i in range(n_topics)]
        topics_for_diversity = [[word for word, _ in lsi_model.show_topic(i, topn=10)] for i in range(n_topics)]
        
        coherence_model = CoherenceModel(
            topics=topics_for_coherence,
            texts=processed_texts,
            dictionary=dictionary,
            coherence='c_w2v',
            keyed_vectors=word_vectors
        )
        cw2v_semantic = coherence_model.get_coherence()

        model_output_for_diversity = {"topics": topics_for_diversity}
        diversity = diversity_metric.score(model_output_for_diversity)
        rbo = rbo_metric.score(model_output_for_diversity)

        print(f"    - C_W2V (Semantic, topk=20): {cw2v_semantic:.4f}")
        print(f"    - Topic Diversity (topk=10): {diversity:.4f}")
        print(f"    - InvertedRBO (topk=10): {rbo:.4f}")
        
        results.append({
            "num_topics": n_topics,
            "C_W2V (Semantic)": cw2v_semantic,
            "Topic Diversity": diversity,
            "InvertedRBO": rbo
        })
        
    return pd.DataFrame(results).set_index("num_topics")

def plot_results(results_df):
    print("\n--- 4. 正在可视化评估结果... ---")
    fig, ax1 = plt.subplots(figsize=(14, 8))
    
    ax1.set_xlabel('Number of Topics')
    ax1.set_ylabel('C_W2V Semantic Coherence (topk=20)', color='tab:red')
    ax1.plot(results_df.index, results_df['C_W2V (Semantic)'], color='tab:red', marker='o', linewidth=2.5, label='C_W2V (Semantic)')
    ax1.tick_params(axis='y', labelcolor='tab:red')
    ax1.grid(True, which='both', linestyle='--', linewidth=0.5)

    ax2 = ax1.twinx()
    ax2.set_ylabel('Diversity Scores (topk=10)', color='tab:blue')
    ax2.plot(results_df.index, results_df['Topic Diversity'], color='tab:blue', marker='x', linestyle='-', label='Topic Diversity')
    ax2.plot(results_df.index, results_df['InvertedRBO'], color='tab:cyan', marker='x', linestyle='--', label='InvertedRBO')
    ax2.tick_params(axis='y', labelcolor='tab:blue')

    fig.legend(loc="upper center", bbox_to_anchor=(0.5, 0.96), ncol=3, fontsize='medium')
    fig.suptitle('专注化LSI模型评估: 语义一致性 vs. 多样性', fontsize=16)
    fig.tight_layout(rect=[0, 0.03, 1, 0.92])
    plt.show()


def main():
    print("--- 1. 正在加载本地语料库... ---")
    with open(PROCESSED_CORPUS_PICKLE, 'rb') as f:
        processed_texts = pickle.load(f)
    dictionary = Dictionary(processed_texts)
    
    print(f"\n--- 2. 正在加载腾讯中文词向量模型 (首次加载可能需要较长时间)... ---")
    if not os.path.exists(TENCENT_WV_PATH):
        raise FileNotFoundError(f"错误: 腾讯词向量文件未找到，请检查路径: {TENCENT_WV_PATH}")
    
    word_vectors = KeyedVectors.load_word2vec_format(TENCENT_WV_PATH, binary=False)
    
    print("--- 词向量模型加载成功 ---")

    results_df = evaluate_models_focused(MODEL_SAVE_DIR, TOPIC_RANGE, processed_texts, dictionary, word_vectors)
    
    if not results_df.empty:
        print("\n--- 最终评估完成, 结果如下: ---")
        print(results_df)
        results_df.to_csv(RESULTS_CSV_PATH)
        print(f"\n评估结果已保存到 {RESULTS_CSV_PATH}")
        plot_results(results_df)
    else:
        print("\n--- 未找到任何模型进行评估 ---")

if __name__ == "__main__":
    main()