In [None]:
import pickle
from gensim.corpora import Dictionary
from collections import Counter

PROCESSED_CORPUS_PICKLE = 'data/paragraph.pkl' 
FILTERED_DICTIONARY_PATH = 'data/LDA/final_dictionary.dict'
FILTERED_CORPUS_PATH = 'data/LDA/final_corpus.pkl'

NO_BELOW = 10 
NO_ABOVE = 0.5
KEEP_N = None

if __name__ == '__main__':
    print("--- 1. 加载预处理好的分词后文本 ---")
    with open(PROCESSED_CORPUS_PICKLE, 'rb') as f:
        processed_texts = pickle.load(f)
    print(f"成功加载 {len(processed_texts)} 条文本。")

    print("\n--- 2. 创建初始Gensim词典 (不过滤) ---")
    dictionary = Dictionary(processed_texts)
    initial_vocab_size = len(dictionary)
    print(f"初始词典大小: {initial_vocab_size}")

    print("\n--- 3. 分析将被各个过滤规则移除的词汇 ---")
    
    num_docs = dictionary.num_docs
    print(f"总文档数: {num_docs}")
    print("-" * 40)


    no_below_limit = NO_BELOW
    low_freq_ids = {tokenid for tokenid, docfreq in dictionary.dfs.items() if docfreq < no_below_limit}
    print(f"规则 'no_below = {NO_BELOW}' 分析:")
    print(f" - 文档频率低于 {no_below_limit} 的词汇有 {len(low_freq_ids)} 个。")


    no_above_limit = num_docs * NO_ABOVE
    high_freq_ids = {tokenid for tokenid, docfreq in dictionary.dfs.items() if docfreq > no_above_limit}
    print(f"\n规则 'no_above = {NO_ABOVE}' 分析:")
    print(f" - 文档频率高于 {NO_ABOVE:.0%} (即 > {int(no_above_limit)}个文档) 的词汇有 {len(high_freq_ids)} 个。")


    if high_freq_ids:
        print(" - 示例 (将被移除的高频词):")
        sorted_high_freq = sorted(high_freq_ids, key=lambda tokenid: dictionary.dfs[tokenid], reverse=True)
        for tokenid in sorted_high_freq[:5]:
            print(f"   - '{dictionary[tokenid]}' (在 {dictionary.dfs[tokenid]} 个文档中出现)")

    total_removed_ids = low_freq_ids.union(high_freq_ids)
    final_vocab_size_estimated = initial_vocab_size - len(total_removed_ids)

    print("\n--- 综合分析结果 ---")
    print(f"将被移除的低频词总数: {len(low_freq_ids)}")
    print(f"将被移除的高频词总数: {len(high_freq_ids)}")
    print(f"将被移除的独立词汇总数: {len(total_removed_ids)}")
    print("-" * 40)
    print(f"预计过滤后的词典大小: {final_vocab_size_estimated}")
    print("-" * 40)

    print("\n--- 4. 实际执行Gensim的 filter_extremes 操作 ---")
    dictionary.filter_extremes(no_below=NO_BELOW, no_above=NO_ABOVE, keep_n=KEEP_N)
    dictionary.compactify() 
    
    final_vocab_size_actual = len(dictionary)
    print(f"Gensim过滤后，实际最终词典大小: {final_vocab_size_actual}")

    if final_vocab_size_actual == final_vocab_size_estimated:
        print("验证成功：手动分析结果与Gensim执行结果一致。")
    else:
        print("警告：手动分析结果与Gensim执行结果不一致，请检查逻辑。")


    print("\n--- 5. 创建并保存最终的BoW语料库和词典 ---")
    corpus = [dictionary.doc2bow(text) for text in processed_texts]
    
    dictionary.save(FILTERED_DICTIONARY_PATH)
    with open(FILTERED_CORPUS_PATH, 'wb') as f:
        pickle.dump(corpus, f)
        
    print(f"最终词典已保存至: {FILTERED_DICTIONARY_PATH}")
    print(f"最终BoW语料库已保存至: {FILTERED_CORPUS_PATH}")

In [None]:
import pickle
import os
import time
from gensim.models import LdaMulticore
from gensim.corpora import Dictionary
import multiprocessing

FINAL_DICTIONARY_PATH = 'data/LDA/final_dictionary.dict'
FINAL_CORPUS_PATH = 'data/LDA/final_corpus.pkl'

MODEL_SAVE_DIR = 'data/LDA/model/'
TOPIC_RANGE = range(3, 16)


if __name__ == '__main__':
    os.makedirs(MODEL_SAVE_DIR, exist_ok=True)

    num_workers = multiprocessing.cpu_count() - 2 if multiprocessing.cpu_count() > 2 else 1
    print(f"--- 将为每个LDA模型训练使用 {num_workers} 个CPU核心 ---")

    print("\n--- 1. 正在加载经过词典过滤的最终语料库和词典... ---")
    if not os.path.exists(FINAL_DICTIONARY_PATH) or not os.path.exists(FINAL_CORPUS_PATH):
        raise FileNotFoundError("错误：找不到最终的词典或语料库文件。请先运行词典过滤脚本。")
        
    dictionary = Dictionary.load(FINAL_DICTIONARY_PATH)
    with open(FINAL_CORPUS_PATH, 'rb') as f:
        corpus = pickle.load(f)
    print(f"加载成功。词典大小: {len(dictionary)}，语料库文档数: {len(corpus)}")


    print("\n--- 2. 开始批量训练LDA模型 ---")
    start_time = time.time()
    
    for n_topics in TOPIC_RANGE:
        print(f"\n--- 开始训练: {n_topics} 主题 ---")

        model = LdaMulticore(
            corpus=corpus,               
            num_topics=n_topics,          
            id2word=dictionary,            
            random_state=42,              
            passes=10,                   
            workers=num_workers,         
        )
        
        model_path = os.path.join(MODEL_SAVE_DIR, f'lda_model_{n_topics}.model')
        model.save(model_path)
        print(f"--- 已保存: {n_topics} 主题的模型至 {model_path} ---")

    end_time = time.time()
    print(f"\n--- 全部模型训练完毕, 总耗时: {end_time - start_time:.2f} 秒 ---")
    print(f"所有模型已保存在: {MODEL_SAVE_DIR}")

In [None]:
import os
import pickle
import pandas as pd
import matplotlib.pyplot as plt
from gensim.models import LdaMulticore, KeyedVectors
from gensim.corpora import Dictionary
from gensim.models.coherencemodel import CoherenceModel
from octis.evaluation_metrics.diversity_metrics import TopicDiversity, InvertedRBO


PROCESSED_CORPUS_PICKLE = 'data/paragraph.pkl' 
FINAL_DICTIONARY_PATH = 'data/LDA/final_dictionary.dict'
TENCENT_WV_PATH = 'data/origin/tencent-ailab-embedding-zh-d100-v0.2.0-s.txt'

MODEL_SAVE_DIR = 'data/LDA/model/'
RESULTS_CSV_PATH = 'data/LDA/lda_f_evaluation.csv'

TOPIC_RANGE = range(3, 16)

def evaluate_models(model_dir, topic_range, processed_texts, dictionary, word_vectors):
    print("\n--- 开始进行模型评估 ---")
    results = []

    diversity_metric = TopicDiversity(topk=10)
    rbo_metric = InvertedRBO(topk=10, weight=0.9)

    for n_topics in topic_range:
        model_path = os.path.join(model_dir, f'lda_model_{n_topics}.model')
        if not os.path.exists(model_path):
            print(f"警告: 模型文件未找到，跳过: {model_path}")
            continue
            
        print(f"--- 正在评估: {n_topics} 主题的模型 ---")
        lda_model = LdaMulticore.load(model_path)

        topics_for_coherence = [[word for word, _ in lda_model.show_topic(i, topn=20)] for i in range(n_topics)]
        topics_for_diversity = [[word for word, _ in lda_model.show_topic(i, topn=10)] for i in range(n_topics)]

        coherence_model = CoherenceModel(
            topics=topics_for_coherence,
            texts=processed_texts,
            dictionary=dictionary,
            coherence='c_w2v',
            keyed_vectors=word_vectors
        )
        cw2v_semantic = coherence_model.get_coherence()

        model_output_for_diversity = {"topics": topics_for_diversity}
        diversity = diversity_metric.score(model_output_for_diversity)
        rbo = rbo_metric.score(model_output_for_diversity)

        print(f"  - C_W2V (Semantic, topk=20): {cw2v_semantic:.4f}")
        print(f"  - Topic Diversity (topk=10): {diversity:.4f}")
        print(f"  - InvertedRBO (topk=10): {rbo:.4f}")
        
        results.append({
            "num_topics": n_topics,
            "C_W2V (Semantic)": cw2v_semantic,
            "Topic Diversity": diversity,
            "InvertedRBO": rbo
        })
        
    return pd.DataFrame(results).set_index("num_topics")


def plot_results(results_df):
    print("\n--- 正在可视化评估结果... ---")
    fig, ax1 = plt.subplots(figsize=(14, 8))

    ax1.set_xlabel('Number of Topics')
    ax1.set_ylabel('C_W2V Semantic Coherence (topk=20)', color='tab:red')
    ax1.plot(results_df.index, results_df['C_W2V (Semantic)'], color='tab:red', marker='o', linewidth=2.5, label='C_W2V (Semantic)')
    ax1.tick_params(axis='y', labelcolor='tab:red')
    ax1.grid(True, which='both', linestyle='--', linewidth=0.5)

    ax2 = ax1.twinx()
    ax2.set_ylabel('Diversity Scores (topk=10)', color='tab:blue')
    ax2.plot(results_df.index, results_df['Topic Diversity'], color='tab:blue', marker='x', linestyle='-', label='Topic Diversity')
    ax2.plot(results_df.index, results_df['InvertedRBO'], color='tab:cyan', marker='x', linestyle='--', label='InvertedRBO')
    ax2.tick_params(axis='y', labelcolor='tab:blue')

    fig.legend(loc="upper center", bbox_to_anchor=(0.5, 0.96), ncol=3, fontsize='medium')
    fig.suptitle('LDA-F 模型评估: 语义一致性 vs. 多样性', fontsize=16)
    fig.tight_layout(rect=[0, 0.03, 1, 0.92])
    plt.show()

def main():
    """主执行流程"""
    print("--- 1. 正在加载评估所需的文件... ---")
    
    with open(PROCESSED_CORPUS_PICKLE, 'rb') as f:
        processed_texts = pickle.load(f)
    print(f" - 成功加载 {len(processed_texts)} 条原始文本。")

    dictionary = Dictionary.load(FINAL_DICTIONARY_PATH)
    print(f" - 成功加载最终词典 (大小: {len(dictionary)})。")

    print(" - 正在加载腾讯词向量模型...")
    word_vectors = KeyedVectors.load_word2vec_format(TENCENT_WV_PATH, binary=False)
    print(" - 成功加载腾讯词向量。")

    results_df = evaluate_models(MODEL_SAVE_DIR, TOPIC_RANGE, processed_texts, dictionary, word_vectors)
    
    if not results_df.empty:
        print("\n--- 评估完成, 最终结果如下: ---")
        print(results_df)

        os.makedirs(os.path.dirname(RESULTS_CSV_PATH), exist_ok=True)
        results_df.to_csv(RESULTS_CSV_PATH)
        print(f"\n评估结果已保存到 {RESULTS_CSV_PATH}")
        
        plot_results(results_df)
    else:
        print("\n--- 评估失败: 在指定目录下未找到任何模型文件 ---")

if __name__ == "__main__":
    main()