In [None]:
import os
import time
import pickle
import numpy as np
import random 
from gensim.corpora import Dictionary

import bitermplus as btm
from sklearn.feature_extraction.text import CountVectorizer

if __name__ == '__main__':

    PROCESSED_CORPUS_PICKLE = 'data/paragraph.pkl'
    MODEL_SAVE_DIR = 'data/BTM/model/'
    TOPIC_RANGE = range(3, 16)
    os.makedirs(MODEL_SAVE_DIR, exist_ok=True)

    
    FILTER_NO_BELOW = 50     
    FILTER_NO_ABOVE = 0.6   
    FILTER_KEEP_N = 4000   

    BITERM_WINDOW_SIZE = 4    

    SAMPLING_RATE = 0.6     

    print("--- 正在加载语料库... ---")
    with open(PROCESSED_CORPUS_PICKLE, 'rb') as f:
        processed_texts = pickle.load(f)

    original_count = len(processed_texts)
    sample_size = int(original_count * SAMPLING_RATE)
    print(f"--- [终极手段] 已启用数据抽样，将从 {original_count} 条数据中随机抽取 {sample_size} 条 ---")
    processed_texts = random.sample(processed_texts, sample_size)
    print(f"--- 语料库加载完成，将处理 {len(processed_texts)} 篇文档。---")


    print("\n--- 正在使用Gensim在抽样后的数据上构建和过滤词典... ---")
    
    dictionary = Dictionary(processed_texts)
    print(f"    - 抽样后语料的原始词汇表大小: {len(dictionary)}")

    dictionary.filter_extremes(
        no_below=FILTER_NO_BELOW, 
        no_above=FILTER_NO_ABOVE, 
        keep_n=FILTER_KEEP_N
    )
    dictionary.compactify()
    
    vocab = list(dictionary.token2id.keys())
    print(f"    - 【终极过滤后】词汇表大小: {len(vocab)}")

    print("--- 使用过滤后的词汇表进行最终向量化... ---")
    texts_str = [" ".join(doc) for doc in processed_texts]
    del processed_texts

    vectorizer = CountVectorizer(vocabulary=vocab)
    X = vectorizer.fit_transform(texts_str)
    print(f"    - 生成的词频矩阵 X 的维度: {X.shape}")

    print("\n--- 正在将文本转换为词ID序列 (for biterm generation)... ---")
    texts_vec = btm.get_vectorized_docs(texts_str, np.array(vocab))
    del texts_str

    print(f"--- 正在生成 Biterms (滑动窗口大小 = {BITERM_WINDOW_SIZE})... ---")
    biterms = btm.get_biterms(texts_vec, win=BITERM_WINDOW_SIZE)
    del texts_vec
    print(f"--- Biterms 生成完毕，共计 {len(biterms)} 个。---")


    print(f"\n即将开始训练 {len(TOPIC_RANGE)} 个模型...")

    start_time = time.time()

    for n_topics in TOPIC_RANGE:
        print(f"\n--- 开始训练: {n_topics} 主题 ---")
        model = btm.BTM(
            X, np.array(vocab), T=n_topics, M=20, alpha=50/n_topics, beta=0.01
        )
        model.fit(biterms, iterations=100)
        
        model_path = os.path.join(MODEL_SAVE_DIR, f'btm_model_{n_topics}.pkl')
        with open(model_path, 'wb') as f:
            pickle.dump(model, f)
        print(f"--- 已保存: {n_topics} 主题的模型到 {model_path} ---")

    print(f"\n--- 全部模型训练完毕, 总耗时: {time.time() - start_time:.2f} 秒 ---")
    print(f"--- 所有模型已保存在目录: {MODEL_SAVE_DIR}")

In [None]:
import os
import pickle
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from gensim.models import KeyedVectors
from gensim.corpora import Dictionary
from gensim.models.coherencemodel import CoherenceModel
from octis.evaluation_metrics.diversity_metrics import TopicDiversity, InvertedRBO
from scipy.sparse import csr_matrix

PROCESSED_CORPUS_PICKLE = 'data/paragraph.pkl'
MODEL_SAVE_DIR = 'data/BTM/model/'
TENCENT_WV_PATH = 'data/origin/tencent-ailab-embedding-zh-d100-v0.2.0-s.txt'
TOPIC_RANGE = range(3, 16)
RESULTS_CSV_PATH = 'data/BTM/btm_evaluation_final.csv'


def get_btm_topics(btm_model, topn=20):
    """
    从已加载的 bitermplus BTM 模型对象中提取主题词。
    此版本基于诊断脚本的结果，解决了所有已知的 AttributeError 问题。
    """

    if not hasattr(btm_model, 'matrix_topics_words_') and not hasattr(btm_model, 'topics_'):
        print(f"    - 注意: 正在生成主题-词分布矩阵...")

        vocab_size = len(btm_model.vocabulary_)
        dummy_X = csr_matrix(([1], ([0], [0])), shape=(1, vocab_size))
        btm_model.transform(dummy_X)

    if hasattr(btm_model, 'matrix_topics_words_'):
        topic_word_dist = btm_model.matrix_topics_words_
    elif hasattr(btm_model, 'topics_'):
        topic_word_dist = btm_model.topics_
    else:
        raise AttributeError("模型在调用 transform() 后未能生成 'matrix_topics_words_' 或 'topics_' 属性。")


    vocab = btm_model.vocabulary_

    topics = []
    for topic_dist in topic_word_dist:
        top_word_indices = np.argsort(np.asarray(topic_dist))[-topn:][::-1]
        topic_words = [vocab[i] for i in top_word_indices]
        topics.append(topic_words)
    return topics

def evaluate_btm_models_focused(model_dir, topic_range, full_processed_texts, full_dictionary, word_vectors):
    print("\n--- 3. 开始进行BTM模型评估 ---")
    results = []
    diversity_metric = TopicDiversity(topk=10); rbo_metric = InvertedRBO(topk=10, weight=0.9)
    for n_topics in topic_range:
        model_path = os.path.join(model_dir, f'btm_model_{n_topics}.pkl')
        if not os.path.exists(model_path):
            print(f"警告: 模型文件未找到，跳过: {model_path}"); continue
        print(f"--- 正在评估: {n_topics} 主题的模型 ---")
        with open(model_path, 'rb') as f:
            btm_model = pickle.load(f)
        topics_for_coherence = get_btm_topics(btm_model, topn=20)
        topics_for_diversity = get_btm_topics(btm_model, topn=10)
        coherence_model = CoherenceModel(topics=topics_for_coherence, texts=full_processed_texts, dictionary=full_dictionary, coherence='c_w2v', keyed_vectors=word_vectors)
        cw2v_semantic = coherence_model.get_coherence()
        model_output_for_diversity = {"topics": topics_for_diversity}
        diversity = diversity_metric.score(model_output_for_diversity)
        rbo = rbo_metric.score(model_output_for_diversity)
        print(f"    - C_W2V (Semantic, topk=20): {cw2v_semantic:.4f}")
        print(f"    - Topic Diversity (topk=10): {diversity:.4f}")
        print(f"    - InvertedRBO (topk=10): {rbo:.4f}")
        results.append({"num_topics": n_topics, "C_W2V (Semantic)": cw2v_semantic, "Topic Diversity": diversity, "InvertedRBO": rbo})
    return pd.DataFrame(results).set_index("num_topics")


def plot_results(results_df):
    print("\n--- 4. 正在可视化评估结果... ---")
    fig, ax1 = plt.subplots(figsize=(14, 8)); ax1.set_xlabel('Number of Topics'); ax1.set_ylabel('C_W2V Semantic Coherence (topk=20)', color='tab:red')
    ax1.plot(results_df.index, results_df['C_W2V (Semantic)'], color='tab:red', marker='o', linewidth=2.5, label='C_W2V (Semantic)')
    ax1.tick_params(axis='y', labelcolor='tab:red'); ax1.grid(True, which='both', linestyle='--', linewidth=0.5)
    ax2 = ax1.twinx(); ax2.set_ylabel('Diversity Scores (topk=10)', color='tab:blue')
    ax2.plot(results_df.index, results_df['Topic Diversity'], color='tab:blue', marker='x', linestyle='-', label='Topic Diversity')
    ax2.plot(results_df.index, results_df['InvertedRBO'], color='tab:cyan', marker='x', linestyle='--', label='InvertedRBO')
    ax2.tick_params(axis='y', labelcolor='tab:blue'); fig.legend(loc="upper center", bbox_to_anchor=(0.5, 0.96), ncol=3, fontsize='medium')
    fig.suptitle('BTM 模型评估: 语义一致性 vs. 多样性', fontsize=16); fig.tight_layout(rect=[0, 0.03, 1, 0.92]); plt.show()


def main():
    print("--- 1. 正在加载【完整】语料库和词向量... ---")
    with open(PROCESSED_CORPUS_PICKLE, 'rb') as f:
        full_processed_texts = pickle.load(f)
    if not os.path.exists(TENCENT_WV_PATH):
        raise FileNotFoundError(f"错误: 腾讯词向量文件未找到，请检查路径: {TENCENT_WV_PATH}")
    word_vectors = KeyedVectors.load_word2vec_format(TENCENT_WV_PATH, binary=False)
    full_dictionary = Dictionary(full_processed_texts)
    print("--- 完整语料库和词向量加载成功 ---")
    results_df = evaluate_btm_models_focused(model_dir=MODEL_SAVE_DIR, topic_range=TOPIC_RANGE, full_processed_texts=full_processed_texts, full_dictionary=full_dictionary, word_vectors=word_vectors)
    if not results_df.empty:
        print("\n--- 最终评估完成, 结果如下: ---"); print(results_df)
        os.makedirs(os.path.dirname(RESULTS_CSV_PATH), exist_ok=True)
        results_df.to_csv(RESULTS_CSV_PATH)
        print(f"\n评估结果已保存到 {RESULTS_CSV_PATH}"); plot_results(results_df)
    else:
        print("\n--- 未找到任何模型进行评估 ---")

if __name__ == "__main__":
    main()