In [None]:
#Reduction Technique  NO-DR CRL UMAP PCA at dimension 128

In [None]:
#NO-DR

In [None]:
import pickle
import pandas as pd
import time
import os

INPUT_HD_AGC_RESULTS_PATH = 'data/4-1/O/hd_agc_results.pkl'
INPUT_ORIGINAL_TEXT_PATH = 'data/paragraph.pkl'
OUTPUT_DIR = 'data/4-1/O/'
os.makedirs(OUTPUT_DIR, exist_ok=True)
OUTPUT_CLUSTERS_CSV_PATH = os.path.join(OUTPUT_DIR, 'document_clusters_hd_agc.csv')
OUTPUT_MERGED_DOCS_PATH = os.path.join(OUTPUT_DIR, 'merged_documents_with_clusters.pkl')

def merge_documents_and_include_originals():
    print(f"--- 正在从 {INPUT_HD_AGC_RESULTS_PATH} 加载 HD-AGC 聚类结果... ---")
    if not os.path.exists(INPUT_HD_AGC_RESULTS_PATH):
        raise FileNotFoundError(f"HD-AGC 结果文件未找到，请检查路径: {INPUT_HD_AGC_RESULTS_PATH}")
    with open(INPUT_HD_AGC_RESULTS_PATH, 'rb') as f:
        hd_agc_results = pickle.load(f)
    
    clusters = hd_agc_results.get('clusters', [])
    
    print(f"--- 正在从 {INPUT_ORIGINAL_TEXT_PATH} 加载原始文本... ---")
    if not os.path.exists(INPUT_ORIGINAL_TEXT_PATH):
        raise FileNotFoundError(f"原始文本文件未找到，请检查路径: {INPUT_ORIGINAL_TEXT_PATH}")
    with open(INPUT_ORIGINAL_TEXT_PATH, 'rb') as f:
        paragraph = pickle.load(f)
        
    n_total_docs = len(paragraph)

    print("\n--- HD-AGC 聚类结果分析 ---")
    n_clusters = len(clusters)
    num_clustered_docs = sum(len(c) for c in clusters)
    
    print(f" 从 {n_total_docs} 篇文档中，识别出:")
    print(f"   - {n_clusters} 个簇，共包含 {num_clustered_docs} 篇文档。")
    if n_clusters > 0:
        avg_docs_per_cluster = num_clustered_docs / n_clusters
        print(f"   - 平均每个簇由 {avg_docs_per_cluster:.2f} 篇原始文档构成。")
        cluster_sizes = [len(c) for c in clusters]
        top_10_indices = sorted(range(len(cluster_sizes)), key=lambda i: cluster_sizes[i], reverse=True)[:10]
        
        print("\n--- Top 10 最大簇的文档数: ---")
        for i, cluster_idx in enumerate(top_10_indices):
            print(f"  - 簇 {cluster_idx} (第 {i+1} 大): {cluster_sizes[cluster_idx]} 个文档")

    print("\n--- 正在构建每篇文档的聚类标签... ---")
    labels = [-1] * n_total_docs
    for cluster_id, doc_indices in enumerate(clusters):
        for doc_index in doc_indices:
            labels[doc_index] = cluster_id
            
    documents_text = [" ".join(text) for text in paragraph]
    results_df = pd.DataFrame({
        'document_index': range(n_total_docs),
        'document_text': documents_text,
        'cluster_label': labels
    })
    results_df.sort_values(by='cluster_label', inplace=True)
    results_df.to_csv(OUTPUT_CLUSTERS_CSV_PATH, index=False, encoding='utf-8-sig')
    print(f" 详细结果已保存到: {OUTPUT_CLUSTERS_CSV_PATH}")


    print("\n--- 正在构建最终文档集合... ---")
    start_time = time.time()
    
    merged_docs_list = []


    print(f"--- 步骤 1/2: 添加 {n_total_docs} 篇原始文档 ---")
    merged_docs_list.extend(paragraph)


    print(f"--- 步骤 2/2: 正在合并 {n_clusters} 个簇为超级文档 ---")
    for doc_indices in clusters:
        merged_doc = []
        for doc_index in doc_indices:
            if 0 <= doc_index < len(paragraph):
                merged_doc.extend(paragraph[doc_index])
        if merged_doc:
            merged_docs_list.append(merged_doc)

    print(f"--- 合并完成！耗时: {time.time() - start_time:.2f} 秒 ---")

    final_doc_count = len(merged_docs_list)
    expected_doc_count = n_total_docs + n_clusters
    print(f"--- 共生成 {final_doc_count} 篇文档 (由 {n_total_docs} 篇原始文档 + "
          f"{n_clusters} 个超级文档组成)。 ---")
    if final_doc_count != expected_doc_count:
        print(f"   - 警告: 最终文档数 ({final_doc_count}) 与预期数 ({expected_doc_count}) 不符，请检查。")

    with open(OUTPUT_MERGED_DOCS_PATH, 'wb') as f:
        pickle.dump(merged_docs_list, f)
        
    print(f" 成功将文档保存到: {OUTPUT_MERGED_DOCS_PATH}")

    if merged_docs_list and clusters:
        first_cluster_indices = clusters[0]
        sample_cluster_size = len(first_cluster_indices)
        print(f"\n示例合并文档 (来自簇 0) 由 {sample_cluster_size} 篇原始文档合并而成。")
        print(f"其前20个词为: {merged_docs_list[n_total_docs][:20]}")

if __name__ == '__main__':
    merge_documents_and_include_originals()

In [None]:
import pickle
import os  
from gensim.corpora import Dictionary
from collections import Counter


PROCESSED_CORPUS_PICKLE = 'data/4-1/O/merged_documents_with_clusters.pkl'
FILTERED_DICTIONARY_PATH = 'data/4-1/O/final_dictionary.dict'
FILTERED_CORPUS_PATH = 'data/4-1/O/final_corpus.pkl'

NO_BELOW = 10
NO_ABOVE = 0.35
KEEP_N = None

if __name__ == '__main__':
    print("--- 1. 加载预处理好的分词后文本 ---")
    with open(PROCESSED_CORPUS_PICKLE, 'rb') as f:
        processed_texts = pickle.load(f)
    print(f"成功加载 {len(processed_texts)} 条文本。")

    print("\n--- 2. 创建初始Gensim词典 (不过滤) ---")
    dictionary = Dictionary(processed_texts)
    initial_vocab_size = len(dictionary)
    print(f"初始词典大小: {initial_vocab_size}")


    print("\n--- 3. 分析将被各个过滤规则移除的词汇 ---")
    
    num_docs = dictionary.num_docs
    print(f"总文档数: {num_docs}")
    print("-" * 40)

    no_below_limit = NO_BELOW
    low_freq_ids = {tokenid for tokenid, docfreq in dictionary.dfs.items() if docfreq < no_below_limit}
    print(f"规则 'no_below = {NO_BELOW}' 分析:")
    print(f" - 文档频率低于 {no_below_limit} 的词汇有 {len(low_freq_ids)} 个。")

    no_above_limit = num_docs * NO_ABOVE
    high_freq_ids = {tokenid for tokenid, docfreq in dictionary.dfs.items() if docfreq > no_above_limit}
    print(f"\n规则 'no_above = {NO_ABOVE}' 分析:")
    print(f" - 文档频率高于 {NO_ABOVE:.0%} (即 > {int(no_above_limit)}个文档) 的词汇有 {len(high_freq_ids)} 个。")

    if high_freq_ids:
        print(" - 示例 (将被移除的高频词):")
        sorted_high_freq = sorted(high_freq_ids, key=lambda tokenid: dictionary.dfs[tokenid], reverse=True)
        for tokenid in sorted_high_freq[:5]:
            print(f"   - '{dictionary[tokenid]}' (在 {dictionary.dfs[tokenid]} 个文档中出现)")

    total_removed_ids = low_freq_ids.union(high_freq_ids)
    final_vocab_size_estimated = initial_vocab_size - len(total_removed_ids)

    print("\n--- 综合分析结果 ---")
    print(f"将被移除的低频词总数: {len(low_freq_ids)}")
    print(f"将被移除的高频词总数: {len(high_freq_ids)}")
    print(f"将被移除的独立词汇总数: {len(total_removed_ids)}")
    print("-" * 40)
    print(f"预计过滤后的词典大小: {final_vocab_size_estimated}")
    print("-" * 40)

    print("\n--- 4. 实际执行Gensim的 filter_extremes 操作 ---")
    dictionary.filter_extremes(no_below=NO_BELOW, no_above=NO_ABOVE, keep_n=KEEP_N)
    dictionary.compactify()
    
    final_vocab_size_actual = len(dictionary)
    print(f"Gensim过滤后，实际最终词典大小: {final_vocab_size_actual}")

    if final_vocab_size_actual == final_vocab_size_estimated:
        print("验证成功：手动分析结果与Gensim执行结果一致。")
    else:
        print("警告：手动分析结果与Gensim执行结果不一致，请检查逻辑。")

    print("\n--- 5. 创建并保存最终的BoW语料库和词典 ---")

    output_dir = os.path.dirname(FILTERED_DICTIONARY_PATH)
    os.makedirs(output_dir, exist_ok=True)
    
    corpus = [dictionary.doc2bow(text) for text in processed_texts]
    
    dictionary.save(FILTERED_DICTIONARY_PATH)
    with open(FILTERED_CORPUS_PATH, 'wb') as f:
        pickle.dump(corpus, f)
        
    print(f"最终词典已保存至: {FILTERED_DICTIONARY_PATH}")
    print(f"最终BoW语料库已保存至: {FILTERED_CORPUS_PATH}")

In [None]:
import pickle
import os
import time
from gensim.models import LdaMulticore
from gensim.corpora import Dictionary
import multiprocessing


FINAL_DICTIONARY_PATH = 'data/4-1/O/final_dictionary.dict'
FINAL_CORPUS_PATH = 'data/4-1/O/final_corpus.pkl'

MODEL_SAVE_DIR = 'data/4-1/O/model/'
TOPIC_RANGE = range(3, 16)

if __name__ == '__main__':
    os.makedirs(MODEL_SAVE_DIR, exist_ok=True)

    num_workers = multiprocessing.cpu_count() - 2 if multiprocessing.cpu_count() > 2 else 1
    print(f"--- 将为每个LDA模型训练使用 {num_workers} 个CPU核心 ---")

    print("\n--- 1. 正在加载经过词典过滤的最终语料库和词典... ---")
    if not os.path.exists(FINAL_DICTIONARY_PATH) or not os.path.exists(FINAL_CORPUS_PATH):
        raise FileNotFoundError("错误：找不到最终的词典或语料库文件。请先运行词典过滤脚本。")
        
    dictionary = Dictionary.load(FINAL_DICTIONARY_PATH)
    with open(FINAL_CORPUS_PATH, 'rb') as f:
        corpus = pickle.load(f)
    print(f"加载成功。词典大小: {len(dictionary)}，语料库文档数: {len(corpus)}")

    print("\n--- 2. 开始批量训练LDA模型 ---")
    start_time = time.time()
    
    for n_topics in TOPIC_RANGE:
        print(f"\n--- 开始训练: {n_topics} 主题 ---")

        model = LdaMulticore(
            corpus=corpus,              
            num_topics=n_topics,           
            id2word=dictionary,           
            random_state=42,             
            passes=10,                   
            workers=num_workers,          
        )

        model_path = os.path.join(MODEL_SAVE_DIR, f'lda_model_{n_topics}.model')
        model.save(model_path)
        print(f"--- 已保存: {n_topics} 主题的模型至 {model_path} ---")

    end_time = time.time()
    print(f"\n--- 全部模型训练完毕, 总耗时: {end_time - start_time:.2f} 秒 ---")
    print(f"所有模型已保存在: {MODEL_SAVE_DIR}")

In [None]:
import os
import pickle
import pandas as pd
import matplotlib.pyplot as plt
from gensim.models import LdaMulticore, KeyedVectors
from gensim.corpora import Dictionary
from gensim.models.coherencemodel import CoherenceModel
from octis.evaluation_metrics.diversity_metrics import TopicDiversity, InvertedRBO

PROCESSED_CORPUS_PICKLE = 'data/4-1/O/merged_documents_with_clusters.pkl' 
FINAL_DICTIONARY_PATH = 'data/4-1/O/final_dictionary.dict'
TENCENT_WV_PATH = 'data/origin/tencent-ailab-embedding-zh-d100-v0.2.0-s.txt'
MODEL_SAVE_DIR = 'data/4-1/O/model/'
RESULTS_CSV_PATH = 'data/4-1/O/lda_f_evaluation.csv'

TOPIC_RANGE = range(3, 16)

def evaluate_models(model_dir, topic_range, processed_texts, dictionary, word_vectors):
    print("\n--- 开始进行模型评估 ---")
    results = []

    diversity_metric = TopicDiversity(topk=10)
    rbo_metric = InvertedRBO(topk=10, weight=0.9)

    for n_topics in topic_range:
        model_path = os.path.join(model_dir, f'lda_model_{n_topics}.model')
        if not os.path.exists(model_path):
            print(f"警告: 模型文件未找到，跳过: {model_path}")
            continue
            
        print(f"--- 正在评估: {n_topics} 主题的模型 ---")
        lda_model = LdaMulticore.load(model_path)

        topics_for_coherence = [[word for word, _ in lda_model.show_topic(i, topn=20)] for i in range(n_topics)]
        topics_for_diversity = [[word for word, _ in lda_model.show_topic(i, topn=10)] for i in range(n_topics)]

        coherence_model = CoherenceModel(
            topics=topics_for_coherence,
            texts=processed_texts,
            dictionary=dictionary,
            coherence='c_w2v',
            keyed_vectors=word_vectors
        )
        cw2v_semantic = coherence_model.get_coherence()

        model_output_for_diversity = {"topics": topics_for_diversity}
        diversity = diversity_metric.score(model_output_for_diversity)
        rbo = rbo_metric.score(model_output_for_diversity)

        print(f"  - C_W2V (Semantic, topk=20): {cw2v_semantic:.4f}")
        print(f"  - Topic Diversity (topk=10): {diversity:.4f}")
        print(f"  - InvertedRBO (topk=10): {rbo:.4f}")
        
        results.append({
            "num_topics": n_topics,
            "C_W2V (Semantic)": cw2v_semantic,
            "Topic Diversity": diversity,
            "InvertedRBO": rbo
        })
        
    return pd.DataFrame(results).set_index("num_topics")

def plot_results(results_df):
    print("\n--- 正在可视化评估结果... ---")
    fig, ax1 = plt.subplots(figsize=(14, 8))

    ax1.set_xlabel('Number of Topics')
    ax1.set_ylabel('C_W2V Semantic Coherence (topk=20)', color='tab:red')
    ax1.plot(results_df.index, results_df['C_W2V (Semantic)'], color='tab:red', marker='o', linewidth=2.5, label='C_W2V (Semantic)')
    ax1.tick_params(axis='y', labelcolor='tab:red')
    ax1.grid(True, which='both', linestyle='--', linewidth=0.5)

    ax2 = ax1.twinx()
    ax2.set_ylabel('Diversity Scores (topk=10)', color='tab:blue')
    ax2.plot(results_df.index, results_df['Topic Diversity'], color='tab:blue', marker='x', linestyle='-', label='Topic Diversity')
    ax2.plot(results_df.index, results_df['InvertedRBO'], color='tab:cyan', marker='x', linestyle='--', label='InvertedRBO')
    ax2.tick_params(axis='y', labelcolor='tab:blue')

    fig.legend(loc="upper center", bbox_to_anchor=(0.5, 0.96), ncol=3, fontsize='medium')
    fig.suptitle('LDA-F 模型评估: 语义一致性 vs. 多样性', fontsize=16)
    fig.tight_layout(rect=[0, 0.03, 1, 0.92])
    plt.show()

def main():
    print("--- 1. 正在加载评估所需的文件... ---")

    with open(PROCESSED_CORPUS_PICKLE, 'rb') as f:
        processed_texts = pickle.load(f)
    print(f" - 成功加载 {len(processed_texts)} 条原始文本。")

    dictionary = Dictionary.load(FINAL_DICTIONARY_PATH)
    print(f" - 成功加载最终词典 (大小: {len(dictionary)})。")

    print(" - 正在加载腾讯词向量模型...")
    word_vectors = KeyedVectors.load_word2vec_format(TENCENT_WV_PATH, binary=False)
    print(" - 成功加载腾讯词向量。")

    results_df = evaluate_models(MODEL_SAVE_DIR, TOPIC_RANGE, processed_texts, dictionary, word_vectors)
    
    if not results_df.empty:
        print("\n--- 评估完成, 最终结果如下: ---")
        print(results_df)

        os.makedirs(os.path.dirname(RESULTS_CSV_PATH), exist_ok=True)
        results_df.to_csv(RESULTS_CSV_PATH)
        print(f"\n评估结果已保存到 {RESULTS_CSV_PATH}")
        
        plot_results(results_df)
    else:
        print("\n--- 评估失败: 在指定目录下未找到任何模型文件 ---")

if __name__ == "__main__":
    main()

In [None]:
#CRL

In [None]:
import pickle
import pandas as pd
import time
import os

INPUT_HD_AGC_RESULTS_PATH = 'data/4-1/CRL/hd_agc_results.pkl'
INPUT_ORIGINAL_TEXT_PATH = 'data/paragraph.pkl'
OUTPUT_DIR = 'data/4-1/CRL/'
os.makedirs(OUTPUT_DIR, exist_ok=True)
OUTPUT_CLUSTERS_CSV_PATH = os.path.join(OUTPUT_DIR, 'document_clusters_hd_agc.csv')
OUTPUT_MERGED_DOCS_PATH = os.path.join(OUTPUT_DIR, 'merged_documents_with_clusters.pkl')

def merge_documents_and_include_originals():
    print(f"--- 正在从 {INPUT_HD_AGC_RESULTS_PATH} 加载 HD-AGC 聚类结果... ---")
    if not os.path.exists(INPUT_HD_AGC_RESULTS_PATH):
        raise FileNotFoundError(f"HD-AGC 结果文件未找到，请检查路径: {INPUT_HD_AGC_RESULTS_PATH}")
    with open(INPUT_HD_AGC_RESULTS_PATH, 'rb') as f:
        hd_agc_results = pickle.load(f)
    
    clusters = hd_agc_results.get('clusters', [])
    
    print(f"--- 正在从 {INPUT_ORIGINAL_TEXT_PATH} 加载原始文本... ---")
    if not os.path.exists(INPUT_ORIGINAL_TEXT_PATH):
        raise FileNotFoundError(f"原始文本文件未找到，请检查路径: {INPUT_ORIGINAL_TEXT_PATH}")
    with open(INPUT_ORIGINAL_TEXT_PATH, 'rb') as f:
        paragraph = pickle.load(f)
        
    n_total_docs = len(paragraph)

    print("\n--- HD-AGC 聚类结果分析 ---")
    n_clusters = len(clusters)
    num_clustered_docs = sum(len(c) for c in clusters)
    
    print(f" 从 {n_total_docs} 篇文档中，识别出:")
    print(f"   - {n_clusters} 个簇，共包含 {num_clustered_docs} 篇文档。")
    if n_clusters > 0:
        avg_docs_per_cluster = num_clustered_docs / n_clusters
        print(f"   - 平均每个簇由 {avg_docs_per_cluster:.2f} 篇原始文档构成。")
        cluster_sizes = [len(c) for c in clusters]
        top_10_indices = sorted(range(len(cluster_sizes)), key=lambda i: cluster_sizes[i], reverse=True)[:10]
        
        print("\n--- Top 10 最大簇的文档数: ---")
        for i, cluster_idx in enumerate(top_10_indices):
            print(f"  - 簇 {cluster_idx} (第 {i+1} 大): {cluster_sizes[cluster_idx]} 个文档")

    print("\n--- 正在构建每篇文档的聚类标签... ---")
    labels = [-1] * n_total_docs
    for cluster_id, doc_indices in enumerate(clusters):
        for doc_index in doc_indices:
            labels[doc_index] = cluster_id
            
    documents_text = [" ".join(text) for text in paragraph]
    results_df = pd.DataFrame({
        'document_index': range(n_total_docs),
        'document_text': documents_text,
        'cluster_label': labels
    })
    results_df.sort_values(by='cluster_label', inplace=True)
    results_df.to_csv(OUTPUT_CLUSTERS_CSV_PATH, index=False, encoding='utf-8-sig')
    print(f" 详细结果已保存到: {OUTPUT_CLUSTERS_CSV_PATH}")


    print("\n--- 正在构建最终文档集合... ---")
    start_time = time.time()
    
    merged_docs_list = []


    print(f"--- 步骤 1/2: 添加 {n_total_docs} 篇原始文档 ---")
    merged_docs_list.extend(paragraph)


    print(f"--- 步骤 2/2: 正在合并 {n_clusters} 个簇为超级文档 ---")
    for doc_indices in clusters:
        merged_doc = []
        for doc_index in doc_indices:
            if 0 <= doc_index < len(paragraph):
                merged_doc.extend(paragraph[doc_index])
        if merged_doc:
            merged_docs_list.append(merged_doc)

    print(f"--- 合并完成！耗时: {time.time() - start_time:.2f} 秒 ---")

    final_doc_count = len(merged_docs_list)
    expected_doc_count = n_total_docs + n_clusters
    print(f"--- 共生成 {final_doc_count} 篇文档 (由 {n_total_docs} 篇原始文档 + "
          f"{n_clusters} 个超级文档组成)。 ---")
    if final_doc_count != expected_doc_count:
        print(f"   - 警告: 最终文档数 ({final_doc_count}) 与预期数 ({expected_doc_count}) 不符，请检查。")

    with open(OUTPUT_MERGED_DOCS_PATH, 'wb') as f:
        pickle.dump(merged_docs_list, f)
        
    print(f" 成功将文档保存到: {OUTPUT_MERGED_DOCS_PATH}")

    if merged_docs_list and clusters:
        first_cluster_indices = clusters[0]
        sample_cluster_size = len(first_cluster_indices)
        print(f"\n示例合并文档 (来自簇 0) 由 {sample_cluster_size} 篇原始文档合并而成。")
        print(f"其前20个词为: {merged_docs_list[n_total_docs][:20]}")

if __name__ == '__main__':
    merge_documents_and_include_originals()

In [None]:
import pickle
import os  
from gensim.corpora import Dictionary
from collections import Counter


PROCESSED_CORPUS_PICKLE = 'data/4-1/CRL/merged_documents_with_clusters.pkl'
FILTERED_DICTIONARY_PATH = 'data/4-1/CRL/final_dictionary.dict'
FILTERED_CORPUS_PATH = 'data/4-1/CRL/final_corpus.pkl'

NO_BELOW = 10
NO_ABOVE = 0.35
KEEP_N = None

if __name__ == '__main__':
    print("--- 1. 加载预处理好的分词后文本 ---")
    with open(PROCESSED_CORPUS_PICKLE, 'rb') as f:
        processed_texts = pickle.load(f)
    print(f"成功加载 {len(processed_texts)} 条文本。")

    print("\n--- 2. 创建初始Gensim词典 (不过滤) ---")
    dictionary = Dictionary(processed_texts)
    initial_vocab_size = len(dictionary)
    print(f"初始词典大小: {initial_vocab_size}")


    print("\n--- 3. 分析将被各个过滤规则移除的词汇 ---")
    
    num_docs = dictionary.num_docs
    print(f"总文档数: {num_docs}")
    print("-" * 40)

    no_below_limit = NO_BELOW
    low_freq_ids = {tokenid for tokenid, docfreq in dictionary.dfs.items() if docfreq < no_below_limit}
    print(f"规则 'no_below = {NO_BELOW}' 分析:")
    print(f" - 文档频率低于 {no_below_limit} 的词汇有 {len(low_freq_ids)} 个。")

    no_above_limit = num_docs * NO_ABOVE
    high_freq_ids = {tokenid for tokenid, docfreq in dictionary.dfs.items() if docfreq > no_above_limit}
    print(f"\n规则 'no_above = {NO_ABOVE}' 分析:")
    print(f" - 文档频率高于 {NO_ABOVE:.0%} (即 > {int(no_above_limit)}个文档) 的词汇有 {len(high_freq_ids)} 个。")

    if high_freq_ids:
        print(" - 示例 (将被移除的高频词):")
        sorted_high_freq = sorted(high_freq_ids, key=lambda tokenid: dictionary.dfs[tokenid], reverse=True)
        for tokenid in sorted_high_freq[:5]:
            print(f"   - '{dictionary[tokenid]}' (在 {dictionary.dfs[tokenid]} 个文档中出现)")

    total_removed_ids = low_freq_ids.union(high_freq_ids)
    final_vocab_size_estimated = initial_vocab_size - len(total_removed_ids)

    print("\n--- 综合分析结果 ---")
    print(f"将被移除的低频词总数: {len(low_freq_ids)}")
    print(f"将被移除的高频词总数: {len(high_freq_ids)}")
    print(f"将被移除的独立词汇总数: {len(total_removed_ids)}")
    print("-" * 40)
    print(f"预计过滤后的词典大小: {final_vocab_size_estimated}")
    print("-" * 40)

    print("\n--- 4. 实际执行Gensim的 filter_extremes 操作 ---")
    dictionary.filter_extremes(no_below=NO_BELOW, no_above=NO_ABOVE, keep_n=KEEP_N)
    dictionary.compactify()
    
    final_vocab_size_actual = len(dictionary)
    print(f"Gensim过滤后，实际最终词典大小: {final_vocab_size_actual}")

    if final_vocab_size_actual == final_vocab_size_estimated:
        print("验证成功：手动分析结果与Gensim执行结果一致。")
    else:
        print("警告：手动分析结果与Gensim执行结果不一致，请检查逻辑。")

    print("\n--- 5. 创建并保存最终的BoW语料库和词典 ---")

    output_dir = os.path.dirname(FILTERED_DICTIONARY_PATH)
    os.makedirs(output_dir, exist_ok=True)
    
    corpus = [dictionary.doc2bow(text) for text in processed_texts]
    
    dictionary.save(FILTERED_DICTIONARY_PATH)
    with open(FILTERED_CORPUS_PATH, 'wb') as f:
        pickle.dump(corpus, f)
        
    print(f"最终词典已保存至: {FILTERED_DICTIONARY_PATH}")
    print(f"最终BoW语料库已保存至: {FILTERED_CORPUS_PATH}")

In [None]:
import pickle
import os
import time
from gensim.models import LdaMulticore
from gensim.corpora import Dictionary
import multiprocessing


FINAL_DICTIONARY_PATH = 'data/4-1/CRL/final_dictionary.dict'
FINAL_CORPUS_PATH = 'data/4-1/CRL/final_corpus.pkl'

MODEL_SAVE_DIR = 'data/4-1/CRL/model/'
TOPIC_RANGE = range(3, 16)

if __name__ == '__main__':
    os.makedirs(MODEL_SAVE_DIR, exist_ok=True)

    num_workers = multiprocessing.cpu_count() - 2 if multiprocessing.cpu_count() > 2 else 1
    print(f"--- 将为每个LDA模型训练使用 {num_workers} 个CPU核心 ---")

    print("\n--- 1. 正在加载经过词典过滤的最终语料库和词典... ---")
    if not os.path.exists(FINAL_DICTIONARY_PATH) or not os.path.exists(FINAL_CORPUS_PATH):
        raise FileNotFoundError("错误：找不到最终的词典或语料库文件。请先运行词典过滤脚本。")
        
    dictionary = Dictionary.load(FINAL_DICTIONARY_PATH)
    with open(FINAL_CORPUS_PATH, 'rb') as f:
        corpus = pickle.load(f)
    print(f"加载成功。词典大小: {len(dictionary)}，语料库文档数: {len(corpus)}")

    print("\n--- 2. 开始批量训练LDA模型 ---")
    start_time = time.time()
    
    for n_topics in TOPIC_RANGE:
        print(f"\n--- 开始训练: {n_topics} 主题 ---")

        model = LdaMulticore(
            corpus=corpus,              
            num_topics=n_topics,           
            id2word=dictionary,           
            random_state=42,             
            passes=10,                   
            workers=num_workers,          
        )

        model_path = os.path.join(MODEL_SAVE_DIR, f'lda_model_{n_topics}.model')
        model.save(model_path)
        print(f"--- 已保存: {n_topics} 主题的模型至 {model_path} ---")

    end_time = time.time()
    print(f"\n--- 全部模型训练完毕, 总耗时: {end_time - start_time:.2f} 秒 ---")
    print(f"所有模型已保存在: {MODEL_SAVE_DIR}")

In [None]:
import os
import pickle
import pandas as pd
import matplotlib.pyplot as plt
from gensim.models import LdaMulticore, KeyedVectors
from gensim.corpora import Dictionary
from gensim.models.coherencemodel import CoherenceModel
from octis.evaluation_metrics.diversity_metrics import TopicDiversity, InvertedRBO

PROCESSED_CORPUS_PICKLE = 'data/4-1/CRL/merged_documents_with_clusters.pkl' 
FINAL_DICTIONARY_PATH = 'data/4-1/CRL/final_dictionary.dict'
TENCENT_WV_PATH = 'data/origin/tencent-ailab-embedding-zh-d100-v0.2.0-s.txt'
MODEL_SAVE_DIR = 'data/4-1/CRL/model/'
RESULTS_CSV_PATH = 'data/4-1/CRL/lda_f_evaluation.csv'

TOPIC_RANGE = range(3, 16)

def evaluate_models(model_dir, topic_range, processed_texts, dictionary, word_vectors):
    print("\n--- 开始进行模型评估 ---")
    results = []

    diversity_metric = TopicDiversity(topk=10)
    rbo_metric = InvertedRBO(topk=10, weight=0.9)

    for n_topics in topic_range:
        model_path = os.path.join(model_dir, f'lda_model_{n_topics}.model')
        if not os.path.exists(model_path):
            print(f"警告: 模型文件未找到，跳过: {model_path}")
            continue
            
        print(f"--- 正在评估: {n_topics} 主题的模型 ---")
        lda_model = LdaMulticore.load(model_path)

        topics_for_coherence = [[word for word, _ in lda_model.show_topic(i, topn=20)] for i in range(n_topics)]
        topics_for_diversity = [[word for word, _ in lda_model.show_topic(i, topn=10)] for i in range(n_topics)]

        coherence_model = CoherenceModel(
            topics=topics_for_coherence,
            texts=processed_texts,
            dictionary=dictionary,
            coherence='c_w2v',
            keyed_vectors=word_vectors
        )
        cw2v_semantic = coherence_model.get_coherence()

        model_output_for_diversity = {"topics": topics_for_diversity}
        diversity = diversity_metric.score(model_output_for_diversity)
        rbo = rbo_metric.score(model_output_for_diversity)

        print(f"  - C_W2V (Semantic, topk=20): {cw2v_semantic:.4f}")
        print(f"  - Topic Diversity (topk=10): {diversity:.4f}")
        print(f"  - InvertedRBO (topk=10): {rbo:.4f}")
        
        results.append({
            "num_topics": n_topics,
            "C_W2V (Semantic)": cw2v_semantic,
            "Topic Diversity": diversity,
            "InvertedRBO": rbo
        })
        
    return pd.DataFrame(results).set_index("num_topics")

def plot_results(results_df):
    print("\n--- 正在可视化评估结果... ---")
    fig, ax1 = plt.subplots(figsize=(14, 8))

    ax1.set_xlabel('Number of Topics')
    ax1.set_ylabel('C_W2V Semantic Coherence (topk=20)', color='tab:red')
    ax1.plot(results_df.index, results_df['C_W2V (Semantic)'], color='tab:red', marker='o', linewidth=2.5, label='C_W2V (Semantic)')
    ax1.tick_params(axis='y', labelcolor='tab:red')
    ax1.grid(True, which='both', linestyle='--', linewidth=0.5)

    ax2 = ax1.twinx()
    ax2.set_ylabel('Diversity Scores (topk=10)', color='tab:blue')
    ax2.plot(results_df.index, results_df['Topic Diversity'], color='tab:blue', marker='x', linestyle='-', label='Topic Diversity')
    ax2.plot(results_df.index, results_df['InvertedRBO'], color='tab:cyan', marker='x', linestyle='--', label='InvertedRBO')
    ax2.tick_params(axis='y', labelcolor='tab:blue')

    fig.legend(loc="upper center", bbox_to_anchor=(0.5, 0.96), ncol=3, fontsize='medium')
    fig.suptitle('LDA-F 模型评估: 语义一致性 vs. 多样性', fontsize=16)
    fig.tight_layout(rect=[0, 0.03, 1, 0.92])
    plt.show()

def main():
    print("--- 1. 正在加载评估所需的文件... ---")

    with open(PROCESSED_CORPUS_PICKLE, 'rb') as f:
        processed_texts = pickle.load(f)
    print(f" - 成功加载 {len(processed_texts)} 条原始文本。")

    dictionary = Dictionary.load(FINAL_DICTIONARY_PATH)
    print(f" - 成功加载最终词典 (大小: {len(dictionary)})。")

    print(" - 正在加载腾讯词向量模型...")
    word_vectors = KeyedVectors.load_word2vec_format(TENCENT_WV_PATH, binary=False)
    print(" - 成功加载腾讯词向量。")

    results_df = evaluate_models(MODEL_SAVE_DIR, TOPIC_RANGE, processed_texts, dictionary, word_vectors)
    
    if not results_df.empty:
        print("\n--- 评估完成, 最终结果如下: ---")
        print(results_df)

        os.makedirs(os.path.dirname(RESULTS_CSV_PATH), exist_ok=True)
        results_df.to_csv(RESULTS_CSV_PATH)
        print(f"\n评估结果已保存到 {RESULTS_CSV_PATH}")
        
        plot_results(results_df)
    else:
        print("\n--- 评估失败: 在指定目录下未找到任何模型文件 ---")

if __name__ == "__main__":
    main()

In [None]:
#UMAP

In [None]:
import pickle
import pandas as pd
import time
import os

INPUT_HD_AGC_RESULTS_PATH = 'data/4-1/UMAP/hd_agc_results.pkl'
INPUT_ORIGINAL_TEXT_PATH = 'data/paragraph.pkl'
OUTPUT_DIR = 'data/4-1/UMAP/'
os.makedirs(OUTPUT_DIR, exist_ok=True)
OUTPUT_CLUSTERS_CSV_PATH = os.path.join(OUTPUT_DIR, 'document_clusters_hd_agc.csv')
OUTPUT_MERGED_DOCS_PATH = os.path.join(OUTPUT_DIR, 'merged_documents_with_clusters.pkl')

def merge_documents_and_include_originals():
    print(f"--- 正在从 {INPUT_HD_AGC_RESULTS_PATH} 加载 HD-AGC 聚类结果... ---")
    if not os.path.exists(INPUT_HD_AGC_RESULTS_PATH):
        raise FileNotFoundError(f"HD-AGC 结果文件未找到，请检查路径: {INPUT_HD_AGC_RESULTS_PATH}")
    with open(INPUT_HD_AGC_RESULTS_PATH, 'rb') as f:
        hd_agc_results = pickle.load(f)
    
    clusters = hd_agc_results.get('clusters', [])
    
    print(f"--- 正在从 {INPUT_ORIGINAL_TEXT_PATH} 加载原始文本... ---")
    if not os.path.exists(INPUT_ORIGINAL_TEXT_PATH):
        raise FileNotFoundError(f"原始文本文件未找到，请检查路径: {INPUT_ORIGINAL_TEXT_PATH}")
    with open(INPUT_ORIGINAL_TEXT_PATH, 'rb') as f:
        paragraph = pickle.load(f)
        
    n_total_docs = len(paragraph)

    print("\n--- HD-AGC 聚类结果分析 ---")
    n_clusters = len(clusters)
    num_clustered_docs = sum(len(c) for c in clusters)
    
    print(f" 从 {n_total_docs} 篇文档中，识别出:")
    print(f"   - {n_clusters} 个簇，共包含 {num_clustered_docs} 篇文档。")
    if n_clusters > 0:
        avg_docs_per_cluster = num_clustered_docs / n_clusters
        print(f"   - 平均每个簇由 {avg_docs_per_cluster:.2f} 篇原始文档构成。")
        cluster_sizes = [len(c) for c in clusters]
        top_10_indices = sorted(range(len(cluster_sizes)), key=lambda i: cluster_sizes[i], reverse=True)[:10]
        
        print("\n--- Top 10 最大簇的文档数: ---")
        for i, cluster_idx in enumerate(top_10_indices):
            print(f"  - 簇 {cluster_idx} (第 {i+1} 大): {cluster_sizes[cluster_idx]} 个文档")

    print("\n--- 正在构建每篇文档的聚类标签... ---")
    labels = [-1] * n_total_docs
    for cluster_id, doc_indices in enumerate(clusters):
        for doc_index in doc_indices:
            labels[doc_index] = cluster_id
            
    documents_text = [" ".join(text) for text in paragraph]
    results_df = pd.DataFrame({
        'document_index': range(n_total_docs),
        'document_text': documents_text,
        'cluster_label': labels
    })
    results_df.sort_values(by='cluster_label', inplace=True)
    results_df.to_csv(OUTPUT_CLUSTERS_CSV_PATH, index=False, encoding='utf-8-sig')
    print(f" 详细结果已保存到: {OUTPUT_CLUSTERS_CSV_PATH}")


    print("\n--- 正在构建最终文档集合... ---")
    start_time = time.time()
    
    merged_docs_list = []


    print(f"--- 步骤 1/2: 添加 {n_total_docs} 篇原始文档 ---")
    merged_docs_list.extend(paragraph)


    print(f"--- 步骤 2/2: 正在合并 {n_clusters} 个簇为超级文档 ---")
    for doc_indices in clusters:
        merged_doc = []
        for doc_index in doc_indices:
            if 0 <= doc_index < len(paragraph):
                merged_doc.extend(paragraph[doc_index])
        if merged_doc:
            merged_docs_list.append(merged_doc)

    print(f"--- 合并完成！耗时: {time.time() - start_time:.2f} 秒 ---")

    final_doc_count = len(merged_docs_list)
    expected_doc_count = n_total_docs + n_clusters
    print(f"--- 共生成 {final_doc_count} 篇文档 (由 {n_total_docs} 篇原始文档 + "
          f"{n_clusters} 个超级文档组成)。 ---")
    if final_doc_count != expected_doc_count:
        print(f"   - 警告: 最终文档数 ({final_doc_count}) 与预期数 ({expected_doc_count}) 不符，请检查。")

    with open(OUTPUT_MERGED_DOCS_PATH, 'wb') as f:
        pickle.dump(merged_docs_list, f)
        
    print(f" 成功将文档保存到: {OUTPUT_MERGED_DOCS_PATH}")

    if merged_docs_list and clusters:
        first_cluster_indices = clusters[0]
        sample_cluster_size = len(first_cluster_indices)
        print(f"\n示例合并文档 (来自簇 0) 由 {sample_cluster_size} 篇原始文档合并而成。")
        print(f"其前20个词为: {merged_docs_list[n_total_docs][:20]}")

if __name__ == '__main__':
    merge_documents_and_include_originals()

In [None]:
import pickle
import os  
from gensim.corpora import Dictionary
from collections import Counter


PROCESSED_CORPUS_PICKLE = 'data/4-1/UMAP/merged_documents_with_clusters.pkl'
FILTERED_DICTIONARY_PATH = 'data/4-1/UMAP/final_dictionary.dict'
FILTERED_CORPUS_PATH = 'data/4-1/UMAP/final_corpus.pkl'

NO_BELOW = 10
NO_ABOVE = 0.35
KEEP_N = None

if __name__ == '__main__':
    print("--- 1. 加载预处理好的分词后文本 ---")
    with open(PROCESSED_CORPUS_PICKLE, 'rb') as f:
        processed_texts = pickle.load(f)
    print(f"成功加载 {len(processed_texts)} 条文本。")

    print("\n--- 2. 创建初始Gensim词典 (不过滤) ---")
    dictionary = Dictionary(processed_texts)
    initial_vocab_size = len(dictionary)
    print(f"初始词典大小: {initial_vocab_size}")


    print("\n--- 3. 分析将被各个过滤规则移除的词汇 ---")
    
    num_docs = dictionary.num_docs
    print(f"总文档数: {num_docs}")
    print("-" * 40)

    no_below_limit = NO_BELOW
    low_freq_ids = {tokenid for tokenid, docfreq in dictionary.dfs.items() if docfreq < no_below_limit}
    print(f"规则 'no_below = {NO_BELOW}' 分析:")
    print(f" - 文档频率低于 {no_below_limit} 的词汇有 {len(low_freq_ids)} 个。")

    no_above_limit = num_docs * NO_ABOVE
    high_freq_ids = {tokenid for tokenid, docfreq in dictionary.dfs.items() if docfreq > no_above_limit}
    print(f"\n规则 'no_above = {NO_ABOVE}' 分析:")
    print(f" - 文档频率高于 {NO_ABOVE:.0%} (即 > {int(no_above_limit)}个文档) 的词汇有 {len(high_freq_ids)} 个。")

    if high_freq_ids:
        print(" - 示例 (将被移除的高频词):")
        sorted_high_freq = sorted(high_freq_ids, key=lambda tokenid: dictionary.dfs[tokenid], reverse=True)
        for tokenid in sorted_high_freq[:5]:
            print(f"   - '{dictionary[tokenid]}' (在 {dictionary.dfs[tokenid]} 个文档中出现)")

    total_removed_ids = low_freq_ids.union(high_freq_ids)
    final_vocab_size_estimated = initial_vocab_size - len(total_removed_ids)

    print("\n--- 综合分析结果 ---")
    print(f"将被移除的低频词总数: {len(low_freq_ids)}")
    print(f"将被移除的高频词总数: {len(high_freq_ids)}")
    print(f"将被移除的独立词汇总数: {len(total_removed_ids)}")
    print("-" * 40)
    print(f"预计过滤后的词典大小: {final_vocab_size_estimated}")
    print("-" * 40)

    print("\n--- 4. 实际执行Gensim的 filter_extremes 操作 ---")
    dictionary.filter_extremes(no_below=NO_BELOW, no_above=NO_ABOVE, keep_n=KEEP_N)
    dictionary.compactify()
    
    final_vocab_size_actual = len(dictionary)
    print(f"Gensim过滤后，实际最终词典大小: {final_vocab_size_actual}")

    if final_vocab_size_actual == final_vocab_size_estimated:
        print("验证成功：手动分析结果与Gensim执行结果一致。")
    else:
        print("警告：手动分析结果与Gensim执行结果不一致，请检查逻辑。")

    print("\n--- 5. 创建并保存最终的BoW语料库和词典 ---")

    output_dir = os.path.dirname(FILTERED_DICTIONARY_PATH)
    os.makedirs(output_dir, exist_ok=True)
    
    corpus = [dictionary.doc2bow(text) for text in processed_texts]
    
    dictionary.save(FILTERED_DICTIONARY_PATH)
    with open(FILTERED_CORPUS_PATH, 'wb') as f:
        pickle.dump(corpus, f)
        
    print(f"最终词典已保存至: {FILTERED_DICTIONARY_PATH}")
    print(f"最终BoW语料库已保存至: {FILTERED_CORPUS_PATH}")

In [None]:
import pickle
import os
import time
from gensim.models import LdaMulticore
from gensim.corpora import Dictionary
import multiprocessing


FINAL_DICTIONARY_PATH = 'data/4-1/UMAP/final_dictionary.dict'
FINAL_CORPUS_PATH = 'data/4-1/UMAP/final_corpus.pkl'

MODEL_SAVE_DIR = 'data/4-1/UMAP/model/'
TOPIC_RANGE = range(3, 16)

if __name__ == '__main__':
    os.makedirs(MODEL_SAVE_DIR, exist_ok=True)

    num_workers = multiprocessing.cpu_count() - 2 if multiprocessing.cpu_count() > 2 else 1
    print(f"--- 将为每个LDA模型训练使用 {num_workers} 个CPU核心 ---")

    print("\n--- 1. 正在加载经过词典过滤的最终语料库和词典... ---")
    if not os.path.exists(FINAL_DICTIONARY_PATH) or not os.path.exists(FINAL_CORPUS_PATH):
        raise FileNotFoundError("错误：找不到最终的词典或语料库文件。请先运行词典过滤脚本。")
        
    dictionary = Dictionary.load(FINAL_DICTIONARY_PATH)
    with open(FINAL_CORPUS_PATH, 'rb') as f:
        corpus = pickle.load(f)
    print(f"加载成功。词典大小: {len(dictionary)}，语料库文档数: {len(corpus)}")

    print("\n--- 2. 开始批量训练LDA模型 ---")
    start_time = time.time()
    
    for n_topics in TOPIC_RANGE:
        print(f"\n--- 开始训练: {n_topics} 主题 ---")

        model = LdaMulticore(
            corpus=corpus,              
            num_topics=n_topics,           
            id2word=dictionary,           
            random_state=42,             
            passes=10,                   
            workers=num_workers,          
        )

        model_path = os.path.join(MODEL_SAVE_DIR, f'lda_model_{n_topics}.model')
        model.save(model_path)
        print(f"--- 已保存: {n_topics} 主题的模型至 {model_path} ---")

    end_time = time.time()
    print(f"\n--- 全部模型训练完毕, 总耗时: {end_time - start_time:.2f} 秒 ---")
    print(f"所有模型已保存在: {MODEL_SAVE_DIR}")

In [None]:
import os
import pickle
import pandas as pd
import matplotlib.pyplot as plt
from gensim.models import LdaMulticore, KeyedVectors
from gensim.corpora import Dictionary
from gensim.models.coherencemodel import CoherenceModel
from octis.evaluation_metrics.diversity_metrics import TopicDiversity, InvertedRBO

PROCESSED_CORPUS_PICKLE = 'data/4-1/UMAP/merged_documents_with_clusters.pkl' 
FINAL_DICTIONARY_PATH = 'data/4-1/UMAP/final_dictionary.dict'
TENCENT_WV_PATH = 'data/origin/tencent-ailab-embedding-zh-d100-v0.2.0-s.txt'
MODEL_SAVE_DIR = 'data/4-1/UMAP/model/'
RESULTS_CSV_PATH = 'data/4-1/UMAP/lda_f_evaluation.csv'

TOPIC_RANGE = range(3, 16)

def evaluate_models(model_dir, topic_range, processed_texts, dictionary, word_vectors):
    print("\n--- 开始进行模型评估 ---")
    results = []

    diversity_metric = TopicDiversity(topk=10)
    rbo_metric = InvertedRBO(topk=10, weight=0.9)

    for n_topics in topic_range:
        model_path = os.path.join(model_dir, f'lda_model_{n_topics}.model')
        if not os.path.exists(model_path):
            print(f"警告: 模型文件未找到，跳过: {model_path}")
            continue
            
        print(f"--- 正在评估: {n_topics} 主题的模型 ---")
        lda_model = LdaMulticore.load(model_path)

        topics_for_coherence = [[word for word, _ in lda_model.show_topic(i, topn=20)] for i in range(n_topics)]
        topics_for_diversity = [[word for word, _ in lda_model.show_topic(i, topn=10)] for i in range(n_topics)]

        coherence_model = CoherenceModel(
            topics=topics_for_coherence,
            texts=processed_texts,
            dictionary=dictionary,
            coherence='c_w2v',
            keyed_vectors=word_vectors
        )
        cw2v_semantic = coherence_model.get_coherence()

        model_output_for_diversity = {"topics": topics_for_diversity}
        diversity = diversity_metric.score(model_output_for_diversity)
        rbo = rbo_metric.score(model_output_for_diversity)

        print(f"  - C_W2V (Semantic, topk=20): {cw2v_semantic:.4f}")
        print(f"  - Topic Diversity (topk=10): {diversity:.4f}")
        print(f"  - InvertedRBO (topk=10): {rbo:.4f}")
        
        results.append({
            "num_topics": n_topics,
            "C_W2V (Semantic)": cw2v_semantic,
            "Topic Diversity": diversity,
            "InvertedRBO": rbo
        })
        
    return pd.DataFrame(results).set_index("num_topics")

def plot_results(results_df):
    print("\n--- 正在可视化评估结果... ---")
    fig, ax1 = plt.subplots(figsize=(14, 8))

    ax1.set_xlabel('Number of Topics')
    ax1.set_ylabel('C_W2V Semantic Coherence (topk=20)', color='tab:red')
    ax1.plot(results_df.index, results_df['C_W2V (Semantic)'], color='tab:red', marker='o', linewidth=2.5, label='C_W2V (Semantic)')
    ax1.tick_params(axis='y', labelcolor='tab:red')
    ax1.grid(True, which='both', linestyle='--', linewidth=0.5)

    ax2 = ax1.twinx()
    ax2.set_ylabel('Diversity Scores (topk=10)', color='tab:blue')
    ax2.plot(results_df.index, results_df['Topic Diversity'], color='tab:blue', marker='x', linestyle='-', label='Topic Diversity')
    ax2.plot(results_df.index, results_df['InvertedRBO'], color='tab:cyan', marker='x', linestyle='--', label='InvertedRBO')
    ax2.tick_params(axis='y', labelcolor='tab:blue')

    fig.legend(loc="upper center", bbox_to_anchor=(0.5, 0.96), ncol=3, fontsize='medium')
    fig.suptitle('LDA-F 模型评估: 语义一致性 vs. 多样性', fontsize=16)
    fig.tight_layout(rect=[0, 0.03, 1, 0.92])
    plt.show()

def main():
    print("--- 1. 正在加载评估所需的文件... ---")

    with open(PROCESSED_CORPUS_PICKLE, 'rb') as f:
        processed_texts = pickle.load(f)
    print(f" - 成功加载 {len(processed_texts)} 条原始文本。")

    dictionary = Dictionary.load(FINAL_DICTIONARY_PATH)
    print(f" - 成功加载最终词典 (大小: {len(dictionary)})。")

    print(" - 正在加载腾讯词向量模型...")
    word_vectors = KeyedVectors.load_word2vec_format(TENCENT_WV_PATH, binary=False)
    print(" - 成功加载腾讯词向量。")

    results_df = evaluate_models(MODEL_SAVE_DIR, TOPIC_RANGE, processed_texts, dictionary, word_vectors)
    
    if not results_df.empty:
        print("\n--- 评估完成, 最终结果如下: ---")
        print(results_df)

        os.makedirs(os.path.dirname(RESULTS_CSV_PATH), exist_ok=True)
        results_df.to_csv(RESULTS_CSV_PATH)
        print(f"\n评估结果已保存到 {RESULTS_CSV_PATH}")
        
        plot_results(results_df)
    else:
        print("\n--- 评估失败: 在指定目录下未找到任何模型文件 ---")

if __name__ == "__main__":
    main()

In [None]:
#PCA

In [None]:
import pickle
import pandas as pd
import time
import os

INPUT_HD_AGC_RESULTS_PATH = 'data/4-1/PCA/hd_agc_results.pkl'
INPUT_ORIGINAL_TEXT_PATH = 'data/paragraph.pkl'
OUTPUT_DIR = 'data/4-1/PCA/'
os.makedirs(OUTPUT_DIR, exist_ok=True)
OUTPUT_CLUSTERS_CSV_PATH = os.path.join(OUTPUT_DIR, 'document_clusters_hd_agc.csv')
OUTPUT_MERGED_DOCS_PATH = os.path.join(OUTPUT_DIR, 'merged_documents_with_clusters.pkl')

def merge_documents_and_include_originals():
    print(f"--- 正在从 {INPUT_HD_AGC_RESULTS_PATH} 加载 HD-AGC 聚类结果... ---")
    if not os.path.exists(INPUT_HD_AGC_RESULTS_PATH):
        raise FileNotFoundError(f"HD-AGC 结果文件未找到，请检查路径: {INPUT_HD_AGC_RESULTS_PATH}")
    with open(INPUT_HD_AGC_RESULTS_PATH, 'rb') as f:
        hd_agc_results = pickle.load(f)
    
    clusters = hd_agc_results.get('clusters', [])
    
    print(f"--- 正在从 {INPUT_ORIGINAL_TEXT_PATH} 加载原始文本... ---")
    if not os.path.exists(INPUT_ORIGINAL_TEXT_PATH):
        raise FileNotFoundError(f"原始文本文件未找到，请检查路径: {INPUT_ORIGINAL_TEXT_PATH}")
    with open(INPUT_ORIGINAL_TEXT_PATH, 'rb') as f:
        paragraph = pickle.load(f)
        
    n_total_docs = len(paragraph)

    print("\n--- HD-AGC 聚类结果分析 ---")
    n_clusters = len(clusters)
    num_clustered_docs = sum(len(c) for c in clusters)
    
    print(f" 从 {n_total_docs} 篇文档中，识别出:")
    print(f"   - {n_clusters} 个簇，共包含 {num_clustered_docs} 篇文档。")
    if n_clusters > 0:
        avg_docs_per_cluster = num_clustered_docs / n_clusters
        print(f"   - 平均每个簇由 {avg_docs_per_cluster:.2f} 篇原始文档构成。")
        cluster_sizes = [len(c) for c in clusters]
        top_10_indices = sorted(range(len(cluster_sizes)), key=lambda i: cluster_sizes[i], reverse=True)[:10]
        
        print("\n--- Top 10 最大簇的文档数: ---")
        for i, cluster_idx in enumerate(top_10_indices):
            print(f"  - 簇 {cluster_idx} (第 {i+1} 大): {cluster_sizes[cluster_idx]} 个文档")

    print("\n--- 正在构建每篇文档的聚类标签... ---")
    labels = [-1] * n_total_docs
    for cluster_id, doc_indices in enumerate(clusters):
        for doc_index in doc_indices:
            labels[doc_index] = cluster_id
            
    documents_text = [" ".join(text) for text in paragraph]
    results_df = pd.DataFrame({
        'document_index': range(n_total_docs),
        'document_text': documents_text,
        'cluster_label': labels
    })
    results_df.sort_values(by='cluster_label', inplace=True)
    results_df.to_csv(OUTPUT_CLUSTERS_CSV_PATH, index=False, encoding='utf-8-sig')
    print(f" 详细结果已保存到: {OUTPUT_CLUSTERS_CSV_PATH}")


    print("\n--- 正在构建最终文档集合... ---")
    start_time = time.time()
    
    merged_docs_list = []


    print(f"--- 步骤 1/2: 添加 {n_total_docs} 篇原始文档 ---")
    merged_docs_list.extend(paragraph)


    print(f"--- 步骤 2/2: 正在合并 {n_clusters} 个簇为超级文档 ---")
    for doc_indices in clusters:
        merged_doc = []
        for doc_index in doc_indices:
            if 0 <= doc_index < len(paragraph):
                merged_doc.extend(paragraph[doc_index])
        if merged_doc:
            merged_docs_list.append(merged_doc)

    print(f"--- 合并完成！耗时: {time.time() - start_time:.2f} 秒 ---")

    final_doc_count = len(merged_docs_list)
    expected_doc_count = n_total_docs + n_clusters
    print(f"--- 共生成 {final_doc_count} 篇文档 (由 {n_total_docs} 篇原始文档 + "
          f"{n_clusters} 个超级文档组成)。 ---")
    if final_doc_count != expected_doc_count:
        print(f"   - 警告: 最终文档数 ({final_doc_count}) 与预期数 ({expected_doc_count}) 不符，请检查。")

    with open(OUTPUT_MERGED_DOCS_PATH, 'wb') as f:
        pickle.dump(merged_docs_list, f)
        
    print(f" 成功将文档保存到: {OUTPUT_MERGED_DOCS_PATH}")

    if merged_docs_list and clusters:
        first_cluster_indices = clusters[0]
        sample_cluster_size = len(first_cluster_indices)
        print(f"\n示例合并文档 (来自簇 0) 由 {sample_cluster_size} 篇原始文档合并而成。")
        print(f"其前20个词为: {merged_docs_list[n_total_docs][:20]}")

if __name__ == '__main__':
    merge_documents_and_include_originals()

In [None]:
import pickle
import pandas as pd
import time
import os

INPUT_HD_AGC_RESULTS_PATH = 'data/4-1/PCA/hd_agc_results.pkl'
INPUT_ORIGINAL_TEXT_PATH = 'data/paragraph.pkl'
OUTPUT_DIR = 'data/4-1/PCA/'
os.makedirs(OUTPUT_DIR, exist_ok=True)
OUTPUT_CLUSTERS_CSV_PATH = os.path.join(OUTPUT_DIR, 'document_clusters_hd_agc.csv')
OUTPUT_MERGED_DOCS_PATH = os.path.join(OUTPUT_DIR, 'merged_documents_with_clusters.pkl')

def merge_documents_and_include_originals():
    print(f"--- 正在从 {INPUT_HD_AGC_RESULTS_PATH} 加载 HD-AGC 聚类结果... ---")
    if not os.path.exists(INPUT_HD_AGC_RESULTS_PATH):
        raise FileNotFoundError(f"HD-AGC 结果文件未找到，请检查路径: {INPUT_HD_AGC_RESULTS_PATH}")
    with open(INPUT_HD_AGC_RESULTS_PATH, 'rb') as f:
        hd_agc_results = pickle.load(f)
    
    clusters = hd_agc_results.get('clusters', [])
    
    print(f"--- 正在从 {INPUT_ORIGINAL_TEXT_PATH} 加载原始文本... ---")
    if not os.path.exists(INPUT_ORIGINAL_TEXT_PATH):
        raise FileNotFoundError(f"原始文本文件未找到，请检查路径: {INPUT_ORIGINAL_TEXT_PATH}")
    with open(INPUT_ORIGINAL_TEXT_PATH, 'rb') as f:
        paragraph = pickle.load(f)
        
    n_total_docs = len(paragraph)

    print("\n--- HD-AGC 聚类结果分析 ---")
    n_clusters = len(clusters)
    num_clustered_docs = sum(len(c) for c in clusters)
    
    print(f" 从 {n_total_docs} 篇文档中，识别出:")
    print(f"   - {n_clusters} 个簇，共包含 {num_clustered_docs} 篇文档。")
    if n_clusters > 0:
        avg_docs_per_cluster = num_clustered_docs / n_clusters
        print(f"   - 平均每个簇由 {avg_docs_per_cluster:.2f} 篇原始文档构成。")
        cluster_sizes = [len(c) for c in clusters]
        top_10_indices = sorted(range(len(cluster_sizes)), key=lambda i: cluster_sizes[i], reverse=True)[:10]
        
        print("\n--- Top 10 最大簇的文档数: ---")
        for i, cluster_idx in enumerate(top_10_indices):
            print(f"  - 簇 {cluster_idx} (第 {i+1} 大): {cluster_sizes[cluster_idx]} 个文档")

    print("\n--- 正在构建每篇文档的聚类标签... ---")
    labels = [-1] * n_total_docs
    for cluster_id, doc_indices in enumerate(clusters):
        for doc_index in doc_indices:
            labels[doc_index] = cluster_id
            
    documents_text = [" ".join(text) for text in paragraph]
    results_df = pd.DataFrame({
        'document_index': range(n_total_docs),
        'document_text': documents_text,
        'cluster_label': labels
    })
    results_df.sort_values(by='cluster_label', inplace=True)
    results_df.to_csv(OUTPUT_CLUSTERS_CSV_PATH, index=False, encoding='utf-8-sig')
    print(f" 详细结果已保存到: {OUTPUT_CLUSTERS_CSV_PATH}")


    print("\n--- 正在构建最终文档集合... ---")
    start_time = time.time()
    
    merged_docs_list = []


    print(f"--- 步骤 1/2: 添加 {n_total_docs} 篇原始文档 ---")
    merged_docs_list.extend(paragraph)


    print(f"--- 步骤 2/2: 正在合并 {n_clusters} 个簇为超级文档 ---")
    for doc_indices in clusters:
        merged_doc = []
        for doc_index in doc_indices:
            if 0 <= doc_index < len(paragraph):
                merged_doc.extend(paragraph[doc_index])
        if merged_doc:
            merged_docs_list.append(merged_doc)

    print(f"--- 合并完成！耗时: {time.time() - start_time:.2f} 秒 ---")

    final_doc_count = len(merged_docs_list)
    expected_doc_count = n_total_docs + n_clusters
    print(f"--- 共生成 {final_doc_count} 篇文档 (由 {n_total_docs} 篇原始文档 + "
          f"{n_clusters} 个超级文档组成)。 ---")
    if final_doc_count != expected_doc_count:
        print(f"   - 警告: 最终文档数 ({final_doc_count}) 与预期数 ({expected_doc_count}) 不符，请检查。")

    with open(OUTPUT_MERGED_DOCS_PATH, 'wb') as f:
        pickle.dump(merged_docs_list, f)
        
    print(f" 成功将文档保存到: {OUTPUT_MERGED_DOCS_PATH}")

    if merged_docs_list and clusters:
        first_cluster_indices = clusters[0]
        sample_cluster_size = len(first_cluster_indices)
        print(f"\n示例合并文档 (来自簇 0) 由 {sample_cluster_size} 篇原始文档合并而成。")
        print(f"其前20个词为: {merged_docs_list[n_total_docs][:20]}")

if __name__ == '__main__':
    merge_documents_and_include_originals()

In [None]:
import pickle
import os
import time
from gensim.models import LdaMulticore
from gensim.corpora import Dictionary
import multiprocessing


FINAL_DICTIONARY_PATH = 'data/4-1/PCA/final_dictionary.dict'
FINAL_CORPUS_PATH = 'data/4-1/PCA/final_corpus.pkl'

MODEL_SAVE_DIR = 'data/4-1/PCA/model/'
TOPIC_RANGE = range(3, 16)

if __name__ == '__main__':
    os.makedirs(MODEL_SAVE_DIR, exist_ok=True)

    num_workers = multiprocessing.cpu_count() - 2 if multiprocessing.cpu_count() > 2 else 1
    print(f"--- 将为每个LDA模型训练使用 {num_workers} 个CPU核心 ---")

    print("\n--- 1. 正在加载经过词典过滤的最终语料库和词典... ---")
    if not os.path.exists(FINAL_DICTIONARY_PATH) or not os.path.exists(FINAL_CORPUS_PATH):
        raise FileNotFoundError("错误：找不到最终的词典或语料库文件。请先运行词典过滤脚本。")
        
    dictionary = Dictionary.load(FINAL_DICTIONARY_PATH)
    with open(FINAL_CORPUS_PATH, 'rb') as f:
        corpus = pickle.load(f)
    print(f"加载成功。词典大小: {len(dictionary)}，语料库文档数: {len(corpus)}")

    print("\n--- 2. 开始批量训练LDA模型 ---")
    start_time = time.time()
    
    for n_topics in TOPIC_RANGE:
        print(f"\n--- 开始训练: {n_topics} 主题 ---")

        model = LdaMulticore(
            corpus=corpus,              
            num_topics=n_topics,           
            id2word=dictionary,           
            random_state=42,             
            passes=10,                   
            workers=num_workers,          
        )

        model_path = os.path.join(MODEL_SAVE_DIR, f'lda_model_{n_topics}.model')
        model.save(model_path)
        print(f"--- 已保存: {n_topics} 主题的模型至 {model_path} ---")

    end_time = time.time()
    print(f"\n--- 全部模型训练完毕, 总耗时: {end_time - start_time:.2f} 秒 ---")
    print(f"所有模型已保存在: {MODEL_SAVE_DIR}")

In [None]:
import os
import pickle
import pandas as pd
import matplotlib.pyplot as plt
from gensim.models import LdaMulticore, KeyedVectors
from gensim.corpora import Dictionary
from gensim.models.coherencemodel import CoherenceModel
from octis.evaluation_metrics.diversity_metrics import TopicDiversity, InvertedRBO

PROCESSED_CORPUS_PICKLE = 'data/4-1/PCA/merged_documents_with_clusters.pkl' 
FINAL_DICTIONARY_PATH = 'data/4-1/PCA/final_dictionary.dict'
TENCENT_WV_PATH = 'data/origin/tencent-ailab-embedding-zh-d100-v0.2.0-s.txt'
MODEL_SAVE_DIR = 'data/4-1/PCA/model/'
RESULTS_CSV_PATH = 'data/4-1/PCA/lda_f_evaluation.csv'

TOPIC_RANGE = range(3, 16)

def evaluate_models(model_dir, topic_range, processed_texts, dictionary, word_vectors):
    print("\n--- 开始进行模型评估 ---")
    results = []

    diversity_metric = TopicDiversity(topk=10)
    rbo_metric = InvertedRBO(topk=10, weight=0.9)

    for n_topics in topic_range:
        model_path = os.path.join(model_dir, f'lda_model_{n_topics}.model')
        if not os.path.exists(model_path):
            print(f"警告: 模型文件未找到，跳过: {model_path}")
            continue
            
        print(f"--- 正在评估: {n_topics} 主题的模型 ---")
        lda_model = LdaMulticore.load(model_path)

        topics_for_coherence = [[word for word, _ in lda_model.show_topic(i, topn=20)] for i in range(n_topics)]
        topics_for_diversity = [[word for word, _ in lda_model.show_topic(i, topn=10)] for i in range(n_topics)]

        coherence_model = CoherenceModel(
            topics=topics_for_coherence,
            texts=processed_texts,
            dictionary=dictionary,
            coherence='c_w2v',
            keyed_vectors=word_vectors
        )
        cw2v_semantic = coherence_model.get_coherence()

        model_output_for_diversity = {"topics": topics_for_diversity}
        diversity = diversity_metric.score(model_output_for_diversity)
        rbo = rbo_metric.score(model_output_for_diversity)

        print(f"  - C_W2V (Semantic, topk=20): {cw2v_semantic:.4f}")
        print(f"  - Topic Diversity (topk=10): {diversity:.4f}")
        print(f"  - InvertedRBO (topk=10): {rbo:.4f}")
        
        results.append({
            "num_topics": n_topics,
            "C_W2V (Semantic)": cw2v_semantic,
            "Topic Diversity": diversity,
            "InvertedRBO": rbo
        })
        
    return pd.DataFrame(results).set_index("num_topics")

def plot_results(results_df):
    print("\n--- 正在可视化评估结果... ---")
    fig, ax1 = plt.subplots(figsize=(14, 8))

    ax1.set_xlabel('Number of Topics')
    ax1.set_ylabel('C_W2V Semantic Coherence (topk=20)', color='tab:red')
    ax1.plot(results_df.index, results_df['C_W2V (Semantic)'], color='tab:red', marker='o', linewidth=2.5, label='C_W2V (Semantic)')
    ax1.tick_params(axis='y', labelcolor='tab:red')
    ax1.grid(True, which='both', linestyle='--', linewidth=0.5)

    ax2 = ax1.twinx()
    ax2.set_ylabel('Diversity Scores (topk=10)', color='tab:blue')
    ax2.plot(results_df.index, results_df['Topic Diversity'], color='tab:blue', marker='x', linestyle='-', label='Topic Diversity')
    ax2.plot(results_df.index, results_df['InvertedRBO'], color='tab:cyan', marker='x', linestyle='--', label='InvertedRBO')
    ax2.tick_params(axis='y', labelcolor='tab:blue')

    fig.legend(loc="upper center", bbox_to_anchor=(0.5, 0.96), ncol=3, fontsize='medium')
    fig.suptitle('LDA-F 模型评估: 语义一致性 vs. 多样性', fontsize=16)
    fig.tight_layout(rect=[0, 0.03, 1, 0.92])
    plt.show()

def main():
    print("--- 1. 正在加载评估所需的文件... ---")

    with open(PROCESSED_CORPUS_PICKLE, 'rb') as f:
        processed_texts = pickle.load(f)
    print(f" - 成功加载 {len(processed_texts)} 条原始文本。")

    dictionary = Dictionary.load(FINAL_DICTIONARY_PATH)
    print(f" - 成功加载最终词典 (大小: {len(dictionary)})。")

    print(" - 正在加载腾讯词向量模型...")
    word_vectors = KeyedVectors.load_word2vec_format(TENCENT_WV_PATH, binary=False)
    print(" - 成功加载腾讯词向量。")

    results_df = evaluate_models(MODEL_SAVE_DIR, TOPIC_RANGE, processed_texts, dictionary, word_vectors)
    
    if not results_df.empty:
        print("\n--- 评估完成, 最终结果如下: ---")
        print(results_df)

        os.makedirs(os.path.dirname(RESULTS_CSV_PATH), exist_ok=True)
        results_df.to_csv(RESULTS_CSV_PATH)
        print(f"\n评估结果已保存到 {RESULTS_CSV_PATH}")
        
        plot_results(results_df)
    else:
        print("\n--- 评估失败: 在指定目录下未找到任何模型文件 ---")

if __name__ == "__main__":
    main()

In [None]:
#analyse

In [None]:
import pickle
import numpy as np
import os
from tqdm import tqdm
from sklearn.metrics.pairwise import cosine_similarity
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib.ticker import MaxNLocator, FuncFormatter

INPUT_CLUSTER_RESULTS_PATH = 'data/4-1/O/hd_agc_results.pkl'
INPUT_VECTORS_PATH = 'data/doc_vectors.npy'

OUTPUT_FIGURE_PATH = 'data/4-1/O/similarity_distribution.png' 

S_MIN_THRESHOLD = 0.83

AXIS_LABEL_FONTSIZE = 18
LEGEND_FONTSIZE = 18
TICK_LABEL_FONTSIZE = 18
SUB_LABEL_FONTSIZE = 28


def evaluate_cluster_similarity():
    print("--- 正在加载数据... ---")   
    if not os.path.exists(INPUT_CLUSTER_RESULTS_PATH):
        raise FileNotFoundError(f"聚类结果文件未找到: {INPUT_CLUSTER_RESULTS_PATH}")
    if not os.path.exists(INPUT_VECTORS_PATH):
        raise FileNotFoundError(f"向量文件未找到: {INPUT_VECTORS_PATH}")

    with open(INPUT_CLUSTER_RESULTS_PATH, 'rb') as f:
        cluster_results = pickle.load(f)
    clusters = cluster_results['clusters']
    print(f"成功加载 {len(clusters)} 个簇。")

    vectors = np.load(INPUT_VECTORS_PATH)
    if vectors.dtype != np.float32:
        vectors = vectors.astype(np.float32)
    print(f"成功加载 {len(vectors)} 个向量。")

    print("\n--- 正在计算簇内相似度... ---")
    all_intra_cluster_similarities = []

    for cluster_indices in tqdm(clusters, desc="[计算中]"):
        if len(cluster_indices) < 2:
            continue
        
        cluster_vectors = vectors[cluster_indices]
        sim_matrix = cosine_similarity(cluster_vectors)
        upper_triangle_indices = np.triu_indices(len(cluster_indices), k=1)
        pairwise_sims = sim_matrix[upper_triangle_indices]
        all_intra_cluster_similarities.extend(pairwise_sims.tolist())

    if not all_intra_cluster_similarities:
        print("\n错误：未能计算出任何相似度分数。请检查簇是否都过小（成员数<2）。")
        return

    print(f"\n--- 计算完成！共得到 {len(all_intra_cluster_similarities):,} 个相似度分数。---")

    print("--- 正在生成相似度分布图... ---")
    
    sim_array = np.array(all_intra_cluster_similarities)
    
    print(f"  - 平均相似度: {np.mean(sim_array):.4f}")
    
    sns.set_style("whitegrid")
    plt.figure(figsize=(12, 7))
    
    ax = plt.gca()
    
    sns.histplot(sim_array, bins=100, kde=True, color='skyblue', alpha=0.7, ax=ax)
    
    if S_MIN_THRESHOLD is not None:
        plt.axvline(x=S_MIN_THRESHOLD, color='r', linestyle='--', linewidth=2, 
                    label=f'S_MIN Threshold = {S_MIN_THRESHOLD}')
        plt.legend(fontsize=LEGEND_FONTSIZE)
        

    plt.xlabel('Cosine Similarity', fontsize=AXIS_LABEL_FONTSIZE)
    
    ax.text(0.5, -0.2, '(a)', 
            transform=ax.transAxes, 
            ha='center', 
            va='top', 
            fontsize=SUB_LABEL_FONTSIZE)
    
    plt.ylabel('Frequency', fontsize=AXIS_LABEL_FONTSIZE)
    
    plt.tick_params(axis='both', which='major', labelsize=TICK_LABEL_FONTSIZE)
    
    plt.xlim(0, 1.0)
    plt.ylim(0, 20000)

    ax.yaxis.set_major_locator(MaxNLocator(prune='lower'))

    def custom_tick_formatter(x, pos):
        if x == 0:
            return '0'
        else:
            return f'{x:.1f}'

    ax.xaxis.set_major_formatter(FuncFormatter(custom_tick_formatter))

    output_dir = os.path.dirname(OUTPUT_FIGURE_PATH)
    if output_dir and not os.path.exists(output_dir):
        os.makedirs(output_dir)
        print(f"已创建输出目录: {output_dir}")

    plt.tight_layout(pad=1.5)
    
    plt.savefig(OUTPUT_FIGURE_PATH, dpi=300)
    
    print(f"\n评估完成，图像已成功保存至: {OUTPUT_FIGURE_PATH}")
    
    print("正在显示相似度分布图...")
    plt.show()

if __name__ == '__main__':
    evaluate_cluster_similarity()

In [None]:
import pickle
import numpy as np
import os
from tqdm import tqdm
from sklearn.metrics.pairwise import cosine_similarity
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib.ticker import MaxNLocator, FuncFormatter


INPUT_CLUSTER_RESULTS_PATH = 'data/4-1/CRL/hd_agc_results.pkl'
INPUT_VECTORS_PATH = 'data/doc_vectors.npy'

OUTPUT_FIGURE_PATH = 'data/4-1/CRL/similarity_distribution.png' 

S_MIN_THRESHOLD = 0.83

AXIS_LABEL_FONTSIZE = 18
LEGEND_FONTSIZE = 18
TICK_LABEL_FONTSIZE = 18
SUB_LABEL_FONTSIZE = 28

def evaluate_cluster_similarity():
    print("--- 正在加载数据... ---")
    
    if not os.path.exists(INPUT_CLUSTER_RESULTS_PATH):
        raise FileNotFoundError(f"聚类结果文件未找到: {INPUT_CLUSTER_RESULTS_PATH}")
    if not os.path.exists(INPUT_VECTORS_PATH):
        raise FileNotFoundError(f"向量文件未找到: {INPUT_VECTORS_PATH}")

    with open(INPUT_CLUSTER_RESULTS_PATH, 'rb') as f:
        cluster_results = pickle.load(f)
    clusters = cluster_results['clusters']
    print(f"成功加载 {len(clusters)} 个簇。")

    vectors = np.load(INPUT_VECTORS_PATH)
    if vectors.dtype != np.float32:
        vectors = vectors.astype(np.float32)
    print(f"成功加载 {len(vectors)} 个向量。")

    print("\n--- 正在计算簇内相似度... ---")
    all_intra_cluster_similarities = []

    for cluster_indices in tqdm(clusters, desc="[计算中]"):
        if len(cluster_indices) < 2:
            continue
        
        cluster_vectors = vectors[cluster_indices]
        sim_matrix = cosine_similarity(cluster_vectors)
        upper_triangle_indices = np.triu_indices(len(cluster_indices), k=1)
        pairwise_sims = sim_matrix[upper_triangle_indices]

        all_intra_cluster_similarities.extend(pairwise_sims.tolist())

    if not all_intra_cluster_similarities:
        print("\n错误：未能计算出任何相似度分数。请检查簇是否都过小（成员数<2）。")
        return

    print(f"\n--- 计算完成！共得到 {len(all_intra_cluster_similarities):,} 个相似度分数。---")

    print("--- 正在生成相似度分布图... ---")
    
    sim_array = np.array(all_intra_cluster_similarities)
    
    print(f"  - 平均相似度: {np.mean(sim_array):.4f}")
    
    sns.set_style("whitegrid")
    plt.figure(figsize=(12, 7))

    ax = plt.gca()
    
    sns.histplot(sim_array, bins=100, kde=True, color='skyblue', alpha=0.7, ax=ax)
    
    if S_MIN_THRESHOLD is not None:
        plt.axvline(x=S_MIN_THRESHOLD, color='r', linestyle='--', linewidth=2, 
                    label=f'S_MIN Threshold = {S_MIN_THRESHOLD}')
        plt.legend(fontsize=LEGEND_FONTSIZE)

    plt.xlabel('Cosine Similarity', fontsize=AXIS_LABEL_FONTSIZE)

    ax.text(0.5, -0.2, '(b)', 
            transform=ax.transAxes, 
            ha='center', 
            va='top', 
            fontsize=SUB_LABEL_FONTSIZE)
    
    plt.ylabel('Frequency', fontsize=AXIS_LABEL_FONTSIZE)
    
    plt.tick_params(axis='both', which='major', labelsize=TICK_LABEL_FONTSIZE)

    plt.xlim(0, 1.0)
    plt.ylim(0, 20000)

    ax.yaxis.set_major_locator(MaxNLocator(prune='lower'))

    def custom_tick_formatter(x, pos):
        if x == 0:
            return '0'
        else:
            return f'{x:.1f}'

    ax.xaxis.set_major_formatter(FuncFormatter(custom_tick_formatter))

    output_dir = os.path.dirname(OUTPUT_FIGURE_PATH)
    if output_dir and not os.path.exists(output_dir):
        os.makedirs(output_dir)
        print(f"已创建输出目录: {output_dir}")

    plt.tight_layout(pad=1.5)
    
    plt.savefig(OUTPUT_FIGURE_PATH, dpi=300)
    
    print(f"\n评估完成，图像已成功保存至: {OUTPUT_FIGURE_PATH}")
    
    print("正在显示相似度分布图...")
    plt.show()


if __name__ == '__main__':
    evaluate_cluster_similarity()

In [None]:
import pickle
import numpy as np
import os
from tqdm import tqdm
from sklearn.metrics.pairwise import cosine_similarity
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib.ticker import MaxNLocator, FuncFormatter

INPUT_CLUSTER_RESULTS_PATH = 'data/4-1/UMAP/hd_agc_results.pkl'
INPUT_VECTORS_PATH = 'data/doc_vectors.npy'

OUTPUT_FIGURE_PATH = 'data/4-1/UMAP/similarity_distribution.png' 

S_MIN_THRESHOLD = 0.83
AXIS_LABEL_FONTSIZE = 18
LEGEND_FONTSIZE = 18
TICK_LABEL_FONTSIZE = 18
SUB_LABEL_FONTSIZE = 28

def evaluate_cluster_similarity():
    print("--- 正在加载数据... ---")
    
    if not os.path.exists(INPUT_CLUSTER_RESULTS_PATH):
        raise FileNotFoundError(f"聚类结果文件未找到: {INPUT_CLUSTER_RESULTS_PATH}")
    if not os.path.exists(INPUT_VECTORS_PATH):
        raise FileNotFoundError(f"向量文件未找到: {INPUT_VECTORS_PATH}")

    with open(INPUT_CLUSTER_RESULTS_PATH, 'rb') as f:
        cluster_results = pickle.load(f)
    clusters = cluster_results['clusters']
    print(f"成功加载 {len(clusters)} 个簇。")

    vectors = np.load(INPUT_VECTORS_PATH)
    if vectors.dtype != np.float32:
        vectors = vectors.astype(np.float32)
    print(f"成功加载 {len(vectors)} 个向量。")

    print("\n--- 正在计算簇内相似度... ---")
    all_intra_cluster_similarities = []

    for cluster_indices in tqdm(clusters, desc="[计算中]"):
        if len(cluster_indices) < 2:
            continue
        
        cluster_vectors = vectors[cluster_indices]
        sim_matrix = cosine_similarity(cluster_vectors)
        upper_triangle_indices = np.triu_indices(len(cluster_indices), k=1)
        pairwise_sims = sim_matrix[upper_triangle_indices]
        all_intra_cluster_similarities.extend(pairwise_sims.tolist())

    if not all_intra_cluster_similarities:
        print("\n错误：未能计算出任何相似度分数。请检查簇是否都过小（成员数<2）。")
        return

    print(f"\n--- 计算完成！共得到 {len(all_intra_cluster_similarities):,} 个相似度分数。---")

    print("--- 正在生成相似度分布图... ---")
    
    sim_array = np.array(all_intra_cluster_similarities)
    
    print(f"  - 平均相似度: {np.mean(sim_array):.4f}")
    
    sns.set_style("whitegrid")
    plt.figure(figsize=(12, 7))

    ax = plt.gca()
    
    sns.histplot(sim_array, bins=100, kde=True, color='skyblue', alpha=0.7, ax=ax)
    
    if S_MIN_THRESHOLD is not None:
        plt.axvline(x=S_MIN_THRESHOLD, color='r', linestyle='--', linewidth=2, 
                    label=f'S_MIN Threshold = {S_MIN_THRESHOLD}')
        plt.legend(fontsize=LEGEND_FONTSIZE)

    plt.xlabel('Cosine Similarity', fontsize=AXIS_LABEL_FONTSIZE)

    ax.text(0.5, -0.2, '(c)', 
            transform=ax.transAxes, 
            ha='center', 
            va='top', 
            fontsize=SUB_LABEL_FONTSIZE)
    
    plt.ylabel('Frequency', fontsize=AXIS_LABEL_FONTSIZE)
    
    plt.tick_params(axis='both', which='major', labelsize=TICK_LABEL_FONTSIZE)

    plt.xlim(0, 1.0)
    plt.ylim(0, 20000)

    ax.yaxis.set_major_locator(MaxNLocator(prune='lower'))

    def custom_tick_formatter(x, pos):
        if x == 0:
            return '0'
        else:
            return f'{x:.1f}'

    ax.xaxis.set_major_formatter(FuncFormatter(custom_tick_formatter))

    output_dir = os.path.dirname(OUTPUT_FIGURE_PATH)
    if output_dir and not os.path.exists(output_dir):
        os.makedirs(output_dir)
        print(f"已创建输出目录: {output_dir}")

    plt.tight_layout(pad=1.5)
    
    plt.savefig(OUTPUT_FIGURE_PATH, dpi=300)
    
    print(f"\n评估完成，图像已成功保存至: {OUTPUT_FIGURE_PATH}")
    
    print("正在显示相似度分布图...")
    plt.show()


if __name__ == '__main__':
    evaluate_cluster_similarity()

In [None]:
import pickle
import numpy as np
import os
from tqdm import tqdm
from sklearn.metrics.pairwise import cosine_similarity
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib.ticker import MaxNLocator, FuncFormatter

INPUT_CLUSTER_RESULTS_PATH = 'data/4-1/PCA/hd_agc_results.pkl'
INPUT_VECTORS_PATH = 'data/doc_vectors.npy'

OUTPUT_FIGURE_PATH = 'data/4-1/PCA/similarity_distribution.png' 

S_MIN_THRESHOLD = 0.83

AXIS_LABEL_FONTSIZE = 18
LEGEND_FONTSIZE = 18
TICK_LABEL_FONTSIZE = 18
SUB_LABEL_FONTSIZE = 28

def evaluate_cluster_similarity():
    print("--- 正在加载数据... ---")
    
    if not os.path.exists(INPUT_CLUSTER_RESULTS_PATH):
        raise FileNotFoundError(f"聚类结果文件未找到: {INPUT_CLUSTER_RESULTS_PATH}")
    if not os.path.exists(INPUT_VECTORS_PATH):
        raise FileNotFoundError(f"向量文件未找到: {INPUT_VECTORS_PATH}")

    with open(INPUT_CLUSTER_RESULTS_PATH, 'rb') as f:
        cluster_results = pickle.load(f)
    clusters = cluster_results['clusters']
    print(f"成功加载 {len(clusters)} 个簇。")

    vectors = np.load(INPUT_VECTORS_PATH)
    if vectors.dtype != np.float32:
        vectors = vectors.astype(np.float32)
    print(f"成功加载 {len(vectors)} 个向量。")

    print("\n--- 正在计算簇内相似度... ---")
    all_intra_cluster_similarities = []

    for cluster_indices in tqdm(clusters, desc="[计算中]"):
        if len(cluster_indices) < 2:
            continue
        
        cluster_vectors = vectors[cluster_indices]
        sim_matrix = cosine_similarity(cluster_vectors)
        upper_triangle_indices = np.triu_indices(len(cluster_indices), k=1)
        pairwise_sims = sim_matrix[upper_triangle_indices]
        all_intra_cluster_similarities.extend(pairwise_sims.tolist())

    if not all_intra_cluster_similarities:
        print("\n错误：未能计算出任何相似度分数。请检查簇是否都过小（成员数<2）。")
        return

    print(f"\n--- 计算完成！共得到 {len(all_intra_cluster_similarities):,} 个相似度分数。---")

    print("--- 正在生成相似度分布图... ---")
    
    sim_array = np.array(all_intra_cluster_similarities)
    
    print(f"  - 平均相似度: {np.mean(sim_array):.4f}")
    
    sns.set_style("whitegrid")
    plt.figure(figsize=(12, 7))

    ax = plt.gca()
    
    sns.histplot(sim_array, bins=100, kde=True, color='skyblue', alpha=0.7, ax=ax)
    
    if S_MIN_THRESHOLD is not None:
        plt.axvline(x=S_MIN_THRESHOLD, color='r', linestyle='--', linewidth=2, 
                    label=f'S_MIN Threshold = {S_MIN_THRESHOLD}')
        plt.legend(fontsize=LEGEND_FONTSIZE)

    plt.xlabel('Cosine Similarity', fontsize=AXIS_LABEL_FONTSIZE) 

    ax.text(0.5, -0.2, '(d)', 
            transform=ax.transAxes, 
            ha='center', 
            va='top', 
            fontsize=SUB_LABEL_FONTSIZE)
    
    plt.ylabel('Frequency', fontsize=AXIS_LABEL_FONTSIZE)
    
    plt.tick_params(axis='both', which='major', labelsize=TICK_LABEL_FONTSIZE)

    plt.xlim(0, 1.0)
    plt.ylim(0, 20000)

    ax.yaxis.set_major_locator(MaxNLocator(prune='lower'))

    def custom_tick_formatter(x, pos):
        if x == 0:
            return '0'
        else:
            return f'{x:.1f}'

    ax.xaxis.set_major_formatter(FuncFormatter(custom_tick_formatter))

    output_dir = os.path.dirname(OUTPUT_FIGURE_PATH)
    if output_dir and not os.path.exists(output_dir):
        os.makedirs(output_dir)
        print(f"已创建输出目录: {output_dir}")

    plt.tight_layout(pad=1.5)
    
    plt.savefig(OUTPUT_FIGURE_PATH, dpi=300) 
    
    print(f"\n评估完成，图像已成功保存至: {OUTPUT_FIGURE_PATH}")
    
    print("正在显示相似度分布图...")
    plt.show()


if __name__ == '__main__':
    evaluate_cluster_similarity()

In [None]:
#dimension 64 256

In [None]:
#64

In [None]:
import pickle
import pandas as pd
import time
import os

INPUT_HD_AGC_RESULTS_PATH = 'data/4-1/64/pca_64_hd_agc_results.pkl'

INPUT_ORIGINAL_TEXT_PATH = 'data/paragraph.pkl'

OUTPUT_DIR = 'data/4-1/64/'
os.makedirs(OUTPUT_DIR, exist_ok=True)

OUTPUT_CLUSTERS_CSV_PATH = os.path.join(OUTPUT_DIR, 'document_clusters_hd_agc.csv')

OUTPUT_MERGED_DOCS_PATH = os.path.join(OUTPUT_DIR, 'merged_documents_with_clusters.pkl')

def merge_documents_and_include_originals():
    print(f"--- 正在从 {INPUT_HD_AGC_RESULTS_PATH} 加载 HD-AGC 聚类结果... ---")
    if not os.path.exists(INPUT_HD_AGC_RESULTS_PATH):
        raise FileNotFoundError(f"HD-AGC 结果文件未找到，请检查路径: {INPUT_HD_AGC_RESULTS_PATH}")
    with open(INPUT_HD_AGC_RESULTS_PATH, 'rb') as f:
        hd_agc_results = pickle.load(f)
    
    clusters = hd_agc_results.get('clusters', [])
    
    print(f"--- 正在从 {INPUT_ORIGINAL_TEXT_PATH} 加载原始文本... ---")
    if not os.path.exists(INPUT_ORIGINAL_TEXT_PATH):
        raise FileNotFoundError(f"原始文本文件未找到，请检查路径: {INPUT_ORIGINAL_TEXT_PATH}")
    with open(INPUT_ORIGINAL_TEXT_PATH, 'rb') as f:
        paragraph = pickle.load(f)
        
    n_total_docs = len(paragraph)

    print("\n--- HD-AGC 聚类结果分析 ---")
    n_clusters = len(clusters)
    num_clustered_docs = sum(len(c) for c in clusters)
    
    print(f"从 {n_total_docs} 篇文档中，识别出:")
    print(f"   - {n_clusters} 个簇，共包含 {num_clustered_docs} 篇文档。")
    if n_clusters > 0:
        avg_docs_per_cluster = num_clustered_docs / n_clusters
        print(f"   - 平均每个簇由 {avg_docs_per_cluster:.2f} 篇原始文档构成。")
        cluster_sizes = [len(c) for c in clusters]
        top_10_indices = sorted(range(len(cluster_sizes)), key=lambda i: cluster_sizes[i], reverse=True)[:10]
        
        print("\n--- Top 10 最大簇的文档数: ---")
        for i, cluster_idx in enumerate(top_10_indices):
            print(f"  - 簇 {cluster_idx} (第 {i+1} 大): {cluster_sizes[cluster_idx]} 个文档")

    print("\n--- 正在构建每篇文档的聚类标签... ---")
    labels = [-1] * n_total_docs
    for cluster_id, doc_indices in enumerate(clusters):
        for doc_index in doc_indices:
            labels[doc_index] = cluster_id
            
    documents_text = [" ".join(text) for text in paragraph]
    results_df = pd.DataFrame({
        'document_index': range(n_total_docs),
        'document_text': documents_text,
        'cluster_label': labels
    })
    results_df.sort_values(by='cluster_label', inplace=True)
    results_df.to_csv(OUTPUT_CLUSTERS_CSV_PATH, index=False, encoding='utf-8-sig')
    print(f"详细结果已保存到: {OUTPUT_CLUSTERS_CSV_PATH}")

    print("\n--- 正在构建最终文档集合... ---")
    start_time = time.time()
    
    merged_docs_list = []

    print(f"--- 步骤 1/2: 添加 {n_total_docs} 篇原始文档 ---")
    merged_docs_list.extend(paragraph)

    print(f"--- 步骤 2/2: 正在合并 {n_clusters} 个簇为超级文档 ---")
    for doc_indices in clusters:
        merged_doc = []
        for doc_index in doc_indices:
            if 0 <= doc_index < len(paragraph):
                merged_doc.extend(paragraph[doc_index])
        if merged_doc:
            merged_docs_list.append(merged_doc)

    print(f"--- 合并完成！耗时: {time.time() - start_time:.2f} 秒 ---")

    final_doc_count = len(merged_docs_list)
    expected_doc_count = n_total_docs + n_clusters
    print(f"--- 共生成 {final_doc_count} 篇文档 (由 {n_total_docs} 篇原始文档 + "
          f"{n_clusters} 个超级文档组成)。 ---")
    if final_doc_count != expected_doc_count:
        print(f"   - 警告: 最终文档数 ({final_doc_count}) 与预期数 ({expected_doc_count}) 不符，请检查。")

    with open(OUTPUT_MERGED_DOCS_PATH, 'wb') as f:
        pickle.dump(merged_docs_list, f)
        
    print(f"成功将文档保存到: {OUTPUT_MERGED_DOCS_PATH}")

    if merged_docs_list and clusters:
        first_cluster_indices = clusters[0]
        sample_cluster_size = len(first_cluster_indices)
        print(f"\n示例合并文档 (来自簇 0) 由 {sample_cluster_size} 篇原始文档合并而成。")
        print(f"其前20个词为: {merged_docs_list[n_total_docs][:20]}")

if __name__ == '__main__':
    merge_documents_and_include_originals()

In [None]:
import pickle
import os 
from gensim.corpora import Dictionary
from collections import Counter

PROCESSED_CORPUS_PICKLE = 'data/4-1/64/merged_documents_with_clusters.pkl'
FILTERED_DICTIONARY_PATH = 'data/4-1/64/final_dictionary.dict'
FILTERED_CORPUS_PATH = 'data/4-1/64/final_corpus.pkl'

NO_BELOW = 10
NO_ABOVE = 0.35
KEEP_N = None

if __name__ == '__main__':
    print("--- 1. 加载预处理好的分词后文本 ---")
    with open(PROCESSED_CORPUS_PICKLE, 'rb') as f:
        processed_texts = pickle.load(f)
    print(f"成功加载 {len(processed_texts)} 条文本。")

    print("\n--- 2. 创建初始Gensim词典 (不过滤) ---")
    dictionary = Dictionary(processed_texts)
    initial_vocab_size = len(dictionary)
    print(f"初始词典大小: {initial_vocab_size}")

    print("\n--- 3. 分析将被各个过滤规则移除的词汇 ---")
    
    num_docs = dictionary.num_docs
    print(f"总文档数: {num_docs}")
    print("-" * 40)

    no_below_limit = NO_BELOW
    low_freq_ids = {tokenid for tokenid, docfreq in dictionary.dfs.items() if docfreq < no_below_limit}
    print(f"规则 'no_below = {NO_BELOW}' 分析:")
    print(f" - 文档频率低于 {no_below_limit} 的词汇有 {len(low_freq_ids)} 个。")

    no_above_limit = num_docs * NO_ABOVE
    high_freq_ids = {tokenid for tokenid, docfreq in dictionary.dfs.items() if docfreq > no_above_limit}
    print(f"\n规则 'no_above = {NO_ABOVE}' 分析:")
    print(f" - 文档频率高于 {NO_ABOVE:.0%} (即 > {int(no_above_limit)}个文档) 的词汇有 {len(high_freq_ids)} 个。")

    if high_freq_ids:
        print(" - 示例 (将被移除的高频词):")
        sorted_high_freq = sorted(high_freq_ids, key=lambda tokenid: dictionary.dfs[tokenid], reverse=True)
        for tokenid in sorted_high_freq[:5]:
            print(f"   - '{dictionary[tokenid]}' (在 {dictionary.dfs[tokenid]} 个文档中出现)")

    total_removed_ids = low_freq_ids.union(high_freq_ids)
    final_vocab_size_estimated = initial_vocab_size - len(total_removed_ids)

    print("\n--- 综合分析结果 ---")
    print(f"将被移除的低频词总数: {len(low_freq_ids)}")
    print(f"将被移除的高频词总数: {len(high_freq_ids)}")
    print(f"将被移除的独立词汇总数: {len(total_removed_ids)}")
    print("-" * 40)
    print(f"预计过滤后的词典大小: {final_vocab_size_estimated}")
    print("-" * 40)

    print("\n--- 4. 实际执行Gensim的 filter_extremes 操作 ---")
    dictionary.filter_extremes(no_below=NO_BELOW, no_above=NO_ABOVE, keep_n=KEEP_N)
    dictionary.compactify()
    
    final_vocab_size_actual = len(dictionary)
    print(f"Gensim过滤后，实际最终词典大小: {final_vocab_size_actual}")

    if final_vocab_size_actual == final_vocab_size_estimated:
        print("验证成功：手动分析结果与Gensim执行结果一致。")
    else:
        print("警告：手动分析结果与Gensim执行结果不一致，请检查逻辑。")


    print("\n--- 5. 创建并保存最终的BoW语料库和词典 ---")

    output_dir = os.path.dirname(FILTERED_DICTIONARY_PATH)

    os.makedirs(output_dir, exist_ok=True)
    
    corpus = [dictionary.doc2bow(text) for text in processed_texts]
    
    dictionary.save(FILTERED_DICTIONARY_PATH)
    with open(FILTERED_CORPUS_PATH, 'wb') as f:
        pickle.dump(corpus, f)
        
    print(f"最终词典已保存至: {FILTERED_DICTIONARY_PATH}")
    print(f"最终BoW语料库已保存至: {FILTERED_CORPUS_PATH}")

In [None]:
import pickle
import os
import time
from gensim.models import LdaMulticore
from gensim.corpora import Dictionary
import multiprocessing

FINAL_DICTIONARY_PATH = 'data/4-1/64/final_dictionary.dict'
FINAL_CORPUS_PATH = 'data/4-1/64/final_corpus.pkl'

MODEL_SAVE_DIR = 'data/4-1/64/model/'
TOPIC_RANGE = range(3, 16)

if __name__ == '__main__':
    os.makedirs(MODEL_SAVE_DIR, exist_ok=True)

    num_workers = multiprocessing.cpu_count() - 2 if multiprocessing.cpu_count() > 2 else 1
    print(f"--- 将为每个LDA模型训练使用 {num_workers} 个CPU核心 ---")

    print("\n--- 1. 正在加载经过词典过滤的最终语料库和词典... ---")
    if not os.path.exists(FINAL_DICTIONARY_PATH) or not os.path.exists(FINAL_CORPUS_PATH):
        raise FileNotFoundError("错误：找不到最终的词典或语料库文件。请先运行词典过滤脚本。")
        
    dictionary = Dictionary.load(FINAL_DICTIONARY_PATH)
    with open(FINAL_CORPUS_PATH, 'rb') as f:
        corpus = pickle.load(f)
    print(f"加载成功。词典大小: {len(dictionary)}，语料库文档数: {len(corpus)}")


    print("\n--- 2. 开始批量训练LDA模型 ---")
    start_time = time.time()
    
    for n_topics in TOPIC_RANGE:
        print(f"\n--- 开始训练: {n_topics} 主题 ---")

        model = LdaMulticore(
            corpus=corpus,                 
            num_topics=n_topics,           
            id2word=dictionary,            
            random_state=42,              
            passes=10,                     
            workers=num_workers,          
        )

        model_path = os.path.join(MODEL_SAVE_DIR, f'lda_model_{n_topics}.model')
        model.save(model_path)
        print(f"--- 已保存: {n_topics} 主题的模型至 {model_path} ---")

    end_time = time.time()
    print(f"\n--- 全部模型训练完毕, 总耗时: {end_time - start_time:.2f} 秒 ---")
    print(f"所有模型已保存在: {MODEL_SAVE_DIR}")

In [None]:
import os
import pickle
import pandas as pd
import matplotlib.pyplot as plt
from gensim.models import LdaMulticore, KeyedVectors
from gensim.corpora import Dictionary
from gensim.models.coherencemodel import CoherenceModel
from octis.evaluation_metrics.diversity_metrics import TopicDiversity, InvertedRBO

PROCESSED_CORPUS_PICKLE = 'data/4-1/64/merged_documents_with_clusters.pkl' 

FINAL_DICTIONARY_PATH = 'data/4-1/64/final_dictionary.dict'

TENCENT_WV_PATH = 'data/origin/tencent-ailab-embedding-zh-d100-v0.2.0-s.txt'

MODEL_SAVE_DIR = 'data/4-1/64/model/'

RESULTS_CSV_PATH = 'data/4-1/64/lda_f_evaluation.csv'

TOPIC_RANGE = range(3, 16)

def evaluate_models(model_dir, topic_range, processed_texts, dictionary, word_vectors):
    print("\n--- 开始进行模型评估 ---")
    results = []

    diversity_metric = TopicDiversity(topk=10)
    rbo_metric = InvertedRBO(topk=10, weight=0.9)

    for n_topics in topic_range:
        model_path = os.path.join(model_dir, f'lda_model_{n_topics}.model')
        if not os.path.exists(model_path):
            print(f"警告: 模型文件未找到，跳过: {model_path}")
            continue
            
        print(f"--- 正在评估: {n_topics} 主题的模型 ---")
        lda_model = LdaMulticore.load(model_path)

        topics_for_coherence = [[word for word, _ in lda_model.show_topic(i, topn=20)] for i in range(n_topics)]
        topics_for_diversity = [[word for word, _ in lda_model.show_topic(i, topn=10)] for i in range(n_topics)]

        coherence_model = CoherenceModel(
            topics=topics_for_coherence,
            texts=processed_texts,
            dictionary=dictionary,
            coherence='c_w2v',
            keyed_vectors=word_vectors
        )
        cw2v_semantic = coherence_model.get_coherence()

        model_output_for_diversity = {"topics": topics_for_diversity}
        diversity = diversity_metric.score(model_output_for_diversity)
        rbo = rbo_metric.score(model_output_for_diversity)

        print(f"  - C_W2V (Semantic, topk=20): {cw2v_semantic:.4f}")
        print(f"  - Topic Diversity (topk=10): {diversity:.4f}")
        print(f"  - InvertedRBO (topk=10): {rbo:.4f}")
        
        results.append({
            "num_topics": n_topics,
            "C_W2V (Semantic)": cw2v_semantic,
            "Topic Diversity": diversity,
            "InvertedRBO": rbo
        })
        
    return pd.DataFrame(results).set_index("num_topics")

def plot_results(results_df):
    print("\n--- 正在可视化评估结果... ---")
    fig, ax1 = plt.subplots(figsize=(14, 8))

    ax1.set_xlabel('Number of Topics')
    ax1.set_ylabel('C_W2V Semantic Coherence (topk=20)', color='tab:red')
    ax1.plot(results_df.index, results_df['C_W2V (Semantic)'], color='tab:red', marker='o', linewidth=2.5, label='C_W2V (Semantic)')
    ax1.tick_params(axis='y', labelcolor='tab:red')
    ax1.grid(True, which='both', linestyle='--', linewidth=0.5)

    ax2 = ax1.twinx()
    ax2.set_ylabel('Diversity Scores (topk=10)', color='tab:blue')
    ax2.plot(results_df.index, results_df['Topic Diversity'], color='tab:blue', marker='x', linestyle='-', label='Topic Diversity')
    ax2.plot(results_df.index, results_df['InvertedRBO'], color='tab:cyan', marker='x', linestyle='--', label='InvertedRBO')
    ax2.tick_params(axis='y', labelcolor='tab:blue')

    fig.legend(loc="upper center", bbox_to_anchor=(0.5, 0.96), ncol=3, fontsize='medium')
    fig.suptitle('LDA-F 模型评估: 语义一致性 vs. 多样性', fontsize=16)
    fig.tight_layout(rect=[0, 0.03, 1, 0.92])
    plt.show()

def main():
    print("--- 1. 正在加载评估所需的文件... ---")

    with open(PROCESSED_CORPUS_PICKLE, 'rb') as f:
        processed_texts = pickle.load(f)
    print(f" - 成功加载 {len(processed_texts)} 条原始文本。")

    dictionary = Dictionary.load(FINAL_DICTIONARY_PATH)
    print(f" - 成功加载最终词典 (大小: {len(dictionary)})。")

    print(" - 正在加载腾讯词向量模型...")
    word_vectors = KeyedVectors.load_word2vec_format(TENCENT_WV_PATH, binary=False)
    print(" - 成功加载腾讯词向量。")

    results_df = evaluate_models(MODEL_SAVE_DIR, TOPIC_RANGE, processed_texts, dictionary, word_vectors)
    
    if not results_df.empty:
        print("\n--- 评估完成, 最终结果如下: ---")
        print(results_df)

        os.makedirs(os.path.dirname(RESULTS_CSV_PATH), exist_ok=True)
        results_df.to_csv(RESULTS_CSV_PATH)
        print(f"\n评估结果已保存到 {RESULTS_CSV_PATH}")
        
        plot_results(results_df)
    else:
        print("\n--- 评估失败: 在指定目录下未找到任何模型文件 ---")

if __name__ == "__main__":
    main()

In [None]:
#256

In [None]:
import pickle
import pandas as pd
import time
import os

INPUT_HD_AGC_RESULTS_PATH = 'data/4-1/256/pca_256_hd_agc_results.pkl'

INPUT_ORIGINAL_TEXT_PATH = 'data/paragraph.pkl'

OUTPUT_DIR = 'data/4-1/256/'
os.makedirs(OUTPUT_DIR, exist_ok=True)

OUTPUT_CLUSTERS_CSV_PATH = os.path.join(OUTPUT_DIR, 'document_clusters_hd_agc.csv')

OUTPUT_MERGED_DOCS_PATH = os.path.join(OUTPUT_DIR, 'merged_documents_with_clusters.pkl')

def merge_documents_and_include_originals():
    print(f"--- 正在从 {INPUT_HD_AGC_RESULTS_PATH} 加载 HD-AGC 聚类结果... ---")
    if not os.path.exists(INPUT_HD_AGC_RESULTS_PATH):
        raise FileNotFoundError(f"HD-AGC 结果文件未找到，请检查路径: {INPUT_HD_AGC_RESULTS_PATH}")
    with open(INPUT_HD_AGC_RESULTS_PATH, 'rb') as f:
        hd_agc_results = pickle.load(f)
    
    clusters = hd_agc_results.get('clusters', [])
    
    print(f"--- 正在从 {INPUT_ORIGINAL_TEXT_PATH} 加载原始文本... ---")
    if not os.path.exists(INPUT_ORIGINAL_TEXT_PATH):
        raise FileNotFoundError(f"原始文本文件未找到，请检查路径: {INPUT_ORIGINAL_TEXT_PATH}")
    with open(INPUT_ORIGINAL_TEXT_PATH, 'rb') as f:
        paragraph = pickle.load(f)
        
    n_total_docs = len(paragraph)

    print("\n--- HD-AGC 聚类结果分析 ---")
    n_clusters = len(clusters)
    num_clustered_docs = sum(len(c) for c in clusters)
    
    print(f"从 {n_total_docs} 篇文档中，识别出:")
    print(f"   - {n_clusters} 个簇，共包含 {num_clustered_docs} 篇文档。")
    if n_clusters > 0:
        avg_docs_per_cluster = num_clustered_docs / n_clusters
        print(f"   - 平均每个簇由 {avg_docs_per_cluster:.2f} 篇原始文档构成。")
        cluster_sizes = [len(c) for c in clusters]
        top_10_indices = sorted(range(len(cluster_sizes)), key=lambda i: cluster_sizes[i], reverse=True)[:10]
        
        print("\n--- Top 10 最大簇的文档数: ---")
        for i, cluster_idx in enumerate(top_10_indices):
            print(f"  - 簇 {cluster_idx} (第 {i+1} 大): {cluster_sizes[cluster_idx]} 个文档")

    print("\n--- 正在构建每篇文档的聚类标签... ---")
    labels = [-1] * n_total_docs
    for cluster_id, doc_indices in enumerate(clusters):
        for doc_index in doc_indices:
            labels[doc_index] = cluster_id
            
    documents_text = [" ".join(text) for text in paragraph]
    results_df = pd.DataFrame({
        'document_index': range(n_total_docs),
        'document_text': documents_text,
        'cluster_label': labels
    })
    results_df.sort_values(by='cluster_label', inplace=True)
    results_df.to_csv(OUTPUT_CLUSTERS_CSV_PATH, index=False, encoding='utf-8-sig')
    print(f"详细结果已保存到: {OUTPUT_CLUSTERS_CSV_PATH}")

    print("\n--- 正在构建最终文档集合... ---")
    start_time = time.time()
    
    merged_docs_list = []

    print(f"--- 步骤 1/2: 添加 {n_total_docs} 篇原始文档 ---")
    merged_docs_list.extend(paragraph)

    print(f"--- 步骤 2/2: 正在合并 {n_clusters} 个簇为超级文档 ---")
    for doc_indices in clusters:
        merged_doc = []
        for doc_index in doc_indices:
            if 0 <= doc_index < len(paragraph):
                merged_doc.extend(paragraph[doc_index])
        if merged_doc:
            merged_docs_list.append(merged_doc)

    print(f"--- 合并完成！耗时: {time.time() - start_time:.2f} 秒 ---")

    final_doc_count = len(merged_docs_list)
    expected_doc_count = n_total_docs + n_clusters
    print(f"--- 共生成 {final_doc_count} 篇文档 (由 {n_total_docs} 篇原始文档 + "
          f"{n_clusters} 个超级文档组成)。 ---")
    if final_doc_count != expected_doc_count:
        print(f"   - 警告: 最终文档数 ({final_doc_count}) 与预期数 ({expected_doc_count}) 不符，请检查。")

    with open(OUTPUT_MERGED_DOCS_PATH, 'wb') as f:
        pickle.dump(merged_docs_list, f)
        
    print(f"成功将文档保存到: {OUTPUT_MERGED_DOCS_PATH}")

    if merged_docs_list and clusters:
        first_cluster_indices = clusters[0]
        sample_cluster_size = len(first_cluster_indices)
        print(f"\n示例合并文档 (来自簇 0) 由 {sample_cluster_size} 篇原始文档合并而成。")
        print(f"其前20个词为: {merged_docs_list[n_total_docs][:20]}")

if __name__ == '__main__':
    merge_documents_and_include_originals()

In [None]:
import pickle
import os 
from gensim.corpora import Dictionary
from collections import Counter

PROCESSED_CORPUS_PICKLE = 'data/4-1/256/merged_documents_with_clusters.pkl'
FILTERED_DICTIONARY_PATH = 'data/4-1/256/final_dictionary.dict'
FILTERED_CORPUS_PATH = 'data/4-1/256/final_corpus.pkl'

NO_BELOW = 10
NO_ABOVE = 0.35
KEEP_N = None

if __name__ == '__main__':
    print("--- 1. 加载预处理好的分词后文本 ---")
    with open(PROCESSED_CORPUS_PICKLE, 'rb') as f:
        processed_texts = pickle.load(f)
    print(f"成功加载 {len(processed_texts)} 条文本。")

    print("\n--- 2. 创建初始Gensim词典 (不过滤) ---")
    dictionary = Dictionary(processed_texts)
    initial_vocab_size = len(dictionary)
    print(f"初始词典大小: {initial_vocab_size}")

    print("\n--- 3. 分析将被各个过滤规则移除的词汇 ---")
    
    num_docs = dictionary.num_docs
    print(f"总文档数: {num_docs}")
    print("-" * 40)

    no_below_limit = NO_BELOW
    low_freq_ids = {tokenid for tokenid, docfreq in dictionary.dfs.items() if docfreq < no_below_limit}
    print(f"规则 'no_below = {NO_BELOW}' 分析:")
    print(f" - 文档频率低于 {no_below_limit} 的词汇有 {len(low_freq_ids)} 个。")

    no_above_limit = num_docs * NO_ABOVE
    high_freq_ids = {tokenid for tokenid, docfreq in dictionary.dfs.items() if docfreq > no_above_limit}
    print(f"\n规则 'no_above = {NO_ABOVE}' 分析:")
    print(f" - 文档频率高于 {NO_ABOVE:.0%} (即 > {int(no_above_limit)}个文档) 的词汇有 {len(high_freq_ids)} 个。")

    if high_freq_ids:
        print(" - 示例 (将被移除的高频词):")
        sorted_high_freq = sorted(high_freq_ids, key=lambda tokenid: dictionary.dfs[tokenid], reverse=True)
        for tokenid in sorted_high_freq[:5]:
            print(f"   - '{dictionary[tokenid]}' (在 {dictionary.dfs[tokenid]} 个文档中出现)")

    total_removed_ids = low_freq_ids.union(high_freq_ids)
    final_vocab_size_estimated = initial_vocab_size - len(total_removed_ids)

    print("\n--- 综合分析结果 ---")
    print(f"将被移除的低频词总数: {len(low_freq_ids)}")
    print(f"将被移除的高频词总数: {len(high_freq_ids)}")
    print(f"将被移除的独立词汇总数: {len(total_removed_ids)}")
    print("-" * 40)
    print(f"预计过滤后的词典大小: {final_vocab_size_estimated}")
    print("-" * 40)

    print("\n--- 4. 实际执行Gensim的 filter_extremes 操作 ---")
    dictionary.filter_extremes(no_below=NO_BELOW, no_above=NO_ABOVE, keep_n=KEEP_N)
    dictionary.compactify()
    
    final_vocab_size_actual = len(dictionary)
    print(f"Gensim过滤后，实际最终词典大小: {final_vocab_size_actual}")

    if final_vocab_size_actual == final_vocab_size_estimated:
        print("验证成功：手动分析结果与Gensim执行结果一致。")
    else:
        print("警告：手动分析结果与Gensim执行结果不一致，请检查逻辑。")


    print("\n--- 5. 创建并保存最终的BoW语料库和词典 ---")

    output_dir = os.path.dirname(FILTERED_DICTIONARY_PATH)

    os.makedirs(output_dir, exist_ok=True)
    
    corpus = [dictionary.doc2bow(text) for text in processed_texts]
    
    dictionary.save(FILTERED_DICTIONARY_PATH)
    with open(FILTERED_CORPUS_PATH, 'wb') as f:
        pickle.dump(corpus, f)
        
    print(f"最终词典已保存至: {FILTERED_DICTIONARY_PATH}")
    print(f"最终BoW语料库已保存至: {FILTERED_CORPUS_PATH}")

In [None]:
import pickle
import os
import time
from gensim.models import LdaMulticore
from gensim.corpora import Dictionary
import multiprocessing

FINAL_DICTIONARY_PATH = 'data/4-1/256/final_dictionary.dict'
FINAL_CORPUS_PATH = 'data/4-1/256/final_corpus.pkl'

MODEL_SAVE_DIR = 'data/4-1/256/model/'
TOPIC_RANGE = range(3, 16)

if __name__ == '__main__':
    os.makedirs(MODEL_SAVE_DIR, exist_ok=True)

    num_workers = multiprocessing.cpu_count() - 2 if multiprocessing.cpu_count() > 2 else 1
    print(f"--- 将为每个LDA模型训练使用 {num_workers} 个CPU核心 ---")

    print("\n--- 1. 正在加载经过词典过滤的最终语料库和词典... ---")
    if not os.path.exists(FINAL_DICTIONARY_PATH) or not os.path.exists(FINAL_CORPUS_PATH):
        raise FileNotFoundError("错误：找不到最终的词典或语料库文件。请先运行词典过滤脚本。")
        
    dictionary = Dictionary.load(FINAL_DICTIONARY_PATH)
    with open(FINAL_CORPUS_PATH, 'rb') as f:
        corpus = pickle.load(f)
    print(f"加载成功。词典大小: {len(dictionary)}，语料库文档数: {len(corpus)}")


    print("\n--- 2. 开始批量训练LDA模型 ---")
    start_time = time.time()
    
    for n_topics in TOPIC_RANGE:
        print(f"\n--- 开始训练: {n_topics} 主题 ---")

        model = LdaMulticore(
            corpus=corpus,                 
            num_topics=n_topics,           
            id2word=dictionary,            
            random_state=42,              
            passes=10,                     
            workers=num_workers,          
        )

        model_path = os.path.join(MODEL_SAVE_DIR, f'lda_model_{n_topics}.model')
        model.save(model_path)
        print(f"--- 已保存: {n_topics} 主题的模型至 {model_path} ---")

    end_time = time.time()
    print(f"\n--- 全部模型训练完毕, 总耗时: {end_time - start_time:.2f} 秒 ---")
    print(f"所有模型已保存在: {MODEL_SAVE_DIR}")

In [None]:
import os
import pickle
import pandas as pd
import matplotlib.pyplot as plt
from gensim.models import LdaMulticore, KeyedVectors
from gensim.corpora import Dictionary
from gensim.models.coherencemodel import CoherenceModel
from octis.evaluation_metrics.diversity_metrics import TopicDiversity, InvertedRBO

PROCESSED_CORPUS_PICKLE = 'data/4-1/256/merged_documents_with_clusters.pkl' 

FINAL_DICTIONARY_PATH = 'data/4-1/256/final_dictionary.dict'

TENCENT_WV_PATH = 'data/origin/tencent-ailab-embedding-zh-d100-v0.2.0-s.txt'

MODEL_SAVE_DIR = 'data/4-1/256/model/'

RESULTS_CSV_PATH = 'data/4-1/256/lda_f_evaluation.csv'

TOPIC_RANGE = range(3, 16)

def evaluate_models(model_dir, topic_range, processed_texts, dictionary, word_vectors):
    print("\n--- 开始进行模型评估 ---")
    results = []

    diversity_metric = TopicDiversity(topk=10)
    rbo_metric = InvertedRBO(topk=10, weight=0.9)

    for n_topics in topic_range:
        model_path = os.path.join(model_dir, f'lda_model_{n_topics}.model')
        if not os.path.exists(model_path):
            print(f"警告: 模型文件未找到，跳过: {model_path}")
            continue
            
        print(f"--- 正在评估: {n_topics} 主题的模型 ---")
        lda_model = LdaMulticore.load(model_path)

        topics_for_coherence = [[word for word, _ in lda_model.show_topic(i, topn=20)] for i in range(n_topics)]
        topics_for_diversity = [[word for word, _ in lda_model.show_topic(i, topn=10)] for i in range(n_topics)]

        coherence_model = CoherenceModel(
            topics=topics_for_coherence,
            texts=processed_texts,
            dictionary=dictionary,
            coherence='c_w2v',
            keyed_vectors=word_vectors
        )
        cw2v_semantic = coherence_model.get_coherence()

        model_output_for_diversity = {"topics": topics_for_diversity}
        diversity = diversity_metric.score(model_output_for_diversity)
        rbo = rbo_metric.score(model_output_for_diversity)

        print(f"  - C_W2V (Semantic, topk=20): {cw2v_semantic:.4f}")
        print(f"  - Topic Diversity (topk=10): {diversity:.4f}")
        print(f"  - InvertedRBO (topk=10): {rbo:.4f}")
        
        results.append({
            "num_topics": n_topics,
            "C_W2V (Semantic)": cw2v_semantic,
            "Topic Diversity": diversity,
            "InvertedRBO": rbo
        })
        
    return pd.DataFrame(results).set_index("num_topics")

def plot_results(results_df):
    print("\n--- 正在可视化评估结果... ---")
    fig, ax1 = plt.subplots(figsize=(14, 8))

    ax1.set_xlabel('Number of Topics')
    ax1.set_ylabel('C_W2V Semantic Coherence (topk=20)', color='tab:red')
    ax1.plot(results_df.index, results_df['C_W2V (Semantic)'], color='tab:red', marker='o', linewidth=2.5, label='C_W2V (Semantic)')
    ax1.tick_params(axis='y', labelcolor='tab:red')
    ax1.grid(True, which='both', linestyle='--', linewidth=0.5)

    ax2 = ax1.twinx()
    ax2.set_ylabel('Diversity Scores (topk=10)', color='tab:blue')
    ax2.plot(results_df.index, results_df['Topic Diversity'], color='tab:blue', marker='x', linestyle='-', label='Topic Diversity')
    ax2.plot(results_df.index, results_df['InvertedRBO'], color='tab:cyan', marker='x', linestyle='--', label='InvertedRBO')
    ax2.tick_params(axis='y', labelcolor='tab:blue')

    fig.legend(loc="upper center", bbox_to_anchor=(0.5, 0.96), ncol=3, fontsize='medium')
    fig.suptitle('LDA-F 模型评估: 语义一致性 vs. 多样性', fontsize=16)
    fig.tight_layout(rect=[0, 0.03, 1, 0.92])
    plt.show()

def main():
    print("--- 1. 正在加载评估所需的文件... ---")

    with open(PROCESSED_CORPUS_PICKLE, 'rb') as f:
        processed_texts = pickle.load(f)
    print(f" - 成功加载 {len(processed_texts)} 条原始文本。")

    dictionary = Dictionary.load(FINAL_DICTIONARY_PATH)
    print(f" - 成功加载最终词典 (大小: {len(dictionary)})。")

    print(" - 正在加载腾讯词向量模型...")
    word_vectors = KeyedVectors.load_word2vec_format(TENCENT_WV_PATH, binary=False)
    print(" - 成功加载腾讯词向量。")

    results_df = evaluate_models(MODEL_SAVE_DIR, TOPIC_RANGE, processed_texts, dictionary, word_vectors)
    
    if not results_df.empty:
        print("\n--- 评估完成, 最终结果如下: ---")
        print(results_df)

        os.makedirs(os.path.dirname(RESULTS_CSV_PATH), exist_ok=True)
        results_df.to_csv(RESULTS_CSV_PATH)
        print(f"\n评估结果已保存到 {RESULTS_CSV_PATH}")
        
        plot_results(results_df)
    else:
        print("\n--- 评估失败: 在指定目录下未找到任何模型文件 ---")

if __name__ == "__main__":
    main()