In [None]:
import torch
from transformers import BertTokenizer, BertModel
import numpy as np
import pickle
import os
from tqdm import tqdm

ORIGINAL_TEXTS_PATH = 'data/paragraph.pkl' 
OUTPUT_VECTORS_PATH = 'data/BERT-LDA/bert_sentence_vectors.npy'
MODEL_NAME = 'hfl/chinese-roberta-wwm-ext'
BATCH_SIZE = 64

def generate_bert_embeddings():

    print("--- BERT句子向量生成脚本 ---")
    os.makedirs(os.path.dirname(OUTPUT_VECTORS_PATH), exist_ok=True)
    
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print(f"\n--- 1. 设备检测: 将使用 '{device}' 设备进行计算 ---")
    if device.type == 'cpu':
        print("警告：未使用GPU，处理速度会非常慢。")

    print("\n--- 2. 正在加载数据和预训练模型... ---")

    if not os.path.exists(ORIGINAL_TEXTS_PATH):
        raise FileNotFoundError(f"错误: 找不到原始文本文件: {ORIGINAL_TEXTS_PATH}")
    with open(ORIGINAL_TEXTS_PATH, 'rb') as f:
        texts = pickle.load(f)
    print(f" - 成功加载 {len(texts)} 条文本。")

    tokenizer = BertTokenizer.from_pretrained(MODEL_NAME)
    model = BertModel.from_pretrained(MODEL_NAME)
    model.to(device) 
    model.eval() 
    print(f" - 成功加载 '{MODEL_NAME}' 模型。")
-
    print(f"\n--- 3. 开始生成句子向量 (批处理大小: {BATCH_SIZE}) ---")
    all_sentence_embeddings = []

    for i in tqdm(range(0, len(texts), BATCH_SIZE), desc="正在处理批次"):
       
        batch_texts = texts[i:i + BATCH_SIZE]
        batch_texts_str = [" ".join(doc) for doc in batch_texts]

        inputs = tokenizer(
            batch_texts_str, 
            padding=True,      
            truncation=True,    
            max_length=512, 
            return_tensors="pt"
        )

        inputs = {key: val.to(device) for key, val in inputs.items()}

        with torch.no_grad():
            outputs = model(**inputs)

        last_hidden_states = outputs.last_hidden_state
        
        attention_mask = inputs['attention_mask']
        mask_expanded = attention_mask.unsqueeze(-1).expand(last_hidden_states.size()).float()

        sum_embeddings = torch.sum(last_hidden_states * mask_expanded, 1)

        sum_mask = torch.clamp(mask_expanded.sum(1), min=1e-9)

        sentence_embeddings = sum_embeddings / sum_mask

        all_sentence_embeddings.append(sentence_embeddings.cpu().numpy())

    print("\n--- 4. 正在合并和保存向量... ---")

    final_embeddings = np.vstack(all_sentence_embeddings)

    np.save(OUTPUT_VECTORS_PATH, final_embeddings)
    
    print(f"✅ 处理完成！")
    print(f" - 生成的句子向量维度: {final_embeddings.shape}")
    print(f" - 结果已成功保存到: {OUTPUT_VECTORS_PATH}")


if __name__ == "__main__":
    generate_bert_embeddings()

In [None]:
import numpy as np
import umap
import os
import time

INPUT_VECTORS_PATH = 'data/BERT-LDA/bert_sentence_vectors.npy'
OUTPUT_VECTORS_PATH = 'data/BERT-LDA/umap_reduced_vectors.npy'

N_COMPONENTS = 128
N_NEIGHBORS = 15
MIN_DIST = 0.1

def reduce_dimensionality():
    print("--- UMAP 降维脚本 ---")

    print(f"\n--- 1. 正在加载高维向量: {INPUT_VECTORS_PATH}... ---")
    if not os.path.exists(INPUT_VECTORS_PATH):
        raise FileNotFoundError(f"错误: 找不到输入文件: {INPUT_VECTORS_PATH}")
    
    high_dim_vectors = np.load(INPUT_VECTORS_PATH)
    print(f" - 加载成功。向量维度: {high_dim_vectors.shape}")

    print("\n--- 2. 正在执行UMAP降维... (这可能需要几分钟) ---")
    print(f"   - 原始维度: {high_dim_vectors.shape[1]}")
    print(f"   - 目标维度: {N_COMPONENTS}")

    reducer = umap.UMAP(
        n_components=N_COMPONENTS,
        n_neighbors=N_NEIGHBORS,
        min_dist=MIN_DIST,
        random_state=42,
        verbose=True
    )
    
    start_time = time.time()
    low_dim_vectors = reducer.fit_transform(high_dim_vectors)
    end_time = time.time()
    
    print(f"--- UMAP降维完成，耗时: {end_time - start_time:.2f} 秒 ---")

    print("\n--- 3. 正在保存降维后的向量... ---")
    np.save(OUTPUT_VECTORS_PATH, low_dim_vectors)
    
    print(f"✅ 处理完成！")
    print(f" - 生成的低维向量维度: {low_dim_vectors.shape}")
    print(f" - 结果已成功保存到: {OUTPUT_VECTORS_PATH}")


if __name__ == "__main__":
    reduce_dimensionality()

In [None]:
import numpy as np
from sklearn.cluster import MiniBatchKMeans
import os
import pickle
import time
from collections import Counter

INPUT_VECTORS_PATH = 'data/BERT-LDA/umap_reduced_vectors.npy'

OUTPUT_CLUSTERS_PATH = 'data/BERT-LDA/kmeans_clusters.pkl'

N_CLUSTERS = 30000
BATCH_SIZE = 2048
def perform_clustering():
    print("--- MiniBatchKMeans 聚类脚本 ---")

    print(f"\n--- 1. 正在加载降维后的向量: {INPUT_VECTORS_PATH}... ---")
    if not os.path.exists(INPUT_VECTORS_PATH):
        raise FileNotFoundError(f"错误: 找不到输入文件: {INPUT_VECTORS_PATH}")
    
    low_dim_vectors = np.load(INPUT_VECTORS_PATH)
    low_dim_vectors = np.ascontiguousarray(low_dim_vectors, dtype=np.float32)
    print(f" - 加载成功。向量维度: {low_dim_vectors.shape}")

    print(f"\n--- 2. 正在执行MiniBatchKMeans聚类... (目标簇数: {N_CLUSTERS}) ---")

    kmeans = MiniBatchKMeans(
        n_clusters=N_CLUSTERS,
        random_state=42,       
        batch_size=BATCH_SIZE,
        n_init='auto',           
        max_iter=300,        
        verbose=1
    )
    
    start_time = time.time()
    
    kmeans.fit(low_dim_vectors)
    end_time = time.time()
    
    print(f"--- K-Means聚类完成，耗时: {end_time - start_time:.2f} 秒 ---")

    print("\n--- 3. 正在处理和保存聚类结果... ---")

    labels = kmeans.labels_

    clusters_map = {}
    for doc_idx, cluster_id in enumerate(labels):
        if cluster_id not in clusters_map:
            clusters_map[cluster_id] = []
        clusters_map[cluster_id].append(doc_idx)


    with open(OUTPUT_CLUSTERS_PATH, 'wb') as f:
        pickle.dump(clusters_map, f)
        
    print(f"✅ 处理完成！")
    print(f" - 聚类结果已成功保存到: {OUTPUT_CLUSTERS_PATH}")


    print("\n--- 聚类结果统计信息 ---")
    print(f" - 实际生成的簇数量: {len(clusters_map)}")
    cluster_sizes = [len(docs) for docs in clusters_map.values()]
    print(f" - 最大簇包含的文档数: {max(cluster_sizes)}")
    print(f" - 最小簇包含的文档数: {min(cluster_sizes)}")
    print(f" - 平均每个簇包含的文档数: {np.mean(cluster_sizes):.2f}")

    size_counts = Counter(cluster_sizes)
    print(" - 簇大小分布 (前5个最常见的):")
    for size, count in size_counts.most_common(5):
        print(f"   - {count} 个簇的大小为 {size}")


if __name__ == "__main__":
    perform_clustering()

In [None]:
import pickle
import os
from tqdm import tqdm

ORIGINAL_TEXTS_PATH = 'data/paragraph.pkl'
CLUSTERS_PATH = 'data/BERT-LDA/kmeans_clusters.pkl'

FINAL_CORPUS_PATH = 'data/BERT-LDA/bert_lda_final_corpus.pkl'


def merge_documents_from_clusters():
    print("--- 文档合并脚本 ---")

    print("\n--- 1. 正在加载原始文本和聚类结果... ---")

    if not os.path.exists(ORIGINAL_TEXTS_PATH):
        raise FileNotFoundError(f"错误: 找不到原始文本文件: {ORIGINAL_TEXTS_PATH}")
    with open(ORIGINAL_TEXTS_PATH, 'rb') as f:
        original_texts = pickle.load(f)
    print(f" - 成功加载 {len(original_texts)} 条原始文本。")

    if not os.path.exists(CLUSTERS_PATH):
        raise FileNotFoundError(f"错误: 找不到聚类结果文件: {CLUSTERS_PATH}")
    with open(CLUSTERS_PATH, 'rb') as f:
        clusters_map = pickle.load(f)
    print(f" - 成功加载 {len(clusters_map)} 个簇的聚类结果。")

    print("\n--- 2. 正在根据聚类结果合并文档... ---")
    
    pseudo_documents = []
    for cluster_id in tqdm(clusters_map, desc="正在合并簇"):
        doc_indices_in_cluster = clusters_map[cluster_id]

        if len(doc_indices_in_cluster) < 2:
            continue
            
        merged_doc = []
        for doc_idx in doc_indices_in_cluster:
            merged_doc.extend(original_texts[doc_idx])
        
        pseudo_documents.append(merged_doc)

    print(f"--- 合并完成，生成了 {len(pseudo_documents)} 个新的伪文档。 ---")

    print("\n--- 3. 正在创建并保存最终的增强语料库... ---")

    final_corpus = original_texts + pseudo_documents
    
    os.makedirs(os.path.dirname(FINAL_CORPUS_PATH), exist_ok=True)
    with open(FINAL_CORPUS_PATH, 'wb') as f:
        pickle.dump(final_corpus, f)
        
    print(f"✅ 处理完成！")
    print("\n--- 最终语料库统计 ---")
    print(f" - 原始文档数: {len(original_texts)}")
    print(f" - 新增伪文档数: {len(pseudo_documents)}")
    print(f" - 最终语料库总文档数: {len(final_corpus)}")
    print(f" - 结果已成功保存到: {FINAL_CORPUS_PATH}")

if __name__ == "__main__":
    merge_documents_from_clusters()

In [None]:
import pickle
import os
import time
from gensim.models import LdaMulticore
from gensim.corpora import Dictionary
import multiprocessing

AUGMENTED_CORPUS_PATH = 'data/BERT-LDA/bert_lda_final_corpus.pkl'

OUTPUT_DIR = 'data/BERT-LDA/'
FINAL_DICTIONARY_PATH = os.path.join(OUTPUT_DIR, 'bert_lda_dictionary.dict')
FINAL_CORPUS_PATH = os.path.join(OUTPUT_DIR, 'bert_lda_corpus.pkl')
MODEL_SAVE_DIR = os.path.join(OUTPUT_DIR, 'models/')


TOPIC_RANGE = range(3, 16) 


FILTER_NO_BELOW = 20    
FILTER_NO_ABOVE = 0.5   
FILTER_KEEP_N = 100000  


if __name__ == '__main__':

    print("--- BERT-LDA 最终训练脚本 ---")
    os.makedirs(MODEL_SAVE_DIR, exist_ok=True) 

    num_workers = multiprocessing.cpu_count() - 2 if multiprocessing.cpu_count() > 2 else 1
    print(f"\n--- 0. 将为每个LDA模型训练使用 {num_workers} 个CPU核心 ---")

    print(f"\n--- 1. 正在加载增强语料库: {AUGMENTED_CORPUS_PATH}... ---")
    if not os.path.exists(AUGMENTED_CORPUS_PATH):
        raise FileNotFoundError(f"错误: 找不到增强语料库文件。请先运行第四步脚本。")
    with open(AUGMENTED_CORPUS_PATH, 'rb') as f:
        augmented_corpus = pickle.load(f)
    print(f" - 加载成功。增强后总文档数: {len(augmented_corpus)}")

    print("\n--- 2. 正在根据增强语料库创建新的词典和BoW语料... ---")
    dictionary = Dictionary(augmented_corpus)
    print(f" - 初始词典大小: {len(dictionary)}")
    
    dictionary.filter_extremes(
        no_below=FILTER_NO_BELOW,
        no_above=FILTER_NO_ABOVE,
        keep_n=FILTER_KEEP_N
    )
    dictionary.compactify()
    print(f" - 过滤后词典大小: {len(dictionary)}")

    corpus = [dictionary.doc2bow(doc) for doc in augmented_corpus]

    dictionary.save(FINAL_DICTIONARY_PATH)
    with open(FINAL_CORPUS_PATH, 'wb') as f:
        pickle.dump(corpus, f)
    print(f" - 新的词典已保存到: {FINAL_DICTIONARY_PATH}")
    print(f" - 新的BoW语料库已保存到: {FINAL_CORPUS_PATH}")

    print(f"\n--- 3. 开始批量训练LDA模型 (主题范围: {TOPIC_RANGE.start} 到 {TOPIC_RANGE.stop - 1}) ---")
    start_time = time.time()
    
    for n_topics in TOPIC_RANGE:
        print(f"\n--- 开始训练: {n_topics} 主题 ---")

        model = LdaMulticore(
            corpus=corpus,
            num_topics=n_topics,
            id2word=dictionary,
            random_state=42,      
            passes=10,           
            workers=num_workers,
        )

        model_path = os.path.join(MODEL_SAVE_DIR, f'lda_model_{n_topics}.model')
        model.save(model_path)
        print(f"--- 已保存: {n_topics} 主题的模型至 {model_path} ---")

    end_time = time.time()
    print(f"\n✅ 全部模型训练完毕, 总耗时: {end_time - start_time:.2f} 秒")
    print(f"   所有模型已保存在: {MODEL_SAVE_DIR}")

In [None]:
import os
import pickle
import pandas as pd
import matplotlib.pyplot as plt
from gensim.models import LdaMulticore, KeyedVectors
from gensim.corpora import Dictionary
from gensim.models.coherencemodel import CoherenceModel
from octis.evaluation_metrics.diversity_metrics import TopicDiversity, InvertedRBO


PROCESSED_CORPUS_PICKLE = 'data/BERT-LDA/bert_lda_corpus.pkl' 
FINAL_DICTIONARY_PATH = 'data/BERT-LDA/bert_lda_dictionary.dict'
TENCENT_WV_PATH = 'data/origin/tencent-ailab-embedding-zh-d100-v0.2.0-s.txt'
MODEL_SAVE_DIR = 'data/BERT-LDA/models/'
RESULTS_CSV_PATH = 'data/BERT-LDA/lda_f_evaluation.csv'

TOPIC_RANGE = range(3, 16)


def evaluate_models(model_dir, topic_range, processed_texts, dictionary, word_vectors):
    print("\n--- 开始进行模型评估 ---")
    results = []

    diversity_metric = TopicDiversity(topk=10)
    rbo_metric = InvertedRBO(topk=10, weight=0.9)

    for n_topics in topic_range:
        model_path = os.path.join(model_dir, f'lda_model_{n_topics}.model')
        if not os.path.exists(model_path):
            print(f"警告: 模型文件未找到，跳过: {model_path}")
            continue
            
        print(f"--- 正在评估: {n_topics} 主题的模型 ---")
        lda_model = LdaMulticore.load(model_path)

        topics_for_coherence = [[word for word, _ in lda_model.show_topic(i, topn=20)] for i in range(n_topics)]
        topics_for_diversity = [[word for word, _ in lda_model.show_topic(i, topn=10)] for i in range(n_topics)]

        coherence_model = CoherenceModel(
            topics=topics_for_coherence,
            texts=processed_texts,
            dictionary=dictionary,
            coherence='c_w2v',
            keyed_vectors=word_vectors
        )
        cw2v_semantic = coherence_model.get_coherence()

        model_output_for_diversity = {"topics": topics_for_diversity}
        diversity = diversity_metric.score(model_output_for_diversity)
        rbo = rbo_metric.score(model_output_for_diversity)

        print(f"  - C_W2V (Semantic, topk=20): {cw2v_semantic:.4f}")
        print(f"  - Topic Diversity (topk=10): {diversity:.4f}")
        print(f"  - InvertedRBO (topk=10): {rbo:.4f}")
        
        results.append({
            "num_topics": n_topics,
            "C_W2V (Semantic)": cw2v_semantic,
            "Topic Diversity": diversity,
            "InvertedRBO": rbo
        })
        
    return pd.DataFrame(results).set_index("num_topics")

def plot_results(results_df):
    print("\n--- 正在可视化评估结果... ---")
    fig, ax1 = plt.subplots(figsize=(14, 8))

    ax1.set_xlabel('Number of Topics')
    ax1.set_ylabel('C_W2V Semantic Coherence (topk=20)', color='tab:red')
    ax1.plot(results_df.index, results_df['C_W2V (Semantic)'], color='tab:red', marker='o', linewidth=2.5, label='C_W2V (Semantic)')
    ax1.tick_params(axis='y', labelcolor='tab:red')
    ax1.grid(True, which='both', linestyle='--', linewidth=0.5)

    ax2 = ax1.twinx()
    ax2.set_ylabel('Diversity Scores (topk=10)', color='tab:blue')
    ax2.plot(results_df.index, results_df['Topic Diversity'], color='tab:blue', marker='x', linestyle='-', label='Topic Diversity')
    ax2.plot(results_df.index, results_df['InvertedRBO'], color='tab:cyan', marker='x', linestyle='--', label='InvertedRBO')
    ax2.tick_params(axis='y', labelcolor='tab:blue')

    fig.legend(loc="upper center", bbox_to_anchor=(0.5, 0.96), ncol=3, fontsize='medium')
    fig.suptitle('LDA-F 模型评估: 语义一致性 vs. 多样性', fontsize=16)
    fig.tight_layout(rect=[0, 0.03, 1, 0.92])
    plt.show()

def main():
    print("--- 1. 正在加载评估所需的文件... ---")

    with open(PROCESSED_CORPUS_PICKLE, 'rb') as f:
        processed_texts = pickle.load(f)
    print(f" - 成功加载 {len(processed_texts)} 条原始文本。")

    dictionary = Dictionary.load(FINAL_DICTIONARY_PATH)
    print(f" - 成功加载最终词典 (大小: {len(dictionary)})。")

    print(" - 正在加载腾讯词向量模型...")
    word_vectors = KeyedVectors.load_word2vec_format(TENCENT_WV_PATH, binary=False)
    print(" - 成功加载腾讯词向量。")

    results_df = evaluate_models(MODEL_SAVE_DIR, TOPIC_RANGE, processed_texts, dictionary, word_vectors)
    
    if not results_df.empty:
        print("\n--- 评估完成, 最终结果如下: ---")
        print(results_df)

        os.makedirs(os.path.dirname(RESULTS_CSV_PATH), exist_ok=True)
        results_df.to_csv(RESULTS_CSV_PATH)
        print(f"\n评估结果已保存到 {RESULTS_CSV_PATH}")
        
        plot_results(results_df)
    else:
        print("\n--- 评估失败: 在指定目录下未找到任何模型文件 ---")

if __name__ == "__main__":
    main()