In [None]:
import json
import jieba
import numpy as np
import matplotlib.pyplot as plt
import matplotlib

from gensim.models.ldamodel import LdaModel
from gensim import corpora, models
from collections import defaultdict

import torch
import torch.nn as nn
from torch.utils.data import DataLoader, TensorDataset
from torch.autograd import Variable
from transformers import BertModel, BertTokenizer

In [None]:
json_path = r'C:\Users\11435\Desktop\clutter\research\data\stock\BV1LuxZeVE25.json'

with open(json_path, 'r', encoding='utf-8') as file:
    # 加载 JSON 数据
    data = json.load(file)
    
doc_data = [info['review'] for info in data]
doc_data

In [None]:
# 读取停用词，并去停用词
stopwords_path1 = r'C:\Users\11435\Desktop\clutter\research\data\corpus\stopwords_scu.txt'
with open(stopwords_path1, 'r', encoding='utf-8') as f:
    stopwords1 = set([line.strip() for line in f])

stopwords_path2 = r'C:\Users\11435\Desktop\clutter\research\data\corpus\stopwords_hit.txt'
with open(stopwords_path2, 'r', encoding='utf-8') as f:
    stopwords2 = set([line.strip() for line in f])

stopwords = stopwords1.union(stopwords2)

texts = []
for doc in doc_data:
    words = jieba.cut(doc)
    filter_words = [word for word in words if word not in stopwords and word.strip() != '']
    texts.append(filter_words)

In [None]:
# 过滤频次
FREQ_LIMIT = 1
frequency = defaultdict(int)
for text in texts:
    for token in text:
        frequency[token] += 1
texts = [[token for token in text if frequency[token] > FREQ_LIMIT]for text in texts]

In [None]:
# 构建词典和语料库
dictionary = corpora.Dictionary(texts)
corpus = [dictionary.doc2bow(text) for text in texts]

In [None]:
# 训练多个LDA模型，并计算每个模型的困惑度
models_perplexity = []
num_topics_range = range(2, 21)  # 从2到20个主题

for num_topics in num_topics_range:
    lda_model = models.LdaModel(corpus, num_topics=num_topics, id2word=dictionary, passes=10)
    perplexity = lda_model.log_perplexity(corpus)
    models_perplexity.append((num_topics, perplexity))

# 打印每个模型的主题数和困惑度
for num_topics, perplexity in models_perplexity:
    print(f"Number of Topics: {num_topics}, Perplexity: {perplexity}")

# 选择困惑度最低的模型
best_num_topics = min(models_perplexity, key=lambda x: x[1])[0]
print(f"Best number of topics: {best_num_topics}")

In [None]:
plt.plot(models_perplexity)

In [None]:
num_topic = 5
lda = models.LdaModel(corpus, num_topics=num_topic, id2word=dictionary, passes=15)

In [None]:
from gensim.models.coherencemodel import CoherenceModel

# 计算主题一致性
coherence_model = CoherenceModel(model=lda_model, texts=texts, dictionary=dictionary, coherence='c_v')
coherence_score = coherence_model.get_coherence()

print('Coherence Score: ', coherence_score)

In [None]:
CoherenceModel(model=lda_model, texts=texts, dictionary=dictionary, coherence='u_mass').get_coherence()

In [None]:
def topic_data_fill(topic_distribution, topic_num):
    features = []
    if len(topic_distribution) < topic_num:
        features = np.zeros(topic_num).tolist()
        for topic in topic_distribution:
            features[topic[0]] = topic[1]
    else:
        features = [topic[1] for topic in topic_distribution]
    return features

# 提取主题向量
lda_features = []
for document in corpus:
    topic_distribution = lda.get_document_topics(document)
    lda_features.append(topic_data_fill(topic_distribution, 5))

In [None]:
topics = lda.print_topics(num_words=4)
for topic in topics:
    print(topic)

In [None]:
# 检测程序
for idx, features in enumerate(lda_features):
    if len(features) < 5:
        print(idx)

In [None]:
lda_features

In [None]:
# 加载已经训练好的bert模型
model_path = r'D:\tool\toolkit\nlp\distiluse-base-multilingual-cased-v2-finetuned-stsb_multi_mt-es'
model = BertModel.from_pretrained(model_path)
tokenizer = BertTokenizer.from_pretrained(model_path)

In [None]:
bert_features = []
for article in doc_data:
    # 对文章进行分词和编码
    encoded_input = tokenizer(article, return_tensors='pt', padding=True, truncation=True, max_length=512)
    # 获取模型的输出
    with torch.no_grad():
        outputs = model(**encoded_input)
    # 获取CLS token的嵌入作为文章的特征向量
    bert_features.append(outputs.last_hidden_state[:, 0, :].squeeze().detach().numpy())

In [None]:
# lda和bert特征拼接
concatenated_features = []
for bert_feature, lda_feature in zip(bert_features, lda_features):
    # 将BERT特征向量和LDA主题向量拼接
    concatenated_feature = np.concatenate((bert_feature, lda_feature))
    concatenated_features.append(concatenated_feature)

In [None]:
# 定义自编码器
class AutoEncoder(nn.Module):
    def __init__(self, input_dim, hidden_size):
        super(AutoEncoder, self).__init__()

        self.encoder  = nn.Sequential(
            nn.Linear(input_dim, 128),
            nn.ReLU(True),
            nn.Linear(128, 64),
            nn.ReLU(True)
        )
        self.en_fc = nn.Linear(64, hidden_size)
        self.de_fc = nn.Linear(hidden_size, 64)
        self.decoder = nn.Sequential(
            nn.Linear(64, 128),
            nn.ReLU(True),
            nn.Linear(128, input_dim)
        )

    def forward(self, x):
        en = self.encoder(x)
        code = self.en_fc(en)
        de = self.de_fc(code)
        decoded = self.decoder(de)
        return code, decoded

In [None]:
# 准备数据
# 模型默认参数为float32，如果想要用double(float64)来训练的话，model=model.double()
tensor_data = torch.tensor(concatenated_features, dtype=torch.float32)
dataset = TensorDataset(tensor_data)
data_loader = DataLoader(dataset, batch_size=32, shuffle=True)

# 开始训练
HIDDEN_SIZE = 32
input_dim = len(concatenated_features[0])

model = AutoEncoder(input_dim, HIDDEN_SIZE)
criterion = nn.MSELoss()  # 均方误差损失
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

# 训练模型
model = model.to("cpu")
epochs = 5
for epoch in range(epochs):
    for batch_idx, (data,) in enumerate(data_loader):
        # 正向传播
        outputs = model(data)[1]
        loss = criterion(outputs, data)
        
        # 反向传播和优化
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        if batch_idx % 100 == 0:
            print(f'Epoch [{epoch+1}/{epochs}], Batch [{batch_idx+1}/{len(data_loader)}], Loss: {loss.item():.4f}')

In [None]:
# 提取文本encoder后的结果
encoder_features = model(dataset)[0].item()

In [None]:
# 用KM算法进行聚类，并提取关键词

from sklearn.cluster import KMeans
import numpy as np

K = 5

# 创建KMeans实例，指定聚类数目K
kmeans = KMeans(n_clusters=K)

# 拟合模型
kmeans.fit(encoder_features)

# 预测每个数据点的聚类标签
labels = kmeans.labels_

# 获取聚类中心
centers = kmeans.cluster_centers_

# 打印结果
print("Cluster labels:", labels)
print("Cluster centers:", centers)


In [None]:
# 根据聚类的结果对主题词进行提炼

from sklearn.preprocessing import normalize

# 为每个类别收集文档
category_docs = {}
for doc, label in zip(texts, labels):
    if label not in category_docs:
        category_docs[label] = []
    category_docs[label].append(doc)

# 初始化TF-IDF向量化器
vectorizer = TfidfVectorizer()

# 将所有文档转换为TF-IDF向量
X = vectorizer.fit_transform(texts)

# 初始化类别主题词字典
category_topics = {}

# 计算每个类别的文档数量
category_doc_counts = {label: len(docs) for label, docs in category_docs.items()}

# 计算每个类别的TF-IDF平均向量
for label, docs in category_docs.items():
    # 选择当前类别的所有文档的TF-IDF向量
    category_tfidf = X[labels == label]
    
    # 计算平均TF-IDF向量
    category_avg_tfidf = category_tfidf.mean(axis=0)
    
    # 归一化平均TF-IDF向量
    category_avg_tfidf = normalize(category_avg_tfidf, norm='l1')
    
    # 存储类别的平均TF-IDF向量
    category_topics[label] = category_avg_tfidf
    
# 获取特征名称（词汇）
feature_names = vectorizer.get_feature_names_out()

# 为每个类别提取主题词
def get_category_topic(label, num_words=10):
    # 获取当前类别的平均TF-IDF向量
    avg_tfidf = category_topics[label]
    
    # 获取词汇表中每个词的索引
    word_indices = avg_tfidf.indices
    
    # 获取每个词的TF-IDF值
    word_scores = avg_tfidf.data
    
    # 按TF-IDF值降序排列词
    sorted_indices = word_indices[np.argsort(-word_scores)]
    
    # 返回TF-IDF值最高的N个词作为主题词
    top_n_words = [feature_names[index] for index in sorted_indices[:num_words]]
    return top_n_words

# 为每个类别提取主题词
category_topic_words = {label: get_category_topic(label) for label in category_topics.keys()}

for label, topic_words in category_topic_words.items():
    print(f"Category {label} Topic Words: {topic_words}")

# 计算评价指标主题一致性、sc系数、jc系数

In [None]:
# 指标umass计算方式,如果采用UCI计算方式,把window_size的参数进行调整
import numpy as np
from collections import defaultdict, Counter
from itertools import combinations

def build_co_occurrence_matrix(documents, window_size=2):
    """构建词共现矩阵"""
    co_occurrence = defaultdict(int)
    for doc in documents:
        words = list(doc)
        for i in range(len(words) - window_size + 1):
            for pair in combinations(words[i:i + window_size], 2):
                co_occurrence[pair] += 1
    return co_occurrence

def calculate_pmi(co_occurrence):
    """计算点互信息（PMI）"""
    # 计算每个单独词的总出现次数
    word_counts = Counter()
    for pair, count in co_occurrence.items():
        word_counts[pair[0]] += count
        word_counts[pair[1]] += count

    total_documents = sum(co_occurrence.values())

    for pair, count in co_occurrence.items():
        pair_prob = count * 2 / total_documents
        word1, word2 = pair
        word1_prob = word_counts[word1] / total_documents
        word2_prob = word_counts[word2] / total_documents
        pmi = np.log(pair_prob / (word1_prob * word2_prob)) if (word1_prob * word2_prob) > 0 else 0
        co_occurrence[pair] = pmi

    return co_occurrence

def umass_score(texts, top_n=-1):
    co_occurrence = build_co_occurrence_matrix(texts)
    pmi_matrix = calculate_pmi(co_occurrence)
    # 筛选出存在于PMI矩阵中的词对
    # valid_pairs = [pair for pair in pmi_matrix.keys() if pmi_matrix[pair] > 0]
    # top_pairs = sorted(valid_pairs, key=lambda x: pmi_matrix[x], reverse=True)[:top_n]
    # umass = sum(pmi_matrix[pair] for pair in top_pairs) / len(top_pairs)
    umass = sum(pmi_matrix.values())/len(pmi_matrix)
    return umass


coherence_scores = {label: umass_score(cluster) for label, cluster in category_docs.items()}
average_coherence = sum(coherence_scores.values()) / len(coherence_scores)

In [None]:
# sc系数sklearn已有现成的包能进行计算->适合参考
from sklearn.metrics import silhouette_score
def sc(features, labels):
    return silhouette_score(features, labels)

silhouette_avg = sc()
print(f"轮廓系数: {silhouette_avg:.2f}")

In [None]:
# jc系数/散度->无意义,对于硬分类,算出来的结果基本都是1
def jaccard_distance(u, v):
    intersection = len(set(u) & set(v))
    union = len(set(u) | set(v))
    return 1 - intersection / union if union != 0 else 0

def jaccard_coefficient(centers):
    n_clusters = len(centers)
    jaccard_sum = 0
    for i in range(n_clusters):
        for j in range(i + 1, n_clusters):
            distance = jaccard_distance(centers[i], centers[j])
            jaccard_sum += distance
    
    return jaccard_sum / (n_clusters * (n_clusters - 1) / 2)

# 示例使用
# centers = np.array([...])  # 聚类中心
# jc = jaccard_coefficient(centers)
# print("Jaccard 散度:", jc)
jaccard_coefficient(centers)