In [None]:
from transformers import AutoTokenizer, AutoModel
from sklearn.cluster import KMeans
import torch
import pandas as pd
import numpy as np

# 加载标签数据
file_path = '../Data/Tags/tag_counter.csv'
tags_df = pd.read_csv(file_path)

# 加载BERT模型
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
model = AutoModel.from_pretrained("bert-base-uncased")

# 获取BERT嵌入
def get_embedding(text):
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=10)
    with torch.no_grad():
        outputs = model(**inputs)
    # 计算[CLS]标记的嵌入表示或平均池化
    embedding = outputs.last_hidden_state.mean(dim=1).squeeze()
    return embedding.cpu().numpy()

# 生成标签嵌入
tags_df['embedding'] = tags_df['tag'].apply(get_embedding)

# 将嵌入转化为矩阵格式，用于聚类
embeddings = np.vstack(tags_df['embedding'].values)

# KMeans聚类
num_clusters = 10  # 依据实际情况调整聚类数量
kmeans = KMeans(n_clusters=num_clusters, random_state=0)
tags_df['cluster'] = kmeans.fit_predict(embeddings)

# 查看每个聚类的标签
clustered_tags = tags_df.groupby('cluster')['tag'].apply(list)

# 输出每个聚类中的标签列表
print(clustered_tags)
