In [25]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import NMF
from bert_score import score
from tqdm import tqdm
import os

tqdm.pandas()

In [26]:
csv_folder = r"C:\Users\jiali\OneDrive - TUM\Studium\Data-Mining\physics.stackexchange.com\CSV"
csv_path = os.path.join(csv_folder, "Posts_with_sentiment.csv")
df = pd.read_csv(csv_path)
df = df.head(5000)

In [27]:
def parse_tags(tag_str):
    if isinstance(tag_str, str):
        return [tag for tag in tag_str.strip('|').split('|') if tag]
    return []

df["TagList"] = df["Tags"].apply(parse_tags)

In [28]:
# ✅ 2. TF-IDF 向量化 CleanBodyNoMath
vectorizer = TfidfVectorizer(max_features=10000, stop_words="english")
X = vectorizer.fit_transform(df["CleanBodyNoMath"])
feature_names = vectorizer.get_feature_names_out()

In [29]:
# ✅ 3. 使用 NMF 提取主题
n_topics = 20
nmf_model = NMF(n_components=n_topics, random_state=42)
W = nmf_model.fit_transform(X)
H = nmf_model.components_

In [30]:
# ✅ 4. 每篇文章提取前 top_n 个关键词
def extract_keywords(row_index, W, H, feature_names, top_n=5):
    topic_idx = W[row_index].argmax()
    top_word_indices = H[topic_idx].argsort()[::-1][:top_n]
    return [feature_names[i] for i in top_word_indices]

df["Keywords"] = [extract_keywords(i, W, H, feature_names) for i in range(len(df))]

In [31]:
# ✅ 5. 计算 Jaccard 相似度
def compute_jaccard(row):
    tags = set(row["TagList"])
    keywords = set(row["Keywords"])
    if not tags and not keywords:
        return 1.0
    return len(tags & keywords) / len(tags | keywords)

df["Jaccard"] = df.progress_apply(compute_jaccard, axis=1)

100%|██████████| 5000/5000 [00:00<00:00, 126554.79it/s]


In [32]:
from bert_score import score
from tqdm import tqdm
import pandas as pd

# ✅ 先生成所有 candidate/reference 句子
candidates = [" ".join(row) for row in df["Keywords"]]
references = [" ".join(row) for row in df["TagList"]]

# ✅ 批量计算 BERTScore（默认使用 GPU）
P, R, F1 = score(candidates, references, lang="en", verbose=True)

# ✅ 写入结果（转为 float）
df["BERT_Precision"] = [p.item() for p in P]
df["BERT_Recall"]    = [r.item() for r in R]
df["BERT_F1"]        = [f.item() for f in F1]

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


calculating scores...
computing bert embedding.


100%|██████████| 19/19 [00:01<00:00, 11.33it/s]


computing greedy matching.


100%|██████████| 79/79 [00:01<00:00, 67.58it/s]


done in 2.86 seconds, 1751.09 sentences/sec


In [None]:
# 计算总体平均值
mean_jaccard = df["Jaccard"].mean()
mean_bert_f1 = df["BERT_F1"].mean()

print(f"🔍 平均 Jaccard 相似度：{mean_jaccard:.3f}")
print(f"🔍 平均 BERT F1 分数：  {mean_bert_f1:.3f}")

🔍 平均 Jaccard 相似度：0.005
🔍 平均 BERT F1 分数：  0.229


: 