In [1]:
import pandas as pd
import re
from pathlib import Path
from keybert import KeyBERT
from bs4 import BeautifulSoup
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
from sentence_transformers import SentenceTransformer, util
from bert_score import score as bertscore
import torch
import os
from tqdm import tqdm
tqdm.pandas() 

# ✅ 1. 设置设备
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")


  from .autonotebook import tqdm as notebook_tqdm


Using device: cuda


In [2]:
# ✅ 2. 加载轻量语义嵌入模型到 GPU
embedder = SentenceTransformer('all-MiniLM-L6-v2', device=device)

# ✅ 3. 读取数据
csv_folder_physics = r"C:\Users\jiali\OneDrive - TUM\Studium\Data-Mining\physics.stackexchange.com\CSV"
csv_folder_literature = r"D:\Studium\DM-Dataset\literature.stackexchange.com\Literature_CSV_raw"
csv_folder_math = r"D:\Studium\DM-Dataset\math.stackexchange.com\CSV"

csv_path = os.path.join(csv_folder_literature, "Posts.csv")
df = pd.read_csv(csv_path, usecols=["Id", "Title", "Body", "Tags", "PostTypeId"])
df = df[df["PostTypeId"] == 1].dropna(subset=["Body", "Tags"]).reset_index(drop=True)
 

In [3]:
# ✅ 4. 文本清洗
def clean_text(text):
    text = BeautifulSoup(text, "html.parser").get_text()
    text = re.sub(r"[^\w\s]", " ", text.lower())
    words = [w for w in text.split() if w not in ENGLISH_STOP_WORDS]
    return " ".join(words)

df["CleanText"] = (df["Title"] + " " + df["Body"]).apply(clean_text)

In [4]:
# ✅ 5. 初始化 KeyBERT（内部使用 SentenceTransformer，已在 GPU 上）
kw_model = KeyBERT(model=embedder)

In [5]:
# ✅ 6. 提取关键词
def extract_keywords(text):
    keywords = kw_model.extract_keywords(text, keyphrase_ngram_range=(1, 2), stop_words='english', top_n=10)
    return [kw for kw, _ in keywords]

df["Keywords"] = df["CleanText"].progress_apply(extract_keywords)

100%|██████████| 6780/6780 [05:36<00:00, 20.14it/s]


In [6]:
# ✅ 7. 标签清洗
def extract_tags(tag_str):
    return [tag.strip().lower().replace("-", " ") for tag in tag_str.strip("|").split("|") if tag]

df["TagList"] = df["Tags"].progress_apply(extract_tags)

100%|██████████| 6780/6780 [00:00<00:00, 483991.10it/s]


In [7]:
from bert_score import score as bertscore

# ✅ 1. 拼接关键词和标签列表为句子
candidates = df["Keywords"].apply(lambda kws: " ".join(kws)).tolist()   # 关键词作为候选
references = df["TagList"].apply(lambda tags: " ".join(tags)).tolist()  # 标签作为参考

# ✅ 2. 一次性计算所有样本的 BERTScore（推荐开启 GPU）
P, R, F1 = bertscore(
    candidates,
    references,
    lang="en",
    device="cuda",        # 或 device="mps"（Mac）/ "cpu"
    batch_size=64,        # 可视 GPU 内存调整
    verbose=True
)

# ✅ 3. 写入结果到 DataFrame
df["BERT_Precision"] = P
df["BERT_Recall"]    = R
df["BERT_F1"]        = F1

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


calculating scores...
computing bert embedding.


100%|██████████| 171/171 [00:25<00:00,  6.60it/s]


computing greedy matching.


100%|██████████| 106/106 [00:01<00:00, 59.05it/s]

done in 27.73 seconds, 244.48 sentences/sec





In [8]:
# ✅ 9. 嵌入向量相似度（cosine）匹配（加速）
def cosine_match_ratio(keywords, tags, threshold=0.6):
    if not keywords or not tags:
        return 0.0, []
    kw_emb = embedder.encode(keywords, convert_to_tensor=True, device=device, batch_size=32)
    tag_emb = embedder.encode(tags,     convert_to_tensor=True, device=device)
    cos = util.cos_sim(kw_emb, tag_emb)
    matched = [kw for i, kw in enumerate(keywords) if max(cos[i]).item() >= threshold]
    return len(matched) / max(len(keywords), 1), matched

df[["MatchRatio", "MatchedKeywords"]] = df.progress_apply(
    lambda r: pd.Series(cosine_match_ratio(r["Keywords"], r["TagList"])), axis=1
)

100%|██████████| 6780/6780 [02:13<00:00, 50.97it/s]


In [9]:
# ✅ 10. 标记可疑标签
df["PotentialTagIssue"] = df["MatchRatio"] < 0.2

# ✅ 11. 精准率 / 召回率 / F1
def compute_prf1(row):
    keywords_set = set(row["Keywords"])
    tags_set = set(row["TagList"])
    matched_set = set(row["MatchedKeywords"])

    tp = len(matched_set)
    fp = len(keywords_set - matched_set)
    fn = len(tags_set - matched_set)

    precision = tp / (tp + fp) if (tp + fp) > 0 else 0.0
    recall = tp / (tp + fn) if (tp + fn) > 0 else 0.0
    f1 = 2 * precision * recall / (precision + recall) if (precision + recall) > 0 else 0.0
    return pd.Series([precision, recall, f1])

df[["Precision", "Recall", "F1"]] = df.progress_apply(compute_prf1, axis=1)

100%|██████████| 6780/6780 [00:00<00:00, 9112.90it/s] 


In [10]:
# ✅ 12. 平均指标
print(f"Average MatchRatio: {df['MatchRatio'].mean():.2f}")
print(f"Average Precision: {df['Precision'].mean():.2f}")
print(f"Average Recall: {df['Recall'].mean():.2f}")
print(f"Average F1-score: {df['F1'].mean():.2f}")
print(f"Average BERTScore-P:  {df['BERT_Precision'].mean():.2f}")
print(f"Average BERTScore-R:  {df['BERT_Recall'].mean():.2f}")
print(f"Average BERTScore-F1: {df['BERT_F1'].mean():.2f}")

# ✅ 13. 打印前几条可疑标签结果
print(df[df["PotentialTagIssue"]][["Id", "Title", "TagList", "Keywords", "MatchedKeywords", "MatchRatio"]].head(5))

Average MatchRatio: 0.27
Average Precision: 0.27
Average Recall: 0.37
Average F1-score: 0.30
Average BERTScore-P:  0.79
Average BERTScore-R:  0.83
Average BERTScore-F1: 0.81
    Id                                              Title  \
2    3                    Why does Mr. Poe cough so much?   
3    4   In Brave New World, what caste is Lenina Crowne?   
12  18  How did Madeline Usher survive without food or...   
13  21               Why was The Call of the Wild banned?   
20  43  Is this hypothesis about the significance of t...   

                                              TagList  \
2   [character analysis, lemony snicket, a series ...   
3   [character analysis, aldous huxley, brave new ...   
12                   [edgar allan poe, short stories]   
13    [censorship, jack london, the call of the wild]   
20  [name significance, patrick rothfuss, the king...   

                                             Keywords       MatchedKeywords  \
2   [poe cough, poe portrayed, arthur

In [11]:
df[df["PotentialTagIssue"]][["Id", "Title", "TagList", "Keywords", "MatchedKeywords", "MatchRatio"]].head(5)

Unnamed: 0,Id,Title,TagList,Keywords,MatchedKeywords,MatchRatio
2,3,Why does Mr. Poe cough so much?,"[character analysis, lemony snicket, a series ...","[poe cough, poe portrayed, arthur poe, cough b...",[series unfortunate],0.1
3,4,"In Brave New World, what caste is Lenina Crowne?","[character analysis, aldous huxley, brave new ...","[lenina caste, caste lenina, world caste, cast...",[],0.0
12,18,How did Madeline Usher survive without food or...,"[edgar allan poe, short stories]","[madeline usher, usher madeline, usher survive...",[],0.0
13,21,Why was The Call of the Wild banned?,"[censorship, jack london, the call of the wild]","[banning classics, banned book, book banned, b...",[banning],0.1
20,43,Is this hypothesis about the significance of t...,"[name significance, patrick rothfuss, the king...","[addiction denna, denna addictive, textual den...",[],0.0


In [12]:
df["PerfectTags"] = df["MatchRatio"] > 0.9

In [13]:
df[df["PerfectTags"]][["Id", "Title", "TagList", "Keywords", "MatchedKeywords", "MatchRatio"]].head(5)

Unnamed: 0,Id,Title,TagList,Keywords,MatchedKeywords,MatchRatio
14,22,How did the Strugatsky Brothers' experience wi...,"[censorship, russian language, strugatsky brot...","[publishing censorship, censorship experience,...","[publishing censorship, censorship experience,...",1.0
16,24,Did J. K. Rowling base platform 9 3/4 on the G...,"[harry potter, j k rowling, inspiration, the s...","[rowling base, potter gump, potter series, pot...","[rowling base, potter gump, potter series, pot...",1.0
17,25,"In Foucault's Pendulum, by what method does th...","[translation, italian language, foucalts pendu...","[translate italian, italian foucault, italian ...","[translate italian, italian foucault, italian ...",1.0
74,178,How many of Shakespeare's words in his plays w...,"[william shakespeare, word coinage]","[words shakespeare, shakespeare words, word sh...","[words shakespeare, shakespeare words, word sh...",1.0
87,228,Was Isidora's fate in Melmoth the Wanderer dir...,"[inspiration, allusions, faust, charles robert...","[faustian tale, inspired faust, melmoth faust,...","[faustian tale, inspired faust, melmoth faust,...",1.0
