#### 下載 & 合併dataset

In [1]:
from datasets import load_dataset
import pandas as pd

# 從 Hugging Face 下載 Liar 資料集 (只載入 train)
dataset_name = "Blueeeeee/Liar_Embeddings"    # 更改bert的截斷成sliding window
dataset = load_dataset(dataset_name, split="train[:1280]")  # 只取前 1280 筆

# 轉換為 Pandas DataFrame
train_df = pd.DataFrame(dataset)

# 儲存為 Excel 檔案
output_file = "news_final.xlsx"
train_df.to_excel(output_file, index=False)

print(f"資料儲存為 {output_file}")


  from .autonotebook import tqdm as notebook_tqdm


資料儲存為 news_final.xlsx


#### 將 news embedding 取出存為NumPy檔案

In [2]:
import pandas as pd
import numpy as np

input_file = "news_final.xlsx"
df = pd.read_excel(input_file)

# 將 'embeddings' 欄位中的內容轉換為 NumPy 陣列
embeddings_list = df['bert_embeddings'].apply(eval).tolist()
embeddings_array = np.array(embeddings_list)

output_file = "Embeddings/news_embeddings.npy"
np.save(output_file, embeddings_array)

print(f"'embeddings' 儲存為 {output_file}")
print(embeddings_array.shape)
print(embeddings_array)


'embeddings' 儲存為 Embeddings/news_embeddings.npy
(1280, 768)
[[-0.52442306 -0.32204252 -0.46295533 ... -0.3363893   0.46377271
  -0.08748707]
 [-0.50526059  0.14018615 -0.39540514 ... -0.30953017  0.83329642
   0.28554618]
 [-0.51595151 -0.05632376 -0.71297407 ... -0.59846878  0.84010178
  -0.12291477]
 ...
 [-0.55240524 -0.02351463 -0.18972731 ... -0.19426206  0.4685699
   0.74677795]
 [-0.41504064  0.04059103 -0.62946928 ... -0.30588928  0.58409637
   0.18748848]
 [-0.71262294 -0.24458    -0.04365434 ... -0.100983    0.75221813
  -0.00396588]]


#### 整理news_final.xlsx檔案欄位

In [3]:
import pandas as pd

file = "news_final.xlsx"
df = pd.read_excel(file)

# 刪除 'embeddings' 欄位
if 'bert_embeddings' in df.columns:
    df = df.drop(columns=['bert_embeddings'])

if 'roberta_embeddings' in df.columns:
    df = df.drop(columns=['roberta_embeddings'])

# 新增 'news_id' 欄位，從 0 開始到 len(df)-1
if 'news_id' not in df.columns:
    df.insert(0, 'news_id', range(len(df)))

df.to_excel(file, index=False)

print(f"成功處理檔案")


成功處理檔案


#### Bertopic


In [4]:
import pandas as pd
from bertopic import BERTopic
import numpy as np
import umap
from hdbscan import HDBSCAN
from sklearn.metrics.pairwise import cosine_similarity

file_path = "news_final.xlsx"
df = pd.read_excel(file_path)
texts = df['text'].tolist()

# 調整 UMAP 參數
umap_model = umap.UMAP(n_neighbors=15, min_dist=0.1, n_components=5, metric='cosine')

# 調整 HDBSCAN 參數
hdbscan_model = HDBSCAN(min_cluster_size=2, metric='euclidean', cluster_selection_method='eom', prediction_data=True)

# BERTopic 模型
topic_model = BERTopic(umap_model=umap_model, hdbscan_model=hdbscan_model, calculate_probabilities=True, nr_topics=31, n_gram_range=(1, 2), verbose=True)
topics, probs = topic_model.fit_transform(texts)
embeddings = topic_model.embedding_model.embedding_model.encode(texts)
df['topic'] = topics
print(df['topic'])

# 對未分配主題的文檔進行後處理
for i, topic in enumerate(topics):
    if topic == -1:
        # 計算該文檔與所有0-29主題中心的相似度，並分配到最接近的主題
        similarities = cosine_similarity([embeddings[i]], topic_model.topic_embeddings_[0:30])[0]
        closest_topic = np.argmax(similarities)
        topics[i] = closest_topic
        df.at[i, 'topic'] = closest_topic  # 同步更新 DataFrame 的主題欄位

df['topic'] = topics
print(df['topic'])

results = []
for i, row in df.iterrows():
    result = {
        "news_id": row['news_id'],
        "text": row['text'],
        "topic": row['topic']
    }
    results.append(result)

for result in results:
    print(f"新聞 ID: {result['news_id']}")
    print(f"內容: {result['text'][:10]}...")
    print(f"主題: {result['topic']}\n")

# 獲取每個主題的主題詞和它們的權重
topic_info = topic_model.get_topic_info()
topic_details = {}
for topic in range(30):
    topic_details[topic] = topic_model.get_topic(topic)

for topic, words in topic_details.items():
    print(f"\n主題 {topic} 的主題詞和權重:")
    for word, weight in words:
        print(f"{word}: {weight:.4f}")
        # 只印到0~29主題
        # 只會有31-1個主題，-1會被指定給其他相近的主題


2025-06-08 18:09:16,765 - BERTopic - Embedding - Transforming documents to embeddings.
Batches: 100%|██████████| 40/40 [00:01<00:00, 23.60it/s]
2025-06-08 18:09:21,751 - BERTopic - Embedding - Completed ✓
2025-06-08 18:09:21,752 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2025-06-08 18:09:40,219 - BERTopic - Dimensionality - Completed ✓
2025-06-08 18:09:40,222 - BERTopic - Cluster - Start clustering the reduced embeddings
2025-06-08 18:09:41,988 - BERTopic - Cluster - Completed ✓
2025-06-08 18:09:41,989 - BERTopic - Representation - Extracting topics from clusters using representation models.
2025-06-08 18:09:42,483 - BERTopic - Representation - Completed ✓
2025-06-08 18:09:42,486 - BERTopic - Topic reduction - Reducing number of topics
2025-06-08 18:09:42,758 - BERTopic - Topic reduction - Reduced number of topics from 150 to 31


0        5
1       14
2        0
3        2
4       -1
        ..
1275     9
1276     0
1277    -1
1278    13
1279     3
Name: topic, Length: 1280, dtype: int64
0        5
1       14
2        0
3        2
4        2
        ..
1275     9
1276     0
1277     0
1278    13
1279     3
Name: topic, Length: 1280, dtype: int64
新聞 ID: 0
內容: Dwayne Boh...
主題: 5

新聞 ID: 1
內容: Scott Suro...
主題: 14

新聞 ID: 2
內容: Barack Oba...
主題: 0

新聞 ID: 3
內容: Blog Posti...
主題: 2

新聞 ID: 4
內容: Charlie Cr...
主題: 2

新聞 ID: 5
內容: Robin Vos ...
主題: 3

新聞 ID: 6
內容: Republican...
主題: 12

新聞 ID: 7
內容: Barack Oba...
主題: 0

新聞 ID: 8
內容: Oregon Lot...
主題: 18

新聞 ID: 9
內容: Duey Stroe...
主題: 1

新聞 ID: 10
內容: Robert Men...
主題: 10

新聞 ID: 11
內容: Bernie S s...
主題: 4

新聞 ID: 12
內容: Mitt Romne...
主題: 0

新聞 ID: 13
內容: Doonesbury...
主題: 1

新聞 ID: 14
內容: George Wil...
主題: 2

新聞 ID: 15
內容: Bernie S s...
主題: 11

新聞 ID: 16
內容: Barack Oba...
主題: 0

新聞 ID: 17
內容: National R...
主題: 4

新聞 ID: 18
內容: Gwen Moore...
主題: 1

新聞 ID: 19
內容: Jack

### BERT

In [5]:
from transformers import AutoTokenizer, AutoModel
import torch

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
tokenizer = AutoTokenizer.from_pretrained("google-bert/bert-base-uncased")
model = AutoModel.from_pretrained("google-bert/bert-base-uncased").to(device)


### 利用BERT 生成Topic的加權嵌入，存為NumPy檔案

In [6]:
import numpy as np

def get_word_embedding(word):
    inputs = tokenizer(word, return_tensors="pt", padding=True).to(device)
    outputs = model(**inputs)
    # 取最後一層的輸出 (batch_size, seq_len, hidden_size)
    last_hidden_state = outputs.last_hidden_state
    # 取 [CLS] 位置的詞嵌入
    word_embedding = last_hidden_state[:, 0, :].squeeze(0).detach().cpu().numpy()

    return word_embedding

topic_embeddings = []

for topic in range(30):
    # 獲取主題詞和它們的權重
    topic_words = topic_model.get_topic(topic)
    
    # 計算主題的加權嵌入
    topic_embedding = np.zeros(768)
    for word, weight in topic_words:
        word_embedding = get_word_embedding(word)
        weighted_embedding = word_embedding * weight  # 加權嵌入
        topic_embedding += weighted_embedding
    
    topic_embeddings.append(topic_embedding)

# 組合為矩陣
topic_embeddings_matrix = np.vstack(topic_embeddings)
np.save("Embeddings/topic_embeddings_30.npy", topic_embeddings_matrix)

print(f"topic_embeddings已保存，形狀為: {topic_embeddings_matrix.shape}")


topic_embeddings已保存，形狀為: (30, 768)


#### 存news2topic.xlsx檔案

In [7]:
df['topic'] = topics
news2topic_df = df[['news_id', 'topic']].rename(columns={'topic': 'topic_id'})

file_name = "news2topic_30.xlsx"
output_file = "graph/edges/" + file_name 
news2topic_df.to_excel(output_file, index=False)

print(f"結果已存為 {file_name}")

結果已存為 news2topic_30.xlsx
