#### 下載 & 合併dataset

In [1]:
from datasets import load_dataset
import pandas as pd

# 從 Hugging Face 下載資料集
dataset_name = "Blueeeeee/PolitiFact_Embeddings"    # 更改bert的截斷成sliding window
dataset = load_dataset(dataset_name)

train_df = pd.DataFrame(dataset['train'])
test_df = pd.DataFrame(dataset['test'])
# 合併 train 和 test 資料
combined_df = pd.concat([train_df, test_df], ignore_index=True)

output_file = "news_final.xlsx"
combined_df.to_excel(output_file, index=False)
print(f"資料儲存為 {output_file}")


  from .autonotebook import tqdm as notebook_tqdm
Generating train split: 100%|██████████| 381/381 [00:00<00:00, 9027.50 examples/s]
Generating test split: 100%|██████████| 102/102 [00:00<00:00, 8081.36 examples/s]


資料儲存為 news_final.xlsx


#### 將 news embedding 取出存為NumPy檔案

In [2]:
import pandas as pd
import numpy as np

input_file = "news_final.xlsx"
df = pd.read_excel(input_file)

# 將 'embeddings' 欄位中的內容轉換為 NumPy 陣列
embeddings_list = df['bert_embeddings'].apply(eval).tolist()
embeddings_array = np.array(embeddings_list)

output_file = "Embeddings/news_embeddings.npy"
np.save(output_file, embeddings_array)

print(f"'embeddings' 儲存為 {output_file}")
print(embeddings_array.shape)
print(embeddings_array)


'embeddings' 儲存為 Embeddings/news_embeddings.npy
(483, 768)
[[ 0.06093387 -0.3279475   0.11908183 ...  0.01421782  0.62325835
   0.22739467]
 [-0.64453363 -0.34649041 -0.71970427 ... -0.06940594  0.40790594
  -0.02794933]
 [-0.36820391 -0.61358404 -0.18433622 ...  0.07476719  0.55231643
   0.43681467]
 ...
 [ 0.11793014 -0.02744537  0.31966382 ... -0.12530054  0.49212432
   0.03951231]
 [ 0.33778453  0.17068006 -0.10644968 ... -0.11142035  0.65453684
  -0.0594855 ]
 [ 0.0282645  -0.65166104  0.29448652 ... -0.20162188  0.23490971
   0.23264199]]


#### 整理news_final.xlsx檔案欄位

In [3]:
import pandas as pd

file = "news_final.xlsx"
df = pd.read_excel(file)

# 刪除 'embeddings' 欄位
if 'bert_embeddings' in df.columns:
    df = df.drop(columns=['bert_embeddings'])

if 'roberta_embeddings' in df.columns:
    df = df.drop(columns=['roberta_embeddings'])

# 新增 'news_id' 欄位，從 0 開始到 len(df)-1
if 'news_id' not in df.columns:
    df.insert(0, 'news_id', range(len(df)))

df.to_excel(file, index=False)

print(f"成功處理檔案")


成功處理檔案


#### Bertopic


In [4]:
import pandas as pd
from bertopic import BERTopic
import numpy as np
import umap
from hdbscan import HDBSCAN
from sklearn.metrics.pairwise import cosine_similarity

file_path = "news_final.xlsx"
df = pd.read_excel(file_path)
texts = df['text'].tolist()

# 調整 UMAP 參數
umap_model = umap.UMAP(n_neighbors=15, min_dist=0.1, n_components=5, metric='cosine')

# 調整 HDBSCAN 參數
hdbscan_model = HDBSCAN(min_cluster_size=2, metric='euclidean', cluster_selection_method='eom', prediction_data=True)

# BERTopic 模型
topic_model = BERTopic(umap_model=umap_model, hdbscan_model=hdbscan_model, calculate_probabilities=True, nr_topics=31, n_gram_range=(1, 2), verbose=True)
topics, probs = topic_model.fit_transform(texts)
embeddings = topic_model.embedding_model.embedding_model.encode(texts)
df['topic'] = topics
print(df['topic'])

# 對未分配主題的文檔進行後處理
for i, topic in enumerate(topics):
    if topic == -1:
        # 計算該文檔與所有0-29主題中心的相似度，並分配到最接近的主題
        similarities = cosine_similarity([embeddings[i]], topic_model.topic_embeddings_[0:30])[0]
        closest_topic = np.argmax(similarities)
        topics[i] = closest_topic
        df.at[i, 'topic'] = closest_topic  # 同步更新 DataFrame 的主題欄位

df['topic'] = topics
print(df['topic'])

results = []
for i, row in df.iterrows():
    result = {
        "news_id": row['news_id'],
        "text": row['text'],
        "topic": row['topic']
    }
    results.append(result)

for result in results:
    print(f"新聞 ID: {result['news_id']}")
    print(f"內容: {result['text'][:10]}...")
    print(f"主題: {result['topic']}\n")

# 獲取每個主題的主題詞和它們的權重
topic_info = topic_model.get_topic_info()
topic_details = {}
for topic in range(30):
    topic_details[topic] = topic_model.get_topic(topic)

for topic, words in topic_details.items():
    print(f"\n主題 {topic} 的主題詞和權重:")
    for word, weight in words:
        print(f"{word}: {weight:.4f}")
        # 只印到0~29主題
        # 只會有31-1個主題，-1會被指定給其他相近的主題


2025-03-27 23:18:34,225 - BERTopic - Embedding - Transforming documents to embeddings.
Batches: 100%|██████████| 16/16 [00:02<00:00,  7.17it/s]
2025-03-27 23:18:40,058 - BERTopic - Embedding - Completed ✓
2025-03-27 23:18:40,059 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2025-03-27 23:18:54,329 - BERTopic - Dimensionality - Completed ✓
2025-03-27 23:18:54,333 - BERTopic - Cluster - Start clustering the reduced embeddings
2025-03-27 23:18:54,498 - BERTopic - Cluster - Completed ✓
2025-03-27 23:18:54,501 - BERTopic - Representation - Extracting topics from clusters using representation models.
2025-03-27 23:18:57,150 - BERTopic - Representation - Completed ✓
2025-03-27 23:18:57,157 - BERTopic - Topic reduction - Reducing number of topics
2025-03-27 23:18:59,972 - BERTopic - Topic reduction - Reduced number of topics from 56 to 31


0       3
1       1
2      -1
3      -1
4      -1
       ..
478    19
479     1
480    18
481     6
482     3
Name: topic, Length: 483, dtype: int64
0       3
1       1
2       1
3       3
4       1
       ..
478    19
479     1
480    18
481     6
482     3
Name: topic, Length: 483, dtype: int64
新聞 ID: 0
內容: Inside a F...
主題: 3

新聞 ID: 1
內容: Democrats ...
主題: 1

新聞 ID: 2
內容: Palin: Pio...
主題: 1

新聞 ID: 3
內容: Sasse Stat...
主題: 3

新聞 ID: 4
內容: Tim Kaine'...
主題: 1

新聞 ID: 5
內容: Text of H....
主題: 7

新聞 ID: 6
內容: Health Ref...
主題: 13

新聞 ID: 7
內容: Remarks by...
主題: 0

新聞 ID: 8
內容: Bill Clint...
主題: 1

新聞 ID: 9
內容: Student Ha...
主題: 1

新聞 ID: 10
內容: Donald J. ...
主題: 0

新聞 ID: 11
內容: House GOP ...
主題: 25

新聞 ID: 12
內容: Remarks by...
主題: 0

新聞 ID: 13
內容: Family's p...
主題: 3

新聞 ID: 14
內容: Barack Oba...
主題: 9

新聞 ID: 15
內容: 'This Week...
主題: 8

新聞 ID: 16
內容: YouTube Yo...
主題: 4

新聞 ID: 17
內容: America's ...
主題: 4

新聞 ID: 18
內容: Department...
主題: 17

新聞 ID: 19
內容: CONFIRMED ...
主題: 3

新聞 ID: 20

### BERT

In [5]:
from transformers import AutoTokenizer, AutoModel
import torch

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
tokenizer = AutoTokenizer.from_pretrained("google-bert/bert-base-uncased")
model = AutoModel.from_pretrained("google-bert/bert-base-uncased").to(device)


### 利用BERT 生成Topic的加權嵌入，存為NumPy檔案

In [6]:
import numpy as np

def get_word_embedding(word):
    inputs = tokenizer(word, return_tensors="pt", padding=True).to(device)
    outputs = model(**inputs)
    # 取最後一層的輸出 (batch_size, seq_len, hidden_size)
    last_hidden_state = outputs.last_hidden_state
    # 取 [CLS] 位置的詞嵌入
    word_embedding = last_hidden_state[:, 0, :].squeeze(0).detach().cpu().numpy()

    return word_embedding

topic_embeddings = []

for topic in range(30):
    # 獲取主題詞和它們的權重
    topic_words = topic_model.get_topic(topic)
    
    # 計算主題的加權嵌入
    topic_embedding = np.zeros(768)
    for word, weight in topic_words:
        word_embedding = get_word_embedding(word)
        weighted_embedding = word_embedding * weight  # 加權嵌入
        topic_embedding += weighted_embedding
    
    topic_embeddings.append(topic_embedding)

# 組合為矩陣
topic_embeddings_matrix = np.vstack(topic_embeddings)
np.save("Embeddings/topic_embeddings_30.npy", topic_embeddings_matrix)

print(f"topic_embeddings已保存，形狀為: {topic_embeddings_matrix.shape}")


topic_embeddings已保存，形狀為: (30, 768)


#### 存news2topic.xlsx檔案

In [7]:
df['topic'] = topics
news2topic_df = df[['news_id', 'topic']].rename(columns={'topic': 'topic_id'})

file_name = "news2topic_30.xlsx"
output_file = "graph/edges/" + file_name 
news2topic_df.to_excel(output_file, index=False)

print(f"結果已存為 {file_name}")

結果已存為 news2topic_30.xlsx
