#### 下載 & 合併dataset

In [1]:
from datasets import load_dataset
import pandas as pd

# 從 Hugging Face 下載 FND_KDD2020 資料集 (只載入 train)
dataset_name = "Blueeeeee/FND_KDD2020_Embeddings"    # 更改bert的截斷成sliding window
dataset = load_dataset(dataset_name, split="train[:800]")  # 只取前 800 筆

# 轉換為 Pandas DataFrame
train_df = pd.DataFrame(dataset)

# 儲存為 Excel 檔案
output_file = "news_final.xlsx"
train_df.to_excel(output_file, index=False)

print(f"資料儲存為 {output_file}")


  from .autonotebook import tqdm as notebook_tqdm


資料儲存為 news_final.xlsx


#### 將 news embedding 取出存為NumPy檔案

In [2]:
import pandas as pd
import numpy as np

input_file = "news_final.xlsx"
df = pd.read_excel(input_file)

# 將 'embeddings' 欄位中的內容轉換為 NumPy 陣列
embeddings_list = df['bert_embeddings'].apply(eval).tolist()
embeddings_array = np.array(embeddings_list)

output_file = "Embeddings/news_embeddings.npy"
np.save(output_file, embeddings_array)

print(f"'embeddings' 儲存為 {output_file}")
print(embeddings_array.shape)
print(embeddings_array)


'embeddings' 儲存為 Embeddings/news_embeddings.npy
(800, 768)
[[ 0.15288252 -0.10234407  0.24855722 ... -0.09516022  0.47187915
   0.13419698]
 [ 0.18765637 -0.51727855  0.25113443 ...  0.12069827  0.67971748
  -0.03194928]
 [-0.0340025  -0.25536323  0.2337451  ... -0.06719862  0.22721767
   0.06320038]
 ...
 [-0.47238192 -0.20615265  0.00135468 ... -0.48358735  0.52605253
  -0.05876901]
 [-0.26525265 -0.44008005  0.15422758 ...  0.01753697  0.65917617
  -0.15203035]
 [ 0.20859168 -0.13752888 -0.26137102 ... -0.43239805  0.45633665
   0.20235544]]


#### 整理news_final.xlsx檔案欄位

In [3]:
import pandas as pd

file = "news_final.xlsx"
df = pd.read_excel(file)

# 刪除 'embeddings' 欄位
if 'bert_embeddings' in df.columns:
    df = df.drop(columns=['bert_embeddings'])

if 'roberta_embeddings' in df.columns:
    df = df.drop(columns=['roberta_embeddings'])

# 新增 'news_id' 欄位，從 0 開始到 len(df)-1
if 'news_id' not in df.columns:
    df.insert(0, 'news_id', range(len(df)))

df.to_excel(file, index=False)

print(f"成功處理檔案")


成功處理檔案


#### Bertopic


In [4]:
import pandas as pd
from bertopic import BERTopic
import numpy as np
import umap
from hdbscan import HDBSCAN
from sklearn.metrics.pairwise import cosine_similarity

file_path = "news_final.xlsx"
df = pd.read_excel(file_path)
texts = df['text'].tolist()

# 調整 UMAP 參數
umap_model = umap.UMAP(n_neighbors=15, min_dist=0.1, n_components=5, metric='cosine')

# 調整 HDBSCAN 參數
hdbscan_model = HDBSCAN(min_cluster_size=2, metric='euclidean', cluster_selection_method='eom', prediction_data=True)

# BERTopic 模型
topic_model = BERTopic(umap_model=umap_model, hdbscan_model=hdbscan_model, calculate_probabilities=True, nr_topics=31, n_gram_range=(1, 2), verbose=True)
topics, probs = topic_model.fit_transform(texts)
embeddings = topic_model.embedding_model.embedding_model.encode(texts)
df['topic'] = topics
print(df['topic'])

# 對未分配主題的文檔進行後處理
for i, topic in enumerate(topics):
    if topic == -1:
        # 計算該文檔與所有0-29主題中心的相似度，並分配到最接近的主題
        similarities = cosine_similarity([embeddings[i]], topic_model.topic_embeddings_[0:30])[0]
        closest_topic = np.argmax(similarities)
        topics[i] = closest_topic
        df.at[i, 'topic'] = closest_topic  # 同步更新 DataFrame 的主題欄位

df['topic'] = topics
print(df['topic'])

results = []
for i, row in df.iterrows():
    result = {
        "news_id": row['news_id'],
        "text": row['text'],
        "topic": row['topic']
    }
    results.append(result)

for result in results:
    print(f"新聞 ID: {result['news_id']}")
    print(f"內容: {result['text'][:10]}...")
    print(f"主題: {result['topic']}\n")

# 獲取每個主題的主題詞和它們的權重
topic_info = topic_model.get_topic_info()
topic_details = {}
for topic in range(30):
    topic_details[topic] = topic_model.get_topic(topic)

for topic, words in topic_details.items():
    print(f"\n主題 {topic} 的主題詞和權重:")
    for word, weight in words:
        print(f"{word}: {weight:.4f}")
        # 只印到0~29主題
        # 只會有31-1個主題，-1會被指定給其他相近的主題


2025-06-08 17:16:02,847 - BERTopic - Embedding - Transforming documents to embeddings.
Batches: 100%|██████████| 25/25 [00:02<00:00,  9.43it/s]
2025-06-08 17:16:08,958 - BERTopic - Embedding - Completed ✓
2025-06-08 17:16:08,959 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2025-06-08 17:16:24,856 - BERTopic - Dimensionality - Completed ✓
2025-06-08 17:16:24,858 - BERTopic - Cluster - Start clustering the reduced embeddings
2025-06-08 17:16:25,502 - BERTopic - Cluster - Completed ✓
2025-06-08 17:16:25,503 - BERTopic - Representation - Extracting topics from clusters using representation models.
2025-06-08 17:16:28,890 - BERTopic - Representation - Completed ✓
2025-06-08 17:16:28,899 - BERTopic - Topic reduction - Reducing number of topics
2025-06-08 17:16:32,160 - BERTopic - Topic reduction - Reduced number of topics from 96 to 31


0      0
1      3
2      0
3      2
4     -1
      ..
795   -1
796    0
797    4
798    0
799    0
Name: topic, Length: 800, dtype: int64
0      0
1      3
2      0
3      2
4      0
      ..
795    1
796    0
797    4
798    0
799    0
Name: topic, Length: 800, dtype: int64
新聞 ID: 0
內容: As the cou...
主題: 0

新聞 ID: 1
內容: [This stor...
主題: 3

新聞 ID: 2
內容: CHERRY Sea...
主題: 0

新聞 ID: 3
內容: The 2018 G...
主題: 2

新聞 ID: 4
內容: The gang’s...
主題: 0

新聞 ID: 5
內容: iZombie st...
主題: 0

新聞 ID: 6
內容: Carrie Und...
主題: 0

新聞 ID: 7
內容: Camila Cab...
主題: 0

新聞 ID: 8
內容: It started...
主題: 0

新聞 ID: 9
內容: The tale a...
主題: 3

新聞 ID: 10
內容: Last week,...
主題: 17

新聞 ID: 11
內容: Scarlett J...
主題: 0

新聞 ID: 12
內容: Lamar Odom...
主題: 0

新聞 ID: 13
內容: Taylor Swi...
主題: 1

新聞 ID: 14
內容: Robert Pat...
主題: 0

新聞 ID: 15
內容: On Saturda...
主題: 0

新聞 ID: 16
內容: In actuali...
主題: 1

新聞 ID: 17
內容: For Échame...
主題: 14

新聞 ID: 18
內容: Ming Xi Br...
主題: 0

新聞 ID: 19
內容: Awards sea...
主題: 2

新聞 ID: 20
內容: Chance the...
主題: 

### BERT

In [5]:
from transformers import AutoTokenizer, AutoModel
import torch

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
tokenizer = AutoTokenizer.from_pretrained("google-bert/bert-base-uncased")
model = AutoModel.from_pretrained("google-bert/bert-base-uncased").to(device)


### 利用BERT 生成Topic的加權嵌入，存為NumPy檔案

In [6]:
import numpy as np

def get_word_embedding(word):
    inputs = tokenizer(word, return_tensors="pt", padding=True).to(device)
    outputs = model(**inputs)
    # 取最後一層的輸出 (batch_size, seq_len, hidden_size)
    last_hidden_state = outputs.last_hidden_state
    # 取 [CLS] 位置的詞嵌入
    word_embedding = last_hidden_state[:, 0, :].squeeze(0).detach().cpu().numpy()

    return word_embedding

topic_embeddings = []

for topic in range(30):
    # 獲取主題詞和它們的權重
    topic_words = topic_model.get_topic(topic)
    
    # 計算主題的加權嵌入
    topic_embedding = np.zeros(768)
    for word, weight in topic_words:
        word_embedding = get_word_embedding(word)
        weighted_embedding = word_embedding * weight  # 加權嵌入
        topic_embedding += weighted_embedding
    
    topic_embeddings.append(topic_embedding)

# 組合為矩陣
topic_embeddings_matrix = np.vstack(topic_embeddings)
np.save("Embeddings/topic_embeddings_30.npy", topic_embeddings_matrix)

print(f"topic_embeddings已保存，形狀為: {topic_embeddings_matrix.shape}")


topic_embeddings已保存，形狀為: (30, 768)


#### 存news2topic.xlsx檔案

In [7]:
df['topic'] = topics
news2topic_df = df[['news_id', 'topic']].rename(columns={'topic': 'topic_id'})

file_name = "news2topic_30.xlsx"
output_file = "graph/edges/" + file_name 
news2topic_df.to_excel(output_file, index=False)

print(f"結果已存為 {file_name}")

結果已存為 news2topic_30.xlsx
