#### 下載 & 合併dataset

In [1]:
from datasets import load_dataset
import pandas as pd

# 從 Hugging Face 下載 Fake_News_GossipCop 資料集 (只載入 train)
dataset_name = "LittleFish-Coder/Fake_News_GossipCop"
dataset = load_dataset(dataset_name, split="train[:2000]")  # 只取前 2000 筆

# 轉換為 Pandas DataFrame
train_df = pd.DataFrame(dataset)

# 儲存為 Excel 檔案
output_file = "news_final.xlsx"
train_df.to_excel(output_file, index=False)

print(f"資料儲存為 {output_file}")


  from .autonotebook import tqdm as notebook_tqdm


資料儲存為 news_final.xlsx


#### 將 news embedding 取出存為NumPy檔案

In [2]:
import pandas as pd
import numpy as np

input_file = "news_final.xlsx"
df = pd.read_excel(input_file)

# 將 'embeddings' 欄位中的內容轉換為 NumPy 陣列
embeddings_list = df['embeddings'].apply(eval).tolist()
embeddings_array = np.array(embeddings_list)

output_file = "Embeddings/news_embeddings.npy"
np.save(output_file, embeddings_array)

print(f"'embeddings' 儲存為 {output_file}")
print(embeddings_array.shape)
print(embeddings_array)


'embeddings' 儲存為 Embeddings/news_embeddings.npy
(2000, 768)
[[-0.53123206 -0.37109652 -0.29041293 ...  0.10078647  0.43716702
  -0.23624143]
 [-0.12955573 -0.48822027 -0.12921876 ... -0.04940382  0.6457653
  -0.4960807 ]
 [-0.67361116 -0.43797556  0.05991269 ... -0.0837379   0.74227601
  -0.30755681]
 ...
 [-0.04545198 -0.41613433  0.40091962 ... -0.68856812  0.50836545
   0.63971943]
 [ 0.26808956 -0.76493889  0.20588247 ... -0.68922728  0.48159507
   0.22199568]
 [-0.27066821 -0.15012741  0.40688345 ... -0.37924266  0.50118625
   0.13602512]]


#### 整理news_final.xlsx檔案欄位

In [3]:
import pandas as pd

file = "news_final.xlsx"
df = pd.read_excel(file)

# 刪除 'embeddings' 欄位
if 'embeddings' in df.columns:
    df = df.drop(columns=['embeddings'])

# 新增 'news_id' 欄位，從 0 開始到 len(df)-1
if 'news_id' not in df.columns:
    df.insert(0, 'news_id', range(len(df)))

df.to_excel(file, index=False)

print(f"成功處理檔案")


成功處理檔案


#### Bertopic


In [4]:
import pandas as pd
from bertopic import BERTopic
import numpy as np
import umap
from hdbscan import HDBSCAN
from sklearn.metrics.pairwise import cosine_similarity

file_path = "news_final.xlsx"
df = pd.read_excel(file_path)
texts = df['text'].tolist()

# 調整 UMAP 參數
umap_model = umap.UMAP(n_neighbors=15, min_dist=0.1, n_components=5, metric='cosine')

# 調整 HDBSCAN 參數
hdbscan_model = HDBSCAN(min_cluster_size=2, metric='euclidean', cluster_selection_method='eom', prediction_data=True)

# BERTopic 模型
topic_model = BERTopic(umap_model=umap_model, hdbscan_model=hdbscan_model, calculate_probabilities=True, nr_topics=31, n_gram_range=(1, 2), verbose=True)
topics, probs = topic_model.fit_transform(texts)
embeddings = topic_model.embedding_model.embedding_model.encode(texts)
df['topic'] = topics
print(df['topic'])

# 對未分配主題的文檔進行後處理
for i, topic in enumerate(topics):
    if topic == -1:
        # 計算該文檔與所有0-29主題中心的相似度，並分配到最接近的主題
        similarities = cosine_similarity([embeddings[i]], topic_model.topic_embeddings_[0:30])[0]
        closest_topic = np.argmax(similarities)
        topics[i] = closest_topic
        df.at[i, 'topic'] = closest_topic  # 同步更新 DataFrame 的主題欄位

df['topic'] = topics
print(df['topic'])

results = []
for i, row in df.iterrows():
    result = {
        "news_id": row['news_id'],
        "text": row['text'],
        "topic": row['topic']
    }
    results.append(result)

for result in results:
    print(f"新聞 ID: {result['news_id']}")
    print(f"內容: {result['text'][:10]}...")
    print(f"主題: {result['topic']}\n")

# 獲取每個主題的主題詞和它們的權重
topic_info = topic_model.get_topic_info()
topic_details = {}
for topic in range(30):
    topic_details[topic] = topic_model.get_topic(topic)

for topic, words in topic_details.items():
    print(f"\n主題 {topic} 的主題詞和權重:")
    for word, weight in words:
        print(f"{word}: {weight:.4f}")
        # 只印到0~29主題
        # 只會有31-1個主題，-1會被指定給其他相近的主題


2025-03-06 18:03:32,668 - BERTopic - Embedding - Transforming documents to embeddings.
Batches: 100%|██████████| 63/63 [00:04<00:00, 12.96it/s]
2025-03-06 18:03:41,839 - BERTopic - Embedding - Completed ✓
2025-03-06 18:03:41,840 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2025-03-06 18:04:04,872 - BERTopic - Dimensionality - Completed ✓
2025-03-06 18:04:04,875 - BERTopic - Cluster - Start clustering the reduced embeddings
2025-03-06 18:04:10,371 - BERTopic - Cluster - Completed ✓
2025-03-06 18:04:10,377 - BERTopic - Representation - Extracting topics from clusters using representation models.
2025-03-06 18:04:18,197 - BERTopic - Representation - Completed ✓
2025-03-06 18:04:18,212 - BERTopic - Topic reduction - Reducing number of topics
2025-03-06 18:04:25,601 - BERTopic - Topic reduction - Reduced number of topics from 233 to 31


0        7
1        0
2        2
3        1
4        9
        ..
1995    -1
1996    13
1997    -1
1998     3
1999     1
Name: topic, Length: 2000, dtype: int64
0        7
1        0
2        2
3        1
4        9
        ..
1995     1
1996    13
1997     0
1998     3
1999     1
Name: topic, Length: 2000, dtype: int64
新聞 ID: 0
內容: Cher Steal...
主題: 7

新聞 ID: 1
內容: Thomas Rhe...
主題: 0

新聞 ID: 2
內容: 2019 Sprin...
主題: 2

新聞 ID: 3
內容: Taylor Swi...
主題: 1

新聞 ID: 4
內容: 15 Times C...
主題: 9

新聞 ID: 5
內容: After ever...
主題: 0

新聞 ID: 6
內容: Hackers Th...
主題: 16

新聞 ID: 7
內容: Selena Gom...
主題: 1

新聞 ID: 8
內容: Watch This...
主題: 0

新聞 ID: 9
內容: The Platin...
主題: 0

新聞 ID: 10
內容: Khloé Kard...
主題: 0

新聞 ID: 11
內容: Billy Bush...
主題: 0

新聞 ID: 12
內容: Untangling...
主題: 4

新聞 ID: 13
內容: Carrie Und...
主題: 0

新聞 ID: 14
內容: Botched pa...
主題: 6

新聞 ID: 15
內容: Mandy Moor...
主題: 1

新聞 ID: 16
內容: Khloe Kard...
主題: 1

新聞 ID: 17
內容: Kourtney's...
主題: 0

新聞 ID: 18
內容: Corinne Ol...
主題: 1

新聞 ID: 19
內容: Devastat

### BERT

In [5]:
from transformers import AutoTokenizer, AutoModel
import torch

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
tokenizer = AutoTokenizer.from_pretrained("google-bert/bert-base-uncased")
model = AutoModel.from_pretrained("google-bert/bert-base-uncased").to(device)


### 利用BERT 生成Topic的加權嵌入，存為NumPy檔案

In [6]:
import numpy as np

def get_word_embedding(word):
    inputs = tokenizer(word, return_tensors="pt", padding=True).to(device)
    outputs = model(**inputs)
    # 取最後一層的輸出 (batch_size, seq_len, hidden_size)
    last_hidden_state = outputs.last_hidden_state
    # 取 [CLS] 位置的詞嵌入
    word_embedding = last_hidden_state[:, 0, :].squeeze(0).detach().cpu().numpy()

    return word_embedding

topic_embeddings = []

for topic in range(30):
    # 獲取主題詞和它們的權重
    topic_words = topic_model.get_topic(topic)
    
    # 計算主題的加權嵌入
    topic_embedding = np.zeros(768)
    for word, weight in topic_words:
        word_embedding = get_word_embedding(word)
        weighted_embedding = word_embedding * weight  # 加權嵌入
        topic_embedding += weighted_embedding
    
    topic_embeddings.append(topic_embedding)

# 組合為矩陣
topic_embeddings_matrix = np.vstack(topic_embeddings)
np.save("Embeddings/topic_embeddings_30.npy", topic_embeddings_matrix)

print(f"topic_embeddings已保存，形狀為: {topic_embeddings_matrix.shape}")


topic_embeddings已保存，形狀為: (30, 768)


#### 存news2topic.xlsx檔案

In [7]:
df['topic'] = topics
news2topic_df = df[['news_id', 'topic']].rename(columns={'topic': 'topic_id'})

file_name = "news2topic_30.xlsx"
output_file = "graph/edges/" + file_name 
news2topic_df.to_excel(output_file, index=False)

print(f"結果已存為 {file_name}")

結果已存為 news2topic_30.xlsx
