# 假新聞分類與分析

In [None]:

import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')

# 載入資料集
fake_df = pd.read_csv('./raw_data/fake.csv')
true_df = pd.read_csv('./raw_data/true.csv')

# 加上 label 欄位
fake_df['label'] = 1
true_df['label'] = 0

# 取前1000筆
data = pd.concat([fake_df.iloc[:1000], true_df.iloc[:1000]], ignore_index=True)
data = data[data['text'].notna()].reset_index(drop=True)

# 檢查各類別數量
print(data['label'].value_counts())


## 需要做文本預處理嗎?

目的:
- 建立分類器來預測真假新聞 -> (TF-IDF + 分類模型需要乾淨的資料，有幫助)
- 分析NER 結果與語意分佈 -> (會破壞語意)
- 建立主題模型來探索語意主題（BERTopic -> (會破壞語意)

In [None]:

# lemmatizer = WordNetLemmatizer()
# stop_words = set(stopwords.words('english'))

# def preprocess(text):
#     text = text.lower()
#     text = re.sub(r'[^a-z ]', '', text)
#     tokens = word_tokenize(text)
#     tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words]
#     return tokens

# data['tokens'] = data['text'].astype(str).apply(preprocess)
# data['clean_text'] = data['tokens'].apply(lambda x: ' '.join(x))



## NER

In [None]:
from matplotlib.font_manager import fontManager
import matplotlib.pyplot as plt

fontManager.addfont('./public/TaipeiSansTCBeta-Regular.ttf')
plt.rcParams['font.sans-serif'] = ['Taipei Sans TC Beta']
plt.rcParams['font.size'] = '16'

In [None]:
from sklearn.cluster import KMeans
from bertopic import BERTopic
from bertopic.vectorizers import ClassTfidfTransformer
from hdbscan import HDBSCAN
from transformers import BertTokenizerFast, AutoTokenizer, AutoModelForTokenClassification, AutoModelForSequenceClassification, pipeline
from tqdm import tqdm

# 載入模型與 tokenizer
tokenizer = AutoTokenizer.from_pretrained("dslim/bert-base-NER")
model = AutoModelForTokenClassification.from_pretrained("dslim/bert-base-NER")
ner_pipeline = pipeline("ner", model=model, tokenizer=tokenizer, aggregation_strategy="simple")

# 建立 NER 結果列表
ner_rows = []

# 分切字串
def split_text(text, chunk_size=512):
    return [text[i:i+chunk_size] for i in range(0, len(text), chunk_size)]

# 針對每篇文章跑 NER（可用 tqdm 顯示進度條）
for idx, text in tqdm(data['text'].astype(str).items()):
    try:
        chunks = split_text(text)
        all_ents = []
        for chunk in chunks:
            all_ents.extend(ner_pipeline(chunk))  # 對每段跑 NER
        for ent in all_ents:
            ner_rows.append({
                "index": idx,
                "entity": ent['entity_group'],  # e.g., PER, LOC
                "word": ent['word'],
                "score": ent['score']
            })
    except Exception as e:
        print(f"Error at idx {idx}: {e}")

# 建立 DataFrame
ner_df = pd.DataFrame(ner_rows)

In [None]:
ner_df.head(10)


In [None]:
# 整合 label
merged_df = ner_df.merge(data[['label']], left_on='index', right_index=True)

# 聚合所有 entity 類型的出現次數
entity_counts_all = (
    merged_df.groupby(['index', 'entity'])
    .size()
    .unstack(fill_value=0)  # 得到每篇文章各類實體數
    .reset_index()
)

# 合併 label
entity_counts_all = entity_counts_all.merge(data[['label']], left_on='index', right_index=True)

# 建模欄位選擇：所有實體類別欄位（排除 index, label）
feature_cols = [col for col in entity_counts_all.columns if col not in ['index', 'label']]
kmeans_fit_pred_data = entity_counts_all[feature_cols]

# 做 KMeans 聚類
from sklearn.cluster import KMeans
import seaborn as sns

kmeans = KMeans(n_clusters=2, random_state=42)
entity_counts_all['cluster'] = kmeans.fit_predict(kmeans_fit_pred_data)

from sklearn.decomposition import PCA

pca = PCA(n_components=2)
X_pca = pca.fit_transform(kmeans_fit_pred_data)
entity_counts_all['PC1'] = X_pca[:, 0]
entity_counts_all['PC2'] = X_pca[:, 1]
# 視覺化聚類結果
plt.figure(figsize=(8, 6))
sns.scatterplot(
    data=entity_counts_all,
    x='PC1', y='PC2', hue='cluster', style='label',
    palette='Set2', s=100
)

plt.title('NER 特徵的主成分分析 + KMeans 聚類')
plt.grid(True)
plt.tight_layout()
plt.show()

#### 嘗試用 NER 提取出的'人名'、'組織'、'地名數量'作為詞彙特徵，再餵給 TF-IDF + 模型來預測這篇新聞是真/假

In [None]:
# 聚合 ner_df 結果為特徵表（以 index = 文章編號為 key）
entity_counts = ner_df.groupby(['index', 'entity']).size().unstack(fill_value=0)

# 合併回原資料集
data_with_ner = data.copy()
data_with_ner = data_with_ner.join(entity_counts, how='left').fillna(0)

# 建立特徵：人名、組織、地名數量
X = data_with_ner[['PER', 'ORG', 'LOC']]
y = data_with_ner['label']

# 建模
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
clf = LogisticRegression()
clf.fit(X_train, y_train)
preds = clf.predict(X_test)

print(classification_report(y_test, preds))

模型分辨力有點低?

## Text Clustering 與 真假新聞分辨

### Topic model: BERTopic 主題詞來源使用c-TF-IDF頻率導向，表現方式偏向詞頻高的詞

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

# 真假新聞進行主題建模
docs = data['text'].astype(str).tolist()

# 模型可換成 'all-MiniLM-L6-v2', 'microsoft/Phi-4-mini-instruct' 等
embedding_model = 'all-MiniLM-L6-v2'

# 可調整 測試用2000筆
# min_cluster_size 群集最少需要包含幾個點，否則會被視為雜訊（noise）
# min_samples 包含至少n篇文章的主題才會被承認為主題
hdbscan_model = HDBSCAN(min_cluster_size=10, min_samples=30) # Clustering layer
vectorizer_model = CountVectorizer(ngram_range=(1, 2), stop_words="english")

topic_model = BERTopic(embedding_model=embedding_model, hdbscan_model=hdbscan_model, vectorizer_model=vectorizer_model)
topics, probs = topic_model.fit_transform(docs)


In [None]:
# 建立一個儲存所有主題關鍵詞與 TF-IDF 分數的清單
all_topics = []

# 把主題總數拿出來（排除 -1 是未分類主題）
valid_topics = [topic for topic in topic_model.get_topic_info().Topic if topic != -1]

# 對每個主題取得詞與 c-TF-IDF 分數
for topic_id in valid_topics:
    topic_words = topic_model.get_topic(topic_id)
    for word, score in topic_words:
        all_topics.append({
            "Topic": topic_id,
            "Word": word,
            "C-TF-IDF": score
        })

# 轉換成 DataFrame 並排序
tfidf_df = pd.DataFrame(all_topics)
tfidf_df = tfidf_df.sort_values(by=["Topic", "C-TF-IDF"], ascending=[True, False])

# 顯示前幾列
tfidf_df.head(20)

In [None]:

# 列出文章的BERTopic資訊
topic_model.get_document_info(docs)

In [None]:
def visualize_fake_news_ratio_by_topic(model, docs, labels, title="主題的假新聞比例"):
    doc_info = model.get_document_info(docs).copy()
    doc_info['label'] = labels

    # 計算比例與數量
    topic_fake_ratio = (
        doc_info[doc_info['Topic'] != -1]
        .groupby('Topic')['label']
        .mean()
        .reset_index()
        .rename(columns={'label': 'fake_news_ratio'})
    )
    topic_counts = (
        doc_info[doc_info['Topic'] != -1]['Topic']
        .value_counts()
        .rename_axis('Topic')
        .reset_index(name='count')
    )
    topic_stats = pd.merge(topic_fake_ratio, topic_counts, on='Topic')

    # 加上主題名稱
    topic_names = model.get_topic_info()[['Topic', 'Name']]
    topic_stats_named = topic_stats.merge(topic_names, on='Topic')

    # 過濾比例過低的主題
    topic_stats_named = topic_stats_named[topic_stats_named['fake_news_ratio'] >= 0.1]

    # 繪圖
    plt.figure(figsize=(12, 8))
    ax = sns.barplot(
        data=topic_stats_named.sort_values(by='fake_news_ratio', ascending=False),
        x='fake_news_ratio', y='Name', palette='Reds'
    )
    plt.title(title)
    plt.xlabel('假新聞比例 (label=1)')
    plt.ylabel('主題代表詞')
    plt.grid(True, axis='x')
    ax.set_yticklabels(ax.get_yticklabels(), fontsize=9)
    plt.tight_layout()
    plt.show()

### representation topic model: 加上語意導向的KeyBERT, 表現方式是語意向量相似的詞 

In [None]:
from bertopic.representation import KeyBERTInspired
from sentence_transformers import SentenceTransformer

embedding_model_with_st = SentenceTransformer(embedding_model)  # 或其他你指定的模型
embeddings = embedding_model_with_st.encode(docs, show_progress_bar=True)

# 關鍵詞表示模型（非生成式）
keybert = KeyBERTInspired()

# 組裝 representation model
representation_model = {
    "KeyBERT": keybert
}

# 建立 BERTopic 模型（用 KeyBERT 調整主題表示）
representation_topic_model = BERTopic(
    embedding_model=embedding_model_with_st,
    vectorizer_model=vectorizer_model,
    hdbscan_model=hdbscan_model,
    representation_model=representation_model,
    top_n_words=30,
    verbose=True
)

# 訓練模型
topics, probs = representation_topic_model.fit_transform(docs, embeddings)

# 查看新的主題表示
representation_topic_model.get_topic_info()

In [None]:
# 視覺化主題分布：圓圈大小是主題的大小，圓圈的距離是主題之間的相似度
topic_model.visualize_topics()

In [None]:
representation_topic_model.visualize_topics()

In [None]:
# 原始模型的主題
visualize_fake_news_ratio_by_topic(topic_model, docs, data['label'], title="原始主題的假新聞比例")

# 使用 KeyBERT 表示詞的模型主題
visualize_fake_news_ratio_by_topic(representation_topic_model, docs, data['label'], title="KeyBERT 主題的假新聞比例")