# BERT (Encoder-only-model)

- Token classification
    NER
- Sequence classification
    Sentiment Classification
    Relation Extraction (RE)
- Text Clustering (BERTopic)
    Embedding model
    Clustering model
    使用Representation方法去微調主題表示


### 真假新聞資料集 ###

In [None]:
# 🔧 安裝核心資料處理與模型套件
!pip install pandas numpy scikit-learn matplotlib seaborn tqdm nltk vaderSentiment empath tabulate
# 🔎 安裝 BERT 相關（transformers, pipeline）
!pip install transformers
# 🤖 安裝命名實體辨識用預訓練模型
!pip install torch
# 🧠 安裝情緒分析微調模型
!pip install sentence-transformers
# 📊 安裝主題建模：BERTopic + HDBSCAN（支援 clustering）
!pip install bertopic hdbscan
# 🗂 字體設定用（如你加載了自訂字體）
!pip install fonttools
!pip install --upgrade nltk

In [None]:
# from google.colab import drive
# drive.mount('/content/drive')

In [None]:
import pandas as pd
import numpy as np
import re
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
import nltk, ssl, os
nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

nltk.data.path.extend([
    '/usr/nltk_data',
    '/usr/local/nltk_data',
    '/usr/share/nltk_data',
    '/usr/local/share/nltk_data',
    '/root/nltk_data'
])

In [None]:
# 載入資料集
fake_df = pd.read_csv('./raw_data/fake.csv')
            #/content/drive/MyDrive/Colab Notebooks/期末專案/raw_data/Fake.csv #/content/drive/MyDrive/Colab Notebooks/期末專案/test_data/Fake_Sample.csv
true_df = pd.read_csv('./raw_data/true.csv')
            #/content/drive/MyDrive/Colab Notebooks/期末專案/raw_data/True.csv #/content/drive/MyDrive/Colab Notebooks/期末專案/test_data/True_Sample.csv

# 合併 title 和 text 成新的 text 欄位
fake_df['text'] = fake_df['title'].astype(str) + " " + fake_df['text'].astype(str)
true_df['text'] = true_df['title'].astype(str) + " " + true_df['text'].astype(str)

# 加上 label 欄位
fake_df['label'] = 1
true_df['label'] = 0

# 取前 1000 筆
dataNum = 50
news_data = pd.concat([fake_df.iloc[:dataNum], true_df.iloc[:dataNum]], ignore_index=True)

# 移除空值並只保留 text 和 label 欄位
news_data = news_data[news_data['text'].notna()].reset_index(drop=True)
news_data = news_data[['text', 'label']]

# 檢查各類別數量
print(news_data['label'].value_counts())
print(news_data.head(1))

### 推特真假推文資料集 ###

In [None]:
# 推特資料處理(為了嘗試解決 加入TF-IDF過擬合&embed過強 可能是因為文本特徵太明顯的問題)
tweet_df = pd.read_csv('./raw_data/Truth_Seeker_Model_Dataset_unindex.csv', encoding='ISO-8859-1')
            #/content/drive/MyDrive/Colab Notebooks/期末專案/raw_data/Truth_Seeker_Model_Dataset.csv #/content/drive/MyDrive/Colab Notebooks/期末專案/test_data/Truth_Seeker_Model_Dataset_Sample.csv
tweet_data = tweet_df[['BinaryNumTarget', 'tweet', '5_label_majority_answer']].copy()

# 清理
tweet_data = tweet_data.dropna()
tweet_data = tweet_data[~tweet_data['tweet'].str.contains('#REF!', na=False)]
valid_labels = ['Agree', 'Mostly Agree']
tweet_data = tweet_data[tweet_data['5_label_majority_answer'].isin(valid_labels)]

# 移除 5_label_majority_answer 欄位，並重新命名欄位
tweet_data = tweet_data.rename(columns={'BinaryNumTarget': 'label', 'tweet': 'text'})
tweet_data = tweet_data[['text', 'label']]

tweet_data_num = 1000  # 取n筆
tweet_data = tweet_data.groupby('label').apply(
    lambda x: x.sample(n=min(len(x), tweet_data_num), random_state=42)
).reset_index(drop=True)


print(tweet_data['label'].value_counts())
print(tweet_data.head(10))

In [None]:
# 合併兩份不同來源資料集
data = pd.concat([news_data, tweet_data], ignore_index=True)

## 需要做文本預處理嗎?

目的:
- 建立分類器來預測真假新聞 -> (TF-IDF + 分類模型需要乾淨的資料，有幫助)
- 分析NER 結果與語意分佈 -> (會破壞語意)
- 建立主題模型來探索語意主題（BERTopic -> (會破壞語意)

In [None]:
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words("english"))
punct_pattern = re.compile(r"[^a-z ]")

def preprocess(text):
    text = text.lower()
    text = punct_pattern.sub(" ", text)
    # 用 preserve_line=True 避開 punkt_tab
    tokens = word_tokenize(text, preserve_line=True)
    tokens = [
        lemmatizer.lemmatize(w)
        for w in tokens
        if w not in stop_words and len(w) > 1
    ]
    return tokens

data['tokens'] = data['text'].astype(str).apply(preprocess)
data['clean_text'] = data['tokens'].apply(lambda x: ' '.join(x))

## 06/10 嘗試方向 ##

1. 真假新聞8成資料作為訓練集
1. 分別用真假新聞的2成做為測試集1、推特真假推文作為測試集2

## NER 預測新聞真假

In [None]:
from matplotlib.font_manager import fontManager
import matplotlib.pyplot as plt

fontManager.addfont('./public/TaipeiSansTCBeta-Regular.ttf')
plt.rcParams['font.sans-serif'] = ['Taipei Sans TC Beta']
plt.rcParams['font.size'] = '16'

In [None]:
from sklearn.cluster import KMeans
from bertopic import BERTopic
from bertopic.vectorizers import ClassTfidfTransformer
from hdbscan import HDBSCAN
from transformers import BertTokenizerFast, AutoTokenizer, AutoModelForTokenClassification, AutoModelForSequenceClassification, pipeline
from tqdm import tqdm

# 載入模型與 tokenizer
tokenizer = AutoTokenizer.from_pretrained("dslim/bert-base-NER")
model = AutoModelForTokenClassification.from_pretrained("dslim/bert-base-NER")
ner_pipeline = pipeline("ner", model=model, tokenizer=tokenizer, aggregation_strategy="simple")

# 建立 NER 結果列表
ner_rows = []

# 分切字串
def split_text(text, chunk_size=512):
    return [text[i:i+chunk_size] for i in range(0, len(text), chunk_size)]

# 針對每篇文章跑 NER（可用 tqdm 顯示進度條）
for idx, text in tqdm(data['text'].astype(str).items()):
    try:
        chunks = split_text(text)
        all_ents = []
        for chunk in chunks:
            all_ents.extend(ner_pipeline(chunk))  # 對每段跑 NER
        for ent in all_ents:
            ner_rows.append({
                "index": idx,
                "entity": ent['entity_group'],  # e.g., PER, LOC
                "word": ent['word'],
                "score": ent['score']
            })
    except Exception as e:
        print(f"Error at idx {idx}: {e}")

# 建立 DataFrame
ner_df = pd.DataFrame(ner_rows)

In [None]:
ner_df.head(10)

In [None]:
# 整合 label
merged_df = ner_df.merge(data[['label']], left_on='index', right_index=True)

# 聚合所有 entity 類型的出現次數
entity_counts_all = (
    merged_df.groupby(['index', 'entity'])
    .size()
    .unstack(fill_value=0)  # 得到每篇文章各類實體數
    .reset_index()
)

# 合併 label
entity_counts_all = entity_counts_all.merge(data[['label']], left_on='index', right_index=True)

# 建模欄位選擇：所有實體類別欄位（排除 index, label）
feature_cols = [col for col in entity_counts_all.columns if col not in ['index', 'label']]
kmeans_fit_pred_data = entity_counts_all[feature_cols]

# 做 KMeans 聚類
from sklearn.cluster import KMeans
import seaborn as sns

kmeans = KMeans(n_clusters=2, random_state=42)
entity_counts_all['cluster'] = kmeans.fit_predict(kmeans_fit_pred_data)

from sklearn.decomposition import PCA

pca = PCA(n_components=2)
X_pca = pca.fit_transform(kmeans_fit_pred_data)
entity_counts_all['PC1'] = X_pca[:, 0]
entity_counts_all['PC2'] = X_pca[:, 1]
# 視覺化聚類結果
plt.figure(figsize=(8, 6))
sns.scatterplot(
    data=entity_counts_all,
    x='PC1', y='PC2', hue='cluster', style='label',
    palette='Set2', s=100
)

plt.title('NER 特徵的主成分分析 + KMeans 聚類')
plt.grid(True)
plt.tight_layout()
plt.show()

使用NER特徵進行KMeans聚類，無監督學習自動分成兩群，上圖為模型前的探索性資料分析(EDA)結果，觀察結果:KMeans聚類有部分成功聚出假新聞群，橘色cluster1幾乎都是叉叉為假新聞群，綠色cluster0包含較多真新聞與部分假新聞。結論：分群重疊明顯，整體分群效果不算非常好，但初步判斷NER有區別能力，需結合更多分類方式進行多模態聚類模型評估!

#### 嘗試用 NER 提取出的'人名'、'組織'、'地名數量'作為詞彙特徵，再餵給 TF-IDF + 模型來預測這篇新聞是真/假

Part1. 使用命名實體辨識(NER)的結果當作特徵，來分類真假新聞(label)

In [None]:
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import classification_report, f1_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn import svm
from sklearn.base import clone
import numpy as np
import pandas as pd
from tabulate import tabulate

# === 建立特徵（NER 例子） ===
entity_counts = ner_df.groupby(['index', 'entity']).size().unstack(fill_value=0)
data_with_ner = data.copy()
data_with_ner = data_with_ner.join(entity_counts, how='left').fillna(0)

X = data_with_ner[['PER', 'ORG', 'LOC']]
y = data_with_ner['label']

# === 分類器列表 ===
classifiers = {
    "LogReg": LogisticRegression(max_iter=1000),
    "DecisionTree": DecisionTreeClassifier(),
    "SVM": svm.SVC(probability=True),
    "RandomForest": RandomForestClassifier(random_state=42)
}

# === K-fold 設定 ===
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# 儲存平均 f1-score 結果
results = []

# === 執行交叉驗證並印出每個模型報告 ===
for name, model in classifiers.items():
    print(f"\n=== {name} 分類結果（5-fold） ===")
    y_true_all, y_pred_all = [], []
    fold_f1_scores = []

    for train_idx, test_idx in skf.split(X, y):
        X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
        y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]

        clf = clone(model)
        clf.fit(X_train, y_train)
        y_pred = clf.predict(X_test)

        y_true_all.extend(y_test)
        y_pred_all.extend(y_pred)

        fold_f1_scores.append(f1_score(y_test, y_pred, average='weighted'))

    avg_f1 = np.mean(fold_f1_scores)
    print(classification_report(
        y_true_all, y_pred_all,
        target_names=["真新聞", "假新聞"],
        digits=2
    ))

    results.append({
        "classifier": name,
        "f1_weighted": avg_f1
    })

# === 比較結果表格 ===
result_df = pd.DataFrame(results).sort_values(by="f1_weighted", ascending=False).reset_index(drop=True)
print("🏁 各模型比較：")
print(tabulate(result_df, headers="keys", tablefmt="fancy_grid"))

# === 找出最佳模型 ===
best = result_df.iloc[0]
print(f"\n🏆 最佳分類器為：{best['classifier']}，weighted F1 = {best['f1_weighted']:.4f}")


### NER提取特徵預測結果尚可
小結:
預測真新聞:  LR      RF
precision   0.71    0.75
recall      0.62    0.74
f1          0.66    0.74

預測假新聞:
precision   0.66    0.74
recall      0.74    0.75
f1          0.70    0.74

NER 特徵對真假新聞辨識有一定程度作用，且用RandomForest的結果較優

BY Chuya
結論：特徵太少，只有三維(PER,ORG,LOC)，只提供「人名/地名/組織」的數量，資訊量太低。很多新聞或推文不一定包含這三類實體，造成大量為 0。

👉 資料區辨性低，模型難以學習。

Part2.使用情緒分析辨識真假新聞


1. distilbert：一款基於SST-2微調的輕量級BERT模型，常用於英文產品評論或客服對話中的情緒正負分類任務。
2. roberta-twitter：專為Twitter資料訓練的RoBERTa模型，廣泛應用於社群貼文的輿情分析與社會事件情緒偵測。
3. bertweet：以海量推文語料訓練的BERT模型，特別適用於社群媒體上的即時情緒追蹤與用戶反應分析。
4. nlptown：一個支援多語言、可輸出1～5星等級的情緒強度模型，常用於多語評論評等、顧客滿意度分析等任務。

In [None]:
# ── Part 2：情緒特徵模型比較 ───────────────────────────
from transformers import pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier
from tqdm import tqdm
import pandas as pd, numpy as np
from tabulate import tabulate

sentiment_models = {
    "distilbert"      : "distilbert-base-uncased-finetuned-sst-2-english",
    "roberta-twitter" : "cardiffnlp/twitter-roberta-base-sentiment",
    "bertweet"        : "finiteautomata/bertweet-base-sentiment-analysis",
    "nlptown"         : "nlptown/bert-base-multilingual-uncased-sentiment"
}

classifiers = {
    "LogReg"       : LogisticRegression(max_iter=1000),
    "DecisionTree" : DecisionTreeClassifier(),
    "LinearSVC"    : LinearSVC(),
    "RandomForest" : RandomForestClassifier(random_state=42)
}

def split_chunks(txt, size=512):
    return [txt[i:i+size] for i in range(0, len(txt), size)]

def senti_score(txt, pipe):
    try:
        res = pipe(split_chunks(txt))
        pos = [r['score'] for r in res if 'POS' in r['label'].upper()]
        neg = [r['score'] for r in res if 'NEG' in r['label'].upper()]
        return (sum(pos)/len(pos)) if pos and sum(pos) > sum(neg) \
               else -(sum(neg)/len(neg)) if neg else 0.0
    except Exception:
        return 0.0

all_results, model_mean = [], []

for m_key, m_name in sentiment_models.items():
    print(f"\n🔍 Sentiment Model: {m_key}")
    pipe = pipeline("sentiment-analysis", model=m_name, truncation=True)
    data_tmp = data.copy()
    tqdm.pandas()
    data_tmp['sentiment_score'] = data_tmp['text'].astype(str).progress_apply(
        lambda t: senti_score(t, pipe)
    )

    X_senti = data_tmp[['sentiment_score']]
    y       = data_tmp['label']

    f1_collect = []
    for clf_key, clf in classifiers.items():
        X_tr, X_te, y_tr, y_te = train_test_split(X_senti, y, test_size=0.2, random_state=42)
        clf.fit(X_tr, y_tr)
        y_pred = clf.predict(X_te)
        f1_val = f1_score(y_te, y_pred, average='weighted')
        f1_collect.append(f1_val)
        all_results.append({"model": m_key, "classifier": clf_key, "f1": f1_val})

    model_mean.append({"model": m_key, "mean_f1": np.mean(f1_collect)})

# ➜ 找平均 F1 最高的情緒模型
model_df = pd.DataFrame(model_mean).sort_values('mean_f1', ascending=False)
best_senti = model_df.iloc[0]['model']
print("\n📊  情緒模型平均 F1：")
print(tabulate(model_df, headers="keys", tablefmt="fancy_grid"))
print(f"\n🏆 最佳情緒模型：{best_senti}")


In [None]:
"""# 載入情緒分析模型(微調後的BERT)
model = pipeline("sentiment-analysis", model="distilbert-base-uncased-finetuned-sst-2-english")
# 因為這個語言也是BERT = 效果仰賴'自然語言語序與上下文' = 使用data['text']即可

# 切割文字 每段不超過 512 字
def split_text(text, chunk_size=512):
    return [text[i:i+chunk_size] for i in range(0, len(text), chunk_size)]

# 整合段落的情緒分數
def analyze_long_text(text):
    try:
        chunks = split_text(text)
        results = model(chunks)

        # 統計情緒
        pos_scores = [r['score'] for r in results if r['label'] == 'POSITIVE']
        neg_scores = [r['score'] for r in results if r['label'] == 'NEGATIVE']

        # 平均分數
        avg_pos = sum(pos_scores) / len(pos_scores) if pos_scores else 0
        avg_neg = sum(neg_scores) / len(neg_scores) if neg_scores else 0

        # 決定總體情續
        if avg_pos > avg_neg:
            return pd.Series(['POSITIVE', avg_pos])
        elif avg_neg > avg_pos:
            return pd.Series(['NEGATIVE', avg_neg])
        else:
            return pd.Series(['NEUTRAL', 0.5])
    except Exception:
        return pd.Series(['ERROR', 0.0])

# 執行分析
tqdm.pandas()
data[['sentiment_label', 'sentiment_score']] = data['text'].progress_apply(analyze_long_text)

data.head(10)

sentiment_pred_X = data[['sentiment_score']]
sentiment_pred_y = data['label']

# 分割訓練與測試集
X_train, X_test, y_train, y_test = train_test_split(sentiment_pred_X, sentiment_pred_y, test_size=0.2, random_state=42)

# 建立模型並訓練
lr_clf = LogisticRegression()
lr_clf.fit(X_train, y_train)

# Random Forest Classifier
rf_clf = RandomForestClassifier(random_state=42)
rf_clf.fit(X_train, y_train)

# 預測與評估
y_pred = lr_clf.predict(X_test)
print(classification_report(y_test, y_pred))

# 預測與評估
rf_y_pred = rf_clf.predict(X_test)
print(classification_report(y_test, rf_y_pred))
"""

### 小結: 情緒預測真假新聞表現不好
小結:
預測真新聞:  LR      RF
precision   0.60    0.52
recall      0.37    0.50
f1          0.46    0.51

預測假新聞:
precision   0.54    0.52
recall      0.75    0.54
f1          0.63    0.53

整體分類效果偏弱，跟丟銅板差不多
模型偏好預測為假新聞（recall 高），但也多誤判

採用HuggingFace的distilbert-base-uncased-finetuned-sst-2-english模型，這是一個對英文-電影評論做情緒分類(positive/negative)的預訓練模型。

Part3.　嘗試整合兩者(NER+情緒)

In [None]:
# ── Part 3：NER + Best Sentiment 特徵訓練 ──────────────
from transformers import pipeline
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import classification_report, f1_score
from sklearn.base import clone
import matplotlib.pyplot as plt, seaborn as sns

# 1️⃣ 用最佳情緒模型重新計算 sentiment_score
best_pipe = pipeline("sentiment-analysis", model=sentiment_models[best_senti], truncation=True)
tqdm.pandas()
data['sentiment_score'] = data['text'].astype(str).progress_apply(lambda t: senti_score(t, best_pipe))

# 2️⃣ 合併 NER (PER/ORG/LOC) + sentiment_score
feature_df = data_with_ner[['PER', 'ORG', 'LOC']].join(data['sentiment_score'])
X_full = feature_df
y_full = data['label']

# 3️⃣ 四個分類器
final_clfs = {
    "LogReg"       : LogisticRegression(max_iter=1000),
    "DecisionTree" : DecisionTreeClassifier(),
    "SVM"          : svm.SVC(probability=True),
    "RandomForest" : RandomForestClassifier(random_state=42)
}

kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
final_res = []

for clf_key, base_clf in final_clfs.items():
    y_all_t, y_all_p, f1_list = [], [], []

    for tr_idx, te_idx in kf.split(X_full, y_full):
        X_tr, X_te = X_full.iloc[tr_idx], X_full.iloc[te_idx]
        y_tr, y_te = y_full.iloc[tr_idx], y_full.iloc[te_idx]

        clf = clone(base_clf)
        clf.fit(X_tr, y_tr)
        y_pr = clf.predict(X_te)

        y_all_t.extend(y_te); y_all_p.extend(y_pr)
        f1_list.append(f1_score(y_te, y_pr, average='weighted'))

    print(f"\n=== {clf_key} 整體報告 ===")
    print(classification_report(y_all_t, y_all_p, target_names=["真新聞","假新聞"], digits=2))

    final_res.append({"classifier": clf_key, "f1_weighted": np.mean(f1_list)})

# 4️⃣ 比較表 & 最佳分類器
final_df = pd.DataFrame(final_res).sort_values('f1_weighted', ascending=False).reset_index(drop=True)
print("\n📊  最終 4 分類器比較：")
print(tabulate(final_df, headers="keys", tablefmt="fancy_grid"))

best_cls = final_df.iloc[0]
print(f"\n🏆 最終最佳組合：情緒模型={best_senti} + 分類器={best_cls['classifier']}，weighted F1={best_cls['f1_weighted']:.4f}")

# 5️⃣ (可選) 視覺化
plt.figure(figsize=(8,5))
sns.barplot(x='classifier', y='f1_weighted', data=final_df, palette='Set2')
plt.title(f'NER + Sentiment({best_senti})  4 分類器比較')
plt.ylabel('Weighted F1')
plt.ylim(0, 1)
plt.tight_layout()
plt.show()


In [None]:
"""from sklearn.ensemble import RandomForestClassifier

combined_X = pd.concat([data_with_ner[['PER', 'ORG', 'LOC']], sentiment_pred_X], axis=1)
combined_y = data['label']
# 分割訓練與測試集
X_train, X_test, y_train, y_test = train_test_split(combined_X, combined_y, test_size=0.2, random_state=42)

# Logistic Regression
clf_lr = LogisticRegression()
clf_lr.fit(X_train, y_train)
lr_preds = clf_lr.predict(X_test)

# Random Forest
clf_rf = RandomForestClassifier(random_state=42)
clf_rf.fit(X_train, y_train)
rf_preds = clf_rf.predict(X_test)

# 評估結果
print("=== Logistic Regression 分類結果 ===")
print(classification_report(y_test, lr_preds))

print("=== Random Forest 分類結果 ===")
print(classification_report(y_test, rf_preds)) """

Part4.NER+情緒+TFIDF

In [None]:
# ── Part 4：TF-IDF + NER + Best Sentiment 模型 ──────────────────────
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import classification_report, f1_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn import svm
from sklearn.ensemble import RandomForestClassifier
from sklearn.base import clone
from tabulate import tabulate
from transformers import pipeline
import pandas as pd, numpy as np
import matplotlib.pyplot as plt, seaborn as sns
from tqdm import tqdm

# ✅ 使用 Part 2 最佳情緒模型重新生成 sentiment_score
senti_model_name = sentiment_models[best_senti]
sentiment_pipe = pipeline("sentiment-analysis", model=senti_model_name, truncation=True)
tqdm.pandas()
data['sentiment_score'] = data['text'].astype(str).progress_apply(lambda t: senti_score(t, sentiment_pipe))

# 1️⃣ 建立 TF-IDF 特徵
tfidf_vec = TfidfVectorizer(max_features=200, ngram_range=(1, 2))
tfidf_mat = tfidf_vec.fit_transform(data['clean_text'].fillna(''))
tfidf_df = pd.DataFrame(tfidf_mat.toarray(),
                        columns=tfidf_vec.get_feature_names_out(),
                        index=data.index)

# 2️⃣ 取得 NER 特徵 + 最新 sentiment 分數
ner_df     = data_with_ner[['PER', 'ORG', 'LOC']].copy()
senti_df   = data[['sentiment_score']]
X_features = pd.concat([ner_df, senti_df, tfidf_df], axis=1)
y_target   = data['label']

# 3️⃣ 定義分類器
classifiers = {
    "LogReg"       : LogisticRegression(max_iter=1000),
    "DecisionTree" : DecisionTreeClassifier(),
    "SVM"          : svm.SVC(probability=True),
    "RandomForest" : RandomForestClassifier(random_state=42)
}

# 4️⃣ Cross-Validation 訓練
kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
results = []

for clf_name, clf_model in classifiers.items():
    print(f"\n=== {clf_name} 分類結果（5-fold） ===")
    y_all_true, y_all_pred, f1s = [], [], []

    for train_idx, test_idx in kf.split(X_features, y_target):
        X_train, X_test = X_features.iloc[train_idx], X_features.iloc[test_idx]
        y_train, y_test = y_target.iloc[train_idx], y_target.iloc[test_idx]

        clf = clone(clf_model)
        clf.fit(X_train, y_train)
        y_pred = clf.predict(X_test)

        y_all_true.extend(y_test)
        y_all_pred.extend(y_pred)
        f1s.append(f1_score(y_test, y_pred, average='weighted'))

    print(classification_report(y_all_true, y_all_pred,
                                target_names=["真新聞", "假新聞"], digits=2))

    results.append({
        "classifier": clf_name,
        "f1_weighted": np.mean(f1s)
    })

# 5️⃣ 輸出總結
result_df = pd.DataFrame(results).sort_values(by='f1_weighted', ascending=False).reset_index(drop=True)

print("\n📊 TF-IDF + NER + Sentiment 分類器比較：")
print(tabulate(result_df, headers="keys", tablefmt="fancy_grid"))

best = result_df.iloc[0]
print(f"\n🏆 最佳分類器為：{best['classifier']}，weighted F1 = {best['f1_weighted']:.4f}")

# ➕ 可選視覺化
plt.figure(figsize=(8,5))
sns.barplot(x='classifier', y='f1_weighted', data=result_df, palette='Set3')
plt.title(f"TF-IDF + NER + Sentiment({best_senti}) 分類器比較")
plt.ylabel('Weighted F1')
plt.ylim(0, 1)
plt.tight_layout()
plt.show()


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

# 依 label 分群
true_texts = data[data['label'] == 0]['clean_text'].fillna('')
fake_texts = data[data['label'] == 1]['clean_text'].fillna('')

# 建立 TF-IDF 向量器（可使用相同設定以便比較）
tfidf = TfidfVectorizer(max_features=2000, ngram_range=(1, 2))

# 擬合於真新聞
true_tfidf_matrix = tfidf.fit_transform(true_texts)
true_feature_names = tfidf.get_feature_names_out()
true_scores = true_tfidf_matrix.mean(axis=0).A1
true_top30 = sorted(zip(true_feature_names, true_scores), key=lambda x: x[1], reverse=True)[:30]

# 擬合於假新聞（需重新建一個 vectorizer 才不會共用字典）
tfidf_fake = TfidfVectorizer(max_features=2000, ngram_range=(1, 2))
fake_tfidf_matrix = tfidf_fake.fit_transform(fake_texts)
fake_feature_names = tfidf_fake.get_feature_names_out()
fake_scores = fake_tfidf_matrix.mean(axis=0).A1
fake_top30 = sorted(zip(fake_feature_names, fake_scores), key=lambda x: x[1], reverse=True)[:30]

# 將兩個 DataFrame 加上 index 並 reset
true_df = pd.DataFrame(true_top30, columns=["真新聞詞", "真_TF-IDF"]).reset_index(drop=True)
fake_df = pd.DataFrame(fake_top30, columns=["假新聞詞", "假_TF-IDF"]).reset_index(drop=True)

# 合併為一個表格（左右比對）
compare_df = pd.concat([true_df, fake_df], axis=1)

# 顯示結果
from IPython.display import display
print("📊 真新聞 vs 假新聞 前 30 常見關鍵詞（TF-IDF 分數）")
display(compare_df)

In [None]:
# """# 嘗試增加TF-IDF欄位(clean_text)

# # 建立 TF-IDF 向量器（可自訂 ngram 範圍與維度限制）
# tfidf = TfidfVectorizer(max_features=200, ngram_range=(1, 2))
# tfidf_matrix = tfidf.fit_transform(data['clean_text'].fillna(''))

# # 轉為 DataFrame
# tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=tfidf.get_feature_names_out(), index=data.index)

# # 新增tf-idf欄位
# ner_sentiment_df = pd.concat([ner_pred_X, sentiment_pred_X], axis=1)
# combined_X_full = pd.concat([ner_sentiment_df, tfidf_df], axis=1)

# # 分割資料
# X_train, X_test, y_train, y_test = train_test_split(combined_X_full, combined_y, test_size=0.2, random_state=42)

# # Logistic
# clf_lr = LogisticRegression()
# clf_lr.fit(X_train, y_train)
# lr_preds = clf_lr.predict(X_test)

# # Random Forest
# clf_rf = RandomForestClassifier(random_state=42)
# clf_rf.fit(X_train, y_train)
# rf_preds = clf_rf.predict(X_test)

# # 評估
# print("=== Logistic Regression(NER + Sentiment + TF-IDF) ===")
# print(classification_report(y_test, lr_preds))

# print("=== Random Forest(NER + Sentiment + TF-IDF) ===")
# print(classification_report(y_test, rf_preds))

Part5. TF-IDF+NER+Sentiment特徵上，再加入VADER(+Empath)與文字表達Style特徵

In [None]:
!pip install vaderSentiment empath tabulate tqdm seaborn matplotlib scikit-learn

In [None]:
# ── Part 5：TF-IDF+NER+Sentiment特徵上，再加入VADER(+Empath)與文字表達Style特徵 ──────────────────────
# ------------------------------------------------------------------
# 1. 透過卡方檢定 (chi-square) 找出「假新聞 > 真新聞」最具區辨力的 n-gram
# ------------------------------------------------------------------
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_selection import chi2
import numpy as np, pandas as pd, re
from tqdm import tqdm

# （1）建一個詞袋模型（unigram+bigram，過濾英文停用字, min_df=5 避免太稀有）
cv = CountVectorizer(stop_words='english', ngram_range=(1,2), min_df=5)
X_bow = cv.fit_transform(data['clean_text'])
feature_names = np.array(cv.get_feature_names_out())

# （2）做卡方檢定；label=1 代表假新聞
chi_scores, _ = chi2(X_bow, data['label'])

# （3）只保留在假新聞出現次數 > 真新聞的詞，再取前 30 名
fake_mask = (X_bow[data['label'].values==1].sum(axis=0) >
             X_bow[data['label'].values==0].sum(axis=0)).A1
candidate_words = feature_names[fake_mask]
candidate_scores= chi_scores[fake_mask]

top_k = 30
top_idx = np.argsort(candidate_scores)[::-1][:top_k]
auto_clickbait = set(candidate_words[top_idx])

print(f"🔍 自動偵測到 {len(auto_clickbait)} 個假新聞高相關詞（前 {top_k}）：")
print(sorted(auto_clickbait))

# ------------------------------------------------------------------
# 2. 建立 VADER / Empath / Style 特徵（含「動態 click-bait」）
# ------------------------------------------------------------------
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
try:
    from empath import Empath
    lexicon = Empath(); use_empath = True
except ModuleNotFoundError:
    use_empath = False

tqdm.pandas()

# 2-1 VADER
vader = SentimentIntensityAnalyzer()
vader_df = (data['text'].progress_apply(vader.polarity_scores)
                       .apply(pd.Series).add_prefix('vader_'))
print("\n🧠 VADER 情緒推動特徵（前幾筆）：")
print(vader_df.head())

# 2-2 Empath（選擇幾個常用情緒社會面向）
if use_empath:
    empath_raw = data['text'].progress_apply(lambda t: lexicon.analyze(t, normalize=True))
    empath_keep = ['positive_emotion','negative_emotion','anger','sadness',
                   'fear','politics','money','fun','love']
    empath_df = (pd.DataFrame(empath_raw.tolist())
                   [empath_keep].add_prefix('empath_'))

    print("\n🎯 NRC-Empath 情緒向量（前幾筆）：")
    print(empath_df.head())
else:
    empath_df = pd.DataFrame(index=data.index)   # 空 DF
    print("\n⚠️ 未啟用 Empath（需 pip install empath）")

# 2-3 Style features（大寫比例 / ! 密度 / 自動 click-bait 命中率）
def style_feats(txt:str):
    L = max(len(txt),1)
    txt_low = txt.lower()
    hit_cnt = sum(1 for w in auto_clickbait if w in txt_low)
    return pd.Series({
        'caps_ratio'      : sum(c.isupper() for c in txt)/L,
        'excl_ratio'      : txt.count('!')/L,
        'clickbait_ratio' : hit_cnt / len(auto_clickbait)
    })

style_df = data['text'].progress_apply(style_feats)

print("\n📝 文字表達方式特徵（大寫比例 / 感嘆號密度 / Click-bait 命中率）")
print(style_df.describe())

print("\n📊 假新聞與真新聞的 Style 特徵平均比較：")
print(pd.concat([style_df, data['label']], axis=1)
        .groupby('label').mean()
        .rename(index={0: "真新聞", 1: "假新聞"}))

# ------------------------------------------------------------------
# 3. 把新特徵接到既有 X_features（TF-IDF + NER + Best-Sentiment）
# ------------------------------------------------------------------
X_final = pd.concat([X_features, vader_df, empath_df, style_df], axis=1)
y_final = data['label']
print("🔢 新增後特徵維度 :", X_final.shape)

# ------------------------------------------------------------------
# 4. 四個分類器 × 5-fold 交叉驗證
# ------------------------------------------------------------------
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import f1_score, classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.base import clone
from tabulate import tabulate
import seaborn as sns, matplotlib.pyplot as plt

clfs = {
    "LogReg"      : LogisticRegression(max_iter=1000),
    "DecisionTree": DecisionTreeClassifier(),
    "SVM"         : SVC(probability=True),
    "RandomForest": RandomForestClassifier(random_state=42)
}

kf = StratifiedKFold(5, shuffle=True, random_state=42)
rows = []

for name, base in clfs.items():
    y_t, y_p, f1s = [], [], []
    for tr, te in kf.split(X_final, y_final):
        mdl = clone(base).fit(X_final.iloc[tr], y_final.iloc[tr])
        pred = mdl.predict(X_final.iloc[te])
        y_t.extend(y_final.iloc[te]); y_p.extend(pred)
        f1s.append(f1_score(y_final.iloc[te], pred, average='weighted'))
    print(f"\n=== {name} 報告 (加 VADER / Style) ===")
    print(classification_report(y_t, y_p, target_names=['真新聞','假新聞'], digits=2))
    rows.append({"classifier": name, "f1_weighted": np.mean(f1s)})

res_df = pd.DataFrame(rows).sort_values('f1_weighted', ascending=False)
print("\n📊  加 VADER / Style / 動態 Click-bait 後分類器比較")
print(tabulate(res_df, headers="keys", tablefmt="fancy_grid"))

best_cls = res_df.iloc[0]
print(f"\n🏆  新最佳模型：{best_cls['classifier']}  (Weighted F1 = {best_cls['f1_weighted']:.4f})")

# （可選）長條圖
plt.figure(figsize=(8,4))
sns.barplot(x='classifier', y='f1_weighted', data=res_df, palette='Set2')
plt.title('加入 VADER / Style 特徵後的分類器比較')
plt.ylabel('Weighted F1')
plt.ylim(0,1); plt.tight_layout(); plt.show()


Part6. NER+Sentiment特徵+SBERT向量化

In [None]:
# ────── Part 6：NER+Sentiment特徵+SBERT向量化 ──────────────────────
# ▍1. SBERT 向量化
from sentence_transformers import SentenceTransformer
from sklearn.preprocessing import StandardScaler

model = SentenceTransformer('sentence-transformers/paraphrase-MiniLM-L6-v2')  # 可改其他如 'paraphrase-MiniLM-L6-v2'
sbert_embeddings = model.encode(data['clean_text'].fillna(''), show_progress_bar=True)

sbert_df = pd.DataFrame(sbert_embeddings, index=data.index)
sbert_df.columns = sbert_df.columns.astype(str)
print("📐 向量維度：", sbert_df.shape)

# ▍2. 合併其他特徵（NER + Sentiment）
ner_df    = data_with_ner[['PER', 'ORG', 'LOC']].copy()
senti_df  = data[['sentiment_score']]
X_sbert   = pd.concat([sbert_df, ner_df, senti_df], axis=1)
y_target  = data['label']

# ▍3. 建立分類器組合
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import f1_score, classification_report
from sklearn.base import clone
from tabulate import tabulate
import matplotlib.pyplot as plt
import seaborn as sns

classifiers = {
    "LogReg"       : LogisticRegression(max_iter=1000),
    "DecisionTree" : DecisionTreeClassifier(),
    "SVM"          : SVC(probability=True),
    "RandomForest" : RandomForestClassifier(random_state=42)
}

# ▍4. Cross-validation 比較表現
kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
results = []

for clf_name, clf_model in classifiers.items():
    print(f"\n=== {clf_name} 分類結果（5-fold） ===")
    y_all_true, y_all_pred, f1s = [], [], []

    for train_idx, test_idx in kf.split(X_sbert, y_target):
        X_train, X_test = X_sbert.iloc[train_idx], X_sbert.iloc[test_idx]
        y_train, y_test = y_target.iloc[train_idx], y_target.iloc[test_idx]

        clf = clone(clf_model)
        clf.fit(X_train, y_train)
        y_pred = clf.predict(X_test)

        y_all_true.extend(y_test)
        y_all_pred.extend(y_pred)
        f1s.append(f1_score(y_test, y_pred, average='weighted'))

    print(classification_report(y_all_true, y_all_pred, target_names=["真新聞", "假新聞"], digits=2))

    results.append({
        "classifier": clf_name,
        "f1_weighted": np.mean(f1s)
    })

# ▍5. 顯示比較結果
result_df = pd.DataFrame(results).sort_values(by='f1_weighted', ascending=False).reset_index(drop=True)

print("\n📊 SBERT + NER + Sentiment 分類器比較：")
print(tabulate(result_df, headers="keys", tablefmt="fancy_grid"))

best = result_df.iloc[0]
print(f"\n🏆 最佳分類器為：{best['classifier']}，weighted F1 = {best['f1_weighted']:.4f}")

# ▍6. 視覺化結果
plt.figure(figsize=(8,5))
sns.barplot(x='classifier', y='f1_weighted', data=result_df, palette='Set2')
plt.title("BERT 向量 + NER + Sentiment 分類器比較")
plt.ylabel('Weighted F1')
plt.ylim(0, 1)
plt.tight_layout()
plt.show()


1. 使用all-MiniLM-L6-v2，最佳分類器為：RandomForest，weighted F1 = 0.7197
2. 使用sentence-transformers/paraphrase-MiniLM-L6-v2，最佳分類器為：LogReg，weighted F1 = 0.7467

### Topic model: BERTopic 主題詞來源使用c-TF-IDF頻率導向，挑出詞頻高的詞

In [None]:
# ───────────────────────────────────────────────────────────────
# 0. 前置條件說明
#   - data['clean_text'] 需為清理後文本欄
#   - data['label'] 為真假標記（0=真新聞，1=假新聞）
#   - 若用 "tfidf_style"，需已先算好 X_final
# ───────────────────────────────────────────────────────────────

import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sentence_transformers import SentenceTransformer
from bertopic import BERTopic
from hdbscan import HDBSCAN
import matplotlib.pyplot as plt
import seaborn as sns

# -------- 1. 選擇向量化方式 -------------------------------------Part 4 / 5 / 6 結果輸入
VEC_CHOICE = "tfidf"       # ← 輸入 # Part 4- "tfidf" / Part 5- "tfidf_style" / Part 6- "sbert"
texts = data['clean_text'].fillna('')

if VEC_CHOICE == "tfidf":
    vec_model = TfidfVectorizer(max_features=5000, ngram_range=(1, 2), stop_words="english")
    embeddings = vec_model.fit_transform(texts)

elif VEC_CHOICE == "tfidf_style":
    if "X_final" not in globals():
        raise RuntimeError("⚠️ 找不到 X_final，請先執行 Part 5 建立特徵")
    embeddings = X_final.values

elif VEC_CHOICE == "sbert":
    emb_model = SentenceTransformer("sentence-transformers/paraphrase-MiniLM-L6-v2")
    embeddings = emb_model.encode(texts, show_progress_bar=True)

else:
    raise ValueError("VEC_CHOICE 僅能為 'tfidf' / 'tfidf_style' / 'sbert'")

# -------- 2. 建立 BERTopic 模型 ----------------------------------
topic_model = BERTopic(
    embedding_model=None if VEC_CHOICE.startswith("tfidf") else emb_model,
    hdbscan_model=HDBSCAN(min_cluster_size=10, min_samples=30),
    vectorizer_model=CountVectorizer(ngram_range=(1, 2), stop_words="english"),
    calculate_probabilities=False,
    verbose=True
)

topics, _ = topic_model.fit_transform(texts, embeddings)
data['topic'] = topics

# ✅ 防呆：確認是否有有效主題（非 -1）
valid_topics = [t for t in set(topics) if t != -1]
if len(valid_topics) == 0:
    print("⚠️ 無有效主題（全部為 outlier），請檢查資料筆數或降低 min_cluster_size 設定。")
else:
    # -------- 3. 主題 × 真／假 分佈 -------------------------------
    data['label_name'] = data['label'].map({0: "True", 1: "Fake"})
    topic_dist = (data.groupby(['topic', 'label_name']).size().unstack(fill_value=0))
    topic_dist['Total'] = topic_dist.sum(axis=1)
    topic_dist['Fake_Ratio'] = topic_dist['Fake'] / topic_dist['Total']

    print("▶ 各 Topic 真／假筆數與假新聞比例 (前 10)：")
    display(topic_dist.sort_values('Fake_Ratio', ascending=False).head(10))

    # -------- 4. 取主題關鍵字並依真假比例排序 ----------------------
    kw_rows = []
    for tid, word_scores in topic_model.get_topics().items():
        if tid == -1:
            continue
        for word, score in word_scores:
            kw_rows.append({"topic": tid, "word": word, "c_tf_idf": score})

    kw_df = pd.DataFrame(kw_rows)

    merged_kw = kw_df.merge(topic_dist.reset_index(), on="topic")

    fake_top_kw = (merged_kw.sort_values(['Fake_Ratio', 'c_tf_idf'], ascending=[False, False])
                            .groupby('topic')
                            .head(30))

    true_top_kw = (merged_kw.sort_values(['Fake_Ratio', 'c_tf_idf'], ascending=[True, False])
                            .groupby('topic')
                            .head(30))

    print("\n🟥 假新聞高比例主題關鍵字 TOP 30")
    display(fake_top_kw[['topic', 'word', 'c_tf_idf', 'Fake_Ratio']])

    print("\n🟦 真新聞高比例主題關鍵字 TOP 30")
    display(true_top_kw[['topic', 'word', 'c_tf_idf', 'Fake_Ratio']])

    # -------- 5. 視覺化每個主題的假新聞比例 ------------------------
    plt.figure(figsize=(12, 4))
    sns.barplot(x=topic_dist.index, y=topic_dist['Fake_Ratio'], palette="coolwarm")
    plt.title("Fake-News Ratio per Topic")
    plt.ylabel("Fake Ratio")
    plt.xlabel("Topic ID")
    plt.xticks(rotation=90)
    plt.ylim(0, 1)
    plt.tight_layout()
    plt.show()


「先用最佳向量化(SBERT)→BERTopic分群→疊加真假標籤→看每個主題哪邊假新聞高、哪邊真新聞高，以及對應關鍵詞的全流程。

將兩者疊加，就能得到：
「假新聞最常見的主題有哪些？」/「真新聞裡哪些主題特別突出？」/「各主題的代表關鍵詞」

In [None]:
# '''from sklearn.feature_extraction.text import CountVectorizer

# # 真假新聞進行主題建模
# docs = data['text'].astype(str).tolist()

# # 模型可換成 'all-MiniLM-L6-v2', 'microsoft/Phi-4-mini-instruct' 等
# embedding_model = 'all-MiniLM-L6-v2'

# # 可調整 測試用2000筆
# # min_cluster_size 群集最少需要包含n個點，否則會被視為雜訊（noise）
# # min_samples 包含至少n篇文章的主題才會被承認為主題
# hdbscan_model = HDBSCAN(min_cluster_size=10, min_samples=30) # Clustering layer
# vectorizer_model = CountVectorizer(ngram_range=(1, 2), stop_words="english")

# topic_model = BERTopic(embedding_model=embedding_model, hdbscan_model=hdbscan_model, vectorizer_model=vectorizer_model)
# topics, probs = topic_model.fit_transform(docs)


In [None]:
# '''# 建立一個儲存所有主題關鍵詞與 TF-IDF 分數的清單
# all_topics = []

# # 把主題總數拿出來（排除 -1 是未分類主題）
# valid_topics = [topic for topic in topic_model.get_topic_info().Topic if topic != -1]

# # 對每個主題取得詞與 c-TF-IDF 分數
# for topic_id in valid_topics:
#     topic_words = topic_model.get_topic(topic_id)
#     for word, score in topic_words:
#         all_topics.append({
#             "Topic": topic_id,
#             "Word": word,
#             "C-TF-IDF": score
#         })

# # 轉換成 DataFrame 並排序
# topic_tfidf_df = pd.DataFrame(all_topics)
# topic_tfidf_df = topic_tfidf_df.sort_values(by=["Topic", "C-TF-IDF"], ascending=[True, False])

# # 顯示前幾列
# topic_tfidf_df.head(20)

In [None]:

# # 列出文章的BERTopic資訊
# topic_model.get_document_info(docs)

In [None]:
# def visualize_fake_news_ratio_by_topic(model, docs, labels, title="主題的假新聞比例"):
#     doc_info = model.get_document_info(docs).copy()
#     doc_info['label'] = labels

#     # 計算比例與數量
#     topic_fake_ratio = (
#         doc_info[doc_info['Topic'] != -1]
#         .groupby('Topic')['label']
#         .mean()
#         .reset_index()
#         .rename(columns={'label': 'fake_news_ratio'})
#     )
#     topic_counts = (
#         doc_info[doc_info['Topic'] != -1]['Topic']
#         .value_counts()
#         .rename_axis('Topic')
#         .reset_index(name='count')
#     )
#     topic_stats = pd.merge(topic_fake_ratio, topic_counts, on='Topic')

#     # 加上主題名稱
#     topic_names = model.get_topic_info()[['Topic', 'Name']]
#     topic_stats_named = topic_stats.merge(topic_names, on='Topic')

#     # 過濾比例過低的主題
#     topic_stats_named = topic_stats_named[topic_stats_named['fake_news_ratio'] >= 0.1]

#     # 繪圖
#     plt.figure(figsize=(12, 8))
#     ax = sns.barplot(
#         data=topic_stats_named.sort_values(by='fake_news_ratio', ascending=False),
#         x='fake_news_ratio', y='Name', palette='Reds'
#     )
#     plt.title(title)
#     plt.xlabel('假新聞比例 (label=1)')
#     plt.ylabel('主題代表詞')
#     plt.grid(True, axis='x')
#     ax.set_yticklabels(ax.get_yticklabels(), fontsize=9)
#     plt.tight_layout()
#     plt.show()

### representation topic model: 加上語意導向的KeyBERT, 表現方式是語意向量相似的詞

In [None]:
# from bertopic.representation import KeyBERTInspired
# from sentence_transformers import SentenceTransformer

# embedding_model_with_st = SentenceTransformer(embedding_model)  # 或其他你指定的模型
# embeddings = embedding_model_with_st.encode(docs, show_progress_bar=True)

# # 關鍵詞表示模型（非生成式）
# keybert = KeyBERTInspired()

# # 組裝 representation model
# representation_model = {
#     "KeyBERT": keybert
# }

# # 建立 BERTopic 模型（用 KeyBERT 調整主題表示）
# representation_topic_model = BERTopic(
#     embedding_model=embedding_model_with_st,
#     vectorizer_model=vectorizer_model,
#     hdbscan_model=hdbscan_model,
#     representation_model=representation_model,
#     top_n_words=30,
#     verbose=True
# )

# # 訓練模型
# topics, probs = representation_topic_model.fit_transform(docs, embeddings)

# # 查看新的主題表示
# representation_topic_model.get_topic_info()

In [None]:
# # 視覺化主題分布：圓圈大小是主題的大小，圓圈的距離是主題之間的相似度
# topic_model.visualize_topics()

In [None]:
# representation_topic_model.visualize_topics()

In [None]:
# # 原始模型的主題
# visualize_fake_news_ratio_by_topic(topic_model, docs, data['label'], title="原始主題的假新聞比例")

# # 使用 KeyBERT 表示詞的模型主題
# visualize_fake_news_ratio_by_topic(representation_topic_model, docs, data['label'], title="KeyBERT 主題的假新聞比例")

## LLM ##

In [None]:
from langchain_community.chat_models import ChatOllama  # 使用 Ollama 封裝的 LLaMA 模型
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.output_parsers import JsonOutputParser
from langchain_core.pydantic_v1 import BaseModel, Field
from langchain.prompts import PromptTemplate

# 定義輸出結構
class MessageClassification(BaseModel):
    verdict: str = Field(description="Verdict whether the message is Real or Fake")
    confidence: str = Field(description="Confidence level of the judgment (e.g., High, Medium, Low)")
    reason: str = Field(description="Brief explanation of the judgment")

# 使用本地 LLaMA 模型
judge_llm = ChatOllama(model="llama3:8B")
logic_llm = ChatOllama(model="phi3:3.8B")
debater_llm = ChatOllama(model="mistral:7B")

# Json 輸出格式解析器
parser = JsonOutputParser(pydantic_object=MessageClassification)
format_instructions = parser.get_format_instructions()


# 單一 LLM 推理的 Prompt
llm_prompt = PromptTemplate.from_template(
    """
You are a professional fact-checker. Analyze the following message and determine if it is real or fake.

Message:
\"\"\"{message}\"\"\"

Fill in this exact JSON format (no extra text!):

{{
  "verdict": "",        // "Real" or "Fake"
  "confidence": "",     // "High", "Medium", or "Low"
  "reason": ""          // Short explanation (1-2 sentences)
}}

Remember:
- DO NOT add anything outside the JSON.
- DO NOT wrap it in markdown (e.g., ```json).
"""
)

# 讓 judge_llm 匯總所有模型觀點的 Prompt
summary_prompt = PromptTemplate.from_template(
    """
You are the final arbiter. Three experts have evaluated the message. Please summarize their opinions and give your final decision.

Message:
\"\"\"{message}\"\"\"

Expert 1 (Logic-focused model):
{logic_opinion}

Expert 2 (Debate-focused model):
{debate_opinion}

Expert 3 (Your own opinion):
{your_opinion}

Now summarize the opinions, resolve any conflicts, and provide a final classification in this JSON format:
{format_instructions}
"""
)

In [None]:
import pandas as pd
import concurrent.futures
import re
import json

def extract_json(text: str) -> dict:
    try:
        
        # 找出第一組結構為 { ... } 的JSON區塊
        match = re.search(r'{[\s\S]*?}', text)
        if not match:
            raise ValueError("No valid JSON object found in output.")
        json_str = match.group()
        return json.loads(json_str)
    except json.JSONDecodeError as e:
        print(f"\n JSON 解析失敗：{e}")
        print("原始輸出：", text)
        return {
            "verdict": "Unknown",
            "confidence": "Low",
            "reason": "Model did not return valid JSON."
        }

def call_llm(llm, prompt):
    response = llm.invoke(prompt)
    return response.content if hasattr(response, "content") else response

# 定義分析函式
def analyze_message_with_multi_llm(message: str):
    logic_input = llm_prompt.format(message=message, format_instructions=format_instructions)
    debate_input = llm_prompt.format(message=message, format_instructions=format_instructions)
    judge_input = llm_prompt.format(message=message, format_instructions=format_instructions)

    with concurrent.futures.ThreadPoolExecutor() as executor:
        futures = {
            executor.submit(call_llm, logic_llm, logic_input): "logic",
            executor.submit(call_llm, debater_llm, debate_input): "debate",
            executor.submit(call_llm, judge_llm, judge_input): "judge"
        }
        results = {}
        for future in concurrent.futures.as_completed(futures):
            key = futures[future]
            results[key] = future.result()

    summary_input = summary_prompt.format(
        message=message,
        logic_opinion=results["logic"],
        debate_opinion=results["debate"],
        your_opinion=results["judge"],
        format_instructions=format_instructions
    )

    final_response = judge_llm.invoke(summary_input)
    result = extract_json(final_response.content)
    return result

def encode_verdict(verdict: str) -> int:
    return 1 if verdict.strip().lower() == 'real' else 0

def encode_confidence(conf: str) -> int:
    mapping = {'low': 0, 'medium': 1, 'high': 2}
    return mapping.get(conf.strip().lower(), 1)  # 預設給信心程度1

In [None]:
# import random

# # 觀察測試用!!!
# sample_texts = data['text'].sample(10, random_state=42)

# for i, text in enumerate(sample_texts):
#     result = analyze_message_with_multi_llm(text)
#     print("推論結果：", result)

In [None]:
# multi-LLM
llm_results = data['text'].progress_apply(analyze_message_with_multi_llm)
llm_df = pd.DataFrame(llm_results.tolist())

# encode
llm_df['verdict_encoded'] = llm_df['verdict'].apply(encode_verdict)
llm_df['confidence_encoded'] = llm_df['confidence'].apply(encode_confidence)

X_final = pd.concat([X_final, llm_df[['verdict_encoded', 'confidence_encoded']]], axis=1)
y_final = data['label']

In [None]:
from sklearn.model_selection import cross_validate, StratifiedKFold
from sklearn.metrics import make_scorer, accuracy_score, f1_score, precision_score, recall_score
from sklearn.pipeline import Pipeline

# classifier models
classifier_model = {
    "LogReg"       : LogisticRegression(max_iter=1000),
    "DecisionTree" : DecisionTreeClassifier(),
    "SVM"          : SVC(kernel='linear', probability=True),
    "RandomForest" : RandomForestClassifier(random_state=42)
}

scoring = {
    'accuracy': make_scorer(accuracy_score),
    'f1': make_scorer(f1_score),
    'precision': make_scorer(precision_score),
    'recall': make_scorer(recall_score)
}

# === init result ===
results = []

# === 建立 Stratified K-Fold ===
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# === 訓練每個模型 ===
for name, model in classifier_model.items():
    print(f"\n 訓練模型: {name}")
    pipeline = Pipeline([
        ('scaler', StandardScaler()),  # 對所有特徵標準化
        ('clf', model)
    ])
    scores = cross_validate(pipeline, X_final, y, cv=cv, scoring=scoring)
    result = {
        'model': name,
        'accuracy': np.mean(scores['test_accuracy']),
        'f1': np.mean(scores['test_f1']),
        'precision': np.mean(scores['test_precision']),
        'recall': np.mean(scores['test_recall'])
    }
    results.append(result)

# === 整理成 DataFrame 顯示 ===
result_df = pd.DataFrame(results)
print("\n各模型評估結果：")
print(result_df.sort_values(by='f1', ascending=False))