In [None]:

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import ast
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split, cross_validate, cross_val_predict
from sklearn.svm import LinearSVC
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, make_scorer, f1_score
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import re
from pprint import pprint
import time
from gensim.models import Phrases, Word2Vec, Doc2Vec
from gensim.corpora import Dictionary
from gensim.models import LdaModel, CoherenceModel
from gensim.matutils import corpus2csc
import pyLDAvis
import pyLDAvis.gensim_models
from nltk.tokenize import word_tokenize
from textblob import TextBlob

nltk.download('stopwords')
nltk.download('wordnet')

In [None]:
fakeReport = pd.read_csv("./raw_data/fake.csv")
data = fakeReport[fakeReport['text'].notna()]
data.head(2)

In [None]:
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

def preprocess(text):
    text = text.lower()
    text = re.sub(r'[^a-z ]', '', text)
    tokens = word_tokenize(text)
    tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words]
    return tokens

token = data['text'].astype(str).apply(preprocess)

In [None]:
# TfidfVectorizer

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split


# 先把 tokens 還原成字串 (因為 TfidfVectorizer 接收字串)
text_TfidfVectorizer = token.apply(lambda x: ' '.join(x))

# 建立向量器
vectorizer_tfidf = TfidfVectorizer(max_features=1000, stop_words='english')
X_tfidf = vectorizer_tfidf.fit_transform(text_TfidfVectorizer)
y = hotel_labels.astype(str)

# 切訓練/測試集 (TF-IDF) 7:3
X_train_tfidf, X_test_tfidf, y_train, y_test = train_test_split(X_tfidf, y, test_size=0.3, random_state=42, stratify=y)


In [None]:
# Word2Vec

from gensim.models.doc2vec import TaggedDocument

w2v_model = Word2Vec(sentences=hotel_tokens, vector_size=100, window=5, min_count=2, workers=4, seed=42)

# 取每個飯店所有 tokens 的平均向量
def average_word2vec(tokens, model, vector_size=100):
    vectors = [model.wv[word] for word in tokens if word in model.wv]
    if len(vectors) > 0:
        return np.mean(vectors, axis=0)
    else:
        return np.zeros(vector_size)

X_w2v = np.vstack(hotel_tokens.apply(lambda tokens: average_word2vec(tokens, w2v_model)))

# 切 Word2Vec 特徵 7:3
X_train_w2v, X_test_w2v, _, _ = train_test_split(X_w2v, y, test_size=0.3, random_state=42, stratify=y)

In [None]:
# Doc2Vec

tagged_docs = [TaggedDocument(words=tokens, tags=[str(i)]) for i, tokens in enumerate(hotel_tokens)]
d2v_model = Doc2Vec(documents=tagged_docs, vector_size=100, window=5, min_count=2, workers=4, seed=42)

X_d2v = np.vstack([d2v_model.dv[str(i)] for i in range(len(hotel_tokens))])

# 切分 Doc2Vec 特徵
X_train_d2v, X_test_d2v, _, _ = train_test_split(X_d2v, y, test_size=0.3, random_state=42, stratify=y)

In [None]:
from sentence_transformers import SentenceTransformer

model_phi4 = SentenceTransformer('microsoft/Phi-4-mini-instruct')
model_phi4.tokenizer.pad_token = model_phi4.tokenizer.eos_token

# 將 hotel_texts 輸入到 sentence transformer
embeddings_phi4 = model_phi4.encode(hotel_texts.tolist(), show_progress_bar=True, batch_size=32)

# 切分 phi4 特徵
X_train_phi4, X_test_phi4, _, _ = train_test_split(embeddings_phi4, y, test_size=0.3, random_state=42, stratify=y)

## 開始比較 TF-IDF、Word2Vec、Doc2Vec 個分類器

In [None]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier


models = [
    ("Logistic Regression", LogisticRegression(max_iter=1000)),
    ("Linear SVM", LinearSVC())
]

vectorizer_names = ['TF-IDF', 'Word2Vec', 'Doc2Vec']
vectorizer_sets = [(X_train_tfidf, X_test_tfidf), (X_train_w2v, X_test_w2v), (X_train_d2v, X_test_d2v)]

for vec_name, (X_tr, X_te) in zip(vectorizer_names, vectorizer_sets):
    print(f"\n===== Feature Set: {vec_name} =====")
    for name, clf in models:
        print(f"\n--- Model: {name} ---")
        clf.fit(X_tr, y_train)
        y_pred = clf.predict(X_te)

        print(classification_report(y_test, y_pred))

        cm = confusion_matrix(y_test, y_pred, labels=sorted(np.unique(y)))
        fig, ax = plt.subplots(figsize=(8,6))
        sns.heatmap(cm, annot=True, fmt="d", cmap=plt.cm.Blues, xticklabels=sorted(np.unique(y)), yticklabels=sorted(np.unique(y)))
        ax.set_title(f"Confusion Matrix: {vec_name} + {name}")
        ax.set_xlabel("Predicted")
        ax.set_ylabel("True")
        plt.yticks(rotation=0)
        plt.show()