# 📂 第1部分：資料讀取與中文預處理

In [None]:
# 匯入套件
import pandas as pd
import jieba
import re
import os
from sklearn.model_selection import train_test_split

# 中文清理
def clean_text(text):
    if not isinstance(text, str):
        text = str(text)
    text = re.sub(r'[^\\u4e00-\\u9fa5a-zA-Z0-9]', ' ', text)
    text = re.sub(r'\\s+', ' ', text).strip()
    return text

# 中文斷詞
def tokenize(text):
    return ' '.join(jieba.cut(text))

# 讀資料
csv_path = 'data/test(2).csv'

# 先確認檔案是否存在
if not os.path.exists(csv_path):
    raise FileNotFoundError(f"找不到資料檔案：{csv_path}")

df = pd.read_csv(csv_path)

# 檢查欄位
if 'subject' not in df.columns or 'message' not in df.columns:
    raise ValueError("CSV檔必須有 'subject' 和 'message' 欄位！")

# **重點：丟掉label是NaN的資料！**
df = df.dropna(subset=['label'])

# 合併標題與內文
df['content'] = (df['subject'].fillna('') + ' ' + df['message'].fillna('')).apply(clean_text)
df['content'] = df['content'].apply(tokenize)

# label轉成整數
df['label'] = df['label'].astype(int)

# 切分資料集
X_train, X_test, y_train, y_test = train_test_split(
    df['content'], df['label'], test_size=0.2, random_state=42, stratify=df['label']
)

# 顯示一下結果
X_train[:3]


Building prefix dict from the default dictionary ...
Dumping model to file cache C:\Users\user\AppData\Local\Temp\jieba.cache
Loading model cost 0.381 seconds.
Prefix dict has been built successfully.


IntCastingNaNError: Cannot convert non-finite values (NA or inf) to integer

# 🛠️ 第2部分：TF-IDF + PCA + PSO特徵選擇

In [None]:

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import PCA
import numpy as np

# 向量化
vectorizer = TfidfVectorizer(max_features=5000)
X_train_tfidf = vectorizer.fit_transform(X_train).toarray()
X_test_tfidf = vectorizer.transform(X_test).toarray()

# PCA降到100維
pca = PCA(n_components=100, random_state=42)
X_train_pca = pca.fit_transform(X_train_tfidf)
X_test_pca = pca.transform(X_test_tfidf)

# 粒子群優化 (簡化版)
def pso_feature_selection(X, y, num_particles=30, max_iter=20, select_num=60):
    n_features = X.shape[1]
    particles = np.random.randint(0, 2, (num_particles, n_features))
    velocities = np.random.uniform(-1, 1, (num_particles, n_features))
    p_best = particles.copy()
    p_best_scores = np.zeros(num_particles)
    for i in range(num_particles):
        p_best_scores[i] = evaluate(X[:,particles[i]==1], y)
    g_best = p_best[np.argmax(p_best_scores)]
    g_best_score = np.max(p_best_scores)
    w, c1, c2 = 0.7, 1.5, 1.5
    
    for iter in range(max_iter):
        for i in range(num_particles):
            r1 = np.random.rand(n_features)
            r2 = np.random.rand(n_features)
            velocities[i] = w * velocities[i] + c1*r1*(p_best[i]-particles[i]) + c2*r2*(g_best-particles[i])
            particles[i] = np.where(np.random.rand(n_features) < sigmoid(velocities[i]), 1, 0)
            if np.sum(particles[i]) == 0:
                particles[i,np.random.randint(0,n_features)] = 1
            score = evaluate(X[:,particles[i]==1], y)
            if score > p_best_scores[i]:
                p_best[i] = particles[i]
                p_best_scores[i] = score
        g_best = p_best[np.argmax(p_best_scores)]
        g_best_score = np.max(p_best_scores)
    return np.where(g_best==1)[0]

def sigmoid(x):
    return 1 / (1 + np.exp(-x))

def evaluate(X_sub, y):
    from sklearn.linear_model import LogisticRegression
    from sklearn.model_selection import cross_val_score
    clf = LogisticRegression(max_iter=500)
    scores = cross_val_score(clf, X_sub, y, cv=3, scoring='accuracy')
    return np.mean(scores)

# 選特徵
selected_idx = pso_feature_selection(X_train_pca, y_train)
X_train_selected = X_train_pca[:, selected_idx]
X_test_selected = X_test_pca[:, selected_idx]

X_train_selected.shape


# 🧠 第3部分：建立TensorFlow CNN模型

In [None]:

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

model = keras.Sequential([
    layers.Input(shape=(X_train_selected.shape[1], 1)),
    layers.Conv1D(64, 3, activation='relu'),
    layers.MaxPooling1D(2),
    layers.Conv1D(128, 3, activation='relu'),
    layers.GlobalMaxPooling1D(),
    layers.Dense(128, activation='relu'),
    layers.Dense(7, activation='softmax')
])

model.compile(optimizer='adam',
              loss='sparse_categorical_crossentropy',
              metrics=['accuracy'])

model.summary()


# 🏋️ 第4部分：模型訓練

In [None]:

X_train_final = np.expand_dims(X_train_selected, axis=-1)
X_test_final = np.expand_dims(X_test_selected, axis=-1)

history = model.fit(
    X_train_final, y_train,
    validation_data=(X_test_final, y_test),
    epochs=20,
    batch_size=32
)


# 📈 第5部分：模型評估與單筆預測

In [None]:

from sklearn.metrics import classification_report

y_pred = np.argmax(model.predict(X_test_final), axis=1)
print(classification_report(y_test, y_pred, digits=4))

# 測試單筆
def predict_text(text):
    text = clean_text(text)
    text = tokenize(text)
    tfidf_vec = vectorizer.transform([text]).toarray()
    tfidf_pca = pca.transform(tfidf_vec)
    tfidf_sel = tfidf_pca[:, selected_idx]
    tfidf_sel = np.expand_dims(tfidf_sel, axis=-1)
    pred = model.predict(tfidf_sel)
    return np.argmax(pred)

example = "親愛的用戶，您的驗證碼為123456，請於5分鐘內使用。"
print("預測結果:", predict_text(example))
