In [1]:
import os
import pandas as pd
import joblib
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_selection import SelectKBest, chi2

source = 'combined_labeled.csv'
df = pd.read_csv(source)

texts = df['content'].tolist()
labels = df['label'].tolist()
text_names = df['file_name'].tolist()
post_times = df['post_time'].tolist()

tfidf_vectorizer = TfidfVectorizer()
X_tfidf = tfidf_vectorizer.fit_transform(texts)

k = 1500
chi2_selector = SelectKBest(chi2, k=k)
X_chi2 = chi2_selector.fit_transform(X_tfidf, labels)

output_dir = "tfidf_vectors"
os.makedirs(output_dir, exist_ok=True)
file_path_chi2 = os.path.join(output_dir, "tfidf_chi2_and_labels.pkl")

data_to_save_chi2 = {
    'X_features': X_chi2,
    'labels': labels,
    'text_names': text_names,
    'post_times': post_times,
    'tfidf_vectorizer': tfidf_vectorizer,
    'chi2_selector': chi2_selector
}

joblib.dump(data_to_save_chi2, file_path_chi2, compress=3)
print(f"已儲存「TF-IDF + Chi-square」至 {file_path_chi2}")

已儲存「TF-IDF + Chi-square」至 tfidf_vectors/tfidf_chi2_and_labels.pkl


In [9]:
import os
import joblib
import numpy as np
from collections import Counter
import warnings

# --- 關閉所有警告 ---
warnings.filterwarnings('ignore')

# --- 載入數據 ---
output_dir = "tfidf_vectors"
file_path = os.path.join(output_dir, "tfidf_chi2_and_labels.pkl")
loaded_data = joblib.load(file_path)
X = loaded_data['X_features']
y = np.array(loaded_data['labels'])
print("成功載入資料")

# --- 統計標籤分佈 ---
label_counts = Counter(y)
print("標籤分佈:")
print(f"看跌(0): {label_counts[0]} 篇文章")
print(f"看漲(1): {label_counts[1]} 篇文章")

成功載入資料
標籤分佈:
看跌(0): 549 篇文章
看漲(1): 1056 篇文章


In [7]:
import os
import joblib
import numpy as np
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
#SVM
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report

output_dir = "tfidf_vectors"
file_path = os.path.join(output_dir, "tfidf_chi2_and_labels.pkl")

loaded_data = joblib.load(file_path)

X = loaded_data['X_features']       
y = np.array(loaded_data['labels'])
text_names = loaded_data['text_names']
post_times = loaded_data['post_times']
tfidf_vectorizer = loaded_data['tfidf_vectorizer']
chi2_selector = loaded_data['chi2_selector']

print("Loaded TF-IDF shape:", X.shape)
print("Labels count:", len(y))


X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

clf = SVC(kernel='rbf', C=1.0, random_state=42)

clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)

print("\nAccuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))


Loaded TF-IDF shape: (1605, 1500)
Labels count: 1605

Accuracy: 0.8691588785046729

Classification Report:
               precision    recall  f1-score   support

           0       0.94      0.66      0.77       109
           1       0.85      0.98      0.91       212

    accuracy                           0.87       321
   macro avg       0.89      0.82      0.84       321
weighted avg       0.88      0.87      0.86       321



In [None]:
import os
import joblib
import numpy as np
import warnings

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

# === 引入 Base Learners ===
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from catboost import CatBoostClassifier

# === StackingClassifier & Meta Learner ===
from sklearn.ensemble import StackingClassifier
from sklearn.linear_model import LogisticRegression
# --- 關閉所有警告 ---
warnings.filterwarnings('ignore')

# --- 載入數據 ---
output_dir = "tfidf_vectors"
file_path = os.path.join(output_dir, "tfidf_chi2_and_labels.pkl")
loaded_data = joblib.load(file_path)
X = loaded_data['X_features']  
y = np.array(loaded_data['labels'])
print("成功載入資料")

# --- 劃分數據集 ---
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)
print("數據集劃分完成")

# --- 初始化 RandomForest 模型 ---
model_rf = RandomForestClassifier(n_estimators=500, random_state=42, n_jobs=-1, class_weight={0: 6.9, 1: 1.0})

# --- 訓練 RandomForest 模型 ---
print("正在訓練 RandomForest 模型...")
model_rf.fit(X_train, y_train)
print("RandomForest 模型訓練完成")

# --- 預測與評估 ---
print("正在進行預測...")
y_pred = model_rf.predict(X_test)

# RandomForest 模型評估
print("RandomForest 模型評估完成")
print(f"準確率: {accuracy_score(y_test, y_pred):.4f}")
print("混淆矩陣:\n", confusion_matrix(y_test, y_pred))
print("分類報告:\n", classification_report(y_test, y_pred, target_names=['看跌(0)', '看漲(1)']))

成功載入資料
數據集劃分完成
正在訓練 RandomForest 模型...
RandomForest 模型訓練完成
正在進行預測...
RandomForest 模型評估完成
準確率: 0.7913
混淆矩陣:
 [[ 62  47]
 [ 20 192]]
分類報告:
               precision    recall  f1-score   support

       看跌(0)       0.76      0.57      0.65       109
       看漲(1)       0.80      0.91      0.85       212

    accuracy                           0.79       321
   macro avg       0.78      0.74      0.75       321
weighted avg       0.79      0.79      0.78       321



In [None]:
import os
import joblib
import numpy as np
import scipy.sparse as sp
import matplotlib.pyplot as plt
import seaborn as sns
import warnings

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from catboost import CatBoostClassifier  

# --- 關閉所有警告 ---
warnings.filterwarnings('ignore')

# --- 1. 載入數據 ---
output_dir = "tfidf_vectors"
file_path = os.path.join(output_dir, "tfidf_chi2_and_labels.pkl")
loaded_data = joblib.load(file_path)
X = loaded_data['X_features']  
y = np.array(loaded_data['labels'])
print("成功載入資料")

# --- 2. 劃分數據集 ---
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)
print("數據集劃分完成")

# --- 3. 初始化 CatBoost 模型 ---
model_cb = CatBoostClassifier(
    iterations=500,
    random_state=42,
    verbose=False,       # 訓練過程若不想輸出 CatBoost 自帶的訊息，可設 False
    class_weights=[2.6, 1.0]
    #auto_class_weights='Balanced'
)

# --- 4. 訓練 CatBoost 模型 ---
print("正在訓練 CatBoost 模型...")
model_cb.fit(X_train, y_train)
print("CatBoost 模型訓練完成")

# --- 5. 預測與評估 ---
print("正在進行預測...")
y_pred = model_cb.predict(X_test)

# --- CatBoost 模型評估 ---
print("CatBoost 模型評估完成")
print(f"準確率: {accuracy_score(y_test, y_pred):.4f}")
print("混淆矩陣:\n", confusion_matrix(y_test, y_pred))
print("分類報告:\n", classification_report(y_test, y_pred, target_names=['看跌(0)', '看漲(1)']))


成功載入資料
數據集劃分完成
正在訓練 CatBoost 模型...
CatBoost 模型訓練完成
正在進行預測...
CatBoost 模型評估完成
準確率: 0.7664
混淆矩陣:
 [[ 73  36]
 [ 39 173]]
分類報告:
               precision    recall  f1-score   support

       看跌(0)       0.65      0.67      0.66       109
       看漲(1)       0.83      0.82      0.82       212

    accuracy                           0.77       321
   macro avg       0.74      0.74      0.74       321
weighted avg       0.77      0.77      0.77       321



In [35]:
import os
import joblib
import numpy as np
import warnings

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

# === 三個基礎模型 ===
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from catboost import CatBoostClassifier

# === VotingClassifier ===
from sklearn.ensemble import VotingClassifier

# 關閉所有警告
warnings.filterwarnings('ignore')

# --- 1. 載入數據 ---
output_dir = "tfidf_vectors"
file_path = os.path.join(output_dir, "tfidf_chi2_and_labels.pkl")

loaded_data = joblib.load(file_path)
X = loaded_data['X_features']       
y = np.array(loaded_data['labels'])
print("成功載入資料")
print("Loaded TF-IDF shape:", X.shape)
print("Labels count:", len(y))

# --- 2. 劃分數據集 ---
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)
print("數據集劃分完成")

# --- 3. 初始化三個模型 ---
model_svm = SVC(kernel='rbf', C=1.0, probability=True, random_state=42)
# 說明：若使用 hard voting，SVC 不一定要設 `probability=True`，
#       但若要使用 soft voting 時，需要 probability=True。

model_rf = RandomForestClassifier(
    n_estimators=500,
    random_state=42,
    n_jobs=-1,
    class_weight={0: 6.9, 1: 1.0}
)

model_cb = CatBoostClassifier(
    iterations=500,
    random_state=42,
    verbose=False,
    class_weights=[2.6, 1.0]
)

# --- 4. 建立投票模型 (Voting Classifier) ---
# voting='hard' 表示多數決；若要平均機率再投票，可改為 voting='soft'
voting_clf = VotingClassifier(
    estimators=[
        ('svm', model_svm),
        ('rf', model_rf),
        ('cb', model_cb)
    ],
    voting='hard'  # 或 'soft'，此範例採用「多數決」。
)

# --- 5. 訓練投票模型 ---
print("\n正在訓練投票模型 (SVM + RF + CatBoost)...")
voting_clf.fit(X_train, y_train)
print("投票模型訓練完成")

# --- 6. 預測與評估 ---
print("正在進行預測...")
y_pred = voting_clf.predict(X_test)

print("\n投票模型 (SVM + RF + CatBoost) 評估結果:")
accuracy = accuracy_score(y_test, y_pred)
print(f"準確率: {accuracy:.4f}")
print("混淆矩陣:\n", confusion_matrix(y_test, y_pred))
print("分類報告:\n", classification_report(y_test, y_pred, target_names=['看跌(0)', '看漲(1)']))


成功載入資料
Loaded TF-IDF shape: (1605, 1500)
Labels count: 1605
數據集劃分完成

正在訓練投票模型 (SVM + RF + CatBoost)...
投票模型訓練完成
正在進行預測...

投票模型 (SVM + RF + CatBoost) 評估結果:
準確率: 0.8318
混淆矩陣:
 [[ 74  35]
 [ 19 193]]
分類報告:
               precision    recall  f1-score   support

       看跌(0)       0.80      0.68      0.73       109
       看漲(1)       0.85      0.91      0.88       212

    accuracy                           0.83       321
   macro avg       0.82      0.79      0.80       321
weighted avg       0.83      0.83      0.83       321



In [37]:
import os
import joblib
import numpy as np
import warnings

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

# === 引入 Base Learners ===
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from catboost import CatBoostClassifier

# === StackingClassifier & Meta Learner ===
from sklearn.ensemble import StackingClassifier
from sklearn.linear_model import LogisticRegression

# 關閉所有警告
warnings.filterwarnings('ignore')

# --- 1. 載入數據 ---
output_dir = "tfidf_vectors"
file_path = os.path.join(output_dir, "tfidf_chi2_and_labels.pkl")

loaded_data = joblib.load(file_path)
X = loaded_data['X_features']       
y = np.array(loaded_data['labels'])
print("成功載入資料")
print("Loaded TF-IDF shape:", X.shape)
print("Labels count:", len(y))

# --- 2. 劃分數據集 ---
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)
print("數據集劃分完成")

# --- 3. 初始化三個 Base Learners ---
# 為了能在 Stacking 中使用機率 (predict_proba)，SVC 需設定 probability=True
model_svm = SVC(kernel='rbf', C=1.0, probability=True, random_state=42)
# 說明：若使用 hard voting，SVC 不一定要設 `probability=True`，
#       但若要使用 soft voting 時，需要 probability=True。

model_rf = RandomForestClassifier(
    n_estimators=500,
    random_state=42,
    n_jobs=-1,
    class_weight={0: 6.9, 1: 1.0}
)

model_cb = CatBoostClassifier(
    iterations=500,
    random_state=42,
    verbose=False,
    class_weights=[2.6, 1.0]
)

# --- 4. 建立 Meta Learner (此處以 LogisticRegression 為例) ---
meta_learner = LogisticRegression(random_state=42)

# --- 5. 建立 StackingClassifier ---
# 若希望使用所有 base learners 的 predict_proba 當作特徵，設 stack_method='predict_proba' 
stacking_clf = StackingClassifier(
    estimators=[
        ('svm', model_svm),
        ('rf', model_rf),
        ('cb', model_cb)
    ],
    final_estimator=meta_learner,
    stack_method='auto',   # 'auto'、'predict_proba'、'decision_function'、'predict' 等皆可
    passthrough=True,     # 若設為 True，則會把原始特徵一起傳給最終分類器
    n_jobs=-1
)

# --- 6. 訓練 Stacking 模型 ---
print("\n正在訓練 Stacking 模型 (SVM + RF + CatBoost -> Meta LR)...")
stacking_clf.fit(X_train, y_train)
print("Stacking 模型訓練完成")

# --- 7. 預測與評估 ---
print("正在進行預測...")
y_pred = stacking_clf.predict(X_test)

print("\nStacking 模型 (SVM + RF + CatBoost) 評估結果:")
accuracy = accuracy_score(y_test, y_pred)
print(f"準確率: {accuracy:.4f}")
print("混淆矩陣:\n", confusion_matrix(y_test, y_pred))
print("分類報告:\n", classification_report(y_test, y_pred, target_names=['看跌(0)', '看漲(1)']))


成功載入資料
Loaded TF-IDF shape: (1605, 1500)
Labels count: 1605
數據集劃分完成

正在訓練 Stacking 模型 (SVM + RF + CatBoost -> Meta LR)...
Stacking 模型訓練完成
正在進行預測...

Stacking 模型 (SVM + RF + CatBoost) 評估結果:
準確率: 0.8442
混淆矩陣:
 [[ 78  31]
 [ 19 193]]
分類報告:
               precision    recall  f1-score   support

       看跌(0)       0.80      0.72      0.76       109
       看漲(1)       0.86      0.91      0.89       212

    accuracy                           0.84       321
   macro avg       0.83      0.81      0.82       321
weighted avg       0.84      0.84      0.84       321



### 以下是建議使用的模型

In [38]:
import os
import joblib
import numpy as np
import warnings

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

# === 引入 Base Learners ===
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from catboost import CatBoostClassifier

# === StackingClassifier & Meta Learner ===
from sklearn.ensemble import StackingClassifier
from sklearn.linear_model import LogisticRegression

# 關閉所有警告
warnings.filterwarnings('ignore')

# --- 1. 載入數據 ---
output_dir = "tfidf_vectors"
file_path = os.path.join(output_dir, "tfidf_chi2_and_labels.pkl")

loaded_data = joblib.load(file_path)
X = loaded_data['X_features']       
y = np.array(loaded_data['labels'])
print("成功載入資料")
print("Loaded TF-IDF shape:", X.shape)
print("Labels count:", len(y))

# --- 2. 劃分數據集 ---
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)
print("數據集劃分完成")

# --- 3. 初始化三個 Base Learners ---
# 為了能在 Stacking 中使用機率 (predict_proba)，SVC 需設定 probability=True
model_svm = SVC(kernel='rbf', C=1.0, probability=True, random_state=42)
# 說明：若使用 hard voting，SVC 不一定要設 `probability=True`，
#       但若要使用 soft voting 時，需要 probability=True。

model_rf = RandomForestClassifier(
    n_estimators=500,
    random_state=42,
    n_jobs=-1,
    class_weight='balanced'
)

model_cb = CatBoostClassifier(
    iterations=500,
    random_state=42,
    verbose=False,
    auto_class_weights='Balanced'
)

# --- 4. 建立 Meta Learner (此處以 LogisticRegression 為例) ---
meta_learner = LogisticRegression(random_state=42)

# --- 5. 建立 StackingClassifier ---
# 若希望使用所有 base learners 的 predict_proba 當作特徵，設 stack_method='predict_proba' 
stacking_clf = StackingClassifier(
    estimators=[
        ('svm', model_svm),
        ('rf', model_rf),
        ('cb', model_cb)
    ],
    final_estimator=meta_learner,
    stack_method='auto',   # 'auto'、'predict_proba'、'decision_function'、'predict' 等皆可
    passthrough=True,     # 若設為 True，則會把原始特徵一起傳給最終分類器
    n_jobs=-1
)

# --- 6. 訓練 Stacking 模型 ---
print("\n正在訓練 Stacking 模型 (SVM + RF + CatBoost -> Meta LR)...")
stacking_clf.fit(X_train, y_train)
print("Stacking 模型訓練完成")

# --- 7. 預測與評估 ---
print("正在進行預測...")
y_pred = stacking_clf.predict(X_test)

print("\nStacking 模型 (SVM + RF + CatBoost) 評估結果:")
accuracy = accuracy_score(y_test, y_pred)
print(f"準確率: {accuracy:.4f}")
print("混淆矩陣:\n", confusion_matrix(y_test, y_pred))
print("分類報告:\n", classification_report(y_test, y_pred, target_names=['看跌(0)', '看漲(1)']))


成功載入資料
Loaded TF-IDF shape: (1605, 1500)
Labels count: 1605
數據集劃分完成

正在訓練 Stacking 模型 (SVM + RF + CatBoost -> Meta LR)...
Stacking 模型訓練完成
正在進行預測...

Stacking 模型 (SVM + RF + CatBoost) 評估結果:
準確率: 0.8380
混淆矩陣:
 [[ 78  31]
 [ 21 191]]
分類報告:
               precision    recall  f1-score   support

       看跌(0)       0.79      0.72      0.75       109
       看漲(1)       0.86      0.90      0.88       212

    accuracy                           0.84       321
   macro avg       0.82      0.81      0.82       321
weighted avg       0.84      0.84      0.84       321

