In [1]:
import seaborn as sns
import email
from bs4 import BeautifulSoup
import re
import nltk
import warnings
warnings.filterwarnings('ignore') # 忽略警告

In [2]:
import numpy as np
import os
import string
import pandas as pd

# 讀取指定路徑下的所有電子郵件文件
def get_data(path):
    data = []
    files = os.listdir(path)
    for file in files:
        f = open(os.path.join(path, file), encoding="ISO-8859-1")
        words_list = f.read()
        data.append(words_list)
        f.close()
    return data

# 指定資料夾路徑
spam_folder = r'C:\Users\Paul\F64106016_FinalProject\archive\spam_2\spam_2'
easyham_folder = r'C:\Users\Paul\F64106016_FinalProject\archive\easy_ham\easy_ham'
hardham_folder = r'C:\Users\Paul\F64106016_FinalProject\archive\hard_ham\hard_ham'

# 讀取不同類別的電子郵件
easy_ham = get_data(easyham_folder)
hard_ham = get_data(hardham_folder)
ham = easy_ham + hard_ham
spam = get_data(spam_folder)

# 打亂數據順序
np.random.shuffle(ham)
np.random.shuffle(spam)


In [3]:
from nltk.corpus import stopwords # 用於去除停用詞
from nltk.stem import PorterStemmer # 用於詞幹提取
from nltk.stem import WordNetLemmatizer  # 用於詞形還原
from sklearn.base import BaseEstimator, TransformerMixin

# 初始化詞幹提取和詞形還原器
stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()
class email_to_clean_text(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass
    def fit(self, X, y=None): 
        return self
    def transform(self, X):
        text_list = []
        for mail in X:
            b = email.message_from_string(mail)
            body = ""

            # 檢查電子郵件是否為多部分組成
            if b.is_multipart():
                for part in b.walk():
                    ctype = part.get_content_type()
                    cdispo = str(part.get('Content-Disposition'))

                    # 跳過任何 text/plain 的附件
                    if ctype == 'text/plain' and 'attachment' not in cdispo:
                        body = part.get_payload(decode=True)  # 獲取電子郵件正文
                        break
           
            else:
                body = b.get_payload(decode=True) # 獲取電子郵件正文
            
            # 使用 BeautifulSoup 解析 HTML 內容
            soup = BeautifulSoup(body, "html.parser") 
            text = soup.get_text().lower()
            # 去除網址
            text = re.sub(r'(https|http)?:\/\/(\w|\.|\/|\?|\=|\&|\%)*\b', '', text, flags=re.MULTILINE) 
            # 去除電子郵件地址
            text = re.sub(r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b', '', text, flags=re.MULTILINE) 
            # 去除標點符號
            text = text.translate(str.maketrans('', '', string.punctuation)) 
            # 去除數字
            text = ''.join([i for i in text if not i.isdigit()]) 
            # 去除停用詞
            stop_words = stopwords.words('english')
            words_list = [w for w in text.split() if w not in stop_words] 
            # 詞形還原
            words_list = [lemmatizer.lemmatize(w) for w in words_list] #lemmatization
            # 詞幹提取
            words_list = [stemmer.stem(w) for w in words_list] #Stemming
            text_list.append(' '.join(words_list))
        return text_list

# 初始化文本清理轉換器   
email_to_text = email_to_clean_text()

# 對不同類別的電子郵件進行文本清理
text_ham = email_to_text.transform(ham)
text_spam = email_to_text.transform(spam)

Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.
Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.
Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.
Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


In [4]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import recall_score
from sklearn.metrics import confusion_matrix

# 準備標籤
labels_ham = ['ham'] * len(text_ham)
labels_spam = ['spam'] * len(text_spam)

# 合併數據和標籤
texts = text_ham + text_spam
labels = labels_ham + labels_spam

# 將數據分割成訓練集和測試集
X_train, X_test, y_train, y_test = train_test_split(texts, labels, test_size=0.2, random_state=42)

# 建立機器學習管道
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('clf', RandomForestClassifier())
])

# 訓練模型
pipeline.fit(X_train, y_train)

# 預測
y_pred = pipeline.predict(X_test)

# 評估模型
accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred, pos_label='spam')
recall = recall_score(y_test, y_pred, pos_label='spam')
conf_matrix = confusion_matrix(y_test, y_pred)

print(f'Accuracy: {accuracy}')
print(f'F1 Score: {f1}')
print(f'Recall: {recall}')
print(f'Confusion Matrix:\n{conf_matrix}')

Accuracy: 0.9714285714285714
F1 Score: 0.953307392996109
Recall: 0.9423076923076923
Confusion Matrix:
[[571   9]
 [ 15 245]]


In [None]:
import tkinter as tk
from tkinter import scrolledtext

# 建立GUI介面
def classify_email():
    email_text = email_input.get("1.0", tk.END)
    clean_text = email_to_text.transform([email_text])
    prediction = pipeline.predict(clean_text)[0]
    result_text.set(f'The email is classified as: {prediction}')

# 創建主窗口
root = tk.Tk()
root.title("Email Spam Classifier")

# 創建文本輸入框
email_input = scrolledtext.ScrolledText(root, width=80, height=20)
email_input.pack()

# 創建結果顯示標籤
result_text = tk.StringVar()
result_label = tk.Label(root, textvariable=result_text, font=("Helvetica", 14))
result_label.pack()

# 創建分類按鈕
classify_button = tk.Button(root, text="Classify Email", command=classify_email)
classify_button.pack()

# 運行GUI主循環
root.mainloop()