In [5]:
import os
import jieba
from collections import Counter
from itertools import chain
from numpy import array
from sklearn.naive_bayes import MultinomialNB

In [6]:
# 路径设置
folder_path = './data'

# 读取邮件内容
def load_emails(filenames):
    emails = []
    for filename in filenames:
        with open(os.path.join(folder_path, filename), 'r', encoding='utf-8') as f:
            content = f.read()
            emails.append(content)
    return emails

# 加载停用词
def load_stopwords(path='cn_stopwords.txt'):
    stopwords = set()
    with open(path, 'r', encoding='utf-8') as f:
        for line in f:
            stopwords.add(line.strip())
    return stopwords

# 分词 + 去停用词
def preprocess_emails(raw_emails, stopwords):
    processed = []
    for email in raw_emails:
        tokens = jieba.cut(email)
        words = [w for w in tokens if w.strip() and w not in stopwords]
        processed.append(words)
    return processed  # 注意：返回的是词列表，而不是字符串

# 提取高频词
def getTopNWords(topN, allWords):
    freq = Counter(chain(*allWords))
    return [w for w, _ in freq.most_common(topN)]

# 构建特征向量
def build_feature_vectors(allWords, topWords):
    vector = []
    for words in allWords:
        temp = list(map(lambda x: words.count(x), topWords))
        vector.append(temp)
    return array(vector)

In [7]:
# 步骤1：准备训练数据
train_filenames = [f"{i}.txt" for i in range(150)]
y_train = [1 if i <= 126 else 0 for i in range(150)]

# 步骤2：加载内容与预处理
raw_emails = load_emails(train_filenames)
stopwords = load_stopwords('cn_stopwords.txt')
processed_emails = preprocess_emails(raw_emails, stopwords)

# 步骤3：提取高频词并构建训练特征向量
topWords = getTopNWords(400, processed_emails)
X_train = build_feature_vectors(processed_emails, topWords)

# 步骤4：训练朴素贝叶斯模型
nb_classifier = MultinomialNB()
nb_classifier.fit(X_train, y_train)

# 步骤5：对剩余的6封邮件进行预测
test_filenames = [f"{i}.txt" for i in range(150, 156)]
raw_test_emails = load_emails(test_filenames)
processed_test_emails = preprocess_emails(raw_test_emails, stopwords)
X_test = build_feature_vectors(processed_test_emails, topWords)
y_test_pred = nb_classifier.predict(X_test)

# 输出预测结果
for i, filename in enumerate(test_filenames):
    print(f"邮件 {filename} 的预测结果: {'垃圾邮件' if y_test_pred[i] == 1 else '正常邮件'}")

邮件 150.txt 的预测结果: 正常邮件
邮件 151.txt 的预测结果: 正常邮件
邮件 152.txt 的预测结果: 垃圾邮件
邮件 153.txt 的预测结果: 正常邮件
邮件 154.txt 的预测结果: 正常邮件
邮件 155.txt 的预测结果: 正常邮件


In [8]:
def load_test_emails(filenames):
    emails = []
    for filename in filenames:
        with open(os.path.join(folder_path, filename), 'r', encoding='gbk',errors='ignore') as f:
            content = f.read()
            emails.append(content)
    return emails

test_filenames = [f"{i}.txt" for i in range(156,165)]
raw_test_emails = load_test_emails(test_filenames)

processed_test_texts = preprocess_emails(raw_test_emails, stopwords)

X_test_emails = build_feature_vectors(processed_test_texts, topWords)

y_pred = nb_classifier.predict(X_test_emails)

for i, filename in enumerate(test_filenames):
    print(f"邮件 {filename} 的预测结果: {'垃圾邮件' if y_pred[i] == 1 else '正常邮件'}")

邮件 156.txt 的预测结果: 垃圾邮件
邮件 157.txt 的预测结果: 正常邮件
邮件 158.txt 的预测结果: 垃圾邮件
邮件 159.txt 的预测结果: 垃圾邮件
邮件 160.txt 的预测结果: 垃圾邮件
邮件 161.txt 的预测结果: 垃圾邮件
邮件 162.txt 的预测结果: 正常邮件
邮件 163.txt 的预测结果: 垃圾邮件
邮件 164.txt 的预测结果: 垃圾邮件
