In [1]:
import os
import jieba
from sklearn.feature_extraction.text import TfidfVectorizer

# 路径
folder_path = './data'

# 步骤1：读取文件
def load_emails(filenames):
    emails = []
    for filename in filenames:
        with open(os.path.join(folder_path, filename), 'r', encoding='utf-8') as f:
            content = f.read()
            emails.append(content)
    return emails

# 步骤2：中文分词 + 去停用词
def load_stopwords(path='cn_stopwords.txt'):
    stopwords = set()
    with open(path, 'r', encoding='utf-8') as f:
        for line in f:
            stopwords.add(line.strip())
    return stopwords

def preprocess_emails(raw_emails, stopwords):
    processed_emails = []
    for email in raw_emails:
        tokens = jieba.cut(email)
        filtered = [w for w in tokens if w.strip() and w not in stopwords]
        processed_emails.append(' '.join(filtered))  # 用空格连接
    return processed_emails

# 步骤3：获取训练集文件名 + 标签
train_filenames = [f"{i}.txt" for i in range(150)]  # 0-149.txt
y_train = [1 if i <= 126 else 0 for i in range(150)]  # 标签列表：前127为垃圾邮件

# 步骤4：载入、预处理
raw_emails = load_emails(train_filenames)
stopwords = load_stopwords('cn_stopwords.txt')
processed_texts = preprocess_emails(raw_emails, stopwords)

Building prefix dict from the default dictionary ...
Loading model from cache C:\Users\Vejvoda\AppData\Local\Temp\jieba.cache
Loading model cost 0.424 seconds.
Prefix dict has been built successfully.


In [2]:
vectorizer = TfidfVectorizer(
    max_df=0.8,          # 去掉出现在80%以上文档中的词
    min_df=3,            # 去掉在3个以下文档中出现的词
    max_features=1500,   # 控制维度，防止稀疏性
    ngram_range=(1, 2),  # 使用1-2元组（unigram+bigram）
)
X = vectorizer.fit_transform(processed_texts)

In [3]:
from sklearn.naive_bayes import MultinomialNB

model = MultinomialNB()
model.fit(X, y_train)

In [4]:
test_filenames = [f"{i}.txt" for i in range(150,156)]
raw_test_emails = load_emails(test_filenames)

processed_test_texts = preprocess_emails(raw_test_emails, stopwords)

X_test_emails = vectorizer.transform(processed_test_texts)

y_pred = model.predict(X_test_emails)

for i, filename in enumerate(test_filenames):
    print(f"邮件 {filename} 的预测结果: {'垃圾邮件' if y_pred[i] == 1 else '正常邮件'}")

邮件 150.txt 的预测结果: 垃圾邮件
邮件 151.txt 的预测结果: 垃圾邮件
邮件 152.txt 的预测结果: 垃圾邮件
邮件 153.txt 的预测结果: 垃圾邮件
邮件 154.txt 的预测结果: 垃圾邮件
邮件 155.txt 的预测结果: 垃圾邮件


In [5]:
test_filenames = [f"{i}.txt" for i in range(121,130)]
raw_test_emails = load_emails(test_filenames)

processed_test_texts = preprocess_emails(raw_test_emails, stopwords)

X_test_emails = vectorizer.transform(processed_test_texts)

y_pred = model.predict(X_test_emails)

for i, filename in enumerate(test_filenames):
    print(f"邮件 {filename} 的预测结果: {'垃圾邮件' if y_pred[i] == 1 else '正常邮件'}")

邮件 121.txt 的预测结果: 垃圾邮件
邮件 122.txt 的预测结果: 垃圾邮件
邮件 123.txt 的预测结果: 垃圾邮件
邮件 124.txt 的预测结果: 垃圾邮件
邮件 125.txt 的预测结果: 垃圾邮件
邮件 126.txt 的预测结果: 垃圾邮件
邮件 127.txt 的预测结果: 正常邮件
邮件 128.txt 的预测结果: 正常邮件
邮件 129.txt 的预测结果: 正常邮件


In [6]:
def load_test_emails(filenames):
    emails = []
    for filename in filenames:
        with open(os.path.join(folder_path, filename), 'r', encoding='gbk',errors='ignore') as f:
            content = f.read()
            emails.append(content)
    return emails

test_filenames = [f"{i}.txt" for i in range(156,166)]
raw_test_emails = load_test_emails(test_filenames)

processed_test_texts = preprocess_emails(raw_test_emails, stopwords)

X_test_emails = vectorizer.transform(processed_test_texts)

y_pred = model.predict(X_test_emails)

for i, filename in enumerate(test_filenames):
    print(f"邮件 {filename} 的预测结果: {'垃圾邮件' if y_pred[i] == 1 else '正常邮件'}")

邮件 156.txt 的预测结果: 垃圾邮件
邮件 157.txt 的预测结果: 垃圾邮件
邮件 158.txt 的预测结果: 垃圾邮件
邮件 159.txt 的预测结果: 垃圾邮件
邮件 160.txt 的预测结果: 垃圾邮件
邮件 161.txt 的预测结果: 垃圾邮件
邮件 162.txt 的预测结果: 垃圾邮件
邮件 163.txt 的预测结果: 垃圾邮件
邮件 164.txt 的预测结果: 垃圾邮件
邮件 165.txt 的预测结果: 垃圾邮件
