In [2]:
import os
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import re


# 定义函数以加载邮件内容
def load_email_content(email_path):
    with open(email_path, 'r', encoding='utf-8') as file:
        return file.read()
    
# 数据加载函数
def load_data(data_path, label_path):
    data = []
    labels = []
    # 读取标签文件
    with open(label_path, 'r', encoding='utf-8') as f:
        for line in f:
            label, path = line.strip().split()
            with open(os.path.join(data_path, path), 'r', encoding='utf-8') as email:
                email_content = email.read()
            data.append(email_content)
            labels.append(label)
    
    return data, labels

# 读取数据和标签
# 使用原始字符串(r'...')
data_path = r'C:\Users\Ran\Desktop\lab3data\trec06c-utf8\data_cut'  # 分词后的邮件目录
label_path = r'C:\Users\Ran\Desktop\lab3data\trec06c-utf8\label\index'  # 标签路径
data, labels = load_data(data_path, label_path)

# 数据划分为训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(data, labels, test_size=0.2, random_state=42)

# 文本特征提取
vectorizer = CountVectorizer(max_features=5000)  # 选择最多5000个特征词
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)


# 训练朴素贝叶斯模型
nb_model = MultinomialNB()
nb_model.fit(X_train_vec, y_train)

# 在测试集上进行预测
y_pred = nb_model.predict(X_test_vec)

# 计算性能指标
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, pos_label='spam')
recall = recall_score(y_test, y_pred, pos_label='spam')
f1 = f1_score(y_test, y_pred, pos_label='spam')

# 打印评估结果
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")


# 测试不同词表大小对模型的影响
feature_sizes = [1000, 5000, 10000]
for size in feature_sizes:
    vectorizer = CountVectorizer(max_features=size)
    X_train_vec = vectorizer.fit_transform(X_train)
    X_test_vec = vectorizer.transform(X_test)
    
    # 训练并评估模型
    nb_model.fit(X_train_vec, y_train)
    y_pred = nb_model.predict(X_test_vec)
    
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, pos_label='spam')
    recall = recall_score(y_test, y_pred, pos_label='spam')
    f1 = f1_score(y_test, y_pred, pos_label='spam')
    
    print(f"Feature size: {size}")
    print(f"Accuracy: {accuracy:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")
    print(f"F1 Score: {f1:.4f}")
    print("-" * 50)


def extract_email_header(email_path):
    with open(email_path, 'r', encoding='utf-8') as file:
        lines = file.readlines()
        headers = []
        for line in lines:
            if line.strip() == '':
                break
            headers.append(line.strip())
    return " ".join(headers)


# 提取邮件头特征并将其与邮件正文结合
def load_data_with_headers(data_path, label_path):
    data = []
    labels = []
    headers = []
    with open(label_path, 'r', encoding='utf-8') as f:
        for line in f:
            label, path = line.strip().split()
            email_path = os.path.join(data_path, path)
            email_content = load_email_content(email_path)
            email_header = extract_email_header(email_path)
            data.append(email_content + " " + email_header)  # 合并正文和邮件头
            labels.append(label)
    return data, labels

# 重新加载数据并进行训练
data_with_headers, labels = load_data_with_headers(data_path, label_path)
X_train, X_test, y_train, y_test = train_test_split(data_with_headers, labels, test_size=0.2, random_state=42)
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

# 训练和评估模型
nb_model.fit(X_train_vec, y_train)
y_pred = nb_model.predict(X_test_vec)
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, pos_label='spam')
recall = recall_score(y_test, y_pred, pos_label='spam')
f1 = f1_score(y_test, y_pred, pos_label='spam')

print(f"Accuracy with header info: {accuracy:.4f}")
print(f"Precision with header info: {precision:.4f}")
print(f"Recall with header info: {recall:.4f}")
print(f"F1 Score with header info: {f1:.4f}")


Accuracy: 0.9934
Precision: 0.9993
Recall: 0.9909
F1 Score: 0.9951
Feature size: 1000
Accuracy: 0.9913
Precision: 0.9958
Recall: 0.9912
F1 Score: 0.9935
--------------------------------------------------
Feature size: 5000
Accuracy: 0.9934
Precision: 0.9993
Recall: 0.9909
F1 Score: 0.9951
--------------------------------------------------
Feature size: 10000
Accuracy: 0.9934
Precision: 0.9995
Recall: 0.9906
F1 Score: 0.9951
--------------------------------------------------


KeyboardInterrupt: 