In [1]:
import pandas as pd, torch
import torch.nn as nn

from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from tokenizers import Tokenizer, normalizers
from tokenizers.models import BPE
from tokenizers.pre_tokenizers import Whitespace
from tokenizers.trainers import BpeTrainer

## Load Dataset

In [2]:
data_frame = pd.read_csv('enron_spam_data.csv').drop(columns=['Date']).rename(
    columns={
        'Message ID': 'id',
        'Subject': 'abstract',
        'Message': 'content',
        'Spam/Ham': 'label',
    }
).set_index('id')
data_frame.dropna(how='any', inplace=True)
data_frame['label'] = data_frame['label'].map({'spam': 1, 'ham': 0})

data_list = []
for _, row in data_frame.iterrows():
    # 为每个文本片段添加字段标识
    abstract = row['abstract'].replace('\n', ' ')
    content = row['content'].replace('\n', ' ')
    data_list.append(f"[ABSTRACT]: {abstract} [CONTENT]: {content}\n")

In [3]:
class SpamDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=512):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        encoding = self.tokenizer.encode(text).ids  # BPE编码
        # 截断或填充到固定长度
        if len(encoding) > self.max_length:
            encoding = encoding[:self.max_length]
        else:
            encoding += [self.tokenizer.token_to_id("[PAD]")] * (self.max_length - len(encoding))
        return torch.tensor(encoding), torch.tensor(self.labels[idx])

## Tokenizer

In [4]:
tokenizer = Tokenizer(BPE(unk_token="[UNK]"))
tokenizer.normalizer = normalizers.Sequence([
    normalizers.NFKC(),     # Unicode标准化
    normalizers.Lowercase() # 统一转小写
])
tokenizer.pre_tokenizer = Whitespace()
trainer = BpeTrainer(
    vocab_size=100000,
    min_frequency=2,
    special_tokens=["[UNK]", "[PAD]", "[ABSTRACT]", "[CONTENT]"],
)

tokenizer.train_from_iterator(data_list, trainer=trainer)






## Model

In [5]:
# Attention
class MultiHeadAttention(nn.Module):
    pass

class Embedding(nn.Module):
    pass

class AttentionModel(nn.Module):
    pass

In [None]:
# RNN
class RNN(nn.Module):
    def __init__(self):
        super(RNN).__init__()

    def forward(self):
        pass

# GRU
class GRU(nn.Module):
    pass

# LSTM
class LSTM(nn.Module):
    pass

## Training