In [None]:
import pandas as pd
import torch, torch.nn as nn
import torch.optim as optim
import torch.nn.init as init
import matplotlib.pyplot as plt

from torch.utils.data import Dataset, DataLoader
from torch.nn import Parameter
from torch.functional import F

from sklearn.model_selection import train_test_split
from tokenizers import Tokenizer, normalizers
from tokenizers.models import BPE, WordLevel
from tokenizers.pre_tokenizers import Whitespace
from tokenizers.trainers import BpeTrainer, WordLevelTrainer

## Load Dataset

In [2]:
data_frame = pd.read_csv('enron_spam_data.csv').drop(columns=['Date']).rename(
    columns={
        'Message ID': 'id',
        'Subject': 'abstract',
        'Message': 'content',
        'Spam/Ham': 'label',
    }
).set_index('id')
data_frame.dropna(how='any', inplace=True)
data_frame['label'] = data_frame['label'].map({'spam': 1, 'ham': 0})

texts = data_frame["abstract"].to_list() + data_frame["content"].to_list()

In [3]:
class SpamDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=256):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        encoding = self.tokenizer.encode(text).ids
        encoding = encoding[:self.max_length] if len(encoding) > self.max_length \
            else encoding + [self.tokenizer.token_to_id("[PAD]")] * (self.max_length - len(encoding))
        return torch.tensor(encoding), torch.tensor(self.labels[idx])

## Tokenizer

In [4]:
tokenizer = Tokenizer(WordLevel(unk_token="[UNK]"))
tokenizer.normalizer = normalizers.Sequence([
    normalizers.NFKC(),
    normalizers.Lowercase()
])
tokenizer.pre_tokenizer = Whitespace()
trainer = WordLevelTrainer(
    vocab_size = 30000,
    min_frequency=20,
    special_tokens=["[UNK]", "[PAD]"],
)

tokenizer.train_from_iterator(texts, trainer=trainer)

In [5]:
tokenizer.save("tokenizer.json")
tokenizer.get_vocab_size()

17387

## Model

In [6]:
# Attention
class MultiHeadAttention(nn.Module):
    pass

class Embedding(nn.Module):
    def __init__(self, num_embeddings: int, embedding_dim: int, padding_idx: int = None):
        super().__init__()
        self.num_embeddings = num_embeddings
        self.embedding_dim = embedding_dim
        self.padding_idx = padding_idx
        
        self.weight = Parameter(
            torch.zeros(num_embeddings, embedding_dim),
            requires_grad=True
        )
        nn.init.xavier_uniform_(self.weight)
        
        # 处理padding_idx
        if padding_idx is not None:
            with torch.no_grad():
                self.weight[padding_idx].fill_(0)
            self.weight[padding_idx].requires_grad = False

    def forward(self, input: torch.LongTensor) -> torch.Tensor:
        input = input.to(self.weight.device)
        return self.weight[input]

class AttentionModel(nn.Module):
    pass

In [7]:
# RNN
class WildeRNN(nn.Module):
    def __init__(self, vocab_size, hidden_size=128, embedding_dim = 256, output_size=2):
        super(WildeRNN, self).__init__()
        
        self.hidden_size = hidden_size

        self.embedding = Embedding(vocab_size, embedding_dim)

        self.W_xh = Parameter(torch.randn(size=(embedding_dim, hidden_size)), requires_grad=True)
        self.W_hh = Parameter(torch.randn(size=(hidden_size, hidden_size)), requires_grad=True)
        self.b_h = Parameter(torch.zeros(hidden_size), requires_grad=True)

        self.W_hq = Parameter(torch.randn(hidden_size, output_size), requires_grad=True)
        self.b_q = Parameter(torch.zeros(output_size), requires_grad=True)

        init.xavier_uniform_(self.W_xh)
        init.orthogonal_(self.W_hh)
        init.xavier_uniform_(self.W_hq)
    
    def forward(self, inputs: torch.Tensor):
        embeded = self.embedding(inputs)
        batch_size, seq_length, _ = embeded.shape
        H = torch.zeros((batch_size, self.hidden_size)).to(device=embeded.device)

        for t in range(seq_length):
            X = embeded[:, t, :]
            H = torch.tanh(torch.mm(X, self.W_xh) + torch.mm(H, self.W_hh) + self.b_h)

        return F.sigmoid(torch.mm(F.dropout(H, 0.3), self.W_hq) + self.b_q)
    
class BetterRNN(nn.Module):
    """
    Using Pytorch RNN Module.
    """
    def __init__(self, vocab_size, hidden_size=64, embedding_dim = 64, output_size=2):
        super(BetterRNN, self).__init__()

        self.rnn_layer = nn.Sequential(
            nn.Embedding(vocab_size, embedding_dim),
            nn.RNN(embedding_dim, hidden_size, batch_first=True, num_layers=1),
        )

        self.linear_layer = nn.Sequential(
            nn.Linear(hidden_size, output_size)
        )
    
    def forward(self, inputs):
       rnn_outputs, _ = self.rnn_layer(inputs)
       return self.linear_layer(rnn_outputs[:, -1, :])

# GRU
class GRU(nn.Module):
    pass

# LSTM
class LSTM(nn.Module):
    pass

## Training

In [8]:
# Constract Training data.
labels = data_frame["label"].to_list() + data_frame["label"].to_list()

train_mail, val_mail, train_labels, val_labels = train_test_split(texts, labels, test_size=0.2)

train_dataset = SpamDataset(train_mail, train_labels, tokenizer)
val_dataset = SpamDataset(val_mail, val_labels, tokenizer)

train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=32)

In [9]:
def train(model, lr, epochs, train_loader, val_loader, verbose=True, device="cpu", save_path="best_model.pth"):
    criterion = nn.BCEWithLogitsLoss()
    optimizer = optim.Adam(model.parameters(), lr=lr)
    
    model.to(device)
    best_val_loss = float('inf')

    train_losses = []
    val_losses = []
    
    for epoch in range(epochs):
        model.train()
        train_loss, train_correct, train_total = 0.0, 0, 0
        
        for inputs, labels in train_loader:
            inputs = inputs.to(device)
            labels = labels.float().unsqueeze(1).to(device)
            
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            
            train_loss += loss.item()
            predicts = (outputs > 0.5).float()
            train_correct += (predicts == labels).sum().item()
            train_total += labels.size(0)
        
        model.eval()
        val_loss, val_correct, val_total = 0.0, 0, 0
        with torch.no_grad():
            for inputs, labels in val_loader:
                inputs = inputs.to(device)
                labels = labels.float().unsqueeze(1).to(device)
                
                outputs = model(inputs)
                loss = criterion(outputs, labels)
                
                val_loss += loss.item()
                predicts = (outputs > 0.5).float()
                val_correct += (predicts == labels).sum().item()
                val_total += labels.size(0)
        
        avg_train_loss = train_loss / len(train_loader)
        train_acc = train_correct / train_total
        avg_val_loss = val_loss / len(val_loader)
        val_acc = val_correct / val_total

        train_losses.append(avg_train_loss)
        val_losses.append(avg_val_loss)
        
        if verbose:
            print(f'Epoch [{epoch+1}/{epochs}]: Train Loss: {avg_train_loss:.4f}, Train Acc: {train_acc:.4f}. Val Loss: {avg_val_loss:.4f}, Val Acc: {val_acc:.4f}')
        
        if avg_val_loss < best_val_loss:
            best_val_loss = avg_val_loss
            torch.save(model.state_dict(), save_path)
            if verbose:
                print(f"Best model saved at epoch {epoch+1} with Val Loss: {avg_val_loss:.4f}")

    plt.figure(figsize=(8, 6))
    plt.plot(range(1, epochs+1), train_losses, label='Train Loss', marker='o')
    plt.plot(range(1, epochs+1), val_losses, label='Validation Loss', marker='x')
    plt.title('Training and Validation Loss')
    plt.xlabel('Epoch')
    plt.ylabel('Loss')
    plt.legend()
    plt.grid(True)
    plt.savefig('loss_plot.png')
    plt.show()

    print("训练完成！最佳模型已保存为:", save_path)

In [None]:
rnn = BetterRNN(vocab_size=tokenizer.get_vocab_size(), output_size=1)
train(rnn, 0.001, 10, train_loader, val_loader, device="cuda")

Epoch [1/10]: Train Loss: 0.6814, Train Acc: 0.5112. Val Loss: 0.7213, Val Acc: 0.4959
Best model saved at epoch 1 with Val Loss: 0.7213
