## CS310 Natural Language Processing
## Assignment 3 (part 2). Named Entity Recognition with Bi-LSTM

**Total points**: 30 + 20 bonus points

In this assignment, you will train a bidirectional LSTM model on the CoNLL2003 English named entity recognition task set and evaluate its performance.

For the bonus questions, submit them as separate notebook files.

### 0. Import Necessary Libraries

In [None]:
### 0. Import Necessary Libraries
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
import numpy as np
import matplotlib.pyplot as plt
from collections import Counter
import random

# 设置随机种子以确保可重复性
torch.manual_seed(42)
random.seed(42)
np.random.seed(42)

### 1. Build the Model

In [None]:
class LSTMModel(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, num_layers, dropout=0.5):
        super(LSTMModel, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, num_layers, 
                          batch_first=True, dropout=dropout)
        self.fc = nn.Linear(hidden_dim, vocab_size)
        self.dropout = nn.Dropout(dropout)
        
        self.hidden_dim = hidden_dim
        self.num_layers = num_layers
        
    def forward(self, x, hidden=None):
        # x shape: [batch_size, seq_len]
        batch_size = x.size(0)
        
        if hidden is None:
            hidden = self.init_hidden(batch_size)
            
        # 词嵌入
        embedded = self.dropout(self.embedding(x))
        # embedded shape: [batch_size, seq_len, embedding_dim]
        
        # LSTM前向传播
        output, hidden = self.lstm(embedded, hidden)
        # output shape: [batch_size, seq_len, hidden_dim]
        
        # 应用dropout
        output = self.dropout(output)
        
        # 全连接层
        output = self.fc(output)
        # output shape: [batch_size, seq_len, vocab_size]
        
        return output, hidden
    
    def init_hidden(self, batch_size):
        weight = next(self.parameters())
        return (weight.new_zeros(self.num_layers, batch_size, self.hidden_dim),
                weight.new_zeros(self.num_layers, batch_size, self.hidden_dim))

def train_model(model, train_loader, test_loader, criterion, optimizer, num_epochs, device):
    train_losses = []
    test_perplexities = []
    best_perplexity = float('inf')
    
    for epoch in range(num_epochs):
        model.train()
        total_loss = 0
        for batch_idx, (inputs, targets) in enumerate(train_loader):
            inputs, targets = inputs.to(device), targets.to(device)
            
            optimizer.zero_grad()
            output, hidden = model(inputs)
            
            # 重塑输出和目标以计算损失
            output = output.view(-1, output.size(-1))
            targets = targets.view(-1)
            
            loss = criterion(output, targets)
            loss.backward()
            
            # 梯度裁剪
            torch.nn.utils.clip_grad_norm_(model.parameters(), 0.5)
            
            optimizer.step()
            total_loss += loss.item()
            
            if batch_idx % 100 == 0:
                print(f'Epoch: {epoch+1}/{num_epochs}, Batch: {batch_idx}/{len(train_loader)}, '
                      f'Loss: {loss.item():.4f}')
            
            train_losses.append(loss.item())
        
        # 评估模型
        model.eval()
        total_loss = 0
        total_words = 0
        with torch.no_grad():
            for inputs, targets in test_loader:
                inputs, targets = inputs.to(device), targets.to(device)
                output, hidden = model(inputs)
                output = output.view(-1, output.size(-1))
                targets = targets.view(-1)
                
                loss = criterion(output, targets)
                total_loss += loss.item() * targets.size(0)
                total_words += targets.size(0)
        
        perplexity = np.exp(total_loss / total_words)
        test_perplexities.append(perplexity)
        
        print(f'Epoch: {epoch+1}/{num_epochs}, Test Perplexity: {perplexity:.4f}')
        
        # 保存最佳模型
        if perplexity < best_perplexity:
            best_perplexity = perplexity
            torch.save(model.state_dict(), 'best_lstm_model.pth')
    
    return train_losses, test_perplexities

def generate_text(model, vocab, prefix, max_length=50, temperature=1.0):
    model.eval()
    words = prefix.lower().split()
    device = next(model.parameters()).device
    
    # 转换前缀词为索引
    input_indices = [vocab.word2idx.get(word, vocab.word2idx['<UNK>']) for word in words]
    input_tensor = torch.LongTensor([input_indices]).to(device)
    
    with torch.no_grad():
        hidden = None
        generated_words = words.copy()
        
        for _ in range(max_length):
            output, hidden = model(input_tensor, hidden)
            
            # 获取最后一个时间步的预测
            word_weights = output[0, -1].div(temperature).exp()
            word_idx = torch.multinomial(word_weights, 1)[0]
            
            # 将生成的词添加到结果中
            generated_word = vocab.idx2word[word_idx.item()]
            generated_words.append(generated_word)
            
            # 准备下一个输入
            input_tensor = torch.LongTensor([[word_idx]]).to(device)
            
            if generated_word == '<EOS>':
                break
    
    return ' '.join(generated_words)

### 2. Train and Evaluate

In [None]:
# 设置参数
EMBEDDING_DIM = 300  # 增加嵌入维度
HIDDEN_DIM = 512    # 增加隐藏层维度
NUM_LAYERS = 2
BATCH_SIZE = 64
NUM_EPOCHS = 20
LEARNING_RATE = 0.001
SEQUENCE_LENGTH = 35
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# 加载数据
file_path = 'Harry_Potter_all_books_preprocessed.txt'
train_dataset, test_dataset, vocab = load_and_preprocess_data(file_path, SEQUENCE_LENGTH)

# 创建数据加载器
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE)

# 初始化模型
model = LSTMModel(len(vocab), EMBEDDING_DIM, HIDDEN_DIM, NUM_LAYERS).to(DEVICE)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE)

# 训练模型
train_losses, test_perplexities = train_model(
    model, train_loader, test_loader, criterion, optimizer, 
    NUM_EPOCHS, DEVICE
)

# 生成示例文本
test_prefixes = [
    "Harry looked at",
    "The castle was",
    "Hermione said",
    "Ron couldn't",
    "Dumbledore smiled"
]

print("\n生成的文本样例:")
for prefix in test_prefixes:
    generated = generate_text(model, vocab, prefix)
    print(f"\n前缀: {prefix}")
    print(f"生成: {generated}")

# 绘制训练损失和测试困惑度曲线
plt.figure(figsize=(12, 4))

plt.subplot(1, 2, 1)
plt.plot(train_losses)
plt.title('训练损失')
plt.xlabel('迭代次数')
plt.ylabel('损失')

plt.subplot(1, 2, 2)
plt.plot(test_perplexities)
plt.title('测试困惑度')
plt.xlabel('轮次')
plt.ylabel('困惑度')

plt.tight_layout()
plt.show()