# 字符级文本预测

我们将使用一句简单的英语句子，例如 "hello"。目标是通过训练一个RNN模型，输入前几个字符后预测下一个字符。
<br>
步骤：
1. 准备数据
2. 定义RNN模型
3. 训练模型
4. 测试模型

In [27]:
import torch
import torch.nn as nn
import torch.optim as optim

## 1. 准备数据

In [28]:
# 准备数据
text = "hello"
chars = sorted(list(set(text)))  # 获取字符集
char_to_idx = {char: i for i, char in enumerate(chars)}
idx_to_char = {i: char for i, char in enumerate(chars)}

In [29]:
# 将文本转化为数字表示
input_seq = [char_to_idx[c] for c in text[:-1]]  # "hell"
target_seq = [char_to_idx[c] for c in text[1:]]  # "ello"

In [30]:
# 转换为Tensor
input_seq = torch.tensor(input_seq).unsqueeze(0)  # 加上batch维度
target_seq = torch.tensor(target_seq).unsqueeze(0)

## 2.定义RNN模型

In [31]:
class SimpleRNN(nn.Module):
    def __init__(self, vocab_size, embed_size, hidden_size, output_size):
        super(SimpleRNN, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_size)   # 嵌入层
        self.rnn = nn.RNN(embed_size, hidden_size, batch_first=True)    # 
        self.fc = nn.Linear(hidden_size, output_size)
    
    def forward(self, x, hidden):
        x = self.embedding(x)
        out, hidden = self.rnn(x, hidden)
        out = self.fc(out)
        return out, hidden

In [32]:
# 参数设置
vocab_size = len(chars)     # 词汇表的大小，即字符集的大小
embed_size = 10            # 嵌入层的维度。每个字符将被映射为一个长度为 10 的向量。这是嵌入层的输出维度。
hidden_size = 20           # RNN 隐藏层的维度。RNN 的隐藏状态（hidden state）将是一个长度为 20 的向量。
output_size = vocab_size   # 输出层的维度。因为我们要预测下一个字符，输出层的大小应该等于词汇表的大小

In [34]:
# 实例化模型
model = SimpleRNN(vocab_size, embed_size, hidden_size, output_size)

## 3.  训练模型

In [35]:
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.01)

# 训练
num_epochs = 100
for epoch in range(num_epochs):
    hidden = None  # 初始隐藏状态
    optimizer.zero_grad()
    output, hidden = model(input_seq, hidden)
    
    loss = criterion(output.view(-1, vocab_size), target_seq.view(-1))
    loss.backward()
    optimizer.step()
    
    if (epoch + 1) % 10 == 0:
        print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}')


Epoch [10/100], Loss: 0.3334
Epoch [20/100], Loss: 0.0502
Epoch [30/100], Loss: 0.0154
Epoch [40/100], Loss: 0.0076
Epoch [50/100], Loss: 0.0050
Epoch [60/100], Loss: 0.0039
Epoch [70/100], Loss: 0.0032
Epoch [80/100], Loss: 0.0028
Epoch [90/100], Loss: 0.0025
Epoch [100/100], Loss: 0.0022


## 4. 测试模型

In [36]:
# 测试模型
with torch.no_grad():
    hidden = None
    input_char = 'h'
    input_idx = torch.tensor([[char_to_idx[input_char]]])
    
    for _ in range(len(text) - 1):
        output, hidden = model(input_idx, hidden)
        _, predicted_idx = torch.max(output, 2)
        predicted_char = idx_to_char[predicted_idx.item()]
        print(f'Input: {input_char}, Predicted: {predicted_char}')
        
        input_char = predicted_char
        input_idx = torch.tensor([[char_to_idx[input_char]]])

Input: h, Predicted: e
Input: e, Predicted: l
Input: l, Predicted: l
Input: l, Predicted: o
