In [8]:
import numpy as np
import re

# 准备数据
sentence = "The quick brown fox jumps over the lazy dog."
# 按单词分割句子，保留标点符号
words = re.findall(r"\b\w+\b|[^\w\s]", sentence)
print(f"句子分割为单词: {words}")

# 创建多个训练样本：每个样本是(前缀序列, 目标单词)
samples = []
for i in range(1, len(words)):
    prefix = words[:i]  # 前缀序列
    target = words[i]   # 目标单词
    samples.append((prefix, target))

print("\n训练样本:")
for i, (prefix, target) in enumerate(samples):
    print(f"样本 {i+1}: 前缀={prefix}, 目标={target}")

# 提取所有唯一单词并创建映射
vocab = sorted(list(set(words)))
word_to_idx = {word: i for i, word in enumerate(vocab)}
idx_to_word = {i: word for i, word in enumerate(vocab)}
vocab_size = len(vocab)

print(f"\n词汇表: {vocab}")
print(f"词汇表大小: {vocab_size}")
print("单词到索引的映射:")
for word, idx in word_to_idx.items():
    print(f"  '{word}': {idx}")

# 将单词序列转换为独热编码，并输出独热编码
def words_to_one_hot(words_seq, vocab_size, word_to_idx, verbose=False):
    """将单词序列转换为独热编码序列"""
    one_hot_vectors = []
    for word in words_seq:
        # 创建独热向量
        one_hot = np.eye(vocab_size)[word_to_idx[word]]
        one_hot_vectors.append(one_hot)
        
        # 如果需要，输出独热编码信息
        if verbose:
            idx = word_to_idx[word]
            print(f"  单词 '{word}' 的独热编码: {one_hot}")
    
    return np.array(one_hot_vectors)

# 初始化RNN权重
hidden_size = 16  # 隐藏层大小

# 输入到隐藏层的权重 (词汇表大小 × 隐藏层大小)
W1 = np.random.randn(vocab_size, hidden_size) * 0.01
# 隐藏层到隐藏层的权重 (隐藏层大小 × 隐藏层大小)
W2 = np.random.randn(hidden_size, hidden_size) * 0.01
# 隐藏层到输出层的权重 (隐藏层大小 × 词汇表大小)
W3 = np.random.randn(hidden_size, vocab_size) * 0.01

# 偏置项
b1 = np.zeros((1, hidden_size))  # 输入层到隐藏层的偏置
b2 = np.zeros((1, vocab_size))   # 隐藏层到输出层的偏置

# RNN前向传播
def rnn_forward(inputs, W1, W2, W3, b1, b2, hidden_size):
    """
    RNN前向传播
    inputs: 输入序列的独热编码，形状为(seq_len, vocab_size)
    返回: 输出序列，隐藏状态序列
    """
    seq_len, vocab_size = inputs.shape
    hidden_states = np.zeros((seq_len + 1, hidden_size))  # 初始隐藏状态为0
    outputs = np.zeros((seq_len, vocab_size))
    
    for t in range(seq_len):
        # 计算当前时间步的隐藏状态
        hidden_states[t+1] = np.tanh(
            np.dot(inputs[t], W1) + 
            np.dot(hidden_states[t], W2) + 
            b1
        )
        # 计算当前时间步的输出
        outputs[t] = np.dot(hidden_states[t+1], W3) + b2
    
    return outputs, hidden_states

# 交叉熵损失函数
def cross_entropy_loss(predictions, target):
    """计算交叉熵损失"""
    exp_preds = np.exp(predictions - np.max(predictions))  # 防止数值溢出
    probs = exp_preds / np.sum(exp_preds)
    loss = -np.sum(target * np.log(probs + 1e-10))  # 加小值防止log(0)
    return loss, probs

# 反向传播
def rnn_backward(inputs, outputs, hidden_states, target, W2, W3, hidden_size):
    """RNN反向传播计算梯度"""
    seq_len, vocab_size = inputs.shape
    dW1 = np.zeros_like(W1)
    dW2 = np.zeros_like(W2)
    dW3 = np.zeros_like(W3)
    db1 = np.zeros_like(b1)
    db2 = np.zeros_like(b2)
    
    # 最后一个时间步的输出误差
    exp_preds = np.exp(outputs[-1] - np.max(outputs[-1]))
    probs = exp_preds / np.sum(exp_preds)
    delta_output = probs - target  # 输出层误差
    
    # 隐藏层误差
    delta_hidden = np.dot(delta_output, W3.T) * (1 - hidden_states[-1]**2)
    delta_hidden = delta_hidden.reshape(1, -1)  # 确保是二维数组
    
    # 计算梯度
    dW3 += np.dot(hidden_states[-1].reshape(-1, 1), delta_output.reshape(1, -1))
    db2 += delta_output
    
    dW1 += np.dot(inputs[-1].reshape(-1, 1), delta_hidden)
    dW2 += np.dot(hidden_states[-2].reshape(-1, 1), delta_hidden)
    db1 += delta_hidden
    
    return dW1, dW2, dW3, db1, db2

# 训练模型
learning_rate = 0.01
epochs = 1000  # 训练轮次

for epoch in range(epochs):
    total_loss = 0
    
    # 遍历所有样本进行训练
    for prefix, target_word in samples:
        # 准备输入和目标（训练时不打印独热编码，避免输出过多）
        input_one_hot = words_to_one_hot(prefix, vocab_size, word_to_idx, verbose=False)
        target_idx = word_to_idx[target_word]
        target_one_hot = np.eye(vocab_size)[target_idx]
        
        # 前向传播
        outputs, hidden_states = rnn_forward(
            input_one_hot, W1, W2, W3, b1, b2, hidden_size
        )
        final_output = outputs[-1]
        
        # 计算损失
        loss, _ = cross_entropy_loss(final_output, target_one_hot)
        total_loss += loss
        
        # 反向传播计算梯度
        dW1, dW2, dW3, db1, db2 = rnn_backward(
            input_one_hot, outputs, hidden_states, target_one_hot, W2, W3, hidden_size
        )
        
        # 更新权重
        W1 -= learning_rate * dW1
        W2 -= learning_rate * dW2
        W3 -= learning_rate * dW3
        b1 -= learning_rate * db1
        b2 -= learning_rate * db2
    
    # 每100轮打印一次损失
    if (epoch + 1) % 100 == 0:
        print(f"轮次 {epoch+1}/{epochs}, 平均损失: {total_loss/len(samples):.4f}")

# 测试模型预测结果
print("\n预测测试及独热编码:")
for prefix, target_word in samples:
    print(f"\n输入前缀: {prefix}")
    # 生成独热编码并打印
    input_one_hot = words_to_one_hot(prefix, vocab_size, word_to_idx, verbose=True)
    
    outputs, _ = rnn_forward(input_one_hot, W1, W2, W3, b1, b2, hidden_size)
    final_output = outputs[-1]
    
    # 计算预测概率
    _, probs = cross_entropy_loss(final_output, np.zeros(vocab_size))
    predicted_idx = np.argmax(probs)
    predicted_word = idx_to_word[predicted_idx]
    
    print(f"预测: {predicted_word}, 实际: {target_word} → {'正确' if predicted_word == target_word else '错误'}")
    

句子分割为单词: ['The', 'quick', 'brown', 'fox', 'jumps', 'over', 'the', 'lazy', 'dog', '.']

训练样本:
样本 1: 前缀=['The'], 目标=quick
样本 2: 前缀=['The', 'quick'], 目标=brown
样本 3: 前缀=['The', 'quick', 'brown'], 目标=fox
样本 4: 前缀=['The', 'quick', 'brown', 'fox'], 目标=jumps
样本 5: 前缀=['The', 'quick', 'brown', 'fox', 'jumps'], 目标=over
样本 6: 前缀=['The', 'quick', 'brown', 'fox', 'jumps', 'over'], 目标=the
样本 7: 前缀=['The', 'quick', 'brown', 'fox', 'jumps', 'over', 'the'], 目标=lazy
样本 8: 前缀=['The', 'quick', 'brown', 'fox', 'jumps', 'over', 'the', 'lazy'], 目标=dog
样本 9: 前缀=['The', 'quick', 'brown', 'fox', 'jumps', 'over', 'the', 'lazy', 'dog'], 目标=.

词汇表: ['.', 'The', 'brown', 'dog', 'fox', 'jumps', 'lazy', 'over', 'quick', 'the']
词汇表大小: 10
单词到索引的映射:
  '.': 0
  'The': 1
  'brown': 2
  'dog': 3
  'fox': 4
  'jumps': 5
  'lazy': 6
  'over': 7
  'quick': 8
  'the': 9
轮次 100/1000, 平均损失: 2.2499
轮次 200/1000, 平均损失: 2.1991
轮次 300/1000, 平均损失: 1.9324
轮次 400/1000, 平均损失: 0.5396
轮次 500/1000, 平均损失: 0.1114
轮次 600/1000, 平均损失: 0.0528
轮次 