# 含隐藏状态的循环神经网络

In [1]:
import torch
X,W_xh = torch.randn(3,1),torch.randn(1,4)
H,W_hh = torch.randn(3,4),torch.randn(4,4)
torch.matmul(X,W_xh)+torch.matmul(H,W_hh)

D:\Anaconda\envs\torch\lib\site-packages\numpy\.libs\libopenblas.JPIJNSWNNAN3CE6LLI5FWSPHUT2VXMTH.gfortran-win_amd64.dll
D:\Anaconda\envs\torch\lib\site-packages\numpy\.libs\libopenblas.XWYDX2IKJW2NMTWSFYNGFUWKQU3LYTCZ.gfortran-win_amd64.dll
  stacklevel=1)


tensor([[-3.6244,  0.8535,  3.6138, -1.4100],
        [-3.5620,  0.7390,  1.3317, -0.8977],
        [ 2.0508, -0.7742, -2.4926,  0.5569]])

In [2]:
torch.matmul(torch.cat((X,H),dim = 1),torch.cat((W_xh,W_hh),dim = 0))

tensor([[-3.6244,  0.8535,  3.6138, -1.4100],
        [-3.5620,  0.7390,  1.3317, -0.8977],
        [ 2.0508, -0.7742, -2.4926,  0.5569]])

# 语言模型数据集

## 建立字符索引

In [3]:
import random
import zipfile

with zipfile.ZipFile('./data/jaychou_lyrics.txt.zip') as zin:
    with zin.open("jaychou_lyrics.txt") as f:
        corpus_chars = f.read().decode("utf-8")
corpus_chars[:40]


'想要有直升机\n想要和你飞到宇宙去\n想要和你融化在一起\n融化在宇宙里\n我每天每天每'

In [4]:
corpus_chars = corpus_chars.replace("\n"," ").replace("\r"," ")
corpus_chars = corpus_chars[0:10000]

In [5]:
idx_to_char = list(set(corpus_chars))
char_to_idx = dict([(char,i) for i,char in enumerate(idx_to_char)])
vocab_size = len(char_to_idx)
print(vocab_size)

1027


In [6]:
corpus_indices = [char_to_idx[char] for char in corpus_chars]
sample = corpus_indices[:20]
print("chars:","".join([idx_to_char[idx] for idx in sample]))
print("indices:",sample)

chars: 想要有直升机 想要和你飞到宇宙去 想要和
indices: [881, 398, 1017, 546, 473, 814, 262, 881, 398, 319, 634, 766, 439, 357, 655, 747, 262, 881, 398, 319]


## 时序数据采样

假设时间步数为5，样本序列为5个字符，即“想”“要”“有”“直”“升”。该样本的标签序列为这些字符分别在训练集中的下一个字符，即“要”“有”“直”“升”“机”。

### 随机采样

我们无法用**一个小批量最终时间步的隐藏状态来初始化下一个小批量的隐藏状态**。在训练模型时，每次随机采样前都需要重新初始化隐藏状态。

In [7]:
def data_iter_random(corpus_indices,batch_size,num_steps,device = None):
    num_examples = (len(corpus_indices) - 1)//num_steps
    epoch_size = num_examples//batch_size
    example_indices = list(range(num_examples))
    random.shuffle(example_indices)
    
    def _data(pos):
        return corpus_indices[pos:pos+num_steps]
    if device is None:
        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    
    for i in range(epoch_size):
        i = i * batch_size
        batch_indices = example_indices[i:i+batch_size]
        X = [_data(j*num_steps) for j in batch_indices]
        Y = [_data(j*num_steps+1) for j in batch_indices]
        
        yield torch.tensor(X,dtype = torch.float32,device = device),\
        torch.tensor(Y,dtype = torch.float32,device = device)
    

In [8]:
my_seq = list(range(30))
for X,Y in data_iter_random(my_seq , batch_size = 8 , num_steps = 6):
    print("X:",X,"\nY:",Y,"\n")

### 相邻采样

可以令相邻的两个随机小批量在原始序列上的位置相毗邻。

可以**用一个小批量最终时间步的隐藏状态来初始化下一个小批量的隐藏状态**，从而使下一个小批量的输出也取决于当前小批量的输入，并如此循环下去。

这对实现循环神经网络造成了两方面影响：

1.  在训练模型时，**只需在每一个迭代周期开始时初始化隐藏状态**；


2. 当多个相邻小批量通过传递隐藏状态串联起来时，模型参数的梯度计算将依赖所有串联起来的小批量序列。同一迭代周期中，随着迭代次数的增加，梯度的计算开销会越来越大。 为了使模型参数的梯度计算只依赖一次迭代读取的小批量序列，**可以在每次读取小批量前将隐藏状态从计算图中分离出来。**

In [9]:
def data_iter_consecutive(corpus_indices,batch_size,num_steps,device = None):
    if device is None:
        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    corpus_indices = torch.tensor(corpus_indices,dtype = torch.float32,device = device)
    data_len = len(corpus_indices)
    batch_len = data_len // batch_size
    indices = corpus_indices[0:batch_size*batch_len].view(batch_size,batch_len)
    epoch_size = (batch_len - 1)//num_steps
    
    for i in range(epoch_size):
        i = i * num_steps
        X = indices[:, i : i + num_steps]
        Y = indices[:, i + 1 : i + num_steps + 1]
        yield X, Y
    

In [10]:
for X, Y in data_iter_consecutive(my_seq, batch_size=2, num_steps=6):
    print('X: ', X, '\nY:', Y, '\n')

X:  tensor([[ 0.,  1.,  2.,  3.,  4.,  5.],
        [15., 16., 17., 18., 19., 20.]], device='cuda:0') 
Y: tensor([[ 1.,  2.,  3.,  4.,  5.,  6.],
        [16., 17., 18., 19., 20., 21.]], device='cuda:0') 

X:  tensor([[ 6.,  7.,  8.,  9., 10., 11.],
        [21., 22., 23., 24., 25., 26.]], device='cuda:0') 
Y: tensor([[ 7.,  8.,  9., 10., 11., 12.],
        [22., 23., 24., 25., 26., 27.]], device='cuda:0') 



# 循环神经网络的从零开始实现

在本节中，我们将从零开始实现一个基于字符级循环神经网络的语言模型，并在周杰伦专辑歌词数据集上训练一个模型来进行歌词创作。首先，我们读取周杰伦专辑歌词数据集：

In [11]:
import time
import math
import numpy as np
import torch
from torch import nn, optim
import torch.nn.functional as F

import sys
sys.path.append("..") 
import d2lzh_pytorch as d2l
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

(corpus_indices, char_to_idx, idx_to_char, vocab_size) = d2l.load_data_jay_lyrics()

## one-hot向量
假设词典中不同字符的数量为`N`（即词典大小vocab_size），每个字符已经同一个从0到`N-1`的连续整数值索引一一对应。

In [12]:
def one_hot(x,n_class,dtype = torch.float32):
    # X shape: (batch) , output shape : (batch,n_class)
    x = x.long()
    res = torch.zeros(x.shape[0] , n_class , dtype = dtype ,device = x.device)
    res.scatter_(1 , x.view(-1,1) , 1)
    
    return res

x = torch.tensor([0, 2])
one_hot(x, vocab_size)

tensor([[1., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 1.,  ..., 0., 0., 0.]])

我们每次采样的小批量的形状是(批量大小, 时间步数)。下面的函数将这样的小批量变换成数个可以输入进网络的形状为(批量大小, 词典大小)的矩阵，矩阵个数等于时间步数。也就是说，时间步`t`的输入为$\boldsymbol{X}_t \in \mathbb{R}^{n \times d}$
 ，其中`n`为批量大小，`d`为输入个数，即`one-hot`向量长度（词典大小）。

In [13]:
def to_onehot(X,n_class):
    # X shape: (batch, seq_len), output: seq_len elements of (batch, n_class)
    return [one_hot(X[:,i],n_class) for i in range(X.shape[1])]

X = torch.arange(10).view(2, 5)
inputs = to_onehot(X, vocab_size)
print(len(inputs), inputs[0].shape)

5 torch.Size([2, 1027])


## 初始化模型参数