In [14]:
import torch
import torch.nn as nn
import zipfile
import random
import os

In [4]:
os.getcwd()

'/Users/litao/Desktop/program/pytorch/chapter03'

In [5]:
with zipfile.ZipFile("../data/jaychou_lyrics.txt.zip") as zin:
    with zin.open("jaychou_lyrics.txt") as f:
        corpus_chars=f.read().decode("utf-8")
corpus_chars[:40]

'想要有直升机\n想要和你飞到宇宙去\n想要和你融化在一起\n融化在宇宙里\n我每天每天每'

## 数据处理

In [6]:
# 将换行符号替换成空格
corpus_chars=corpus_chars.replace("\n"," ").replace("\r"," ")

In [12]:
characters=list(set(corpus_chars))
char2index=dict([(char,i) for i,char in enumerate(characters)])
index2char=dict([(i,char) for i,char in enumerate(characters)])

In [13]:
# 查看是否转换成功
corpus_indices = [char2index[char] for char in corpus_chars]
sample = corpus_indices[:20]
print('chars:', ''.join([index2char[idx] for idx in sample]))
print('indices:', sample)

chars: 想要有直升机 想要和你飞到宇宙去 想要和
indices: [2391, 913, 829, 2029, 2075, 2066, 972, 2391, 913, 1214, 522, 48, 1611, 1934, 2008, 1923, 972, 2391, 913, 1214]


## 时序数据采样的两种方式

### 1. 随机采样
每个样本是原始序列上任意截取的一段序列。相邻的两个随机小批量在原始序列上的位置不一定相毗邻。因此，我们无法用一个小批量最终时间步的隐藏状态来初始化下一个小批量的隐藏状态。在训练模型时，每次随机采样前都需要重新初始化隐藏状态。

In [15]:
def data_iter_random(corpus_indices, batch_size, num_steps,shuffle=True,device=None):
    """
    :param corpus_indices :original sequence
    :param batch_size :batch size
    :param num_steps:how many character one step contains 
    """
    # 减1是因为输出的索引x是相应输入的索引y加1
    # 可以划分为多少个num_steps，即总共需要计算的次数
    num_examples=(len(corpus_indices)-1)//num_steps
    # epoch_size的计算方式，每一次需要计算的个数
    epoch_size=num_examples//batch_size
    example_indices=list(range(num_examples))
    if shuffle:
        random.shuffle(example_indices)
    
    def _data(pos):
        """
        :param pos :return a seq which start at pos and length is num_steps
        """
        return corpus_indices[pos:pos+num_steps]
    if device is None:
        device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    
    # read seq
    for i in range(epoch_size):
        i=i * batch_size
        batch_indices=example_indices[i:i+batch_size]
        X = [_data(j * num_steps) for j in batch_indices]
        Y = [_data(j * num_steps + 1) for j in batch_indices]
        yield torch.tensor(X, dtype=torch.float32, device=device), torch.tensor(Y, dtype=torch.float32, device=device)

In [16]:
my_seq = list(range(30))
for X, Y in data_iter_random(my_seq, batch_size=2, num_steps=6):
    print('X: ', X, '\nY:', Y, '\n')

X:  tensor([[12., 13., 14., 15., 16., 17.],
        [ 0.,  1.,  2.,  3.,  4.,  5.]]) 
Y: tensor([[13., 14., 15., 16., 17., 18.],
        [ 1.,  2.,  3.,  4.,  5.,  6.]]) 

X:  tensor([[18., 19., 20., 21., 22., 23.],
        [ 6.,  7.,  8.,  9., 10., 11.]]) 
Y: tensor([[19., 20., 21., 22., 23., 24.],
        [ 7.,  8.,  9., 10., 11., 12.]]) 



### 2. 相邻采样
我们还可以令相邻的两个随机小批量在原始序列上的位置相毗邻。这时候，我们就可以用一个小批量最终时间步的隐藏状态来初始化下一个小批量的隐藏状态，从而使下一个小批量的输出也取决于当前小批量的输入，并如此循环下去。

In [187]:
def data_iter_consecutive(corpus_indices, batch_size, num_steps, device=None):
    if device is None:
        device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    corpus_indices = torch.tensor(corpus_indices, dtype=torch.float32, device=device)
    # 数据总长度1000
    data_len = len(corpus_indices)
    # 每个batch的长度
    batch_len = data_len // batch_size
    indices = corpus_indices[0: batch_size*batch_len].view(batch_size, batch_len)
    # epoch的计算方式
    epoch_size = (batch_len - 1) // num_steps
    for i in range(epoch_size):
        i = i * num_steps
        X = indices[:, i: i + num_steps]
        Y = indices[:, i + 1: i + num_steps + 1]
        yield X, Y

In [174]:
for X, Y in data_iter_consecutive(my_seq, batch_size=2, num_steps=6):
    print('X: ', X, '\nY:', Y, '\n')

X:  tensor([[ 0.,  1.,  2.,  3.,  4.,  5.],
        [15., 16., 17., 18., 19., 20.]]) 
Y: tensor([[ 1.,  2.,  3.,  4.,  5.,  6.],
        [16., 17., 18., 19., 20., 21.]]) 

X:  tensor([[ 6.,  7.,  8.,  9., 10., 11.],
        [21., 22., 23., 24., 25., 26.]]) 
Y: tensor([[ 7.,  8.,  9., 10., 11., 12.],
        [22., 23., 24., 25., 26., 27.]]) 



## 使用RNN/LSTM来定义模型

In [202]:
import sys
sys.path.append("..")
from torch_utils.utils import *
import torch.nn.functional as F
import numpy as np
import time
import math

In [126]:
# 设定随机数种子，使结果可浮现
def setup_seed(seed):
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    np.random.seed(seed)
    random.seed(seed)
    torch.backends.cudnn.deterministic = False
setup_seed(0)

In [24]:
# 导入Jayzhou歌词文件
corpus_chars,vocab_size,char2index,index2char=load_data_jay_lyrics()

In [29]:
nn.LSTM?

In [139]:
num_hiddens=256
rnn_layer=nn.RNN(input_size=vocab_size,hidden_size=num_hiddens,batch_first=True)

In [140]:
# 创建数据，测试输出的shape
batch_size=16
num_steps=35
state=None

X = torch.rand(batch_size, num_steps, vocab_size)
X.shape

torch.Size([16, 35, 2582])

In [141]:
Y, state_new = rnn_layer(X, state)
# output:batch_size,num_steps,hidden_dim
# h:batch_size,hidden_dim 
print(Y.shape, len(state_new), state_new[0].shape)

torch.Size([16, 35, 256]) 1 torch.Size([16, 256])


## 定义RNN model

In [188]:
class RNNModel(nn.Module):
    def __init__(self,vocab_size,hidden_dim,bidirectional=False):
        super(RNNModel,self).__init__()
        self.rnn_layer=nn.RNN(input_size=vocab_size,hidden_size=hidden_dim)
        self.hidden_dim=hidden_dim*1 if not bidirectional else hidden_dim*2
        self.vocab_size=vocab_size
        self.fc=nn.Linear(self.hidden_dim,self.vocab_size)
        # 起始状态
        self.state=None
    def forward(self,X,state):
        # X:(batch,seq_len)
        X=F.one_hot(X,self.vocab_size).float()
        Y, self.state = self.rnn_layer(X, state)
        #(num_steps * batch_size, num_hiddens)，它的输出形状为(num_steps * batch_size, vocab_size)
        output = self.fc(Y.view(-1, Y.shape[-1]))
        return output, self.state

## 模型测试

In [189]:
def predict_rnn(prefix, num_chars, model, device, idx_to_char,char_to_idx):
    state=None
    # 记录起始的角标
    output=[char_to_idx[prefix[0]]]
    for t in range(num_chars+len(prefix)-1):
        X=torch.tensor([output[-1]],device=device).view(1,1)
        if state is not None:
            # LSTM instance
            if isinstance(state,tuple):
                state = (state[0].to(device), state[1].to(device))
            # RNN instance
            else:
                state=state.to(device)
        Y,state=model(X,state)
        if t<len(prefix)-1:
            output.append(char_to_idx[prefix[t + 1]])
        # 获取概率最大的角标
        else:
            output.append(int(Y.argmax(dim=1).item()))
    # 转变为字符输出
    return ''.join([idx_to_char[i] for i in output])

In [190]:
# 初步进行测试
device="cuda" if torch.cuda.is_available() else "cpu"
model = RNNModel(vocab_size,hidden_dim=256).to(device)
predict_rnn('分开', 10, model, device, index2char, char2index)

'分开蜂鸣混悄登與警登芙登'

In [191]:
# 超参数定义
num_epochs, batch_size, lr, clipping_theta = 250, 32, 1e-3, 1e-2
num_steps=35

In [192]:
criterion=nn.CrossEntropyLoss()
optimizer=torch.optim.Adam(model.parameters(),lr=lr)

## 模型训练

In [206]:
def train_rnn(model,corpus,idx_to_char,char_to_idx,vocab_size,
                    pred_period, pred_len, prefixes,device):
    state=None
    model.to(device)
    for epoch in range(num_epochs):
        l_sum, n, start = 0.0, 0, time.time()
        data_iter=data_iter_consecutive(corpus, batch_size, num_steps, device)
        for X,y in data_iter:
            X=X.to(torch.int64)
            if state is not None:
                if isinstance (state, tuple): # LSTM, state:(h, c)  
                    state = (state[0].detach(), state[1].detach())
                else:   
                    state = state.detach()
            # output:(batch_size*num_steps,vocab_size)
            output,state=model(X,state)
            
            # Y的形状是(batch_size, num_steps)，转置后再变成长度为batch * num_steps 的向量
            y=torch.transpose(y,0,1).contiguous().view(-1)
            loss = criterion(output, y.long())
            # 梯度清零
            optimizer.zero_grad()
            #  反向传播
            loss.backward()
            #grad_clipping(model.parameters(), clipping_theta, device)
            optimizer.step()
            l_sum += loss.item() * y.shape[0]
            n += y.shape[0]
        # 困惑度计算
        try:
            perplexity = math.exp(l_sum / n)
        except OverflowError:
            perplexity = float('inf')
        if (epoch + 1) % pred_period == 0:
            print('epoch %d, perplexity %f, time %.2f sec' % (
                epoch + 1, perplexity, time.time() - start))
            for prefix in prefixes:
                print(' -', predict_rnn(
                    prefix, pred_len, model, device, idx_to_char,
                    char_to_idx))

In [207]:
pred_period, pred_len, prefixes = 50, 50, ['分开', '不分开']
train_rnn(model,corpus_indices, index2char, char2index,vocab_size,pred_period, pred_len, prefixes,device)

epoch 50, perplexity 3.124914, time 5.12 sec
 - 分开    不过小在多外在你 不 向 的看在我 不   安了 忘不小把的 在决  的小太 的 很 不着本
 - 不分开    让少地在用 笑分你 不  的 洒国美远我的无 我 带 回 飞是有舍不我 到 听的 转的 我 
epoch 100, perplexity 1.448145, time 5.10 sec
 - 分开   下的 的 别 相 一松你惯象我上的  你在对有忘笑让 让不狠所在 青我 如 不声是千离  沼 
 - 不分开    让过这颗想感专和麻打蜜 不听 们回小 停大是旁感风 我 说 不欣 之我看 你是 史 的堡 情
epoch 150, perplexity 1.442334, time 5.09 sec
 - 分开 就  不调你了不是忆我靠他的是面那 烟去 有有身我看  就你儿这 心 听爷夜不缘 说绿能空是的望风
 - 不分开 风  让种遇直面所当 节找地在说的你  色圈 过一因我 因的 　a的在  的面 为来气 山预悲后们
epoch 200, perplexity 3.957064, time 5.11 sec
 - 分开 不  界 戏 多脑下远杂那我了样 是　声 名英力扑情  多单输的我下子  些 谁黄了呼  子在 铃
 - 不分开 这 中不都你抬许在我些 的黄下的 忆碎难是的  上我不傲芽将笑不 　的家把都让故那的明我起你不  
epoch 250, perplexity 1.043246, time 5.09 sec
 - 分开 放 着不 你在的了我孤想在这上你我受一 事落满一 那待代 只因啦她 面的忆 泪 你 微 场无音还妳
 - 不分开 化 不不是能呆才 在能  明的了你的你打 只也像  恼 成后不一 的地的终我温  过没笔了细一不一


# GRU network

In [209]:
class GRUModel(nn.Module):
    def __init__(self,vocab_size,hidden_dim,bidirectional=False):
        super(GRUModel,self).__init__()
        self.gru_layer=nn.GRU(input_size=vocab_size,hidden_size=hidden_dim)
        self.hidden_dim=hidden_dim*1 if not bidirectional else hidden_dim*2
        self.vocab_size=vocab_size
        self.fc=nn.Linear(self.hidden_dim,self.vocab_size)
        # 起始状态
        self.state=None
    def forward(self,X,state):
        # X:(batch,seq_len)
        X=F.one_hot(X,self.vocab_size).float()
        Y, self.state = self.gru_layer(X, state)
        #(num_steps * batch_size, num_hiddens)，它的输出形状为(num_steps * batch_size, vocab_size)
        output = self.fc(Y.view(-1, Y.shape[-1]))
        return output, self.state

# LSTM network

In [None]:
class GRUModel(nn.Module):
    def __init__(self,vocab_size,hidden_dim,bidirectional=False):
        super(GRUModel,self).__init__()
        self.lstm_layer=nn.LSTM(input_size=vocab_size,hidden_size=hidden_dim)
        self.hidden_dim=hidden_dim*1 if not bidirectional else hidden_dim*2
        self.vocab_size=vocab_size
        self.fc=nn.Linear(self.hidden_dim,self.vocab_size)
        # 起始状态
        self.state=None
    def forward(self,X,state):
        # X:(batch,seq_len)
        X=F.one_hot(X,self.vocab_size).float()
        Y, self.state = self.gru_layer(X, state)
        #(num_steps * batch_size, num_hiddens)，它的输出形状为(num_steps * batch_size, vocab_size)
        output = self.fc(Y.view(-1, Y.shape[-1]))
        return output, self.state