# 语言模型

学习目标
- 学习语言模型，以及如何训练一个语言模型
- 学习torchtext的基本使用方法
    - 构建 vocabulary
    - word to inde 和 index to word
- 学习torch.nn的一些基本模型
    - Linear
    - RNN
    - LSTM
    - GRU
- RNN的训练技巧
    - Gradient Clipping
- 如何保存和读取模型

我们会使用 [torchtext](https://github.com/pytorch/text) 来创建vocabulary, 然后把数据读成batch的格式。请大家自行阅读README来学习torchtext。

In [1]:
import warnings
warnings.filterwarnings('ignore')

In [48]:
import torchtext
from torchtext.vocab import Vectors
import torch
import numpy as np
import random

USE_CUDA = torch.cuda.is_available()

# 为了保证实验结果可以复现，我们经常会把各种random seed固定在某一个值
seed = 100
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
if USE_CUDA:
    torch.cuda.manual_seed(seed)

BATCH_SIZE = 50
EMBEDDING_SIZE = 650
MAX_VOCAB_SIZE = 50000

- 我们会继续使用上次的text8作为我们的训练，验证和测试数据
- torchtext提供了LanguageModelingDataset这个class来帮助我们处理语言模型数据集
- BPTTIterator可以连续地得到连贯的句子

In [49]:
TEXT = torchtext.data.Field(lower=True)
train, val, test = torchtext.datasets.LanguageModelingDataset.splits(path=".", 
    train="text8.train.txt", validation="text8.dev.txt", test="text8.test.txt", text_field=TEXT)
TEXT.build_vocab(train, max_size=MAX_VOCAB_SIZE)
print("vocabulary size: {}".format(len(TEXT.vocab)))

device = torch.device("cuda:1" if USE_CUDA else "cpu")
VOCAB_SIZE = len(TEXT.vocab)
train_iter, val_iter, test_iter = torchtext.data.BPTTIterator.splits(
    (train, val, test), batch_size=BATCH_SIZE, device=device, bptt_len=32, repeat=False, shuffle=True)
# bptt_len 回传长度/单词数(seq_len)

vocabulary size: 50002


50000+2(unk,pad)

In [11]:
TEXT.vocab.itos[:10]

['<unk>', '<pad>', 'the', 'of', 'and', 'one', 'in', 'a', 'to', 'zero']

In [12]:
TEXT.vocab.stoi['of']

3

模型的输入是一串文字，模型的输出也是一串文字，他们之间相差一个位置，因为语言模型的目标是根据之前的单词预测下一个单词。

In [50]:
it = iter(train_iter)
batch = next(it)
batch.text.shape # bptt_len * batch_size

torch.Size([32, 50])

In [None]:
print(" ".join([TEXT.vocab.itos[i] for i in batch.text[:,1].data]))#第2个sequence(句子)
print(" ".join([TEXT.vocab.itos[i] for i in batch.target[:,1].data]))

In [None]:
for i in range(5):
    print(i)
    batch = next(it)
    print(" ".join([TEXT.vocab.itos[i] for i in batch.text[:,2].data]))
    print(" ".join([TEXT.vocab.itos[i] for i in batch.target[:,2].data]))

### 定义模型

- 继承nn.Module
- __init__函数
- forward函数
- 其余可以根据模型需要定义相关的函数

In [5]:
import torch
import torch.nn as nn

class RNNModel(nn.Module):
    """ 一个简单的循环神经网络"""

    def __init__(self, rnn_type, ntoken, ninp, nhid, nlayers, dropout=0.5):
        ''' 该模型包含以下几层:
            - 一个词嵌入层
            - 一个循环神经网络层(RNN, LSTM, GRU)
            - 一个线性层, 从hidden state到输出单词表, 不需要激活函数
            - 一个dropout层, 用来做regularization
            - ntoken = vocab_size
            - ninp = embedding_size
            - nhid = hidden_size
        '''
        super(RNNModel, self).__init__()
        self.drop = nn.Dropout(dropout)
        self.encoder = nn.Embedding(ntoken, ninp)
        
        if rnn_type in ['LSTM', 'GRU']:
            # getattr: 在nn中检索对应rnn_type
            self.rnn = getattr(nn, rnn_type)(ninp, nhid, nlayers, dropout=dropout)
        else:
            try:
                nonlinearity = {'RNN_TANH': 'tanh', 'RNN_RELU': 'relu'}[rnn_type]
            except KeyError:
                raise ValueError( """An invalid option for `--model` was supplied,
                                 options are ['LSTM', 'GRU', 'RNN_TANH' or 'RNN_RELU']""")
            self.rnn = nn.RNN(ninp, nhid, nlayers, nonlinearity=nonlinearity, dropout=dropout)
            
        self.decoder = nn.Linear(nhid, ntoken) # hidden_size -> vocab_size
        self.init_weights()
        self.rnn_type = rnn_type
        self.nhid = nhid
        self.nlayers = nlayers

    def forward(self, input, hidden): #hidden: hidden state
        ''' Forward pass:
            - word embedding
            - 输入循环神经网络
            - 一个线性层从hidden state转化为输出单词表
            - input(text): seq_len * batch_size(torchtext setting) # seq_len = bptt_len
        '''
        emb = self.drop(self.encoder(input)) # seq_len * batch_size * embed_size
        output, hidden = self.rnn(emb, hidden) # hidden:(n_layers * batch_size * hidden_size, n_layers * batch_size * hidden_size) 1: 1layer, 1 direction
        output = self.drop(output) # seq_len * batch_size * hidden_size
        #decoded = self.decoder(output.view(output.size(0)*output.size(1), output.size(2)))
        #转成二维，便于decoder linear操作 (seq_len * batch_size) * hidden_size
        decoded = self.decoder(output.view(-1, output.size(2))) # (seq_len * batch_size) * vocab_size
        #转回三维，seq_len * batch_size * vocab_size(其实没必要)
        return decoded.view(output.size(0), output.size(1), decoded.size(-1)), hidden

    def init_hidden(self, bsz, requires_grad=True): # bsz:batch_size;requires_grad要不要bp到hidden state
        weight = next(self.parameters())
        if self.rnn_type == 'LSTM': # LSTM: h & c, 2 hidden states
            return (weight.new_zeros((self.nlayers, bsz, self.nhid), requires_grad=requires_grad),
                    weight.new_zeros((self.nlayers, bsz, self.nhid), requires_grad=requires_grad))
        else:
            return weight.new_zeros((self.nlayers, bsz, self.nhid), requires_grad=requires_grad)

    def init_weights(self):
        initrange = 0.1
        self.encoder.weight.data.uniform_(-initrange, initrange)
        self.decoder.bias.data.zero_()
        self.decoder.weight.data.uniform_(-initrange, initrange)

初始化一个模型

In [36]:
n_layers = 2
model = RNNModel("LSTM", VOCAB_SIZE, EMBEDDING_SIZE, EMBEDDING_SIZE, n_layers, dropout=0.5)
if USE_CUDA:
    model = model.cuda()

- 我们首先定义评估模型的代码。
- 模型的评估和模型的训练逻辑基本相同，唯一的区别是我们只需要forward pass，不需要backward pass/grad

In [31]:
def evaluate(model, data):
    model.eval()
    total_loss = 0.
    total_count = 0.
    it = iter(data)
    with torch.no_grad():
        hidden = model.init_hidden(BATCH_SIZE, requires_grad=False)
        for i, batch in enumerate(it):
            data, target = batch.text, batch.target
            if USE_CUDA:
                data, target = data.cuda(), target.cuda()
            hidden = repackage_hidden(hidden)
            with torch.no_grad():
                output, hidden = model(data, hidden)
            loss = loss_fn(output.view(-1, VOCAB_SIZE), target.view(-1))
            total_count += np.multiply(*data.size())
            total_loss += loss.item()*np.multiply(*data.size())
            
    loss = total_loss / total_count
    # 切回train，进入下一个iteration
    model.train()
    return loss

我们需要定义下面的一个function，帮助我们把一个hidden state和计算图之前的历史分离。

In [8]:
# Remove this part
def repackage_hidden(h):
    """Wraps hidden states in new Tensors, to detach them from their history."""
    if isinstance(h, torch.Tensor):
        return h.detach() # h跟之前节点的联系切断
    else: # multiple hidden states
        return tuple(repackage_hidden(v) for v in h)

In [51]:
it = iter(train_iter)
hidden = model.init_hidden(BATCH_SIZE)
hidden = repackage_hidden(hidden)

In [63]:
hidden[0].shape # n_layers * batch_size * nhid

torch.Size([2, 50, 650])

In [53]:
batch = next(it)

In [54]:
batch.text.shape #seq_len * batch_size

torch.Size([32, 50])

In [55]:
data = batch.text
data = data.cuda()
emb = model.encoder(data)
emb.shape #seq_len * batch_size * embedding_size

torch.Size([32, 50, 650])

In [64]:
output, (hidden, cell) = model.rnn(emb, hidden)

In [58]:
output.shape # seq_len * batch_size * embed_size

torch.Size([32, 50, 650])

In [65]:
hidden.shape # n_layers * batch_size * embed_size

torch.Size([2, 50, 650])

In [66]:
output.view(-1, output.size(2)).shape

torch.Size([1600, 650])

In [68]:
decoded = model.decoder(output.view(-1, output.size(2)))
decoded.shape

torch.Size([1600, 50002])

In [69]:
decoded.view(output.size(0), output.size(1), decoded.size(-1)).shape

torch.Size([32, 50, 50002])

In [70]:
decoded.view(-1, VOCAB_SIZE).shape

torch.Size([1600, 50002])

定义loss function和optimizer


In [33]:
loss_fn = nn.CrossEntropyLoss()
learning_rate = 0.001
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
scheduler = torch.optim.lr_scheduler.ExponentialLR(optimizer, 0.5)

训练模型：
- 模型一般需要训练若干个epoch
- 每个epoch我们都把所有的数据分成若干个batch
- 把每个batch的输入和输出都包装成cuda tensor
- forward pass，通过输入的句子预测每个单词的下一个单词
- 用output和target计算cross entropy loss
- 清空模型当前gradient
- backward pass
- gradient clipping，防止梯度爆炸
- 更新模型参数
- 每隔一定的iteration输出模型在当前iteration的loss，以及在验证集上做模型的评估

In [37]:
import copy
GRAD_CLIP = 1.
NUM_EPOCHS = 2

val_losses = []
for epoch in range(NUM_EPOCHS):
    model.train()
    it = iter(train_iter)
    hidden = model.init_hidden(BATCH_SIZE)
    for i, batch in enumerate(it):
        data, target = batch.text, batch.target
        if USE_CUDA:
            data, target = data.cuda(), target.cuda()
        hidden = repackage_hidden(hidden)
        
        model.zero_grad()
        output, hidden = model(data, hidden) #hidden state传遍该epoch所有iteration
        
        optimizer.zero_grad()
        loss = loss_fn(output.view(-1, VOCAB_SIZE), target.view(-1)) #2维-1维
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), GRAD_CLIP)#grad clipping
        optimizer.step()
        
        if i % 1000 == 0:
            print("epoch", epoch, "iter", i, "loss", loss.item())
    
        if i % 10000 == 0:
            val_loss = evaluate(model, val_iter)
            
            if len(val_losses) == 0 or val_loss < min(val_losses):
                print("best model, val loss: ", val_loss)
                torch.save(model.state_dict(), "lm-best.th")
            else:
                # learning rate decay
                scheduler.step()
                optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
            val_losses.append(val_loss)

epoch 0 iter 0 loss 10.822037696838379
best model, val loss:  10.822260043438908
epoch 0 iter 1000 loss 10.825000762939453
epoch 0 iter 2000 loss 10.824640274047852
epoch 0 iter 3000 loss 10.822461128234863
epoch 0 iter 4000 loss 10.82335376739502
epoch 0 iter 5000 loss 10.825164794921875
epoch 0 iter 6000 loss 10.818838119506836
epoch 0 iter 7000 loss 10.82194709777832
epoch 0 iter 8000 loss 10.821283340454102
epoch 0 iter 9000 loss 10.823173522949219
epoch 0 iter 10000 loss 10.822760581970215
epoch 0 iter 11000 loss 6.590358734130859
epoch 0 iter 12000 loss 6.119932174682617
epoch 0 iter 13000 loss 5.953525543212891
epoch 0 iter 14000 loss 5.9914422035217285
epoch 1 iter 0 loss 6.077768325805664
best model, val loss:  5.618270756366516
epoch 1 iter 1000 loss 5.9252214431762695
epoch 1 iter 2000 loss 5.803649425506592
epoch 1 iter 3000 loss 5.932034969329834
epoch 1 iter 4000 loss 5.37612771987915
epoch 1 iter 5000 loss 5.721917629241943
epoch 1 iter 6000 loss 5.6660261154174805
epoch

In [None]:
best_model = RNNModel("LSTM", VOCAB_SIZE, EMBEDDING_SIZE, EMBEDDING_SIZE, 2, dropout=0.5)
if USE_CUDA:
    best_model = best_model.cuda()
best_model.load_state_dict(torch.load("lm-best.th"))

### 使用最好的模型在valid数据上计算perplexity

In [None]:
val_loss = evaluate(best_model, val_iter)
print("perplexity: ", np.exp(val_loss))

### 使用最好的模型在测试数据上计算perplexity

In [None]:
test_loss = evaluate(best_model, test_iter)
print("perplexity: ", np.exp(test_loss))

使用训练好的模型生成一些句子。

In [None]:
hidden = best_model.init_hidden(1)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
input = torch.randint(VOCAB_SIZE, (1, 1), dtype=torch.long).to(device)
words = []
for i in range(100):#依次预测下一个单词
    output, hidden = best_model(input, hidden)#output: 1*1*50002
    word_weights = output.squeeze().exp().cpu()#squeeze:去除所有维度=1的维度 exp:output分布更加极端(softmax)
    word_idx = torch.multinomial(word_weights, 1)[0]#也可以直接softmax->argmax返回预测结果
    input.fill_(word_idx)#append
    word = TEXT.vocab.itos[word_idx]
    words.append(word)
print(" ".join(words))