In [42]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.utils.data as tud

from collections import Counter
import numpy as np
import random
import math

import pandas as pd 
import scipy
import sklearn
from sklearn.metrics.pairwise import cosine_similarity

import torchtext
from torchtext.vocab import Vectors

USE_CUDA = torch.cuda.is_available()

# 设置随机数的seed，这样保证每次测试的数据一致
random.seed(1)
np.random.seed(1)
torch.manual_seed(1)
if USE_CUDA:
    torch.cuda.manual_seed(1)

# 语言模型
1、学习语言模型，以及如何训练一个语言模型
2、学习torchtext的基本使用方法
    2-1、构建vocabulary
    2-2、word to inde 和 index to word
3、学习torch.nn的一些基本模型
    3-1、Linear
    3-2、RNN
    3-3、LSTM
    3-4、GRU
4、RNN的训练技巧
    4-1、Gradient Clipping # 防止梯度爆炸的限制操作
5、如何保存和读取模型

# 参数初始化

In [97]:
# 超参数hyper paramerters 
BATCH_SIZE = 32
EMBEDDING_SIZE = 100
HIDDEN_SIZE = 100
MAX_VOCAB_SIZE = 50000
NUM_EPOCHS = 2
LEARNING_RATE = 0.001
GRAD_CLIP = 5.0 # 为了防止梯度爆炸，设置的权重上下限的绝对值

# 使用torchtext来进行数据预处理
1、我们继续使用text8作为训练，验证，测试数据
2、TorchText的一个重要概念是Field，它决定了你的数据会如何被处理。我们使用TEXT这个field来处理文本数据。
   我们的TEXT field有lower=True这个参数，所以所有的单词都会被lowercase(小写)
3、torchtext提供了LanguageModelingDataset这个class来帮助我们处理语言模型数据集。
4、build_vocab可以根据我们提供的训练数据集来创建最高频单词的单词表，max_size帮助我们限定单词总量
5、BPTTIterator可以连续地得到连贯的句子，BPTT的全过程是back propagation through time。

In [44]:
TEXT = torchtext.data.Field(lower=True) #lower 意味着全部都小写
train,val,test = torchtext.datasets.LanguageModelingDataset.splits(path="/Users/zhenwuzhou/.keras/datasets/text8/",
                                                 train="text8.train.txt",
                                                 validation="text8.dev.txt",
                                                 test="text8.test.txt",text_field=TEXT)

In [88]:
# 利用torchtext的Field来构建单词表
TEXT.build_vocab(train,max_size=MAX_VOCAB_SIZE)
VOCAB_SIZE = len(TEXT.vocab)

In [11]:
len(TEXT.vocab)

50002

In [13]:
# 用itos代表的是index to String；来查看前10个值
TEXT.vocab.itos[:10]

['<unk>', '<pad>', 'the', 'of', 'and', 'one', 'in', 'a', 'to', 'zero']

In [19]:
# stoi 表示String to index；通过单词来查它的对应索引位置
TEXT.vocab.stoi["the"]

2

In [45]:
# 定义要使用的device
device = torch.device("cuda" if USE_CUDA else "cpu")
#Back Propagation Through Time（常简称为 BPTT，可译为“时序反向传播算法”）
#bptt_len表示取字符串的长度，repeat表示不重复取字符串
#batch_size和batch_sizes是两个不一样的参数，batch_size是三个集合采用一样的batch_size,
#而batch_sizes接收的是长度为3的list:[32,16,128]:分别代表训练集，验证集，测试集上的batch_size
train_iter,val_iter,test_iter = torchtext.data.BPTTIterator.splits(
    (train,val,test),batch_size=BATCH_SIZE,device=device,
    bptt_len=50,repeat=False,shuffle=True)

In [46]:
len(train_iter)

9566

In [52]:
# 在执行这个之前一定要调用：TEXT.build_vocab(train,max_size=MAX_VOCAB_SIZE)
# 不然是无法进行next(it)
it = iter(train_iter)
batch = next(it)

In [58]:
batch #  batch由两个部分组成，一部分是text，一部分是target；50是句子的长度，32是batch_size


[torchtext.data.batch.Batch of size 32]
	[.text]:[torch.LongTensor of size 50x32]
	[.target]:[torch.LongTensor of size 50x32]

In [60]:
print(batch.text.shape) # batch.text是一个有50个词组成的句子，32是batch_size，代表每个batch有32个句子
print(batch.text) # 这里batch里面的文本数据
batch.text[:,0] # 这个是吧第一个句子取出来，即为第一列

torch.Size([50, 32])
tensor([[4815,   50,    6,  ..., 9116,   33,    7],
        [3143, 2748,  495,  ...,  893,  277,  317],
        [  13,    8,  850,  ...,  664,  824, 1602],
        ...,
        [   8,   34,  522,  ..., 5237,    3,   12],
        [3628, 1266,  968,  ...,    3,    2,    6],
        [   2,   54,   78,  ...,   12,  185, 3027]])


tensor([ 4815,  3143,    13,     7,   196,     3,  3017,    48,    61,   157,
          129,   743,   463, 10546,   135,     2, 25882,     3,     2,   110,
          835,     4,     2, 16433,     0,     3,     2,   154,   835,  3500,
            2,   196,    12,   188,    61,     6,     7, 10669,   215,     8,
         1334,   104,   439,    21,    61,  2773,   357,     8,  3628,     2])

In [59]:
print(batch.target.shape) # batch.target是一个有50个词组成的句子，32是batch_size
print(batch.target)

torch.Size([50, 32])
tensor([[3143, 2748,  495,  ...,  893,  277,  317],
        [  13,    8,  850,  ...,  664,  824, 1602],
        [   7,  328,   62,  ..., 9289,  231, 1367],
        ...,
        [3628, 1266,  968,  ...,    3,    2,    6],
        [   2,   54,   78,  ...,   12,  185, 3027],
        [ 711,    3,  620,  ...,    7,   16,   30]])


In [65]:
# 把句子打印出来itos[i] 是吧index转换成句子
# 打印出来的text
print(" ".join(TEXT.vocab.itos[i] for i in batch.text[:,0].data.cpu()))
print()
# 打印出来的target
print(" ".join(TEXT.vocab.itos[i] for i in batch.target[:,0].data.cpu()))

# 从打印结果可以看出，每个target和text差了一个单词，说明我们希望每一次预测的就是下一个句子

anarchism originated as a term of abuse first used against early working class radicals including the diggers of the english revolution and the sans <unk> of the french revolution whilst the term is still used in a pejorative way to describe any act that used violent means to destroy the

originated as a term of abuse first used against early working class radicals including the diggers of the english revolution and the sans <unk> of the french revolution whilst the term is still used in a pejorative way to describe any act that used violent means to destroy the organization


In [66]:
# 我们继续迭代5次，看看后面5个有什么规律
for i in range(5):
    batch = next(it)
    print(i)
    print(" ".join(TEXT.vocab.itos[i] for i in batch.text[:,0].data.cpu()))
    print()
    print(" ".join(TEXT.vocab.itos[i] for i in batch.target[:,0].data.cpu()))
# 我们发现每一个batch的第一句话是连续的：interpretations - as - however- ropotkin- stoic

0
organization of society it has also been taken up as a positive label by self defined anarchists the word anarchism is derived from the greek without archons ruler chief king anarchism as a political philosophy is the belief that rulers are unnecessary and should be abolished although there are differing

of society it has also been taken up as a positive label by self defined anarchists the word anarchism is derived from the greek without archons ruler chief king anarchism as a political philosophy is the belief that rulers are unnecessary and should be abolished although there are differing interpretations
1
interpretations of what this means anarchism also refers to related social movements that advocate the elimination of authoritarian institutions particularly the state the word anarchy as most anarchists use it does not imply chaos nihilism or <unk> but rather a harmonious anti authoritarian society in place of what are regarded

of what this means anarchism also refers to rela

In [67]:
# 我们再来看看batch中的第二个句子，发现第二个句子和第一个句子并没有什么关系
print(" ".join(TEXT.vocab.itos[i] for i in batch.text[:,1].data.cpu()))
print()
# 打印出来的target
print(" ".join(TEXT.vocab.itos[i] for i in batch.target[:,1].data.cpu()))

theories of how atoms combine which explains how atoms first combine in pairs and then group into trios of pairs which are the smallest visible units of matter this parallels with the structure of modern atomic theory in which pairs or triplets of supposedly fundamental quarks combine to create most

of how atoms combine which explains how atoms first combine in pairs and then group into trios of pairs which are the smallest visible units of matter this parallels with the structure of modern atomic theory in which pairs or triplets of supposedly fundamental quarks combine to create most typical


In [None]:
# 注意我们的TEXT.build_vocab的单词表是50002个单词，自动赠送了两个特殊token:
# <unk> 表示未知的单词，<pad>表示padding
# 模型的输入是一串文字，模型的输出也是一串文字，他们之间相差一个位置，
# 因为语言模型的目标是根据之前的单词预测下一个单词

# 定义模型
1、继承nn.Module
2、初始化函数
3、forward函数
4、其余可以根据模型需要定义相关的函数

In [78]:
class RNNModel(nn.Module):
    def __init__(self,vocab_size,embed_size,hidden_size):
        super(RNNModel,self).__init__()
        # 首先还是需要把单词的one-hot进行embedding
        self.embed = nn.Embedding(vocab_size,embed_size)
        # 定义循环神经层，其中循环神经单元为LSTM，它接收两个参数，一个是单词embedding后的size，
        # 另外一个是隐藏层的size
        # 可以使用batch_first=True把batch_size变成第一个维度，因为默认第一个维度是句子中单词的数量seq_length
        self.lstm = nn.LSTM(embed_size,hidden_size)
        # 最受的输出层还是要转为one-hot的维度来对应具体的输出单词；（
        # 这就是所谓的输出层词嵌入矩阵，但是一般词嵌入矩阵采用的都是输入层的词嵌入矩阵）
        self.linear = nn.Linear(hidden_size,vocab_size)
        
        self.hidden_size = hidden_size
    
    def forward(self,text,hidden):
        # forward pass
        # text:[seq_length * batch_size] ： 注意每一列是一个句子
        emb = self.embed(text)# [seq_length*batch_size*embed_size]
        # lstm会把最终的每一个预测的输出和hidden
        output , hidden = self.lstm(emb,hidden) 
        # output: [seq_length * batch_size * hidden_size]
        # 这列的num_layers是为1的
        # hidden: [num_layers*batch_size*hidden_size],[num_layers * batch_size * hidden_size]
        # 这里要做下reshape操作，因为后面的全连接层是不支持三维数据的
        # 把最后的hidden_size的维度保留，前面两个维度合并
        output_reshape = output.view(-1,output.shape[2])#[(seq_length * batch_size)*hidden_size]
        
        # 最后的输出每一个位置预测的单词
        out_vocab = self.linear(output_reshape) # [(seq_length * batch_size)*vocab_size]
        # 最后要reshape成三维的，对应的应该是和output的前两个维度是一样的，最后一个维度是one-hot词向量的维度
        # [seq_length * batch_size * vocab_size]
        out_vocab_reshape = out_vocab.view(output.size(0),output.size(1),out_vocab.size(-1))
        
        return out_vocab_reshape,hidden
    
    # 自定义初始化hidden的权重矩阵
    def init_hidden(self,batch_size,requires_grad=True):
        weight = next(self.parameters()) # 这里先从模型中把所有的参数都取出来
        # 这里要返回两个全0的
        return (weight.new_zeros((1,batch_size,self.hidden_size),requires_grad=requires_grad),
                weight.new_zeros((1,batch_size,self.hidden_size),requires_grad=requires_grad))

#  初始化一个模型

In [89]:
model = RNNModel(vocab_size=VOCAB_SIZE,
                 embed_size=EMBEDDING_SIZE,
                 hidden_size=HIDDEN_SIZE)
if USE_CUDA:
    model = model.to(device)

In [90]:
next(model.parameters())

Parameter containing:
tensor([[ 1.6914, -0.1955, -0.0534,  ...,  0.1854, -0.9802,  1.0892],
        [-1.8782, -0.2626, -0.4415,  ...,  0.3620,  2.5087,  0.5897],
        [-0.6326,  0.3006, -0.2671,  ..., -0.6473,  0.4356, -0.4093],
        ...,
        [-1.2651,  1.2818, -0.5194,  ..., -0.3482, -0.8409,  0.2882],
        [-1.9637,  1.2868,  0.4306,  ..., -0.6555, -1.0185,  1.4404],
        [ 0.7882, -2.0590,  2.1683,  ...,  0.7193,  1.1773, -0.0051]],
       requires_grad=True)

In [101]:
# 定义loss_fn和optimizer
loss_fn = nn.CrossEntropyLoss() # softmax多分类问题的损失函数
optimizer = torch.optim.Adam(model.parameters(),lr=LEARNING_RATE)

# 定义一个scheduler 来实现模型的learning——rate的动态下降
scheduler = torch.optim.lr_scheduler.ExponentialLR(optimizer,0.5) #0.5表示learning_rate降一半

# 训练模型：
1、模型需要若干个epoch
2、每个epoch我们都把所有的的数据分成若干个batch
3、把每个batch的输入和输出都包装成cuda tensor
4、forward pass,通过输入的句子预测每个单词的下一个单词
5、用模型的预测和正确的下一个单词计算cross entropy loss
6、清空模型当前的gradient
7、backward pass
8、gradient clipping，防止梯度爆炸
9、更新模型参数
10、每隔一定的iteration输出模型在当前iteration的loss，以及在验证集上做模型的评估

In [92]:
hiddentest = model.init_hidden(BATCH_SIZE)

In [93]:
hiddentest

(tensor([[[0., 0., 0.,  ..., 0., 0., 0.],
          [0., 0., 0.,  ..., 0., 0., 0.],
          [0., 0., 0.,  ..., 0., 0., 0.],
          ...,
          [0., 0., 0.,  ..., 0., 0., 0.],
          [0., 0., 0.,  ..., 0., 0., 0.],
          [0., 0., 0.,  ..., 0., 0., 0.]]], requires_grad=True),
 tensor([[[0., 0., 0.,  ..., 0., 0., 0.],
          [0., 0., 0.,  ..., 0., 0., 0.],
          [0., 0., 0.,  ..., 0., 0., 0.],
          ...,
          [0., 0., 0.,  ..., 0., 0., 0.],
          [0., 0., 0.,  ..., 0., 0., 0.],
          [0., 0., 0.,  ..., 0., 0., 0.]]], requires_grad=True))

In [94]:
# 这是为了把两个hidden的值记录下来传递给下面一个
def repackage_hidden(h):
    if isinstance(h,torch.Tensor):
        return h.detach() # detach()可以把前面的h隐藏车的权重矩阵值给赋值下来，而不去管
    else: 
        # 如果有若干个hidde，就把每个都repackage_hidden一下，我们传入的就是一个元组
        # 所以要把元组中的每个hidden都repackage_hidden，最后再把新的元组返回回去
        return tuple(repackage_hidden(v) for v in h) 

In [112]:
def evaluate(model, eva_data):
    model.eval() # 改为预测模式
    total_loss = 0.
    total_count = 0.
    it = iter(eva_data)
    with torch.no_grad(): # 评测时不需要进行梯度下降
       hidden = model.init_hidden(BATCH_SIZE,requires_grad=False) # 这里要把梯度下降的开关关上
       for i,batch in enumerate(it):
            data, target = batch.text, batch.target
           
            hidden = repackage_hidden(hidden)
        
            # 进行前向传播
            output,hidden = model(data,hidden)
        
            loss_predict = output.view(-1,VOCAB_SIZE) # 预测值
            loss_target = target.view(-1) # 真实值
            loss = loss_fn(loss_predict,loss_target) 
            
            # 因为loss计算的是平均值，所以要计算total_loss=loss*总数
            total_count = np.multiply(*data.size())
            total_loss = loss.item()*total_count
            
            if i == 3:
                break
    
    loss = total_loss / total_count
    model.train() # 模型评测完了之后在改为训练模式
    return loss

In [126]:
printshape = True
val_losses = [] # 存放loss的值 
for epoch in range(NUM_EPOCHS):
    model.train()
    # train_iter使用torchtext通过train数据处理后的结构化数据，里面有text和target
    it = iter(train_iter)
    hidden = model.init_hidden(BATCH_SIZE)
    for i,batch in enumerate(it):
        # 先把text和target取出
        data, target = batch.text, batch.target
        
        # 因为语言模型我们前面看到前面一个batch的0位置的文章是和下一个batch的0位置文章接起来的
        # 所以我们可以吧上一个训练的hidden权重值传递给下一个batch；
        # 这样相当于我们的hidden使用所有数据训练的，这是语言预测模型的一个特殊的地方
        # 其他翻译模型，或者情感分类问题就不用这么做了，因为前后两个batch并没有什么关系
        # 在每次前向传播把之前梯度下降的计算出的hidden的值取出来
        hidden = repackage_hidden(hidden)
        
        # 进行前向传播
        output,hidden = model(data,hidden)
        
        
        # 进行loss计算
        # 这个属于N分类问题，
        # output 预测需要输入0-N每一种的可能性百分百，
        # optt_len 是句子的长度
        # 所以output的shape为[(batch_size*optt_len) *VOCAB_SIZE ];
        # target中只需要每个词数字0-N,所以shape只需要为[batch_size*optt_len]
        loss_predict = output.view(-1,VOCAB_SIZE) # 预测值
        loss_target = target.view(-1) # 真实值
        loss = loss_fn(loss_predict,loss_target)
        
        if(printshape):
            print(loss_predict.shape)#torch.Size([128, 4])
            print(loss_target.shape)#torch.Size([128])
            printshape = False
        
        # 把grad清零
        optimizer.zero_grad()
        
        # 反向传播
        loss.backward()
        
        # 为了防止梯度爆炸，我们需要在梯度下降前，对权重矩阵的值设置上下限
        torch.nn.utils.clip_grad_norm_(model.parameters(),GRAD_CLIP)
        
        #梯度下降
        optimizer.step()
        
        # 打印loss
        if i % 10 == 0:
            print("loss:",loss.item())
            
        
        # 存储模型
        if i % 10 == 0:
            # 用验证数据来去对模型进行评价
            val_loss = evaluate(model,val_iter)
            print("val_loss:",val_loss)
            # 如果是第一次迭代到1000或者val_loss比之前的loss都要好，我们就把模型保存下来
            if len(val_losses) == 0 or val_loss < min(val_losses):
                # 注意这里是save的是model还是model.state_dict(),一定别忘了加()
                torch.save(model.state_dict(),"/Users/zhenwuzhou/.keras/models/text8/1m_dict.pth")
                torch.save(model,"/Users/zhenwuzhou/.keras/models/text8/1m.pth")
                print("best model saved to /Users/zhenwuzhou/.keras/models/text8/1m.pth")
            else:
                # 发现模型的loss无法下降了
                # 我们可以调整learning_rate:
                print("leatnig_rate decay")
                scheduler.step() # 这里用scheduler来进行把optimizer的learning_rate进行下降
            val_losses.append(val_loss)
                

torch.Size([1600, 50002])
torch.Size([1600])
loss: 6.4924421310424805
val_loss: 7.410778999328613
best model saved to /Users/zhenwuzhou/.keras/models/text8/1m.pth


KeyboardInterrupt: 

# 模型保存

In [100]:
model.state_dict()
# torch.save(model.state_dict,"/Users/zhenwuzhou/.keras/models/text8/1m.pth")

OrderedDict([('embed.weight',
              tensor([[ 1.6611, -0.2188, -0.0851,  ...,  0.1512, -1.0154,  1.0818],
                      [-1.8782, -0.2626, -0.4415,  ...,  0.3620,  2.5087,  0.5897],
                      [-0.6337,  0.2750, -0.2720,  ..., -0.6772,  0.4118, -0.3973],
                      ...,
                      [-1.2651,  1.2818, -0.5194,  ..., -0.3482, -0.8409,  0.2882],
                      [-1.9637,  1.2868,  0.4306,  ..., -0.6555, -1.0185,  1.4404],
                      [ 0.7882, -2.0590,  2.1683,  ...,  0.7193,  1.1773, -0.0051]])),
             ('lstm.weight_ih_l0',
              tensor([[ 0.0121,  0.0105,  0.0924,  ...,  0.0238, -0.0003, -0.0184],
                      [ 0.0208,  0.0203, -0.0756,  ...,  0.0249,  0.0014, -0.0071],
                      [ 0.0675, -0.0209, -0.0174,  ..., -0.0327, -0.0212, -0.0648],
                      ...,
                      [-0.0174,  0.0620, -0.1052,  ..., -0.0327, -0.0500, -0.0674],
                      [ 0.0380, -0.051

# 模型重新load

In [127]:
best_model = RNNModel(vocab_size=VOCAB_SIZE,
                 embed_size=EMBEDDING_SIZE,
                 hidden_size=HIDDEN_SIZE)
if USE_CUDA:
    best_model = best_model.to(device)
best_model.load_state_dict(torch.load("/Users/zhenwuzhou/.keras/models/text8/1m_dict.pth"))

<All keys matched successfully>

In [129]:
torch.load("/Users/zhenwuzhou/.keras/models/text8/1m_dict.pth")

OrderedDict([('embed.weight',
              tensor([[ 1.7062, -0.2245, -0.0810,  ...,  0.1508, -1.0192,  1.1120],
                      [-1.8782, -0.2626, -0.4415,  ...,  0.3620,  2.5087,  0.5897],
                      [-0.6297,  0.2410, -0.2347,  ..., -0.7184,  0.4643, -0.3540],
                      ...,
                      [-1.2651,  1.2818, -0.5194,  ..., -0.3482, -0.8409,  0.2882],
                      [-1.9637,  1.2868,  0.4306,  ..., -0.6555, -1.0185,  1.4404],
                      [ 0.7882, -2.0590,  2.1683,  ...,  0.7193,  1.1773, -0.0051]])),
             ('lstm.weight_ih_l0',
              tensor([[-0.0038,  0.0273,  0.0748,  ...,  0.0407, -0.0176, -0.0031],
                      [ 0.0328,  0.0358, -0.0641,  ...,  0.0410, -0.0140,  0.0070],
                      [ 0.0502, -0.0377, -0.0354,  ..., -0.0258, -0.0046, -0.0483],
                      ...,
                      [-0.0089,  0.0744, -0.1241,  ..., -0.0028, -0.0544, -0.0874],
                      [ 0.0563, -0.037

In [132]:
# 加载训练好的模型来进行预测perplexity(混乱度)，值越小越好
test_val_loss = evaluate(best_model,val_iter)
print(test_val_loss)
print("perplexity",np.exp(test_val_loss))

7.410778999328613
perplexity 1653.7140873886542


In [134]:
# 用训练好的模型来预测文章
hidden = best_model.init_hidden(1)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
input = torch.randint(VOCAB_SIZE,(1,1),dtype= torch.long).to(device)
words = []
for i in range(100):
    output,hidden = best_model(input,hidden)
    word_weights = output.squeeze().exp().cpu()
    # num_samples =1 相当于贪心搜索，不为1时属于集束搜索
    word_idx = torch.multinomial(word_weights,1)[0] # greedy (argmax)
    input.fill_(word_idx)
    word = TEXT.vocab.itos[word_idx]
    words.append(word)
print(" ".join(words))

meticulously to knights one local thermal economics no not wet metal seven in one place from et unknown the almost more it of clergy intelligence nine <unk> was between eight four and in operators s draft camp theoretician important fertility an one government four zero what one eight cells island three in italian history the unemployment lines difficulty to though with for advertising could cannot their lists in of this and the as combination their seven calculate a remained collected in damage most the two and in icftu the a celebrated the scientific he containing current the citrus the the
