In [1]:
%matplotlib inline

递归神经网络 - Recurrent Neural Network
====


本节主要参考：
http://pytorch.org/tutorials/beginner/nlp/word_embeddings_tutorial.html#sphx-glr-beginner-nlp-word-embeddings-tutorial-py

In [2]:
# -*- coding: utf-8 -*-

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from six.moves import xrange

In [3]:
import torch
import torch.nn as nn
from torch.autograd import Variable
import torch.nn.functional as F

torch.manual_seed(1)

<torch._C.Generator at 0x7f9b341e0110>

1.N-Gram Language Modeling <br>
probability{ x_(i-N),x_(i-N+1),x_(i-N+2),...,x_(i-1) -> x_(i) }

based on word embedding

In [4]:
# hyper-parameter
training_epoch = 10000
learning_rate = 1e-3
CONTEXT_SIZE = 2
EMBEDDING_DIM = 10

In [5]:
# We will use Shakespeare Sonnet 2
test_sentence = """
Deep learning (also known as deep structured learning or hierarchical learning)
is part of a broader family of machine learning methods based on learning data
representations, as opposed to task-specific algorithms. Learning can be supervised,
semi-supervised or unsupervised. Deep learning models are loosely related to information
processing and communication patterns in a biological nervous system, such as neural
coding that attempts to define a relationship between various stimuli and associated
neuronal responses in the brain. Deep learning architectures such as deep neural
networks, deep belief networks and recurrent neural networks have been applied to
fields including computer vision, speech recognition, natural language processing,
audio recognition, social network filtering, machine translation, bioinformatics
and drug design, where they have produced results comparable to and in some
cases superior to human experts.
""".split()

vocab = set(test_sentence)
word2ind = {word: i for i, word in enumerate(vocab)}
ind2word = dict(zip(word2ind.values(), word2ind.keys()))

data_num = len(test_sentence) - CONTEXT_SIZE
x = [[word2ind[test_sentence[i]], word2ind[test_sentence[i + 1]]]
          for i in range(len(test_sentence) - CONTEXT_SIZE)]
y = [[word2ind[test_sentence[i]]] for i in range(CONTEXT_SIZE, len(test_sentence))]

In [6]:
class NGramLanguageModeler(nn.Module):
    def __init__(self, vocab_size, embedding_dim, context_size):
        super(NGramLanguageModeler, self).__init__()
        self.embeddings = nn.Embedding(vocab_size, embedding_dim)

        self.hidden_size = 32
        self.num_layers = 2
        self.BiLSTM = nn.LSTM(input_size=embedding_dim, hidden_size=self.hidden_size,
                              num_layers=self.num_layers, batch_first=True, bidirectional=True)

        self.fc = nn.Linear(context_size * self.hidden_size * 2, vocab_size)

    def forward(self, x):
        embeds = self.embeddings(x)
        out, _ = self.BiLSTM(embeds)
        out = out.contiguous().view(embeds.size(0), -1)
        out = self.fc(out)
        return out

# 查看Pytorch是否支持GPU
GPU_FLAG = torch.cuda.is_available()
print('CUDA available?', GPU_FLAG)

model = NGramLanguageModeler(len(vocab), EMBEDDING_DIM, CONTEXT_SIZE)
if GPU_FLAG:
    model.cuda()
else:
    pass
print(model)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adagrad(model.parameters(), lr=learning_rate)

CUDA available? True
NGramLanguageModeler(
  (embeddings): Embedding(95, 10)
  (BiLSTM): LSTM(10, 32, num_layers=2, batch_first=True, bidirectional=True)
  (fc): Linear(in_features=128, out_features=95, bias=True)
)


In [7]:
if GPU_FLAG:
    x = Variable(torch.LongTensor(x)).cuda()
    y = Variable(torch.LongTensor(y)).cuda()
else:
    x = Variable(torch.LongTensor(x))
    y = Variable(torch.LongTensor(y))

for epoch in range(1, 1 + training_epoch):
    model.zero_grad()
    y_ = model(x)
    loss = criterion(y_, y.view(-1))
    loss.backward()
    optimizer.step()

    # 因为cost value不太明显，显示cost之和
    if epoch % 1000 == 0:
        print('Epoch %s / %s, training cost: %s' % (epoch, training_epoch, float(loss)))


Epoch 1000 / 10000, training cost: 2.837620735168457
Epoch 2000 / 10000, training cost: 1.5012502670288086
Epoch 3000 / 10000, training cost: 0.8125035762786865
Epoch 4000 / 10000, training cost: 0.486420214176178
Epoch 5000 / 10000, training cost: 0.32615959644317627
Epoch 6000 / 10000, training cost: 0.24045604467391968
Epoch 7000 / 10000, training cost: 0.1903339922428131
Epoch 8000 / 10000, training cost: 0.15868350863456726
Epoch 9000 / 10000, training cost: 0.1373818963766098
Epoch 10000 / 10000, training cost: 0.12229006737470627


In [8]:
# generating text
model.eval()
context_idxs = [word2ind['Deep'], word2ind['Learning']]
logue = context_idxs
for i in xrange(data_num):
    if GPU_FLAG:
        context_var = Variable(torch.LongTensor(context_idxs).view(1, -1)).cuda().cuda()
        context_idxs = model(context_var).topk(1)[1].cpu().numpy()[0, 0]
    else:
        context_var = Variable(torch.LongTensor(context_idxs).view(1, -1)).cuda()
        context_idxs = model(context_var).topk(1)[1].numpy()[0, 0]

    logue.append(context_idxs)
    context_idxs = logue[-2:]

pred_sentence = ' '.join([ind2word[i] for i in logue])

In [9]:
import editdistance

print('Distance between these two sentences is %s' % (editdistance.eval(' '.join(test_sentence), pred_sentence)))
print("\033[1;31;40m %s \033[0m" % (' '.join(test_sentence)))
print(pred_sentence)

Distance between these two sentences is 583
[1;31;40m Deep learning (also known as deep structured learning or hierarchical learning) is part of a broader family of machine learning methods based on learning data representations, as opposed to task-specific algorithms. Learning can be supervised, semi-supervised or unsupervised. Deep learning models are loosely related to information processing and communication patterns in a biological nervous system, such as neural coding that attempts to define a relationship between various stimuli and associated neuronal responses in the brain. Deep learning architectures such as deep neural networks, deep belief networks and recurrent neural networks have been applied to fields including computer vision, speech recognition, natural language processing, audio recognition, social network filtering, machine translation, bioinformatics and drug design, where they have produced results comparable to and in some cases superior to human experts. [0m
D