In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torch.autograd as autograd
import torch.utils.data.dataloader as dataloader
import random
import tqdm
import gensim
import time

In [2]:
with open('luxun.txt') as f:
    corpus = f.read()

corpus = corpus.replace('\n', ' ').replace('\r', ' ')

In [3]:
idx_to_char = list(set(corpus))
char_to_idx = dict([(char, i) for i, char in enumerate(idx_to_char)])
vocab_size = len(char_to_idx)
vocab_size

3759

In [4]:
corpus_indices = [char_to_idx[char] for char in corpus]
sample = corpus_indices[:20]
print('chars:', ''.join([idx_to_char[idx] for idx in sample]))
print('indices:', sample)

chars: 『鲁迅杂文经典全集（全本）/作者:鲁迅』
indices: [1251, 3538, 1582, 2159, 2215, 1707, 743, 1195, 3707, 2999, 1195, 3532, 1133, 1588, 2715, 1131, 231, 3538, 1582, 246]


In [5]:
def data_iter_random(corpus_indices, batch_size, num_steps):
    # 减一是因为输出的索引是相应输入的索引加一。
    num_examples = (len(corpus_indices) - 1) // num_steps
    epoch_size = num_examples // batch_size
    example_indices = list(range(num_examples))
    random.shuffle(example_indices)
    # 返回从 pos 开始的长为 num_steps 的序列
    _data = lambda pos: corpus_indices[pos: pos + num_steps]
    for i in range(epoch_size):
        # 每次读取 batch_size 个随机样本。
        i = i * batch_size
        batch_indices = example_indices[i: i + batch_size]
        X = [_data(j * num_steps) for j in batch_indices]
        Y = [_data(j * num_steps + 1) for j in batch_indices]
        yield torch.tensor(X), torch.tensor(Y)

In [6]:
def data_iter_consecutive(corpus_indices, batch_size, num_steps):
    corpus_indices = torch.tensor(corpus_indices)
    data_len = len(corpus_indices)
    batch_len = data_len // batch_size
    indices = corpus_indices[0: batch_size*batch_len].reshape((
        batch_size, batch_len))
    epoch_size = (batch_len - 1) // num_steps
    for i in range(epoch_size):
        i = i * num_steps
        X = indices[:, i: i + num_steps]
        Y = indices[:, i + 1: i + num_steps + 1]
        yield X, Y

In [7]:
weight = torch.nn.Embedding(vocab_size, 400).weight.data

In [8]:
wvmodel = gensim.models.KeyedVectors.load_word2vec_format('zh_text_char.vector',
                                                          binary=False, encoding='utf-8')

In [9]:
for i in range(len(wvmodel.index2word)):
    try:
        index = word_to_idx[wvmodel.index2word[i]]
    except:
        continue
    weight[index, :] = torch.from_numpy(wvmodel.get_vector(
        idx_to_word[word_to_idx[wvmodel.index2word[i]]]))

In [10]:
class lyricNet(nn.Module):
    def __init__(self, hidden_dim, embed_dim, num_layers, weight,
                 num_labels, bidirectional, dropout=0, **kwargs):
        super(lyricNet, self).__init__(**kwargs)
        self.hidden_dim = hidden_dim
        self.embed_dim = embed_dim
        self.num_layers = num_layers
        self.num_labels = num_labels
        self.bidirectional = bidirectional
        if num_layers <= 1 and dropout != 0:
            self.dropout = 0
        else:
            self.dropout = dropout
        self.embedding = nn.Embedding.from_pretrained(weight)
        self.embedding.weight.requires_grad = False
#         self.gru = nn.GRU(input_size=self.embed_dim, hidden_size=self.hidden_dim,
#                           num_layers=self.num_layers, bidirectional=self.bidirectional,
#                           dropout=self.dropout)
        self.lstm = nn.LSTM(input_size=self.embed_dim, hidden_size=self.hidden_dim,
                            num_layers=self.num_layers, bidirectional=self.bidirectional,
                            dropout=self.dropout)
        if self.bidirectional:
            self.decoder = nn.Linear(hidden_dim * 2, self.num_labels)
        else:
            self.decoder = nn.Linear(hidden_dim, self.num_labels)

    def forward(self, inputs):
        embeddings = self.embedding(inputs)
        states, _ = self.lstm(embeddings.permute([1, 0, 2]))
#         encoding = states[-1]
        outputs = self.decoder(states.reshape((-1, states.shape[-1])))
        return(outputs)

In [11]:
embedding_dim = 400
hidden_dim = 100
lr = 0.1
momentum = 0.9
num_epoch = 100
use_gpu = True
num_layers = 4
dropout = 0.5
bidirectional = True
batch_size = 128
device = torch.device('cuda:1')
loss_function = nn.CrossEntropyLoss()

In [12]:
model = lyricNet(hidden_dim=hidden_dim, embed_dim=embedding_dim, num_layers=num_layers,
                 num_labels=vocab_size, weight=weight, bidirectional=bidirectional,
                 dropout=dropout)
optimizer = optim.SGD(model.parameters(), lr=lr, momentum=momentum)
if use_gpu:
    model = model.to(device)

In [13]:
def eval_acc(y_pred, y_true):
    return int(sum(y_pred.argmax(dim=1) == y_true.t().reshape((-1,))).cpu()) / y_pred.shape[0]

In [14]:
for epoch in range(num_epoch):
    start = time.time()
    num, total_loss, total_acc = 0, 0, 0
    data = data_iter_random(corpus_indices, batch_size, num_steps=500)
#     if epoch == 50:
#         optimizer.param_groups[0]['lr'] = lr * 0.1
    for X, Y in data:
        num += 1
        if use_gpu:
            X = X.to(device)
            Y = Y.to(device)
        model.zero_grad()
        output = model(X)
        l = loss_function(output, Y.t().reshape((-1,))).mean()
        l.backward()
        total_acc += eval_acc(output, Y)
        optimizer.step()
        total_loss += l.item()
    end = time.time()
    print('epoch %d/%d, loss %.4f, acc %.4f, time %.4f'
          %(epoch+1, num_epoch, total_loss / num, total_acc / num, end-start))

epoch 1/100, loss 8.2254, acc 0.0003, time 7.1786
epoch 2/100, loss 8.1978, acc 0.0011, time 5.2194
epoch 3/100, loss 8.1503, acc 0.0026, time 4.6997
epoch 4/100, loss 8.0851, acc 0.0016, time 4.7651
epoch 5/100, loss 7.9975, acc 0.0017, time 4.8398
epoch 6/100, loss 7.8700, acc 0.0025, time 4.7612
epoch 7/100, loss 7.6717, acc 0.0023, time 4.8045
epoch 8/100, loss 7.3915, acc 0.0023, time 4.7217
epoch 9/100, loss 7.2782, acc 0.0020, time 4.6693
epoch 10/100, loss 7.1087, acc 0.0021, time 4.8869
epoch 11/100, loss 7.0237, acc 0.0026, time 4.7287
epoch 12/100, loss 6.9119, acc 0.0023, time 4.8144
epoch 13/100, loss 6.7787, acc 0.0017, time 4.7893
epoch 14/100, loss 6.7043, acc 0.0017, time 4.8017
epoch 15/100, loss 6.6180, acc 0.0016, time 4.9248
epoch 16/100, loss 6.5457, acc 0.0020, time 4.7953
epoch 17/100, loss 6.4659, acc 0.0025, time 4.7910
epoch 18/100, loss 6.3909, acc 0.0026, time 4.7207
epoch 19/100, loss 6.3326, acc 0.0016, time 4.8124
epoch 20/100, loss 6.2608, acc 0.0011, t

In [15]:
def predict_rnn(prefix, num_chars, model, device, idx_to_char, char_to_idx):
    output = [char_to_idx[prefix[0]]]
    for t in range(num_chars + len(prefix)):
#         X = torch.tensor(output).to(device).reshape((1, len(output)))
        X = torch.tensor([output[-1]]).to(device).reshape((1, 1))
        pred = model(X)
        if t < len(prefix) - 1:
            output.append(char_to_idx[prefix[t + 1]])
        else:
            output.append(int(pred.argmax(dim=1)[0]))
    return(''.join([idx_to_char[i] for i in output]))

In [16]:
predict_rnn('分开', 50, model, device, idx_to_char, char_to_idx)

'分开的                                                  '

In [17]:
predict_rnn('其实', 4, model, device, idx_to_char, char_to_idx)

'其实，，，，，'

In [18]:
predict_rnn('悲剧将人生的有价值的东西毁灭给人看', 10, model, device, idx_to_char, char_to_idx)

'悲剧将人生的有价值的东西毁灭给人看           '