In [9]:
import random
from collections import defaultdict
from pathlib import Path

import numpy as np
import torch
import torch.nn as nn
import torchtext
import tqdm


def set_random_seed(seed):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.benchmark = False
    torch.backends.cudnn.deterministic = True

set_random_seed(2020)
device = torch.device('cuda:0' if torch.cuda.is_available else 'cpu')

# 数据读取与处理

In [2]:
# 如何处理文本数据
text_field = torchtext.data.Field(lower=True)

# 构建语言模型数据集
train_data, valid_data, test_data = torchtext.datasets.LanguageModelingDataset.splits(
    path=Path('/media/bnu/data/nlp-practice/language-model'),
    train='text8.train.txt',
    validation='text8.dev.txt',
    test='text8.test.txt',
    text_field=text_field
)

# 构建词汇表 
text_field.build_vocab(train_data, max_size=50000)

In [3]:
# 实际单词表大小
print('Vocab Size:', len(text_field.vocab))
print('-' * 60)

# Index -> Word
print('Index to Word Sample:')
print(text_field.vocab.itos[:10])
print('-' * 60)

# Word -> Index
print('Word to Index Sample:')
print(list(text_field.vocab.stoi.items())[:10])
print('-' * 60)

Vocab Size: 50002
------------------------------------------------------------
Index to Word Sample:
['<unk>', '<pad>', 'the', 'of', 'and', 'one', 'in', 'a', 'to', 'zero']
------------------------------------------------------------
Word to Index Sample:
[('<unk>', 0), ('<pad>', 1), ('the', 2), ('of', 3), ('and', 4), ('one', 5), ('in', 6), ('a', 7), ('to', 8), ('zero', 9)]
------------------------------------------------------------


In [4]:
# 生成BPTT迭代器
train_iter, valid_iter, test_iter = torchtext.data.BPTTIterator.splits(
    (train_data, valid_data, test_data),
    batch_sizes=(32, 32, 32),
    device=device,
    bptt_len=50,
    repeat=False,
    shuffle=True
)

In [5]:
# 查看维度 (seq_len * batch_size)
batch_data = next(iter(train_iter))
print('Batch Data:')
print(batch_data)
print('-' * 60)

# 查看输入
print('Input:')
print(' '.join([text_field.vocab.itos[i] for i in batch_data.text[:, 1]]))

# 查看目标
print('Target:')
print(' '.join([text_field.vocab.itos[i] for i in batch_data.target[:, 1]]))

Batch Data:

[torchtext.data.batch.Batch of size 32]
	[.text]:[torch.cuda.LongTensor of size 50x32 (GPU 0)]
	[.target]:[torch.cuda.LongTensor of size 50x32 (GPU 0)]
------------------------------------------------------------
Input:
combine in pairs and then group into trios of pairs which are the smallest visible units of matter this parallels with the structure of modern atomic theory in which pairs or triplets of supposedly fundamental quarks combine to create most typical forms of matter they had also suggested the possibility
Target:
in pairs and then group into trios of pairs which are the smallest visible units of matter this parallels with the structure of modern atomic theory in which pairs or triplets of supposedly fundamental quarks combine to create most typical forms of matter they had also suggested the possibility of


# 定义模型

In [13]:
class RNNModel(nn.Module):
    
    def __init__(self, rnn_type, n_token, n_embed, n_hidden, 
                 n_layers, dropout=0.5):
        super(RNNModel, self).__init__()
        self.drop = nn.Dropout(dropout)
        self.embed = nn.Embedding(n_token, n_embed)
        if rnn_type in ['LSTM', 'GRU']:
            self.rnn = getattr(nn, rnn_type)(
                n_embed, n_hidden, n_layers, dropout=dropout)
        else:
            raise ValueError("rnn_type must in ['LSTM', 'GRU']")
        self.linear = nn.Linear(n_hidden, n_token)
        
    def forward(self, inputs):
        # inputs shape: (seq_len, batch_size)
        # x_emb shape: (seq_len, batch_size, embed_size)
        x_emb = self.drop(self.embed(inputs))
        
        # x_rnn shape: (seq_len, batch_size, hidden_size)
        x_rnn, _ = self.rnn(x_emb)
        x_rnn = self.drop(x_rnn)
        
        # outputs shape: (seq_len, batch_size, vocab_size)
        return self.linear(x_rnn)

In [14]:
model = RNNModel(
    rnn_type='LSTM', 
    n_token=len(text_field.vocab),
    n_embed=300,
    n_hidden=500,
    n_layers=2,
    dropout=0.5
)
model.to(device)
model

RNNModel(
  (drop): Dropout(p=0.5, inplace=False)
  (embed): Embedding(50002, 300)
  (rnn): LSTM(300, 500, num_layers=2, dropout=0.5)
  (linear): Linear(in_features=500, out_features=50002, bias=True)
)

# 模型训练

In [15]:
torch.cuda.empty_cache()

num_epochs = 2
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
scheduler = torch.optim.lr_scheduler.ExponentialLR(optimizer, 0.5)
history = defaultdict(list)

for epoch in range(num_epochs):
    model.train()
    total_loss = 0.0
    
    pbar = tqdm.notebook.tqdm(train_iter)
    pbar.set_description(f'Epoch {epoch+1} --> Train')
    
    for i, batch in enumerate(pbar):
        inputs, targets = batch.text.to(device), batch.target.to(device)
        outputs = model(inputs)
        
        # outputs shape: (seq_len * batch_size, vocab_size)
        outputs = outputs.view(-1, outputs.size(2))
        # targets shape: (seq_len * batch_size)
        targets = targets.view(-1)
        
        loss = criterion(outputs, targets)
        total_loss += loss.item()
        
        optimizer.zero_grad()
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()
        
        pbar.set_postfix(loss=loss.item())
    
    train_loss = total_loss / len(train_iter)
    history['train_loss'].append(train_loss)

    model.eval()
    total_loss = 0.0

    pbar = tqdm.notebook.tqdm(valid_iter)
    pbar.set_description(f'Epoch {epoch+1} --> Valid')
    
    with torch.no_grad():
        for i, batch in enumerate(pbar):
            inputs, targets = batch.text.to(device), batch.target.to(device)
            outputs = model(inputs)
            
            outputs = outputs.view(-1, outputs.size(2))
            targets = targets.view(-1)
            
            loss = criterion(outputs, targets)
            total_loss += loss.item()
            pbar.set_postfix(loss=loss.item())
            
    valid_loss = total_loss / len(valid_iter)
    history['valid_loss'].append(valid_loss)

HBox(children=(FloatProgress(value=0.0, max=9571.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=532.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=9571.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=532.0), HTML(value='')))




# 计算Perplexity

In [17]:
print('Perplexity in Valid:', np.exp(history['valid_loss'][-1]))

Perplexity in Valid: 215.8343580612766
