# Language Modeling : LSTM을 이용한 텍스트 생성기

In [1]:
import argparse
import os
import time
import math
import torch
import torch.nn as nn
from torch.autograd import Variable
from torch import Tensor
from torchtext import data as d
from torchtext import datasets
from torchtext.vocab import GloVe

In [2]:
is_cuda = torch.cuda.is_available()
is_cuda

True

# Data 준비하기 : WikiText2 데이터셋을 이용한다

torchtext를 이용하여 WikiText2 데이터셋을 다운로드하고 사용한다.

In [3]:
TEXT = d.Field(lower=True, batch_first=True,)
train, valid, test = datasets.WikiText2.splits(TEXT,root='data')

downloading wikitext-2-v1.zip


data\wikitext-2\wikitext-2-v1.zip: 100%|██████████████████████████████████████████| 4.48M/4.48M [00:03<00:00, 1.19MB/s]


extracting


In [4]:
print(len(train[0].text)) # 다음을 통해 볼 수 있듯이 텍스트 데이터를 하나의 긴 텐서에 저장했다.

2088628


In [5]:
print(train[0].text[:100])

['<eos>', '=', 'valkyria', 'chronicles', 'iii', '=', '<eos>', '<eos>', 'senjō', 'no', 'valkyria', '3', ':', '<unk>', 'chronicles', '(', 'japanese', ':', '戦場のヴァルキュリア3', ',', 'lit', '.', 'valkyria', 'of', 'the', 'battlefield', '3', ')', ',', 'commonly', 'referred', 'to', 'as', 'valkyria', 'chronicles', 'iii', 'outside', 'japan', ',', 'is', 'a', 'tactical', 'role', '@-@', 'playing', 'video', 'game', 'developed', 'by', 'sega', 'and', 'media.vision', 'for', 'the', 'playstation', 'portable', '.', 'released', 'in', 'january', '2011', 'in', 'japan', ',', 'it', 'is', 'the', 'third', 'game', 'in', 'the', 'valkyria', 'series', '.', '<unk>', 'the', 'same', 'fusion', 'of', 'tactical', 'and', 'real', '@-@', 'time', 'gameplay', 'as', 'its', 'predecessors', ',', 'the', 'story', 'runs', 'parallel', 'to', 'the', 'first', 'game', 'and', 'follows', 'the']


# 여러가지 파라미터들 정의

In [10]:
batch_size=20
bptt_len=30
clip = 0.25
lr = 20
log_interval = 400

# 배치 처리기 생성

In [11]:
(len(train[0].text)//batch_size)*batch_size # 배치처리를 하기 위해 남은 8개를 없애야 함을 알 수 있음

2088620

In [12]:
# 배치 처리를 위해 배치에 들어가지 못 하는 나머지 데이터들을 없앰
train[0].text = train[0].text[:(len(train[0].text)//batch_size)*batch_size]
valid[0].text = valid[0].text[:(len(valid[0].text)//batch_size)*batch_size]
test[0].text = test[0].text[:(len(valid[0].text)//batch_size)*batch_size]

In [13]:
# print information about the data
print('train.fields', train.fields)
print('len(train)', len(train))
print('vars(train[0])', vars(train[0])['text'][0:10])

train.fields {'text': <torchtext.data.field.Field object at 0x000001C3840C0910>}
len(train) 1
vars(train[0]) ['<eos>', '=', 'valkyria', 'chronicles', 'iii', '=', '<eos>', '<eos>', 'senjō', 'no']


In [14]:
TEXT.build_vocab(train)

In [15]:
print('len(TEXT.vocab)', len(TEXT.vocab))

len(TEXT.vocab) 28913


In [34]:
train_iter, valid_iter, test_iter = d.BPTTIterator.splits((train, valid, test), batch_size=batch_size, bptt_len=bptt_len, device=0,repeat=False)

# bptt_len은 모델이 기억해야 하는 시퀀스 길이이다.
# bptt_len이 30이고 batch_size가 20이므로 배치 처리된 데이터를 분할할 때, 크기가 30인 데이터를 20개씩 묶어서 하나의 배치로 본다.
# 다음 코드를 통해 그 형상을 확인하자.



In [42]:
for i, batch in enumerate(train_iter):
    print(i)
    print("batch.text shape : ", batch.text.shape)
    print("batch.taget shape : ", batch.target.shape)
    break

0
batch.text shape :  torch.Size([20, 30])
batch.taget shape :  torch.Size([20, 30])


# 모델 정의하기

In [43]:
class LSTMModel(nn.Module):
    def __init__(self,ntoken,ninp,nhid,nlayers,dropout=0.5,tie_weights=False):
        # ntoken : 어휘의 단어의 수, ninp : LSTM에 입력되는 단어의 임베딩 차원,
        # nlayer : LSTM에 사용될 레이어 수, dropout : 드롭아웃 비율
        # tie_weights : encoder와 decoder에 같은 가중치를 사용할지
        super().__init__()
        self.drop = nn.Dropout()
        self.encoder = nn.Embedding(ntoken,ninp)
        self.rnn = nn.LSTM(ninp,nhid,nlayers,dropout=dropout)
        self.decoder = nn.Linear(nhid,ntoken)
        if tie_weights:
            self.decoder.weight = self.encoder.weight
        
        self.init_weights()
        self.nhid = nhid
        self.nlayers = nlayers
        
    def init_weights(self):
        initrange = 0.1
        self.encoder.weight.data.uniform_(-initrange,initrange)
        self.decoder.bias.data.fill_(0)
        self.decoder.weight.data.uniform_(-initrange,initrange)
        
    def forward(self,input,hidden): 
        
        emb = self.drop(self.encoder(input))
        output,hidden = self.rnn(emb,hidden)
        output = self.drop(output)
        s = output.size()
        decoded = self.decoder(output.view(s[0]*s[1],s[2]))
        return decoded.view(s[0],s[1],decoded.size(1)),hidden
    
    def init_hidden(self,bsz):
        weight = next(self.parameters()).data
        return(Variable(weight.new(self.nlayers,bsz,self.nhid).zero_()),Variable(weight.new(self.nlayers,bsz,self.nhid).zero_()))

In [44]:
criterion = nn.CrossEntropyLoss()

In [45]:
emsize = 200
nhid=200
nlayers=2
dropout = 0.2

ntokens = len(TEXT.vocab)
lstm = LSTMModel(ntokens, emsize, nhid,nlayers, dropout, 'store_true')
if is_cuda:
    lstm = lstm.cuda()

In [46]:
def repackage_hidden(h):
    """Wraps hidden states in new Variables, to detach them from their history."""
    if type(h) == Tensor:
        return h.detach().cuda()
    else:
        return tuple(repackage_hidden(v) for v in h)

In [47]:
def evaluate(data_source):
    # Turn on evaluation mode which disables dropout.
    lstm.eval()
    total_loss = 0   
    hidden = lstm.init_hidden(batch_size)
    for batch in data_source:        
        data, targets = batch.text,batch.target.view(-1)
        output, hidden = lstm(data.cuda(), hidden)
        output_flat = output.view(-1, ntokens)
        
        if is_cuda :
            targets = targets.cuda()
        
        total_loss += len(data) * criterion(output_flat, targets).data
        hidden = repackage_hidden(hidden)
    return total_loss.item()/(len(data_source.dataset[0].text)//batch_size)

In [48]:
def trainf():
    # Turn on training mode which enables dropout.
    lstm.train()
    total_loss = 0
    start_time = time.time()
    hidden = lstm.init_hidden(batch_size)
    for  i,batch in enumerate(train_iter):
        data, targets = batch.text,batch.target.view(-1)
        if is_cuda :
            data = data.cuda()
            targets = targets.cuda()
        
        # Starting each batch, we detach the hidden state from how it was previously produced.
        # If we didn't, the model would try backpropagating all the way to start of the dataset.
        hidden = repackage_hidden(hidden)
        lstm.zero_grad()
        output, hidden = lstm(data, hidden)
        loss = criterion(output.view(-1, ntokens), targets)
        loss.backward()

        # `clip_grad_norm` helps prevent the exploding gradient problem in RNNs / LSTMs.
        torch.nn.utils.clip_grad_norm(lstm.parameters(), clip)
        for p in lstm.parameters():
            p.data.add_(-lr, p.grad.data)

        total_loss += loss.data

        if i % log_interval == 0 and i > 0:
            cur_loss = total_loss.item() / log_interval
            elapsed = time.time() - start_time
            (print('| epoch {:3d} | {:5d}/{:5d} batches | lr {:02.2f} | ms/batch {:5.2f} | loss {:5.2f} | ppl {:8.2f}'.format(epoch, i, len(train_iter), lr,elapsed * 1000 / log_interval, cur_loss, math.exp(cur_loss))))
            total_loss = 0
            start_time = time.time()

In [49]:
# Loop over epochs.
best_val_loss = None
epochs = 40

for epoch in range(1, epochs+1):
    epoch_start_time = time.time()
    trainf()
    val_loss = evaluate(valid_iter)
    print('-' * 89)
    print('| end of epoch {:3d} | time: {:5.2f}s | valid loss {:5.2f} | '
        'valid ppl {:8.2f}'.format(epoch, (time.time() - epoch_start_time),
                                   val_loss, math.exp(val_loss)))
    print('-' * 89)
    if not best_val_loss or val_loss < best_val_loss:
        best_val_loss = val_loss
    else:
        # Anneal the learning rate if no improvement has been seen in the validation dataset.
        lr /= 4.0

RuntimeError: Expected hidden[0] size (2, 30, 200), got (2, 20, 200)

In [None]:
# 왜 안 되는지 모르겠다...