In [1]:
import os
import pickle
import shutil
import re
import time
import math
import operator

import torch
import torch.nn as nn
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence
from torch.autograd import Variable
import torch.nn.functional as F
import torch.optim as optim
import random
import numpy as np

from konlpy.tag import Twitter

from tqdm import tqdm

import matplotlib.pyplot as plt
import matplotlib.ticker as ticker

In [2]:
torch.__version__

'0.2.0_2'

In [3]:
USE_CUDA = torch.cuda.is_available()
USE_CUDA = True
GPU_ID = 3

print(USE_CUDA)

True


# 1. Preprocess Dataset

In [4]:
NUM_SYMBOL = 4

PAD_TOK = '<PAD>'
SOS_TOK = '<SOS>'
EOS_TOK = '<EOS>'
UNK_TOK = '<UNK>'

PAD_IDX = 0
SOS_IDX = 1
EOS_IDX = 2
UNK_IDX = 3

In [5]:
SRC_VOCAB_SIZE = 5000
TGT_VOCAB_SIZE = 5000

MAX_LENGTH = 20

In [6]:
SRC_FILENAME = 'dataset/korean_dialog/hangul_src.txt'
TGT_FILENAME = 'dataset/korean_dialog/hangul_tgt.txt'

In [7]:
class Data:
    def __init__(self):
        self.word2index = {}
        self.word2count = {}
        self.index2word = {
            SOS_IDX:SOS_TOK,
            EOS_IDX:EOS_TOK,
            PAD_IDX:PAD_TOK,
            UNK_IDX:UNK_TOK
        }
        self.n_words = len(self.index2word)

    def addSentence(self, sentence):
        for word in sentence:
            self.addWord(word)

    def addWord(self, word):
        if word not in self.word2count:
            self.word2count[word] = 1
            self.n_words += 1
        else:
            self.word2count[word] += 1
            
    def makeVocabDict(self, vocab_size):
        sorted_vocab = sorted(self.word2count.items(), key=operator.itemgetter(1), reverse=True)[:vocab_size]
        
        sorted_i2w = {i+NUM_SYMBOL:sorted_vocab[i][0] for i in range(vocab_size)}
        sorted_w2i = {sorted_vocab[i][0]:i for i in range(vocab_size)}
        
        self.index2word.update(sorted_i2w)
        self.word2index.update(sorted_w2i)
        
        self.n_words = vocab_size

In [8]:
def normalizeString(s):
    s = re.sub('[^가-힝0-9a-zA-Z\\s]', '', s)
    return s

In [9]:
def readData(src_fileName, tgt_fileName, reverse=False):
    print("Reading lines...")

    # Read the file and split into lines
    src_lines = open(src_fileName, 'r', encoding='utf-8').readlines()
    tgt_lines = open(tgt_fileName, 'r', encoding='utf-8').readlines()
    
    # Split every line into pairs and normalize
    pairs = [[normalizeString(src_lines[i][:-1]), normalizeString(tgt_lines[i][:-1])] for i in range(len(src_lines))]

    # Reverse pairs, make Lang instances
    if reverse:
        pairs = [list(reversed(p)) for p in pairs]
        input_data = Data()
        output_data = Data()
    else:
        input_data = Data()
        output_data = Data()

    print("Success!")
    
    return input_data, output_data, pairs

In [10]:
def filterPair(p):
    return len(p[0]) < MAX_LENGTH and \
        len(p[1]) < MAX_LENGTH
    
def filterPairs(pairs, tagger=Twitter()):
    pairs = [[tagger.morphs(pair[0]), tagger.morphs(pair[1])] for pair in pairs]
    return [pair for pair in pairs if filterPair(pair)]

In [11]:
def prepareData(src_fileName, tgt_fileName, reverse=False):
    input_data, output_data, pairs = readData(src_fileName, tgt_fileName)
    print("Read %s sentence pairs" % len(pairs))
    
    pairs = filterPairs(pairs)
    print("Trimmed to %s sentence pairs" % len(pairs))
    
    print("Counting words...")
    for pair in pairs:
        input_data.addSentence(pair[0])
        output_data.addSentence(pair[1])
        
    org_input_n_words = input_data.n_words
    org_output_n_words = output_data.n_words
        
    input_data.makeVocabDict(SRC_VOCAB_SIZE)
    output_data.makeVocabDict(TGT_VOCAB_SIZE)    
    
    print("Num of reduced words :")
    print("- Input data  :", org_input_n_words - input_data.n_words)
    print("- Output data :", org_output_n_words - output_data.n_words)
    
    return input_data, output_data, pairs

In [12]:
def indexesFromSentence(data, sentence):
    return [data.word2index[word] if word in data.word2index else UNK_IDX for word in sentence]

In [13]:
def paddingSeqIndexes(seq):
    pad_num = MAX_LENGTH - len(seq)
    return seq + [PAD_IDX]*pad_num

def variableFromSentence(data, sentence, isPadding=False):
    indexes = indexesFromSentence(data, sentence)
    indexes.append(EOS_IDX)
    
    if isPadding:
        indexes = paddingSeqIndexes(indexes)
    
    #result = Variable(torch.LongTensor(indexes).view(-1, 1))
    result = Variable(torch.LongTensor(indexes))
    if USE_CUDA:
        return result.cuda(GPU_ID)
    else:
        return result

In [14]:
def variablesFromBatch(batch):
    input_batch = []
    target_batch = []
    for pair in batch:
        input_batch.append(variableFromSentence(input_data,pair[0], isPadding=True))
        target_batch.append(variableFromSentence(output_data, pair[1], isPadding=True))
        
    if USE_CUDA:
        return (torch.stack(input_batch, dim=0).cuda(GPU_ID), torch.stack(target_batch, dim=0).cuda(GPU_ID))
    else:
        return (torch.stack(input_batch, dim=0), torch.stack(target_batch, dim=0))

In [15]:
input_data, output_data, pairs = prepareData(SRC_FILENAME, TGT_FILENAME)

Reading lines...
Success!
Read 92192 sentence pairs
Trimmed to 87099 sentence pairs
Counting words...
Num of reduced words :
- Input data  : 13799
- Output data : 13806


In [16]:
train_size = int(len(pairs)*0.8)
val_size = int(len(pairs)*0.1)
test_size = int(len(pairs)*0.1)

train_pairs = pairs[:train_size]
val_pairs = pairs[train_size:train_size+val_size]
test_pairs = pairs[train_size+val_size:]

In [17]:
def getBatch(data=train_pairs, batch_size=128):
    num = len(data)
    for idx in range(0, num, batch_size):
        batch = data[idx:min(idx + batch_size, num)]
        batch = sorted(batch, key=lambda x: len(x[0]), reverse=True)
        batch_len = [len(d[0]) for d in batch]
        yield (batch, batch_len)

# 3. Define Model

In [18]:
INPUT_SIZE = input_data.n_words  # Num of Words
HIDDEN_SIZE = 256  # Embedding Dimension
OUTPUT_SIZE = output_data.n_words

BATCH_SIZE = 512

In [19]:
class EncoderRNN(nn.Module):
    def __init__(self, input_size, hidden_size, n_layers=1):
        super(EncoderRNN, self).__init__()
        
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.n_layers = n_layers

        self.embedding = nn.Embedding(input_size, hidden_size, padding_idx=0)
        self.gru = nn.GRU(hidden_size, hidden_size, batch_first=True)

    def forward(self, input_seqs, input_lens, hidden):
        embedded = self.embedding(input_seqs)
        packed = pack_padded_sequence(embedded, input_lens, batch_first=True)
        outputs, hidden = self.gru(packed, hidden)
        outputs, output_lengths = pad_packed_sequence(outputs, batch_first=True)
        return outputs, hidden

    def initHidden(self, cur_batch_size):
        result = Variable(torch.zeros(self.n_layers, cur_batch_size, self.hidden_size))
        if USE_CUDA:
            return result.cuda(GPU_ID)
        else:
            return result

In [20]:
class DecoderRNN(nn.Module):
    def __init__(self, hidden_size, output_size):
        super(DecoderRNN, self).__init__()
        self.hidden_size = hidden_size
        self.output_size = output_size

        self.embedding = nn.Embedding(output_size, hidden_size)
        self.gru = nn.GRU(hidden_size, hidden_size, batch_first=True)
        self.out = nn.Linear(hidden_size, output_size)
        self.softmax = nn.LogSoftmax()
        
    def forward_step(self, input_var, hidden, cur_batch_size):
        embedded  = self.embedding(input_var)
        output, hidden = self.gru(embedded, hidden)
        output = self.out(output.view(-1, self.hidden_size))
        output = self.softmax(output)
        output = output.view(cur_batch_size, self.output_size, -1)
        
        return output, hidden

    def forward(self, inputs, hidden, cur_batch_size):        
        decoder_outputs = []
        sequence_symbols = []
        #lengths = np.array([max_length] * batch_size)
        
        def decode(step, step_output):
            decoder_outputs.append(step_output)
            symbols = decoder_outputs[-1].topk(1)[1]
            sequence_symbols.append(symbols)
            return symbols
        
        decoder_input = inputs[:, 0].unsqueeze(1)
        for di in range(MAX_LENGTH):
            decoder_output, decoder_hidden = self.forward_step(decoder_input, hidden, cur_batch_size)
            step_output = decoder_output.squeeze(2)
            symbols = decode(di, step_output)
            decoder_input = symbols
        
        sequence_symbols = torch.stack(sequence_symbols, dim=1).squeeze(2)
        decoder_outputs = torch.stack(decoder_outputs, dim=0)
        
        return decoder_outputs, decoder_hidden, sequence_symbols

    def initHidden(self, encoder_hidden):
        result = Variable(torch.zeros(1, BATCH_SIZE, self.hidden_size))
        if USE_CUDA:
            return result.cuda(GPU_ID)
        else:
            return result

In [21]:
def train(input_variable, input_lengths, target_variable, encoder, decoder, encoder_optimizer, decoder_optimizer,
          criterion, max_length=MAX_LENGTH):
    cur_batch_size = input_variable.size()[0]
    encoder_hidden = encoder.initHidden(cur_batch_size)

    encoder_optimizer.zero_grad()
    decoder_optimizer.zero_grad()

    target_length = target_variable.size()[0]

    encoder_outputs = Variable(torch.zeros(max_length, encoder.hidden_size))
    encoder_outputs = encoder_outputs.cuda(GPU_ID) if USE_CUDA else encoder_outputs

    loss = 0

    encoder_outputs, encoder_hidden = encoder(input_variable, input_lengths, encoder_hidden)

    decoder_input = target_variable
    decoder_input = decoder_input.cuda(GPU_ID) if USE_CUDA else decoder_input

    decoder_hidden = encoder_hidden

    decoder_outputs, decoder_hidden, generated_sequence = decoder(decoder_input, decoder_hidden, cur_batch_size)

    for step, step_output in enumerate(decoder_outputs):
        loss += criterion(step_output, target_variable[:, step])

    loss.backward()

    encoder_optimizer.step()
    decoder_optimizer.step()

    return loss.data[0]

In [22]:
def trainEpochs(encoder, decoder, n_epoch, print_every=1, plot_every=1, learning_rate=1e-3, w_decay=1e-5):
    start = time.time()
    plot_losses = []
    print_loss_total = 0  # Reset every print_every
    plot_loss_total = 0  # Reset every plot_every

    encoder_optimizer = optim.Adam(encoder.parameters(), lr=learning_rate, weight_decay=w_decay)
    decoder_optimizer = optim.Adam(decoder.parameters(), lr=learning_rate)
    criterion = nn.NLLLoss(size_average=True)

    for epoch in range(1, n_epoch + 1):
        for batch, batch_len in getBatch(batch_size=BATCH_SIZE):
            input_variable, target_variable = variablesFromBatch(batch)
    
            loss = train(input_variable, batch_len, target_variable, encoder, decoder, encoder_optimizer, decoder_optimizer, criterion)
            print_loss_total += loss
            plot_loss_total += loss
    
        if epoch % print_every == 0:
            print_loss_avg = print_loss_total / print_every
            print_loss_total = 0
            print('%s (%d %d%%) %.4f' % (timeSince(start, epoch / n_epoch),
                                            epoch, epoch / n_epoch * 100, print_loss_avg))
        if epoch % plot_every == 0:
            plot_loss_avg = plot_loss_total / plot_every
            plot_losses.append(plot_loss_avg)
            plot_loss_total = 0
        
    showPlot(plot_losses)

In [23]:
def asMinutes(s):
    m = math.floor(s / 60)
    s -= m * 60
    return '%dm %ds' % (m, s)

In [24]:
def timeSince(since, percent):
    now = time.time()
    s = now - since
    es = s / (percent)
    rs = es - s
    return '%s (- %s)' % (asMinutes(s), asMinutes(rs))

In [25]:
def showPlot(points):
    plt.figure()
    fig, ax = plt.subplots()
    # this locator puts ticks at regular intervals
    loc = ticker.MultipleLocator(base=0.2)
    ax.yaxis.set_major_locator(loc)
    plt.plot(points)

In [26]:
def sampleResponce(encoder, decoder, sentence, max_length=MAX_LENGTH, beam_size=5):
    input_variable = variableFromSentence(input_data, sentence)
    input_length = input_variable.size()[0]
    encoder_hidden = encoder.initHidden(1)

    encoder_outputs = Variable(torch.zeros(max_length, encoder.hidden_size))
    encoder_outputs = encoder_outputs.cuda(GPU_ID) if USE_CUDA else encoder_outputs

    for ei in range(input_length):
        encoder_output, encoder_hidden = encoder(input_variable[ei], [input_length], encoder_hidden)
        encoder_outputs[ei] = encoder_outputs[ei] + encoder_output[0][0]

        
    decoder_hidden = encoder_hidden
    
    score_board = [[[SOS_IDX], 0]]  #  list of [[sequence] and score]
    
    # with beam search
    while len(score_board[0][0]) != max_length + 1:
        # Select each candidate
        for cur, cur_score in score_board: # [[sequence], score].
            candidate = cur[-1]
        
            # Find beams
            decoder_input = Variable(torch.LongTensor([[candidate]]))
            decoder_input = decoder_input.cuda(GPU_ID) if USE_CUDA else decoder_input
        
            decoder_output, decoder_hidden = decoder(decoder_input, decoder_hidden)
            topVecs, topIdxs = decoder_output.data.topk(beam_size)
        
            for next, next_score in zip(topIdxs.tolist()[0], topVecs.tolist()[0]):
                # Append beams to score board
                score = cur_score + next_score  # log softmax
                score_board.append([cur+[next], score])
            
        # select top 5
        score_board = sorted(score_board, key=operator.itemgetter(1), reverse=True)[:5]

    decoded_words = []
    
    for idx in score_board[0][0]:
        if idx == EOS_IDX:
            decoded_words.append(EOS_TOK)
            break
        else:
            decoded_words.append(output_data.index2word[idx])

    return decoded_words

In [27]:
def sampleResponces(encoder, decoder, n=10):
    for i in range(n):
        pair = random.choice(test_pairs)
        print('>', pair[0])
        print('=', pair[1])
        output_words = sampleResponces(encoder, decoder, pair[0])
        output_sentence = ' '.join(output_words)
        print('<', output_sentence)
        print('')

In [28]:
NUM_EPOCH = 200

In [29]:
encoder = EncoderRNN(INPUT_SIZE, HIDDEN_SIZE)
decoder = DecoderRNN(HIDDEN_SIZE, OUTPUT_SIZE)
if USE_CUDA:
    encoder.cuda(GPU_ID)
    decoder.cuda(GPU_ID)

In [None]:
trainEpochs(encoder, decoder, NUM_EPOCH, print_every=1, plot_every=1, learning_rate=1e-3, w_decay=1e-5)

0m 27s (- 91m 19s) (1 0%) 9793.2724
0m 54s (- 90m 2s) (2 1%) 8679.7166
1m 20s (- 88m 38s) (3 1%) 8550.7817
1m 48s (- 88m 22s) (4 2%) 8450.9292
2m 15s (- 88m 2s) (5 2%) 8365.5565
2m 43s (- 87m 51s) (6 3%) 8296.2859
3m 10s (- 87m 33s) (7 3%) 8238.3299
3m 38s (- 87m 24s) (8 4%) 8181.3219
4m 6s (- 87m 10s) (9 4%) 8132.8841
4m 34s (- 86m 52s) (10 5%) 8080.6116
5m 2s (- 86m 31s) (11 5%) 8037.0322
5m 30s (- 86m 10s) (12 6%) 7991.5559
5m 57s (- 85m 44s) (13 6%) 7946.0123
6m 25s (- 85m 15s) (14 7%) 7902.1171
6m 52s (- 84m 51s) (15 7%) 7861.7362
7m 20s (- 84m 26s) (16 8%) 7812.6940
7m 48s (- 83m 59s) (17 8%) 7768.6701
8m 15s (- 83m 34s) (18 9%) 7723.9856
8m 43s (- 83m 8s) (19 9%) 7679.9727
9m 11s (- 82m 42s) (20 10%) 7641.3998
9m 39s (- 82m 15s) (21 10%) 7596.3208
10m 6s (- 81m 49s) (22 11%) 7555.4060
10m 34s (- 81m 20s) (23 11%) 7523.8567
11m 1s (- 80m 54s) (24 12%) 7493.0720
11m 29s (- 80m 26s) (25 12%) 7487.1430
11m 57s (- 80m 0s) (26 13%) 7469.8981
12m 24s (- 79m 32s) (27 13%) 7439.5530
12m 

In [None]:
torch.save(encoder.state_dict(), 'encoder.model')
torch.save(decoder.state_dict(), 'decoder.model')

In [None]:
BATCH_SIZE = 512

def evalModel(encoder, decoder):
    
    loss = 0

    criterion = nn.NLLLoss(size_average=True)

    for batch, batch_len in getBatch(data=test_pairs, batch_size=BATCH_SIZE):
        input_variable, target_variable = variablesFromBatch(batch)
    
        cur_batch_size = input_variable.size()[0]
        encoder_hidden = encoder.initHidden(cur_batch_size)
        
        target_length = target_variable.size()[0]
        
        encoder_outputs = Variable(torch.zeros(MAX_LENGTH, encoder.hidden_size))
        encoder_outputs = encoder_outputs.cuda(GPU_ID) if USE_CUDA else encoder_outputs
        
        encoder_outputs, encoder_hidden = encoder(input_variable, batch_len, encoder_hidden)
        
        decoder_input = target_variable
        decoder_input = decoder_input.cuda(GPU_ID) if USE_CUDA else decoder_input
        
        decoder_hidden = encoder_hidden
        decoder_outputs, decoder_hidden, generated_sequence = decoder(decoder_input, decoder_hidden, cur_batch_size)
        
        for step, step_output in enumerate(decoder_outputs):
            loss += criterion(step_output, target_variable[:, step]).data[0]/cur_batch_size
        
    print('loss = ', loss)

In [None]:
evalModel(encoder, decoder)

In [None]:
evaluateRandomly(encoder, decoder)

In [None]:
for test_pair in getBatch(data=train_pairs, batch_size=128):
    print("input  : ", test_pair[0])
    print("output : ", evaluate(encoder, decoder, test_pair[0]))
    print()

In [None]:
for i in range(5):
    input_sentence = input()
    print("output : ", evaluate(encoder, decoder, input_sentence))
    print()