# 아래의 깃허브 데이터를 이용하여 연습

### 데이터 
https://github.com/songys/Chatbot_data

### 모델
https://tutorials.pytorch.kr/beginner/chatbot_tutorial.html

In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import csv
import random
import re
import os
import unicodedata
import codecs
import itertools
import math
from torch.jit import script, trace
from torch import optim

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [2]:
corpus_name = 'chatData'
corpus = os.path.join(r'C:\Users\abc\jupyter\pytorch\chatbot_Test', corpus_name)

def printLines(file, n=10):
    with open(file, 'r', encoding='utf-8') as datafile:
        lines = datafile.readlines()
    for line in lines[:n]:
        print(line)
        
printLines(os.path.join(corpus, 'ChatbotData.csv'))

Q,A,label

12시 땡!,하루가 또 가네요.,0

1지망 학교 떨어졌어,위로해 드립니다.,0

3박4일 놀러가고 싶다,여행은 언제나 좋죠.,0

3박4일 정도 놀러가고 싶다,여행은 언제나 좋죠.,0

PPL 심하네,눈살이 찌푸려지죠.,0

SD카드 망가졌어,다시 새로 사는 게 마음 편해요.,0

SD카드 안돼,다시 새로 사는 게 마음 편해요.,0

SNS 맞팔 왜 안하지ㅠㅠ,잘 모르고 있을 수도 있어요.,0

SNS 시간낭비인 거 아는데 매일 하는 중,시간을 정하고 해보세요.,0



In [3]:
def loadLines(fileName, fields):
    lines = []
    with open(fileName, 'r', encoding='utf-8') as f:
        next(f)
        for line in f:
            values = line.split(',')
            lineObj = {}
            for i, field in enumerate(fields):
                lineObj[field] = values[i]
            lineObj['text'] = lineObj['Question'] + '\t' + lineObj['Answer']
            lines.append(lineObj['text'])
    return lines
     

lines = []
LINES_FIELDS = ['Question', 'Answer', 'text']
lines = loadLines(os.path.join(corpus, 'ChatbotData.csv'), LINES_FIELDS)
print(lines[:5])
print(len(lines))

['12시 땡!\t하루가 또 가네요.', '1지망 학교 떨어졌어\t위로해 드립니다.', '3박4일 놀러가고 싶다\t여행은 언제나 좋죠.', '3박4일 정도 놀러가고 싶다\t여행은 언제나 좋죠.', 'PPL 심하네\t눈살이 찌푸려지죠.']
11823


In [4]:
datafile = os.path.join(corpus, "formatted_lines.txt")

delimiter = '\t'
delimiter = str(codecs.decode(delimiter, "unicode_escape"))

print("\nWriting newly formatted file...")
with open(datafile, 'w', encoding='utf-8') as outputfile:
    writer = csv.writer(outputfile, delimiter=delimiter, lineterminator='\n')
    for pair in lines:
        writer.writerow([pair])

# 몇 줄을 예제 삼아 출력해 봅니다
print("\nSample lines from file:")
printLines(datafile)


Writing newly formatted file...

Sample lines from file:
"12시 땡!	하루가 또 가네요."

"1지망 학교 떨어졌어	위로해 드립니다."

"3박4일 놀러가고 싶다	여행은 언제나 좋죠."

"3박4일 정도 놀러가고 싶다	여행은 언제나 좋죠."

"PPL 심하네	눈살이 찌푸려지죠."

"SD카드 망가졌어	다시 새로 사는 게 마음 편해요."

"SD카드 안돼	다시 새로 사는 게 마음 편해요."

"SNS 맞팔 왜 안하지ㅠㅠ	잘 모르고 있을 수도 있어요."

"SNS 시간낭비인 거 아는데 매일 하는 중	시간을 정하고 해보세요."

"SNS 시간낭비인데 자꾸 보게됨	시간을 정하고 해보세요."



In [5]:
PAD_token = 0  # 패딩 토큰
SOS_token = 1  # 시작 토큰
EOS_token = 2  # 끝 토큰

class Voc:
    def __init__(self, name):
        self.name = name
        self.trimmed = False 
        self.word2index = {}
        self.word2count = {}
        self.index2word = {PAD_token: "PAD", SOS_token: "SOS", EOS_token: "EOS"}
        self.num_words = 3  # SOS, EOS, PAD를 센 것

    # 단어 분리
    def addSentence(self, sentence):
        for word in sentence.split(' '):
            self.addWord(word)

    # 단어 집합 생성
    def addWord(self, word):
        if word not in self.word2index:
            self.word2index[word] = self.num_words
            self.word2count[word] = 1
            self.index2word[self.num_words] = word
            self.num_words += 1
        else:
            self.word2count[word] += 1

    # 등장 횟수가 기준 이하인 단어를 정리합니다
    def trim(self, min_count):
        if self.trimmed:
            return
        self.trimmed = True

        keep_words = []

        for k, v in self.word2count.items():
            if v >= min_count:
                keep_words.append(k)

        print('keep_words {} / {} = {:.4f}'.format(len(keep_words), len(self.word2index), len(keep_words) / len(self.word2index)))

        # 사전을 다시 초기화힙니다
        self.word2index = {}
        self.word2count = {}
        self.index2word = {PAD_token: "PAD", SOS_token: "SOS", EOS_token: "EOS"}
        self.num_words = 3 

        for word in keep_words:
            self.addWord(word)

In [6]:
MAX_LENGTH = 10  

# 질의/응답 쌍을 읽어서 voc 객체를 반환합니다
def readVocs(datafile, corpus_name):
    print("Reading lines...")
    # 파일을 읽고, 쪼개어 lines에 저장합니다
    lines = open(datafile, encoding='utf-8').read().strip().split('\n')
    pairs = [l.split('\t') for l in lines]
    # pairs = sum(pairs, [[]]) # 2차원에서 1차원으로
    voc = Voc(corpus_name)
    return voc, pairs

def filterPair(p):
    # EOS 토큰을 위해 입력 시퀀스의 마지막 단어를 보존해야 합니다
    return len(p[0].split(' ')) < MAX_LENGTH and len(p[1].split(' ')) < MAX_LENGTH

# 조건식 filterPair에 따라 pairs를 필터링합니다
def filterPairs(pairs):
    return [pair for pair in pairs if filterPair(pair)]

def loadPrepareData(corpus, corpus_name, datafile, save_dir):
    print("Start preparing training data ...")
    voc, pairs = readVocs(datafile, corpus_name) # voc : 단어집합, pairs : 질문 쌍
    print("Read {!s} sentence pairs".format(len(pairs)))
    #print(pairs)
    pairs = filterPairs(pairs)
    #print(pairs)
    print("Trimmed to {!s} sentence pairs".format(len(pairs)))
    print("Counting words...")
    for pair in pairs:
        voc.addSentence(pair[0])
        voc.addSentence(pair[1])
    print("Counted words:", voc.num_words)
    return voc, pairs

save_dir = os.path.join("data", "save")
voc, pairs = loadPrepareData(corpus, corpus_name, datafile, save_dir)

for pair in pairs[:10]:
    print(pair)

Start preparing training data ...
Reading lines...
Read 11823 sentence pairs
Trimmed to 11649 sentence pairs
Counting words...
Counted words: 23478
['"12시 땡!', '하루가 또 가네요."']
['"1지망 학교 떨어졌어', '위로해 드립니다."']
['"3박4일 놀러가고 싶다', '여행은 언제나 좋죠."']
['"3박4일 정도 놀러가고 싶다', '여행은 언제나 좋죠."']
['"PPL 심하네', '눈살이 찌푸려지죠."']
['"SD카드 망가졌어', '다시 새로 사는 게 마음 편해요."']
['"SD카드 안돼', '다시 새로 사는 게 마음 편해요."']
['"SNS 맞팔 왜 안하지ㅠㅠ', '잘 모르고 있을 수도 있어요."']
['"SNS 시간낭비인 거 아는데 매일 하는 중', '시간을 정하고 해보세요."']
['"SNS 시간낭비인데 자꾸 보게됨', '시간을 정하고 해보세요."']


In [7]:
MIN_COUNT = 2    

def trimRareWords(voc, pairs, MIN_COUNT):

    voc.trim(MIN_COUNT)

    keep_pairs = []
    for pair in pairs:
        input_sentence = pair[0]
        output_sentence = pair[1]
        keep_input = True
        keep_output = True

        for word in input_sentence.split(' '):
            if word not in voc.word2index:
                keep_input = False
                break

        for word in output_sentence.split(' '):
            if word not in voc.word2index:
                keep_output = False
                break

        if keep_input and keep_output:
            keep_pairs.append(pair)

    print("Trimmed from {} pairs to {}, {:.4f} of total".format(len(pairs), len(keep_pairs), len(keep_pairs) / len(pairs)))
    return keep_pairs

pairs = trimRareWords(voc, pairs, MIN_COUNT)

keep_words 9920 / 23475 = 0.4226
Trimmed from 11649 pairs to 3532, 0.3032 of total


In [8]:
def indexesFromSentence(voc, sentence):
    # 문장을 단어집합에 저장된 수로 바꾼후 마지막에 EOS추가
    return [voc.word2index[word] for word in sentence.split(' ')] + [EOS_token]


def zeroPadding(l, fillvalue=PAD_token):
    # itertools. zip_longest(*iterables, fillvalue=None) : 길이가 다른 자료형 zip할때
    return list(itertools.zip_longest(*l, fillvalue=fillvalue))

def binaryMatrix(l, value=PAD_token):
    m = []
    for i, seq in enumerate(l):
        m.append([])
        for token in seq:
            if token == PAD_token:
                m[i].append(0)
            else:
                m[i].append(1)
    return m

# 문장 tensor로 변경
def inputVar(l, voc):
    indexes_batch = [indexesFromSentence(voc, sentence) for sentence in l]
    lengths = torch.tensor([len(indexes) for indexes in indexes_batch])
    padList = zeroPadding(indexes_batch)
    padVar = torch.LongTensor(padList)
    return padVar, lengths


def outputVar(l, voc):
    indexes_batch = [indexesFromSentence(voc, sentence) for sentence in l]
    max_target_len = max([len(indexes) for indexes in indexes_batch]) # 가장 길이가 긴것 추출
    padList = zeroPadding(indexes_batch)
    mask = binaryMatrix(padList) # mask : 이진으로 이루어진 tensor, 크기 동일
    mask = torch.ByteTensor(mask)
    padVar = torch.LongTensor(padList)
    return padVar, mask, max_target_len

# pair로 입력받아서 inputVar, outputVar 진행
def batch2TrainData(voc, pair_batch):
    pair_batch.sort(key=lambda x: len(x[0].split(" ")), reverse=True)
    input_batch, output_batch = [], []
    for pair in pair_batch:
        input_batch.append(pair[0])
        output_batch.append(pair[1])
    inp, lengths = inputVar(input_batch, voc)
    output, mask, max_target_len = outputVar(output_batch, voc)
    return inp, lengths, output, mask, max_target_len



small_batch_size = 5
batches = batch2TrainData(voc, [random.choice(pairs) for _ in range(small_batch_size)])
input_variable, lengths, target_variable, mask, max_target_len = batches

print("input_variable:", input_variable)
print("lengths:", lengths)
print("target_variable:", target_variable)
print("mask:", mask)
print("max_target_len:", max_target_len)

input_variable: tensor([[9862, 5227, 5240, 2348, 6368],
        [ 499,  343, 5241, 2349,  383],
        [5169, 5228, 2054,    2,    2],
        [4105,  284,    2,    0,    0],
        [ 874,    2,    0,    0,    0],
        [8905,    0,    0,    0,    0],
        [   2,    0,    0,    0,    0]])
lengths: tensor([7, 5, 4, 3, 3])
target_variable: tensor([[7204, 3675, 1184,  130, 3184],
        [9157,  602, 5242, 2351,  972],
        [ 246,    2,    2,    2, 3185],
        [9817,    0,    0,    0,    2],
        [   2,    0,    0,    0,    0]])
mask: tensor([[1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1],
        [1, 0, 0, 0, 1],
        [1, 0, 0, 0, 0]], dtype=torch.uint8)
max_target_len: 5


In [9]:
class EncoderRNN(nn.Module):
    def __init__(self, hidden_size, embedding, n_layers=1, dropout=0):
        super(EncoderRNN, self).__init__()
        self.n_layers = n_layers
        self.hidden_size = hidden_size
        self.embedding = embedding
        self.gru = nn.GRU(hidden_size, hidden_size, n_layers, 
                          dropout = (0 if n_layers == 1 else dropout), bidirectional=True)
        
    def forward(self, input_seq, input_lengths, hidden=None):
        embedded = self.embedding(input_seq) # input_seq : shape=(max_length, batch_size)
        # print(embedded.shape) : torch.Size([10, 64, 500]) [max_length, batch_size, hidden_size(은닉상태 크기)]
        
        # nn.utils.rnn.pack_padded_sequence : 패딩연산처리 쉽게하기 위해 중간에 빈공간 제거(형태 : tensor)
        packed = nn.utils.rnn.pack_padded_sequence(embedded, input_lengths) # input_lengths : shape=(batch_size)
        # print(packed.batch_sizes) : tensor([64, 64, 64, 58, 52, 45, 38, 17,  8,  2])
        
        outputs, hidden = self.gru(packed, hidden) # 입력hidden : shape=(n_layers * num_directions, batch_size, hidden_size)
        # print(outputs.batch_sizes) : tensor([64, 64, 63, 52, 47, 34, 24, 18, 12,  6])
        # print(hidden.shape) : torch.Size([4, 64, 500]) [층 * 양방향이면2 아니면1, batch_size, hidden_size]
        
        # nn.utils.rnn.pad_packed_sequence : 패딩연산이 끝난 것을 다시 원래대로 (형태 : torch)
        outputs, _ = nn.utils.rnn.pad_packed_sequence(outputs)
        # print(outputs.shape)# : torch.Size([10, 64, 1000]) # [max_length, batch_size, hidden_size(양방향으로 진행했으면 *2)]
        
        # 양방향 GRU의 출력을 합산합니다
        outputs = outputs[:, :, :self.hidden_size] + outputs[:, : ,self.hidden_size:]
        # print(outputs.shape) : torch.Size([10, 64, 500])
        
        # hidden : GRU의 최종 은닉 상태
        return outputs, hidden

In [10]:
class Attn(nn.Module):
    def __init__(self, method, hidden_size):
        super(Attn, self).__init__()
        self.method = method
        if self.method not in ['dot', 'general', 'concat']:
            raise ValueError(self.method, 'is not an appropriate attention method.')
        self.hidden_size = hidden_size
        if self.method == 'general':
            self.attn = nn.Linear(self.hidden_size, hidden_size)
        elif self.method == 'concat':
            self.attn = nn.Linear(self.hidden_size * 2, hidden_size)
            self.v = nn.Parameter(torch.FloatTensor(hidden_size))
            
    # 가중치 계산을 dot-product로 계산
    def dot_score(self, hidden, encoder_output):
        # print(torch.sum(hidden * encoder_output, dim=2).shape) : torch.Size([10, 64]) 10개 생성[max_length, batch_size]
        return torch.sum(hidden * encoder_output, dim=2)
    
    # 
    def general_score(self, hidden, encoder_output):
        energy = self.attn(encoder_output)
        # print(energy.shape) : torch.Size([10, 64, 500]) 10개 생성[max_length, batch_size, hidden_size]
        
        # print(torch.sum(hidden * energy, dim=2).shape) : torch.Size([10, 64]) 10개 생성[max_length, batch_size]
        return torch.sum(hidden * energy, dim=2)
    
    
    def concat_score(self, hidden, encoder_output):
        # cat : 합칠 때 차원은 2차원으로 / expand : 확장
        # Tanh 함수는 함수값을 [-1, 1]로 제한시킴
        # print((hidden.expand(encoder_output.size(0), -1, -1).shape)) : torch.Size([10, 64, 500])
        # print(torch.cat((hidden.expand(encoder_output.size(0), -1, -1), encoder_output), 2).shape) : torch.Size([10, 64, 1000])
        energy = self.attn(torch.cat((hidden.expand(encoder_output.size(0), -1, -1), encoder_output), 2)).tanh()
        # print(energy.shape) : torch.Size([10, 64, 500]) 10개 생성[max_length, batch_size, hidden_size]
        return torch.sum(self.v * energy, dim=2)
    
    def forward(self, hidden, encoder_outputs):
        if self.method == 'general':
            attn_energies = self.general_score(hidden, encoder_outputs)
        elif self.method == 'concat':
            attn_energies = self.concat_score(hidden, encoder_outputs)
        elif self.method == 'dot':
            attn_energies = self.dot_score(hidden, encoder_outputs)
            
        attn_energies = attn_energies.t() # t() : 행과 열을 바꿔서 저장[1, 2, 3], [4, 5, 6] -> [1, 4, 7], [2, 5, 8]
        
        return F.softmax(attn_energies, dim=1).unsqueeze(1)

In [11]:
class LuongAttnDecoderRNN(nn.Module):
    def __init__(self, attn_model, embedding, hidden_size, output_size, n_layers=1, dropout=0.1):
        super(LuongAttnDecoderRNN, self).__init__()

        # 참조를 보존해 둡니다
        self.attn_model = attn_model
        self.hidden_size = hidden_size
        self.output_size = output_size
        self.n_layers = n_layers
        self.dropout = dropout

        # 레이어를 정의합니다
        self.embedding = embedding
        self.embedding_dropout = nn.Dropout(dropout)
        self.gru = nn.GRU(hidden_size, hidden_size, n_layers, dropout=(0 if n_layers == 1 else dropout))
        self.concat = nn.Linear(hidden_size * 2, hidden_size)
        self.out = nn.Linear(hidden_size, output_size)

        self.attn = Attn(attn_model, hidden_size)

    def forward(self, input_step, last_hidden, encoder_outputs):
        # 주의: 한 단위 시간에 대해 한 단계(단어)만을 수행합니다
        # 현재의 입력 단어에 대한 임베딩을 구합니다   
        embedded = self.embedding(input_step) # input_step : 입력 시퀀스 배치에 대한 한 단위 시간(한 단어). shape=(1, batch_size)
        embedded = self.embedding_dropout(embedded)
        # print(embedded.shape) : torch.Size([1, 64, 500])
        
        # 양방향x
        # last_hidden : GRU의 마지막 은닉 레이어. shape=(n_layers * num_directions, batch_size, hidden_size)
        # print(last_hidden.shape) : torch.Size([2, 64, 500]) 
        rnn_output, hidden = self.gru(embedded, last_hidden) 
        # print(rnn_output.shape) : torch.Size([1, 64, 500])
        # print(hidden.shape) : torch.Size([2, 64, 500])

        # attention 가중치
        attn_weights = self.attn(rnn_output, encoder_outputs) # encoder_outputs : 인코더 모델 출력 shape=(max_length, batch_size, hidden_size)
        # print(attn_weights.shape) : torch.Size([64, 1, 10]) 

        # 인코더 출력에 어텐션을 곱하여 새로운 context vector생성
        context = attn_weights.bmm(encoder_outputs.transpose(0, 1))
        # print(context.shape) : torch.Size([64, 1, 500])

        rnn_output = rnn_output.squeeze(0) # print(rnn_output.shape) : torch.Size([64, 500])
        context = context.squeeze(1) # print(context.shape) : torch.Size([64, 500])
        concat_input = torch.cat((rnn_output, context), 1) # print(concat_input.shape) : torch.Size([64, 1000])
        concat_output = torch.tanh(self.concat(concat_input))
        # print(concat_output.shape) : torch.Size([64, 500])

        # output : 각 단어가 디코딩된 시퀀스에서 다음 단어로 사용되었을 때 적합할 확률을 나타내는 정규화된 softmax 텐서. 
        # shape=(batch_size, voc.num_words)
        output = self.out(concat_output)
        output = F.softmax(output, dim=1)

        return output, hidden

In [12]:
def maskNLLLoss(inp, target, mask):
    nTotal = mask.sum()
    crossEntropy = -torch.log(torch.gather(inp, 1, target.view(-1, 1)).squeeze(1))
    loss = crossEntropy.masked_select(mask).mean()
    loss = loss.to(device)
    return loss, nTotal.item()

In [13]:
def train(input_variable, lengths, target_variable, mask, max_target_len, encoder, decoder, embedding, encoder_optimizer, decoder_optimizer,
         batch_size, clip, max_length = MAX_LENGTH):
    
    encoder_optimizer.zero_grad()
    decoder_optimizer.zero_grad()
    
    input_variable = input_variable.to(device)
    lengths = lengths.to(device)
    target_variable = target_variable.to(device)
    mask = mask.to(device)
    
    loss = 0
    print_losses = []
    n_totals = 0
    
    # EncoderRNN의 forward부분 실행
    encoder_outputs, encoder_hidden = encoder(input_variable, lengths)
    
    # 초기 디코더 입력을 생성(각 문장을 SOS 토큰으로 시작)
    decoder_input = torch.LongTensor([[SOS_token for _ in range(batch_size)]])
    decoder_input = decoder_input.to(device)
    
    # 디코더의 초기 은닉 상태를 인코더의 마지막 은닉 상태로
    decoder_hidden = encoder_hidden[:decoder.n_layers]
    
    # teacher_forcing : Decoder부분에서 앞 단어가 잘못 추측되었을 경우 뒤에도 달라지니 정답을 입력해 주는 것
    use_teacher_forcing = True if random.random() < teacher_forcing_ratio else False
    
    if use_teacher_forcing:
        for t in range(max_target_len):
            # LuongAttnDecoderRNN의 forward로 실행
            decoder_output, decoder_hidden = decoder(decoder_input, decoder_hidden, encoder_outputs)
            
             # Teacher forcing 사용: 다음 입력을 현재의 목표로 둡니다
            decoder_input = target_variable[t].view(1, -1)
            mask_loss, nTotal = maskNLLLoss(decoder_output, target_variable[t], mask[t])
            loss += mask_loss
            print_losses.append(mask_loss.item() * nTotal)
            n_totals += nTotal
    else:
        for t in range(max_target_len):
            decoder_output, decoder_hidden = decoder(decoder_input, decoder_hidden, encoder_outputs)
            
            # Teacher forcing 미사용: 다음 입력을 디코더의 출력으로 둡니다
            _, topi = decoder_output.topk(1)
            decoder_input = torch.LongTensor([[topi[i][0] for i in range(batch_size)]])
            decoder_input = decoder_input.to(device)
            mask_loss, nTotal = maskNLLLoss(decoder_output, target_variable[t], mask[t])
            loss += mask_loss
            print_losses.append(mask_loss.item() * nTotal)
            n_totals += nTotal
            
    loss.backward()
    
    # clip_grad_norm_: 그라디언트를 제자리에서 수정합니다
    _ = nn.utils.clip_grad_norm_(encoder.parameters(), clip)
    _ = nn.utils.clip_grad_norm_(decoder.parameters(), clip)
    
    encoder_optimizer.step()
    decoder_optimizer.step()
    
    return sum(print_losses) / n_totals

In [14]:
def trainIters(model_name, voc, pairs, encoder, decoder, encoder_optimizer, decoder_optimizer, embedding, encoder_n_layers, decoder_n_layers, save_dir, n_iteration, batch_size, print_every, save_every, clip, corpus_name, loadFilename):

    # 각 단계에 대한 배치 설정
    # batch2TrainData : return inp, lengths, output, mask, max_target_len
    training_batches = [batch2TrainData(voc, [random.choice(pairs) for _ in range(batch_size)])
                      for _ in range(n_iteration)]


    print('Initializing ...')
    start_iteration = 1
    print_loss = 0
    if loadFilename:
        start_iteration = checkpoint['iteration'] + 1


    print("Training...")
    for iteration in range(start_iteration, n_iteration + 1):
        training_batch = training_batches[iteration - 1]
        
        input_variable, lengths, target_variable, mask, max_target_len = training_batch


        loss = train(input_variable, lengths, target_variable, mask, max_target_len, encoder,
                     decoder, embedding, encoder_optimizer, decoder_optimizer, batch_size, clip)
        
        print_loss += loss


        if iteration % print_every == 0:
            print_loss_avg = print_loss / print_every
            print("Iteration: {}; Percent complete: {:.1f}%; Average loss: {:.4f}".format(iteration, iteration / n_iteration * 100, print_loss_avg))
            print_loss = 0

        # Checkpoint를 저장
        if (iteration % save_every == 0):
            directory = os.path.join(save_dir, model_name, corpus_name, '{}-{}_{}'.format(encoder_n_layers, decoder_n_layers, hidden_size))
            if not os.path.exists(directory):
                os.makedirs(directory)
            torch.save({
                'iteration': iteration,
                'en': encoder.state_dict(),
                'de': decoder.state_dict(),
                'en_opt': encoder_optimizer.state_dict(),
                'de_opt': decoder_optimizer.state_dict(),
                'loss': loss,
                'voc_dict': voc.__dict__,
                'embedding': embedding.state_dict()
            }, os.path.join(directory, '{}_{}.tar'.format(iteration, 'checkpoint')))

In [15]:
# 탐욕적 디코딩(Greedy decoding) : 각 단계에 대해 단순히 decoder_output 에서 가장 높은 softmax값을 갖는 단어를 선택하는 방식
class GreedySearchDecoder(nn.Module):
    def __init__(self, encoder, decoder):
        super(GreedySearchDecoder, self).__init__()
        self.encoder = encoder
        self.decoder = decoder

    def forward(self, input_seq, input_length, max_length):

        # EncoderRNN의 forward부분 실행
        encoder_outputs, encoder_hidden = self.encoder(input_seq, input_length)

        # encoder의 마지막 hidden이 decoder의 처음 hidden
        decoder_hidden = encoder_hidden[:decoder.n_layers]
        
        # decoder의 처음입력을 SOS로 초기화
        decoder_input = torch.ones(1, 1, device=device, dtype=torch.long) * SOS_token

        # 디코더가 단어를 덧붙여 나갈 텐서를 초기화
        all_tokens = torch.zeros([0], device=device, dtype=torch.long)
        all_scores = torch.zeros([0], device=device)

        for _ in range(max_length):
            # LuongAttnDecoderRNN의 forward로 실행
            decoder_output, decoder_hidden = self.decoder(decoder_input, decoder_hidden, encoder_outputs)

            # 가장 가능성 높은 단어 토큰과 그 softmax 점수를 구합니다
            decoder_scores, decoder_input = torch.max(decoder_output, dim=1)

            # 토큰, 점수 기록
            all_tokens = torch.cat((all_tokens, decoder_input), dim=0)
            all_scores = torch.cat((all_scores, decoder_scores), dim=0)

            # 현재의 토큰을 디코더의 다음 입력으로 준비시킵니다(차원을 증가시켜서)
            decoder_input = torch.unsqueeze(decoder_input, 0)

        return all_tokens, all_scores

In [16]:
def evaluate(encoder, decoder, searcher, voc, sentence, max_length=MAX_LENGTH):
    # indexes_batch : 문장을 단어집합에 저장된 수로 바꾼후 마지막에 EOS추가하는 함수
    indexes_batch = [indexesFromSentence(voc, sentence)]
    
    # lengths 텐서를 만듭니다
    lengths = torch.tensor([len(indexes) for indexes in indexes_batch])
    
    # 배치의 차원을 뒤집어서 모델이 사용하는 형태로 만듭니다
    input_batch = torch.LongTensor(indexes_batch).transpose(0, 1)

    input_batch = input_batch.to(device)
    lengths = lengths.to(device)
    
    # searcher를 이용하여 문장을 디코딩합니다
    tokens, scores = searcher(input_batch, lengths, max_length)
    
    # 인덱스 -> 단어
    decoded_words = [voc.index2word[token.item()] for token in tokens]
    return decoded_words


def evaluateInput(encoder, decoder, searcher, voc):
    input_sentence = ''
    while(1):
        try:
            # 입력 문장을 받아옵니다
            input_sentence = input('> ')
            # 종료 조건인지 검사합니다
            if input_sentence == 'q' or input_sentence == 'quit': break
            # 문장을 평가합니다
            output_words = evaluate(encoder, decoder, searcher, voc, input_sentence)
            # 응답 문장을 형식에 맞춰 출력합니다
            output_words[:] = [x for x in output_words if not (x == 'EOS' or x == 'PAD')]
            print('Bot:', ' '.join(output_words))

        except KeyError:
            print("Error: Encountered unknown word.")

In [17]:
model_name = 'cb_model'
attn_model = 'dot'
#attn_model = 'general'
#attn_model = 'concat'
hidden_size = 500
encoder_n_layers = 2
decoder_n_layers = 2
dropout = 0.1
batch_size = 64


loadFilename = None
checkpoint_iter = 4000
#loadFilename = os.path.join(save_dir, model_name, corpus_name,
#                            '{}-{}_{}'.format(encoder_n_layers, decoder_n_layers, hidden_size),
#                            '{}_checkpoint.tar'.format(checkpoint_iter))


# loadFilename이 제공되는 경우에는 모델을 불러옵니다
if loadFilename:
    # 모델을 학습할 때와 같은 기기에서 불러오는 경우
    checkpoint = torch.load(loadFilename)
    # GPU에서 학습한 모델을 CPU로 불러오는 경우
    #checkpoint = torch.load(loadFilename, map_location=torch.device('cpu'))
    encoder_sd = checkpoint['en']
    decoder_sd = checkpoint['de']
    encoder_optimizer_sd = checkpoint['en_opt']
    decoder_optimizer_sd = checkpoint['de_opt']
    embedding_sd = checkpoint['embedding']
    voc.__dict__ = checkpoint['voc_dict']


print('Building encoder and decoder ...')

embedding = nn.Embedding(voc.num_words, hidden_size)
if loadFilename:
    embedding.load_state_dict(embedding_sd)

encoder = EncoderRNN(hidden_size, embedding, encoder_n_layers, dropout)
decoder = LuongAttnDecoderRNN(attn_model, embedding, hidden_size, voc.num_words, decoder_n_layers, dropout)
if loadFilename:
    encoder.load_state_dict(encoder_sd)
    decoder.load_state_dict(decoder_sd)

encoder = encoder.to(device)
decoder = decoder.to(device)
print('Models built and ready to go!')

Building encoder and decoder ...
Models built and ready to go!


In [18]:
clip = 50.0
teacher_forcing_ratio = 1.0
learning_rate = 0.0001
decoder_learning_ratio = 5.0
n_iteration = 1000
print_every = 1
save_every = 500

# Dropout 레이어를 학습 모드로 둡니다
encoder.train()
decoder.train()

# Optimizer를 초기화합니다
print('Building optimizers ...')
encoder_optimizer = optim.Adam(encoder.parameters(), lr=learning_rate)
decoder_optimizer = optim.Adam(decoder.parameters(), lr=learning_rate * decoder_learning_ratio)
if loadFilename:
    encoder_optimizer.load_state_dict(encoder_optimizer_sd)
    decoder_optimizer.load_state_dict(decoder_optimizer_sd)


for state in encoder_optimizer.state.values():
    for k, v in state.items():
        if isinstance(v, torch.Tensor):
            state[k] = v.cuda()

for state in decoder_optimizer.state.values():
    for k, v in state.items():
        if isinstance(v, torch.Tensor):
            state[k] = v.cuda()

# 학습 단계를 수행합니다
print("Starting Training!")
trainIters(model_name, voc, pairs, encoder, decoder, encoder_optimizer, decoder_optimizer,
           embedding, encoder_n_layers, decoder_n_layers, save_dir, n_iteration, batch_size,
           print_every, save_every, clip, corpus_name, loadFilename)

Building optimizers ...
Starting Training!
Initializing ...
Training...


  loss = crossEntropy.masked_select(mask).mean()


Iteration: 1; Percent complete: 0.1%; Average loss: 9.2027
Iteration: 2; Percent complete: 0.2%; Average loss: 9.1231
Iteration: 3; Percent complete: 0.3%; Average loss: 8.9860
Iteration: 4; Percent complete: 0.4%; Average loss: 8.7660
Iteration: 5; Percent complete: 0.5%; Average loss: 8.2781
Iteration: 6; Percent complete: 0.6%; Average loss: 7.9006
Iteration: 7; Percent complete: 0.7%; Average loss: 7.4855
Iteration: 8; Percent complete: 0.8%; Average loss: 7.6309
Iteration: 9; Percent complete: 0.9%; Average loss: 8.1920
Iteration: 10; Percent complete: 1.0%; Average loss: 8.3932
Iteration: 11; Percent complete: 1.1%; Average loss: 8.3776
Iteration: 12; Percent complete: 1.2%; Average loss: 8.2994
Iteration: 13; Percent complete: 1.3%; Average loss: 7.6083
Iteration: 14; Percent complete: 1.4%; Average loss: 7.5981
Iteration: 15; Percent complete: 1.5%; Average loss: 7.1096
Iteration: 16; Percent complete: 1.6%; Average loss: 7.1135
Iteration: 17; Percent complete: 1.7%; Average lo

Iteration: 138; Percent complete: 13.8%; Average loss: 5.8257
Iteration: 139; Percent complete: 13.9%; Average loss: 5.5978
Iteration: 140; Percent complete: 14.0%; Average loss: 5.5835
Iteration: 141; Percent complete: 14.1%; Average loss: 5.7740
Iteration: 142; Percent complete: 14.2%; Average loss: 5.7726
Iteration: 143; Percent complete: 14.3%; Average loss: 5.4883
Iteration: 144; Percent complete: 14.4%; Average loss: 5.8711
Iteration: 145; Percent complete: 14.5%; Average loss: 5.7530
Iteration: 146; Percent complete: 14.6%; Average loss: 5.5723
Iteration: 147; Percent complete: 14.7%; Average loss: 5.7174
Iteration: 148; Percent complete: 14.8%; Average loss: 5.6991
Iteration: 149; Percent complete: 14.9%; Average loss: 5.6039
Iteration: 150; Percent complete: 15.0%; Average loss: 5.8640
Iteration: 151; Percent complete: 15.1%; Average loss: 5.7663
Iteration: 152; Percent complete: 15.2%; Average loss: 5.7373
Iteration: 153; Percent complete: 15.3%; Average loss: 5.8279
Iteratio

Iteration: 272; Percent complete: 27.2%; Average loss: 4.9375
Iteration: 273; Percent complete: 27.3%; Average loss: 4.9849
Iteration: 274; Percent complete: 27.4%; Average loss: 5.0724
Iteration: 275; Percent complete: 27.5%; Average loss: 5.1005
Iteration: 276; Percent complete: 27.6%; Average loss: 4.7670
Iteration: 277; Percent complete: 27.7%; Average loss: 5.1934
Iteration: 278; Percent complete: 27.8%; Average loss: 4.6743
Iteration: 279; Percent complete: 27.9%; Average loss: 5.1069
Iteration: 280; Percent complete: 28.0%; Average loss: 5.1421
Iteration: 281; Percent complete: 28.1%; Average loss: 5.2762
Iteration: 282; Percent complete: 28.2%; Average loss: 5.0297
Iteration: 283; Percent complete: 28.3%; Average loss: 5.0669
Iteration: 284; Percent complete: 28.4%; Average loss: 5.0440
Iteration: 285; Percent complete: 28.5%; Average loss: 5.0057
Iteration: 286; Percent complete: 28.6%; Average loss: 5.1427
Iteration: 287; Percent complete: 28.7%; Average loss: 5.0173
Iteratio

Iteration: 405; Percent complete: 40.5%; Average loss: 4.0497
Iteration: 406; Percent complete: 40.6%; Average loss: 4.1443
Iteration: 407; Percent complete: 40.7%; Average loss: 4.1473
Iteration: 408; Percent complete: 40.8%; Average loss: 3.8640
Iteration: 409; Percent complete: 40.9%; Average loss: 4.2450
Iteration: 410; Percent complete: 41.0%; Average loss: 3.9478
Iteration: 411; Percent complete: 41.1%; Average loss: 4.1852
Iteration: 412; Percent complete: 41.2%; Average loss: 3.9033
Iteration: 413; Percent complete: 41.3%; Average loss: 4.0484
Iteration: 414; Percent complete: 41.4%; Average loss: 4.2536
Iteration: 415; Percent complete: 41.5%; Average loss: 4.2222
Iteration: 416; Percent complete: 41.6%; Average loss: 3.8807
Iteration: 417; Percent complete: 41.7%; Average loss: 4.1016
Iteration: 418; Percent complete: 41.8%; Average loss: 3.8034
Iteration: 419; Percent complete: 41.9%; Average loss: 3.8387
Iteration: 420; Percent complete: 42.0%; Average loss: 3.8683
Iteratio

Iteration: 538; Percent complete: 53.8%; Average loss: 2.6291
Iteration: 539; Percent complete: 53.9%; Average loss: 2.7672
Iteration: 540; Percent complete: 54.0%; Average loss: 3.0544
Iteration: 541; Percent complete: 54.1%; Average loss: 2.9850
Iteration: 542; Percent complete: 54.2%; Average loss: 2.8469
Iteration: 543; Percent complete: 54.3%; Average loss: 2.9322
Iteration: 544; Percent complete: 54.4%; Average loss: 2.8742
Iteration: 545; Percent complete: 54.5%; Average loss: 2.8179
Iteration: 546; Percent complete: 54.6%; Average loss: 2.6806
Iteration: 547; Percent complete: 54.7%; Average loss: 2.6815
Iteration: 548; Percent complete: 54.8%; Average loss: 3.0526
Iteration: 549; Percent complete: 54.9%; Average loss: 2.9765
Iteration: 550; Percent complete: 55.0%; Average loss: 2.7991
Iteration: 551; Percent complete: 55.1%; Average loss: 2.8291
Iteration: 552; Percent complete: 55.2%; Average loss: 2.8004
Iteration: 553; Percent complete: 55.3%; Average loss: 2.7617
Iteratio

Iteration: 672; Percent complete: 67.2%; Average loss: 1.7603
Iteration: 673; Percent complete: 67.3%; Average loss: 1.7978
Iteration: 674; Percent complete: 67.4%; Average loss: 1.5383
Iteration: 675; Percent complete: 67.5%; Average loss: 1.7327
Iteration: 676; Percent complete: 67.6%; Average loss: 1.7933
Iteration: 677; Percent complete: 67.7%; Average loss: 2.0163
Iteration: 678; Percent complete: 67.8%; Average loss: 1.7083
Iteration: 679; Percent complete: 67.9%; Average loss: 1.6469
Iteration: 680; Percent complete: 68.0%; Average loss: 1.9637
Iteration: 681; Percent complete: 68.1%; Average loss: 1.6398
Iteration: 682; Percent complete: 68.2%; Average loss: 1.6787
Iteration: 683; Percent complete: 68.3%; Average loss: 2.0036
Iteration: 684; Percent complete: 68.4%; Average loss: 1.8235
Iteration: 685; Percent complete: 68.5%; Average loss: 1.9071
Iteration: 686; Percent complete: 68.6%; Average loss: 1.7570
Iteration: 687; Percent complete: 68.7%; Average loss: 1.6561
Iteratio

Iteration: 806; Percent complete: 80.6%; Average loss: 1.2993
Iteration: 807; Percent complete: 80.7%; Average loss: 0.9775
Iteration: 808; Percent complete: 80.8%; Average loss: 0.8790
Iteration: 809; Percent complete: 80.9%; Average loss: 1.0362
Iteration: 810; Percent complete: 81.0%; Average loss: 0.9483
Iteration: 811; Percent complete: 81.1%; Average loss: 1.0617
Iteration: 812; Percent complete: 81.2%; Average loss: 0.9959
Iteration: 813; Percent complete: 81.3%; Average loss: 1.0066
Iteration: 814; Percent complete: 81.4%; Average loss: 0.9373
Iteration: 815; Percent complete: 81.5%; Average loss: 0.8771
Iteration: 816; Percent complete: 81.6%; Average loss: 0.9682
Iteration: 817; Percent complete: 81.7%; Average loss: 1.0598
Iteration: 818; Percent complete: 81.8%; Average loss: 0.9549
Iteration: 819; Percent complete: 81.9%; Average loss: 0.8964
Iteration: 820; Percent complete: 82.0%; Average loss: 1.0014
Iteration: 821; Percent complete: 82.1%; Average loss: 1.0335
Iteratio

Iteration: 940; Percent complete: 94.0%; Average loss: 0.6017
Iteration: 941; Percent complete: 94.1%; Average loss: 0.6961
Iteration: 942; Percent complete: 94.2%; Average loss: 0.5969
Iteration: 943; Percent complete: 94.3%; Average loss: 0.4912
Iteration: 944; Percent complete: 94.4%; Average loss: 0.4203
Iteration: 945; Percent complete: 94.5%; Average loss: 0.5456
Iteration: 946; Percent complete: 94.6%; Average loss: 0.6472
Iteration: 947; Percent complete: 94.7%; Average loss: 0.5135
Iteration: 948; Percent complete: 94.8%; Average loss: 0.6468
Iteration: 949; Percent complete: 94.9%; Average loss: 0.5999
Iteration: 950; Percent complete: 95.0%; Average loss: 0.5529
Iteration: 951; Percent complete: 95.1%; Average loss: 0.5712
Iteration: 952; Percent complete: 95.2%; Average loss: 0.5622
Iteration: 953; Percent complete: 95.3%; Average loss: 0.4213
Iteration: 954; Percent complete: 95.4%; Average loss: 0.5444
Iteration: 955; Percent complete: 95.5%; Average loss: 0.5212
Iteratio

In [19]:
# Dropout 레이어를 평가 모드로 설정합니다
encoder.eval()
decoder.eval()

# 탐색 모듈을 초기화합니다
searcher = GreedySearchDecoder(encoder, decoder)

# 채팅을 시작합니다 (다음 줄의 주석을 제거하면 시작해볼 수 있습니다)
evaluateInput(encoder, decoder, searcher, voc)

> 안녕
Bot: 맘고생 많았어요." 기다려지겠네요." 기다려지겠네요." 기다려지겠네요."
> 안녕?
Error: Encountered unknown word.
> 대화가 이상해
Bot: 그래도 배울 점이 있으면 피하세요."
> 뭔소리야
Error: Encountered unknown word.


KeyboardInterrupt: Interrupted by user