In [1]:
#!pip3 install jieba

In [2]:
#!python preprocess_translation/token_zh_en.py

In [3]:
from __future__ import unicode_literals, print_function, division
from io import open
import unicodedata
import string
import re
import random
import operator
from torch.utils.data import Dataset
from collections import Counter
import matplotlib.pyplot as plt
import numpy as np
import torch
import torch.nn as nn
from torch import optim
import torch.nn.functional as F
import sacrebleu
from masked_cross_entropy import *

import random
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [4]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
PAD_token = 0
SOS_token = 1
EOS_token = 2
vocab_size = 60000
hidden_size = 256
# emb_size = 256
MAX_LENGTH = 100 # since 99% source sentence is <= 100
# MAX_LENGTH_1 = max(len(pair[0].split(" ")) for pair in pairs)
# MAX_LENGTH_2 = max(len(pair[1].split(" ")) for pair in pairs)
dropout_p = 0.1
teacher_forcing_ratio = 0.5
BATCH_SIZE = 32

## Loading Data

In [9]:
class Lang:
    def __init__(self, name):
        self.name = name
        self.word2index = {}
        self.word2count = {}
        self.index2word = {0: "PAD", 1: "SOS", 2:"EOS", 3:"UNK"}
        self.n_words = 4  # Count SOS and EOS

    def addSentence(self, sentence):
        for word in sentence.split(' '):
            self.addWord(word)

    def addWord(self, word):
        if word not in self.word2index:
            self.word2index[word] = self.n_words
            self.word2count[word] = 1
            self.index2word[self.n_words] = word
            self.n_words += 1
        else:
            self.word2count[word] += 1

In [10]:
def normalizeEnString(s):
#    s = unicodeToAscii(s.lower().strip())
#    s = re.sub(r"&apos;", r" ", s)
    s = re.sub(r"([.!?])", r" \1", s)
    s = re.sub(r"[^a-zA-Z.!?]+", r" ", s)
    return s
def normalizeChString(s):
#    s = re.sub(r"([。！？])", r" \1", s)
#    s = re.sub(r"[.-*]+", r" ", s)
    return s.strip() 

In [11]:
normalizeEnString("It &apos;s very pretty , and it has rapidly started to overgrow the once very rich biodiversity of the northwestern Mediterranean .")

'It apos s very pretty and it has rapidly started to overgrow the once very rich biodiversity of the northwestern Mediterranean .'

In [12]:
def readLangs(lang1, lang2, data='train'):
    #data: train/dev/test
    print("Reading lines...")

    # Read the file and split into lines
    zh_lines = open('iwslt-zh-en/{}.tok.zh'.format(data)).read().split('\n')
    en_lines = open('iwslt-zh-en/{}.tok.en'.format(data)).read().split('\n')
    # Split every line into pairs and normalize
    pairs = [[normalizeChString(element[0]), normalizeEnString(element[1])] for element in zip(zh_lines, en_lines)]

    input_lang = Lang(lang1)
    output_lang = Lang(lang2)

    return input_lang, output_lang, pairs

In [13]:
def prepareData(lang1, lang2):
    input_lang, output_lang, pairs = readLangs(lang1, lang2)
    print("Read %s sentence pairs" % len(pairs))
#    pairs = filterPairs(pairs)
#    print("Trimmed to %s sentence pairs" % len(pairs))
    print("Counting words...")
    for pair in pairs:
        input_lang.addSentence(pair[0])
        output_lang.addSentence(pair[1])
    print("Counted words:")
    print(input_lang.name, input_lang.n_words)
    print(output_lang.name, output_lang.n_words)
    return input_lang, output_lang, pairs


In [111]:
def build_topwordVocab(lang, vocab_size):
    print("Build vocabulary by top {} frequent word...".format(vocab_size))
    sorted_word2Count = sorted(lang.word2count.items(),
        key=operator.itemgetter(1),
        reverse=True)
    sorted_words = [x[0] for x in sorted_word2Count[:vocab_size]]
    
    lang.word2index = {}

    for ind, word in enumerate(sorted_words):
            lang.word2index[word] = ind + 4

#     lang.word2index = {}
    lang.index2word = {}
    lang.index2word[0] = "PAD"
    lang.index2word[1] = "SOS"
    lang.index2word[2] = "EOS"
    lang.index2word[3] = "UNK"

    for ind, word in enumerate(sorted_words):
        lang.index2word[ind + 4] = word
    
    lang.n_words = len(lang.index2word)
    
    print(lang.name, lang.n_words)
    return lang

input_lang, output_lang, pairs = prepareData('ch', 'eng')

input_lang = build_topwordVocab(input_lang,vocab_size=85000)
output_lang = build_topwordVocab(output_lang, vocab_size=85000)
print(random.choice(pairs))

Reading lines...
Read 213378 sentence pairs
Counting words...
Counted words:
ch 91144
eng 59373
Build vocabulary by top 85000 frequent word...
ch 85004
Build vocabulary by top 85000 frequent word...
eng 59373
['最 开始 我要 追溯到   我 还 在 俄亥俄州 巴伯 顿 的 欧克 代尔 学校 上 三年级 的 时候', 'I apos m going to start way back in the third grade at Oakdale School in Barberton Ohio .']


In [112]:
input_lang.n_words

85004

In [16]:
_, _, val_pairs = readLangs('ch', 'eng', 'dev')

Reading lines...


In [17]:
val_pairs = val_pairs[:-1] # since last line is '',''

In [18]:
print(random.choice(val_pairs))

['我 欣然同意 。', 'And I said quot Yes . quot ']


In [19]:
val_input = [normalizeChString(line) for line in open('iwslt-zh-en/dev.tok.zh').read().split('\n')]
val_output = [normalizeEnString(line) for line in open('iwslt-zh-en/dev.tok.en').read().split('\n')]
val_inputs = []
val_outputs = []
for element in zip(val_input, val_output):
    val_inputs.append(element[0])
    val_outputs.append(element[1])

In [104]:
# sorted_word2Count = sorted(output_lang.word2count.items(),
#     key=operator.itemgetter(1),
#     reverse=True)

In [105]:
# output_lang.word2index ###标点符号排第一 之后要改掉

# Preparing Training Data

In [121]:
def indexesFromSentence(lang, sentence):
    idxs = []
    for word in sentence.split(' '):
        try:
            idxs.append(lang.word2index[word])
        except KeyError:
            idxs.append(3)  # 3 is the id of 'UNK'
    idxs.append(EOS_token)
    return idxs

# def tensorFromSentence(lang, sentence):
#     indexes = indexesFromSentence(lang, sentence)
#     indexes.append(EOS_token)
#     return torch.tensor(indexes, dtype=torch.long, device=device).view(-1, 1)


# def tensorsFromPair(pair):
#     input_tensor = tensorFromSentence(input_lang, pair[0])
#     target_tensor = tensorFromSentence(output_lang, pair[1])
#     return (input_tensor, target_tensor)

class VocabDataset(Dataset):
    def __init__(self, pairs):
#         pairs = [tensorsFromPair(pair) for pair in pairs]
#         self.source_sent_list = [i[0] for i in pairs]
#         self.target_sent_list = [i[1] for i in pairs]
        
        self.source_sent_list = [indexesFromSentence(input_lang,pair[0]) for pair in pairs]
        self.target_sent_list = [indexesFromSentence(output_lang,pair[1]) for pair in pairs]
        
    def __len__(self):
        return len(self.source_sent_list)
        
    def __getitem__(self, key):
        token1_idx = self.source_sent_list[key][:MAX_LENGTH]
        token2_idx = self.target_sent_list[key][:MAX_LENGTH]
        return [token1_idx,token2_idx, len(token1_idx), len(token2_idx)]

    
def Vocab_collate_func(batch):
    source_sent_list = []
    target_sent_list = []
    source_len_list = []
    target_len_list = []

    for datum in batch:   ### batch = sample
        source_len_list.append(datum[2])
        target_len_list.append(datum[3])

    # padding
    for datum in batch:
        
        # source sentence processing
        padded_source = np.pad(np.array(datum[0]), 
                                pad_width=((0,MAX_LENGTH-datum[2])),          ### 0代表左边没有pad,右边的值代表右边pad的个数
                                mode="constant", constant_values=PAD_token)
        source_sent_list.append(padded_source)
        
        # target sentence processing
        padded_target = np.pad(np.array(datum[1]), 
                                pad_width=((0,MAX_LENGTH-datum[3])),          ### 0代表左边没有pad,右边的值代表右边pad的个数
                                mode="constant", constant_values=PAD_token)
        target_sent_list.append(padded_target)
        
    return [torch.tensor(source_sent_list,device = device), 
            torch.tensor(target_sent_list,device = device),
            torch.LongTensor(source_len_list,device = device), 
            torch.LongTensor(target_len_list,device = device)]

train_dataset = VocabDataset(pairs)
train_loader = torch.utils.data.DataLoader(dataset=train_dataset, 
                                           batch_size=BATCH_SIZE,
                                           collate_fn=Vocab_collate_func,
                                           shuffle=True)

val_dataset = VocabDataset(val_pairs)
val_loader = torch.utils.data.DataLoader(dataset=val_dataset,
                                        batch_size=BATCH_SIZE,
                                        collate_fn=Vocab_collate_func,
                                        shuffle=False)

In [122]:
for i,(inputs, outputs, len1, len2) in enumerate(train_loader):
    a, b = outputs, len1
    break

In [123]:
a

tensor([[ 1594,  9489,  6646,  ...,     0,     0,     0],
        [  542,  6291,     4,  ...,     0,     0,     0],
        [   32,    20,   537,  ...,     0,     0,     0],
        ...,
        [ 3787,    81, 57886,  ...,     0,     0,     0],
        [   56,     6,    41,  ...,     0,     0,     0],
        [ 2988,   439,  3670,  ...,     0,     0,     0]])

In [124]:
a[0,:]

tensor([1594, 9489, 6646,  286,   61,    5,  565, 2454,   83, 4014,    7, 5938,
         588,    9,   18,    6,  169,  200,  173,  588,  513,  421,   16,   62,
           9, 9328,   16,   26,  215,   29,    6,   31, 4050,   28,  115, 2709,
          10, 2035,    4,    2,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0])

# Build Encoder-Decoder

In [125]:
class EncoderRNN(nn.Module):
    def __init__(self, input_size, hidden_size, n_layers=1, dropout=0):
        super(EncoderRNN, self).__init__()
        self.hidden_size = hidden_size
        self.n_layers = n_layers
        self.dropout = dropout
        self.embedding = nn.Embedding(input_size, hidden_size)
        self.gru = nn.GRU(hidden_size, hidden_size, n_layers, dropout=self.dropout, bidirectional=True)
#        self.linear = nn.Linear(2*hidden_size, hidden_size)
    def forward(self, input, hidden, batch_size):
        embedded = self.embedding(input).view(1, batch_size, -1)
        output = embedded
        output, hidden = self.gru(output, hidden)
#        output = self.linear(output)

        output = output[:, :, :self.hidden_size] + output[:, : ,self.hidden_size:]
        return output, hidden
        #output: eq_len, batch, num_directions * hidden_size
        #hidden: num_layers * num_directions, batch, hidden_size)
        
    def initHidden(self, batch_size):
        return torch.zeros(2, batch_size, self.hidden_size, device=device)

In [126]:
class AttnDecoderRNN(nn.Module):
    def __init__(self, hidden_size, output_size, dropout_p=0, n_layers=1,max_length=MAX_LENGTH):
        super(AttnDecoderRNN, self).__init__()
        self.hidden_size = hidden_size
        self.output_size = output_size
        self.dropout_p = dropout_p
        self.max_length = max_length
        self.n_layers = n_layers
        self.embedding = nn.Embedding(self.output_size, self.hidden_size, padding_idx=0)
        self.attn = nn.Linear(self.hidden_size * 2, self.max_length)
        self.attn_combine = nn.Linear(self.hidden_size * 2, self.hidden_size)
        self.dropout = nn.Dropout(self.dropout_p)
        self.gru = nn.GRU(self.hidden_size, self.hidden_size, n_layers, dropout=dropout_p)
        self.out = nn.Linear(self.hidden_size, self.output_size)

    def forward(self, input, hidden, encoder_outputs, batch_size):
        embedded = self.embedding(input).view(1, batch_size, -1)
        embedded = self.dropout(embedded)
        
#         embedded: torch.Size([1, 32, 256])
#         hidden: torch.Size([1, 32, 256])
        
        attn_weights = F.softmax(
            self.attn(torch.cat((embedded[0], hidden[0]), 1)), dim=1)   
#         attn_weights:torch.Size([32, 100])
        attn_applied = torch.bmm(attn_weights.unsqueeze(1),
                                 encoder_outputs.transpose(0,1))
#         encoder_outputs: 100*32*256 attn_applied: 32*1*256
        output = torch.cat((embedded[0], attn_applied.transpose(0,1)[0]), 1)

        # output: 32*768
        output = self.attn_combine(output).unsqueeze(0)
        # output 1*32*256
        output = F.relu(output)
        output, hidden = self.gru(output, hidden)
        output = F.log_softmax(self.out(output[0]), dim=1)
#         output: torch.Size([32, 69126])
#         hidden: torch.Size([1, 32, 256])
        return output, hidden, attn_weights

# Testing the models

# Training Model

In [127]:
import time
import math


def asMinutes(s):
    m = math.floor(s / 60)
    s -= m * 60
    return '%dm %ds' % (m, s)


def timeSince(since, percent):
    now = time.time()
    s = now - since
    es = s / (percent)
    rs = es - s
    return '%s (- %s)' % (asMinutes(s), asMinutes(rs))

In [128]:
# this is just one sentence input, could be batchlized 
def train(input_tensor, target_tensor, input_lengths, target_lengths, encoder, decoder, 
          encoder_optimizer, decoder_optimizer, criterion, clip=50.0, max_len=MAX_LENGTH, mask = None):
    encoder_optimizer.zero_grad()  # zero out the accumulated gradient over mini-batch
    decoder_optimizer.zero_grad()
    
    batch_size = input_tensor.size(1)

#     input_length = input_tensor.size(0) # length of source sentence
#     target_length = target_tensor.size(0)


#    encoder_outputs, encoder_hidden = encoder(input_tensor, input_lengths, 0)
    encoder_hidden = encoder.initHidden(batch_size)
    encoder_outputs = torch.zeros(max_len, batch_size, encoder.hidden_size, device=device) 
 
    # feed-forward layer resulting encoder outputs, ei refers to each word token in input sentence
    for ei in range(max_len):
        encoder_output, encoder_hidden = encoder(
            input_tensor[ei], encoder_hidden, batch_size)  
        encoder_outputs[ei] = encoder_output[0] 

    #encoder_outputs:  # max_len x batch_size x hidden_size
    #hidden: n_layers * 2 x batch_size x hidden_size
    loss = 0

    
    
    decoder_input = torch.tensor([[SOS_token]*batch_size], device=device)  # decoder_input: torch.Size([1, 32])
    # init decoder hidden 
    decoder_hidden = encoder_hidden[:decoder.n_layers] # Use last (forward) hidden state from encoder
    
#    max_target_length = max(target_lengths)
    all_decoder_outputs = Variable(torch.zeros(max_len, batch_size, decoder.output_size))
    
    use_teacher_forcing = True if random.random() < teacher_forcing_ratio else False
    
    if use_teacher_forcing:
        # Teacher forcing: Feed the target as the next input
        for di in range(max_len):
            decoder_output, decoder_hidden, decoder_attention = decoder(
                decoder_input, decoder_hidden, encoder_outputs, batch_size)
            
            decoder_input = target_tensor[di]  # Teacher forcing
            all_decoder_outputs[di] = decoder_output
#            temp_loss = criterion(decoder_output, target_tensor[di])
#            loss += temp_loss * mask[di:di+1].float()  
#            loss += temp_loss.float()
#            ave_loss = loss.sum()/batch_size 
            
    else:
        # Without teacher forcing: use its own predictions as the next input
        for di in range(max_len):
            # decoder_input: torch.Size([1, 32])
            # decoder_hidden: torch.Size([1, 32, 256]) 1 token * batch * hidden size
            # encoder_outputs: torch.Size([100, 32, 512])
            decoder_output, decoder_hidden, decoder_attention = decoder(
                decoder_input, decoder_hidden, encoder_outputs, batch_size)
            topv, topi = decoder_output.topk(1)
            # topv: 32*1
            # topi: 32*1

            decoder_input = topi.squeeze().detach()  # detach from history as input
            all_decoder_outputs[di] = decoder_output
            # decoder_input: 32
            # target_tensor: 100*32
            # decoder_output: 32*69127 
#            temp_loss = criterion(decoder_output, target_tensor[di])
#            loss += temp_loss * mask[di:di+1].float()
#            loss += temp_loss.float()
            # loss size 1*32
#            ave_loss = loss.sum()/batch_size  
            
    # Loss calculation and backpropagation

    loss = masked_cross_entropy(
        all_decoder_outputs.transpose(0, 1).contiguous(), # -> batch x seq
        target_tensor.transpose(0, 1).contiguous(), # -> batch x seq
        target_lengths
    )
    loss.backward()
    #    ave_loss.backward()
    
    # Clip gradient norms
    ec = torch.nn.utils.clip_grad_norm_(encoder.parameters(), clip)
    dc = torch.nn.utils.clip_grad_norm_(decoder.parameters(), clip)


    
    encoder_optimizer.step()   # update parameters
    decoder_optimizer.step()

    return loss.data[0]

In [129]:
def trainIters(encoder, decoder, n_iters, print_every=100, plot_every=100, learning_rate=0.005):
    start = time.time()

    encoder_optimizer = optim.Adam(encoder.parameters(), lr=learning_rate)
    decoder_optimizer = optim.Adam(decoder.parameters(), lr=learning_rate)
    criterion = nn.NLLLoss(reduction='none') ##!!!!!!!!!!1 这个loss是否要换成crossentropy

    for epoch in range(1, n_iters + 1):
        plot_losses = []
        print_loss_total = 0  # Reset every print_every
        plot_loss_total = 0  # Reset every plot_every
        for i, (input_sentences, target_sentences,len1,len2) in enumerate(train_loader): 
            input_tensor = input_sentences.transpose(0,1)   # 13*100 to 100*13
            target_tensor = target_sentences.transpose(0,1)
            mask = target_tensor.ge(1)   # 100 * 13
            loss = train(input_tensor, target_tensor, len1, len2, encoder,
                         decoder, encoder_optimizer, decoder_optimizer, criterion, mask = mask)
            print_loss_total += loss
            plot_loss_total += loss

            if i > 0 and i % print_every == 0:
                print_loss_avg = print_loss_total / print_every
                print_loss_total = 0
                bleu_score, (sys_sents, ref_sents) = test_model(encoder, decoder, val_loader)
                print('Time: {}, Epoch: [{}/{}], Step: [{}/{}], Train Loss: {}, Validation Score: {} \n Predicted sentence: {} \n Reference sentence: {}'.format(
                    timeSince(start, i + 1/len(train_loader)), epoch, n_iters, i, 
                    len(train_loader),print_loss_avg, bleu_score, sys_sents, ref_sents))

            if i > 0 and i % plot_every == 0:
                plot_loss_avg = plot_loss_total / plot_every
                plot_losses.append(plot_loss_avg)
                plot_loss_total = 0
                
        print(plot_losses)
        showPlot(plot_losses)

# Plotting results

In [130]:
import matplotlib.pyplot as plt
plt.switch_backend('agg')
import matplotlib.ticker as ticker
import numpy as np


def showPlot(points):
    plt.figure()
    fig, ax = plt.subplots()
    # this locator puts ticks at regular intervals
    loc = ticker.MultipleLocator(base=0.2)
    ax.yaxis.set_major_locator(loc)
    plt.plot(points)

# Evaluation

In [131]:
def get_batch_outputs(encoder, decoder, input_sentences, max_length=MAX_LENGTH): 
    with torch.no_grad():
        input_tensor = input_sentences.transpose(0,1)   # 32*100 to 100*32
        batch_size = input_tensor.size(1)
        encoder_hidden = encoder.initHidden(batch_size)

#        input_length = input_tensor.size(0) # length of source sentence
        encoder_outputs = torch.zeros(max_length, batch_size, encoder.hidden_size, device=device) 
        
        for ei in range(max_length):
            encoder_output, encoder_hidden = encoder(input_tensor[ei], 
                                                     encoder_hidden, batch_size)  
            encoder_outputs[ei] = encoder_output[0] 
            

            
        decoder_input = torch.tensor([[SOS_token]*batch_size], device=device)  # decoder_input: torch.Size([1, 32])
        decoder_hidden = encoder_hidden[:decoder.n_layers]
        
        decoded_words = np.empty((max_length, batch_size), dtype=object)
        
        for di in range(max_length):
            decoder_output, decoder_hidden, decoder_attention = decoder(
                decoder_input, decoder_hidden, encoder_outputs, batch_size)
            topv, topi = decoder_output.topk(1)

#            topi_lst = topi.squeeze().detach().tolist()
            decoder_input = topi.squeeze().detach()  # detach from history as input
            
            decoded_words[di:] = np.array(['<EOS>' if idx==EOS_token else output_lang.index2word[idx] for idx in decoder_input.tolist()])
        
        return decoded_words.transpose()
        

In [150]:
def test_model(encoder, decoder, loader):
    score = []
    for i, (input_sentences, target_sentences, len1, len2) in enumerate(loader):
        batch_size = input_sentences.size(0)
#        print(batch_size)
        sys_sentences = []
        for sentence in get_batch_outputs(encoder, decoder, input_sentences):
            try:
                end_idx = sentence.tolist().index('<EOS>')
                sys_sentences.append(' '.join(sentence[:end_idx]))
            except ValueError:
                sys_sentences.append(' '.join(sentence))
#        sys_sentences = [' '.join(sentence) for sentence in get_batch_outputs(encoder, decoder, input_sentences)]
        ref_sentences = [val_pair[1] for val_pair in val_pairs[i*batch_size:(i+1)*batch_size]]
#        print(ref_sentences)
        score_batch = [sacrebleu.corpus_bleu([sys],[[ref]]).score for sys, ref in zip(sys_sentences, ref_sentences)]
        score.append(sum(score_batch)/len(score_batch))
    idx = random.randint(0,min(len(sys_sentences),len(ref_sentences)))
    return sum(score)/len(score), (sys_sentences[idx], ref_sentences[idx])


In [151]:
# encoder1 = EncoderRNN(input_lang.n_words, hidden_size).to(device)
# attn_decoder1 = AttnDecoderRNN(hidden_size, output_lang.n_words, dropout_p=0).to(device)

# test_model(encoder1, attn_decoder1, val_loader)

(0.1807314790419282,
 ('house Dunlop forging Berner solid rough expertise Starters Writing accounted thickening Dentistry Dentistry Dentistry Grasshopper Dentistry Dentistry Dentistry Dentistry looking teardrop expertise types chlamydia reenacted pulses Lithuania complex expertise Starters Writing Berner expertise solid Starters Writing accounted thickening thickening frequent wrecked Donaldson Miwa accounted thickening complex expertise types chlamydia reenacted pulses Lithuania complex expertise Starters Writing Berner expertise solid Starters Writing accounted thickening thickening frequent wrecked Donaldson Miwa accounted thickening complex expertise types chlamydia reenacted pulses Lithuania complex expertise Starters Writing Berner expertise solid Starters Writing accounted thickening thickening frequent wrecked Donaldson Miwa accounted thickening complex expertise types chlamydia reenacted',
  'So I set it up in my home two years ago and since then we have never experienced any 

In [134]:
# def evaluateRandomly(encoder, decoder, n=10):
#     for i in range(n):
#         pair = random.choice(pairs)
#         print('>', pair[0])
#         print('=', pair[1])
#         output_words = generate_output(encoder, decoder, pairs)
#         output_sentence = ' '.join(output_words)
#         print('<', output_sentence)
#         print('')

# TRAINING AND EVALUATING

In [None]:
encoder1 = EncoderRNN(input_lang.n_words, hidden_size).to(device)
attn_decoder1 = AttnDecoderRNN(hidden_size, output_lang.n_words, dropout_p=0).to(device)

trainIters(encoder1, attn_decoder1, 3, print_every=20, plot_every=1)

torch.save(encoder1.state_dict(), "saved_model/encoder_hiddenSize{}".format(hidden_size))
torch.save(attn_decoder1.state_dict(), "saved_model/attn_decoder_hiddenSize{}".format(hidden_size))



Time: 11m 47s (- -12m 47s), Epoch: [1/3], Step: [20/6669], Train Loss: 1.673552219390869, Validation Score: 0.272699926481706 
 Predicted sentence: . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . 
 Reference sentence: Thanks .
