In [1]:
from __future__ import unicode_literals, print_function, division
import pickle as pkl
import html
from io import open
import unicodedata
import string
import re
import random
import time
import math
import matplotlib.pyplot as plt
import numpy as np, pandas as pd
from sacrebleu import raw_corpus_bleu, corpus_bleu

In [2]:
import torch
from torch.utils.data import Dataset
import torch.nn as nn
from torch import optim
import torch.nn.functional as F
from torch.optim.lr_scheduler import ExponentialLR


In [3]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'

### Data Pre-processing

In [4]:
data_dir = 'data'
SOS_token = 0
EOS_token = 1
PAD_IDX = 2
UNK_IDX = 3
VOCAB_SIZE = 50000

In [5]:
def pkl_loader(file_name):
    with open(file_name+'.p', 'rb') as f:
        objct = pkl.load(f)
    return(objct)

def pkl_dumper(obj, file_name):
    with open(file_name+'.p', 'wb') as f:
        pkl.dump(obj, f, protocol=None)

In [6]:
def load_pretrained_wordvec(lan):
    if lan == 'zh':
        filename = 'wiki.zh.vec' #Chinese
    elif lan == 'en':
        filename = 'wiki-news-300d-1M.vec' #English
    else:
        filename = 'wiki.vi.vec' #Vietnamese
    with open(os.path.join(data_dir, filename),encoding='utf-8') as f:
        word_vecs = np.zeros((VOCAB_SIZE+4, 300))
        word_vecs[UNK_IDX] = np.random.normal(scale=0.6, size=(300, ))
        word_vecs[SOS_token] = np.random.normal(scale=0.6, size=(300, ))
        word_vecs[EOS_token] = np.random.normal(scale=0.6, size=(300, ))

        words_ft = {'<pad>': PAD_IDX,
                   '<unk>': UNK_IDX, 
                   '<sos>': SOS_token,
                   '<eos>': EOS_token}
        idx2words_ft = {PAD_IDX:'<pad>', UNK_IDX: '<unk>', SOS_token: '<sos>', EOS_token: '<eos>'}
        ordered_words_ft = ['<sos>', '<eos>', '<pad>', '<unk>']
        count = 0
        for i, line in enumerate(f):
            if len(idx2words_ft) >= VOCAB_SIZE+4: 
                break
            s = line.split()
            if (np.asarray(s[1:]).size == 300):
                word_vecs[count+4, :] = np.asarray(s[1:])
                words_ft[s[0]] = count+4
                idx2words_ft[count+4] = s[0]
                count += 1
    word_vecs = torch.FloatTensor(word_vecs)
    pkl_dumper(word_vecs, os.path.join(data_dir, lan + '_word_vecs'))
    pkl_dumper(words_ft, os.path.join(data_dir, lan + '_words_ft'))
    pkl_dumper(idx2words_ft, os.path.join(data_dir, lan + '_idx2words_ft'))


In [7]:
# load_pretrained_wordvec('zh')
# load_pretrained_wordvec('en')
# load_pretrained_wordvec('vi')

In [8]:
#load pretrained vectors
word_vecs = {}
word2index = {}
index2word = {}

word_vecs['en'] = pkl_loader(data_dir+'/en_word_vecs')
word_vecs['zh'] = pkl_loader(data_dir+'/zh_word_vecs')
word_vecs['vi'] = pkl_loader(data_dir+'/vi_word_vecs')
word2index['en'] = pkl_loader(data_dir+'/en_words_ft')
word2index['zh'] = pkl_loader(data_dir+'/zh_words_ft')
word2index['vi'] = pkl_loader(data_dir+'/vi_words_ft')
index2word['en'] = pkl_loader(data_dir+'/en_idx2words_ft')
index2word['zh'] = pkl_loader(data_dir+'/zh_idx2words_ft')
index2word['vi'] = pkl_loader(data_dir+'/vi_idx2words_ft')

VOCAB_SIZE = len(word2index['en'])

In [9]:
class Lang:
    def __init__(self, name, index2word, word2index):
        self.name = name
        self.word2index = word2index
        self.index2word = index2word
        self.n_words = len(index2word)


In [10]:
def normalizeString(s):
    s = re.sub(r"([.!?])", r" \1", s)
    s = html.unescape(s)
    return s

In [11]:
def asMinutes(s):
    m = math.floor(s / 60)
    s -= m * 60
    return '%dm %ds' % (m, s)


def timeSince(since, percent):
    now = time.time()
    s = now - since
    es = s / (percent)
    rs = es - s
    return '%s (- %s)' % (asMinutes(s), asMinutes(rs))

def showPlot(points):
    plt.figure()
    fig, ax = plt.subplots()
    # this locator puts ticks at regular intervals
    loc = ticker.MultipleLocator(base=0.2)
    ax.yaxis.set_major_locator(loc)
    plt.plot(points)

In [12]:
def loadingLangs(sourcelang, targetlang, setname):
    input_ls = []
    output_ls = []
    print('Reading lines...')
    # Read the file 
    with open(data_dir+'/iwslt-%s-%s/%s.tok.%s'%(sourcelang, targetlang, setname,sourcelang), encoding='utf-8') as f:
        for line in f.readlines():
            input_ls.append([normalizeString(word) for word in line.split()])
    with open(data_dir+'/iwslt-%s-%s/%s.tok.%s'%(sourcelang, targetlang, setname,targetlang), encoding='utf-8') as f:
        for line in f.readlines():
            output_ls.append([normalizeString(word) for word in line.split()])
    pairs = list(zip(input_ls, output_ls))
    pairs = [pair for pair in pairs if (len(pair[0])>0 and len(pair[1])>0)]
    print('Read %s sentence pairs'%(len(input_ls)))
    if sourcelang == 'zh':
        input_lang = Lang(sourcelang, index2word['zh'], word2index['zh'])
    else:
        input_lang = Lang(sourcelang, index2word['vi'], word2index['vi']) ####
    output_lang = Lang(targetlang, index2word['en'], word2index['en'])
    print("Counted words:")
    print(input_lang.name, input_lang.n_words)
    print(output_lang.name, output_lang.n_words)
    return input_lang, output_lang, pairs

In [15]:
source_tra, target_tra, pairs_tra = loadingLangs('zh', 'en', 'train')
source_val, target_val, pairs_val = loadingLangs('zh', 'en', 'dev')
source_tes, target_tes, pairs_tes = loadingLangs('zh', 'en', 'test')

Reading lines...
Read 213377 sentence pairs
Counted words:
zh 50004
en 50004
Reading lines...
Read 1261 sentence pairs
Counted words:
zh 50004
en 50004
Reading lines...
Read 1397 sentence pairs
Counted words:
zh 50004
en 50004


### Dataset

In [16]:
MAX_SENT_LEN = 38
BATCH_SIZE = 64

In [17]:
def indexesFromSentence(lang, sentence):
    return [lang.word2index[word] if word in lang.word2index else UNK_IDX for word in sentence]


def tensorFromSentence(lang, sentence):
    indexes = indexesFromSentence(lang, sentence)
    indexes.append(EOS_token)
    return torch.tensor(indexes, dtype=torch.long, device=device).view(-1, 1)


def tensorsFromPair(pair,source,target):
    input_lang = source
    output_lang = target
    input_tensor = tensorFromSentence(input_lang, pair[0]).reshape((-1))
    target_tensor = tensorFromSentence(output_lang, pair[1]).reshape((-1))
    return (input_tensor, input_tensor.shape[0], target_tensor, target_tensor.shape[0])


In [18]:
class NMTDataset(Dataset):
    def __init__(self, source, target, pairs):
        self.source = source
        self.target = target
        self.pairs = pairs
        
    def __len__(self):
        return len(self.pairs)
    
    def __getitem__(self, key):
        """
        Triggered when you call dataset[i]
        """
        inp_ten, inp_len, tar_ten, tar_len = tensorsFromPair(self.pairs[key], self.source, self.target)
        item = {}
        item['inputtensor'] = inp_ten[:MAX_SENT_LEN]
        item['inputlen'] = min(inp_len, MAX_SENT_LEN)
        item['targettensor'] = tar_ten[:MAX_SENT_LEN]
        item['targetlen'] = min(tar_len, MAX_SENT_LEN)
        return item

In [19]:
train_data = NMTDataset(source_tra, target_tra, pairs_tra)
val_data = NMTDataset(source_tra, target_tra, pairs_val)
test_data = NMTDataset(source_tra, target_tra, pairs_tes)

### Dataloader

In [20]:
#collate function

def collate_func(batch):
    """
    Customized function for DataLoader that dynamically pads the batch so that all
    data have the same length
    """
    src_data, tar_data, src_len, tar_len = [], [], [], []
    
    for datum in batch: 
        src_len.append(datum['inputlen'])
        tar_len.append(datum['targetlen'])
    
    max_length = [np.max(src_len), np.max(tar_len)]
    for datum in batch: 
        
        src_datum = np.pad(np.array(datum['inputtensor']),
                                pad_width=((0, max_length[0]-datum['inputlen'])),
                                mode="constant", constant_values=PAD_IDX)
        tar_datum = np.pad(np.array(datum['targettensor']),
                                pad_width=((0, max_length[1]-datum['targetlen'])),
                                mode="constant", constant_values=PAD_IDX)
        src_data.append(src_datum)
        tar_data.append(tar_datum)
        
        
    ind_dec_order = np.argsort(src_len)[::-1]
    src_data = np.array(src_data)[ind_dec_order]
    src_len = np.array(src_len)[ind_dec_order]
    tar_data = np.array(tar_data)[ind_dec_order]
    tar_len = np.array(tar_len)[ind_dec_order]
    return [torch.from_numpy(np.array(src_data)).to(device),torch.from_numpy(np.array(tar_data)).to(device),
                torch.LongTensor(np.array(src_len)).to(device),torch.LongTensor(np.array(tar_len)).to(device)]
    

In [21]:
train_loader = torch.utils.data.DataLoader(train_data,
                                           batch_size=BATCH_SIZE,shuffle=True,collate_fn=collate_func)

val_loader = torch.utils.data.DataLoader(val_data,
                                           batch_size=BATCH_SIZE,shuffle=False, collate_fn=collate_func)


In [22]:
#WO ATTN
class EncoderCNN(nn.Module):
    def __init__(self, input_size, hidden_size, emb_size=300, pretrained=False, ker_size=3, dropout_p=0.1):
        super(EncoderCNN, self).__init__()
        self.hidden_size = hidden_size
        self.emb_size = emb_size
        if pretrained==True:
            word_vec = word_vecs[source_tra.name]
            self.embedding = nn.Embedding(input_size, emb_size, padding_idx=PAD_IDX)
            self.embedding.weight = nn.Parameter(word_vec)
            self.embedding.requires_grad = False
        else:
            self.embedding = nn.Embedding(input_size, emb_size, padding_idx=PAD_IDX)
          
          
        self.seq1 = nn.Sequential(nn.Conv1d(emb_size, hidden_size, kernel_size=ker_size, padding=(ker_size-1)//2),
                                nn.ReLU())
        self.seq2 = nn.Sequential(nn.Conv1d(hidden_size, hidden_size, kernel_size=ker_size, padding=(ker_size-1)//2),
                                nn.ReLU())
        self.dropout = nn.Dropout(p=dropout_p)
        self.fc = nn.Linear(hidden_size, hidden_size)
    
    def forward(self, input, hidden):
        seq_len, batch_size = input.size()
        # input size for conv1d is , N is a batch size, C denotes a number of channels, L is a length of signal sequence.
        output = self.embedding(input) ## SL*BS*ES
        output = output.permute(1, 2, 0) 
        output = self.seq1(output)
        output = self.seq2(output)
        hidden = torch.sum(output, dim=2)
        hidden = hidden.unsqueeze(0)

        output = output.permute(2, 0, 1)
        hidden = self.fc(hidden)
        hidden = F.relu(self.dropout(hidden))
        hidden = self.fc(hidden)
        #print(hidden.size(), output.size()) #output: 1*BS*HS; hidden: SL*BS*HS
        return output, hidden
    
    def initHidden(self,batch_size):
        return torch.zeros(1, batch_size, self.hidden_size, device=device)

In [23]:
class DecoderRNN(nn.Module):
    def __init__(self, hidden_size, output_size):
        super(DecoderRNN, self).__init__()
        self.hidden_size = hidden_size
        self.embedding = nn.Embedding(output_size, hidden_size, padding_idx = PAD_IDX)
        self.gru = nn.GRU(hidden_size, hidden_size)
        self.out = nn.Linear(hidden_size, output_size)
        self.softmax = nn.LogSoftmax(dim=1)

    def forward(self, input, hidden):
        input = input.view(1,-1)
        batch_size = input.size()[1]
        #print('in Decoder, batch_size is {}'.format(batch_size))
        
        #print('in Decoder, input before embedded layer is {}, dimension is {}'.format(input,input.size()))
        output = self.embedding(input).view(1, batch_size, -1)
        #print('in Decoder, output after embedded is {}, dimension is {} \n'.format(output, output.size()))
        output = F.relu(output)
        #print('in Decoder, output after relu is {}, dimension is {} \n'.format(output, output.size()))
        #print('in Decoder, the initial hidden is {}, dimension is {}'.format(hidden, hidden.size()))
        
        output, hidden = self.gru(output, hidden)
        
        #print('in Decoder, output of GRU is {}, dimension is {}'.format(output, output.size()))
        #print('in Decoder, hidden of GRU is {}, dimension is {}'.format(hidden, hidden.size()))
        
        output = self.softmax(self.out(output[0]))
        #print('in Decoder, output after softmax is {}, dimension is {}'.format(output, output.size()))
        return output, hidden

    def initHidden(self, batch_size):
        initHidden = torch.zeros(1, batch_size, self.hidden_size, device=device)
        #print('in Decoder, initHidden is {}, dimension is {} \n'.format(initHidden, initHidden.size()))
        return initHidden

In [24]:
#function to mask loss after EOS
def mask_ind(arr):
    arr = arr.cpu().numpy()
    batch_size = arr.shape[1]

    for i in range(batch_size):
        if 1 in arr[:,i]:
            ind = np.where(arr[:,i]== 1)[0][0]
        
            arr[:,i][:ind+1]=1
            arr[:,i][ind+1:]=0
        else:
            arr[:,i]=1  
    return arr, np.count_nonzero(arr)

In [32]:
def train(input_tensor, target_tensor, encoder, decoder,
          encoder_optimizer, decoder_optimizer):
    
    batch_size = input_tensor.size()[1]
    #print('in train, batch size is {}'.format(batch_size))
    encoder_hidden = encoder.initHidden(batch_size)
    #print('in train, initial encoder hidden is {}, dimension is {}'.format(encoder_hidden, encoder_hidden.size()))
    
    encoder_optimizer.zero_grad()  
    decoder_optimizer.zero_grad()

    input_length = input_tensor.size()[0]
    #print('in train, input_length is {}'.format(input_length))
    target_length = target_tensor.size()[0]
    #print('in train, target_length is {}'.format(target_length))
   
    _, context = encoder(input_tensor, encoder_hidden)
    #print('in train encoder_hidden[0] is {}, dimension is {}'.format(encoder_hidden[0],encoder_hidden[0].size()))
    #print('in train after concatenating encoder_hidden[0] and [1] is {}, dimension is {}'.format(torch.cat((encoder_hidden[0].cpu().data,encoder_hidden[1].cpu().data),dim = 1), torch.cat((encoder_hidden[0].cpu().data,encoder_hidden[1].cpu().data),dim = 1).size()))
   
    
    decoder_input = torch.tensor([[SOS_token]*batch_size], device=device)  # decoder_input: torch.Size([1, 32])
    decoder_hidden = context.to(device)
    
    use_teacher_forcing = True if random.random() < teacher_forcing_ratio else False
    #print('target_tensor is {}, dimension is {}'.format(target_tensor, target_tensor.size()))
    
    
    #print('target_tensor is {}, dimension is {}'.format(target_tensor, target_tensor.size()))
    #print('sentence 3 in this batch is {}, dimension is {}'.format(convert_idx_2_sent_new(target_tensor[:,2], target_tra)))
    #print('sentence 3 in this batch is {}'.format(convert_idx_2_sent_new(target_tensor[:,2], target_tra)))
    
    if use_teacher_forcing:
        loss = 0 
        criterion = nn.NLLLoss(reduce = True, ignore_index = 2, reduction='elementwise_mean') 

        for di in range(target_length):
            #print('in teacher_forcing, step {}'.format(di))
            decoder_output, decoder_hidden = decoder(
                decoder_input, decoder_hidden)
            
            decoder_input = target_tensor[di]  
            #print('in teacher forcing, decoder_output at current timestep is {}, dimension is {}'.format(decoder_output, decoder_output.size()))
            #print('predicted target at current timestep is {}, dimension is {}'.format(torch.argmax(decoder_output, dim=1), torch.argmax(decoder_output, dim=1).size()))
            #print('true target at current timestep is {}, dimension is {}'.format(target_tensor[i], target_tensor[i].size()))
            #print('predicted target at current timestep is {}, dimension is {}'.format(decoder_output, decoder_output.size()))
            
            temp_loss = criterion(decoder_output, target_tensor[di])
            #print ('in teacher forcing, temp loss at current step is {}'.format(temp_loss))
            #print('temp_loss for current batch, current token is {}, dimension is {}'.format(temp_loss, temp_loss.size()))
            
            loss += temp_loss
            #loss += temp_loss * mask[di:di+1].float()  
            #print('loss is {}, dimension is {}'.format(loss, loss.size()))
            #ave_loss = loss.sum()/batch_size
        ave_loss = loss/target_length
            
    else:
        loss = None 
        criterion = nn.NLLLoss(reduction='none') 
        prediction = None

        for di in range(target_length):
            #print('in non_teacher forcing, step {}'.format(di))
            
            decoder_output, decoder_hidden = decoder(decoder_input, decoder_hidden)
            topv, topi = decoder_output.topk(1)
            #print('in non_teacher forcing, topi is {}, dimension is {}'.format(topi, topi.size()))
            
            if prediction is None:
                prediction = topi.view(1,-1)
            else:
                prediction = torch.cat((prediction, topi.view(1,-1)), dim=0)
            
            #print('at current step, cumulative prediction is {}, dimension is {}'.format(prediction, prediction.size()))
            
                            
            decoder_input = topi.transpose(0,1).detach()  # detach from history as input
            #print('in non_teacher forcing, input of the current step is {}, dimension is {}'.format(topi.transpose(0,1),topi.transpose(0,1).size()))
            #print('in non_teacher forcing decoder_output at current timestep is {}, dimension is {}'.format(decoder_output, decoder_output.size()))
            
            #print('predicted target at current timestep is {}, dimension is {}'.format(torch.argmax(decoder_output, dim=1), torch.argmax(decoder_output, dim=1).size()))

            #print('true target at current timestep is {}, dimension is {}'.format(target_tensor[i], target_tensor[i].size()))
            
            temp_loss = criterion(decoder_output, target_tensor[di])
            if loss is None:
                loss = temp_loss.view(1,-1)
            else:
                loss = torch.cat((loss, temp_loss.view(1,-1)),dim=0)
            #print('temp_loss for current batch, current token is {}, dimension is {}'.format(temp_loss, temp_loss.size()))
    
    #print('Final prediction is {}'.format(prediction))
        mask, count = mask_ind(prediction)
        total_loss = torch.sum(loss * torch.from_numpy(mask).float().to(device))
        ave_loss = total_loss/count
    #print('total_loss is {}, dimension is{}'.format(total_loss, total_loss.size()))        
    ave_loss.backward()
    encoder_optimizer.step()   
    decoder_optimizer.step()
    
    #print('total valid predicted token is {}'.format(count))
    #print('ave_loss type is {}'.format(type(ave_loss)))
    #print('ave_loss.item() type is {}'.format(type(ave_loss.item())))
    
    return ave_loss.item()

### Evaluation

In [26]:
def convert_idx_2_sent_new(idx_tensor, lang_obj):
    word_list = []
    for i in idx_tensor:
        if i.item() not in set([PAD_IDX,EOS_token,SOS_token]):
            word_list.append(lang_obj.index2word[i.item()])
    sent = (' ').join(word_list)
    return sent

In [27]:
def bleu_new(corpus,truths):
    n = len(corpus)
    pred_ls = []
    true_ls = []
    for i in range(n):
        pred, true = corpus[i], truths[i]
        pred_ls.append( [convert_idx_2_sent_new(sent, target_tra) for sent in pred])
        true_ls.append([convert_idx_2_sent_new(sent, target_tra) for sent in true])
    flattened_pred  = [val for sublist in pred_ls for val in sublist]
    flattened_true  = [val for sublist in true_ls for val in sublist]
    bleu= corpus_bleu(flattened_pred, [flattened_true]).score
    return bleu

In [28]:
def evaluate(encoder, decoder, data_loader, max_length=MAX_SENT_LEN):
    start = time.time()
    encoder.eval()
    decoder.eval()
    inputs = []
    corpus = []
    truths = []
    for i, (input_sentences, target_sentences,len1,len2) in enumerate(data_loader):
        inputs.append(input_sentences.to(device))#put into inputs: batch*seq: each row is a sentence
        input_tensor = input_sentences.transpose(0,1).to(device)
        truths.append(target_sentences.to(device))#put into truths: batch*seq: each row is a sentence
        target_tensor = target_sentences.transpose(0,1).to(device) 
        input_length = input_tensor.size()[0]
        batch_size = input_tensor.size()[1]
    
        
        encoder_hidden = encoder.initHidden(batch_size)
       
        _, context = encoder(input_tensor, encoder_hidden)
       
        decoder_hidden = context.to(device)
        decoder_input = torch.tensor([[SOS_token]*batch_size], device=device) 
        decoded_words = torch.zeros(batch_size, max_length)
    
    

        for di in range(max_length):
            decoder_output, decoder_hidden = decoder(
                decoder_input, decoder_hidden)
            topv, topi = decoder_output.data.topk(1)
            decoded_words[:,di] = topi.squeeze()  #put into decoded_words: batch*seq
            decoder_input = topi.transpose(0,1).detach()
            #print('true target is {}, dimension is {}'.format(target_tensor[:,di],target_tensor[di].size()))
            #print('before transpose, topi is {}, dimension is {}'.format(topi, topi.size()))
            #print('after transpose, topi is {}, dimension is {}'.format(topi.transpose(0,1),topi.transpose(0,1).size()))
        corpus.append(decoded_words)
        
        #print('last: decoded_words is {}, dimension is {}'.format(decoded_words, decoded_words.size()))
        #print('last: inputs is {}, dimension is {}'.format(inputs, len(inputs)))
        #print('last: truths is {}, dimension is {}'.format(truths, len(truths)))
        #print(inputs[0].size(), corpus[0].size(), truths[0].size())
    return inputs, corpus, truths



### Train

In [29]:
hyper = {
    'HIDDEN_SIZE': 512,
    'LR': 0.0004,
    'EVA_EVERY': 200,
    'DROP_OUT': 0.3,
    'TEACHER_RATIO': 0.9,
    'N_LAYERS': 1,
    'KER_SIZE': 3,
    'NUM_EPOCHS': 20   
}

In [37]:
hidden_size = hyper['HIDDEN_SIZE']
learning_rate = hyper['LR']
dropout_p = hyper['DROP_OUT']
teacher_forcing_ratio = hyper['TEACHER_RATIO']
n_layers = hyper['N_LAYERS']
ker_size = hyper['KER_SIZE']
num_epoch = hyper['NUM_EPOCHS']
eva_every = hyper['EVA_EVERY']
early_stopping = True
patience = 30
required_progress = 0.0001
best_score = None
count = 0
start_epoch=0
filename = 'best'

encoder1 = EncoderCNN(source_tra.n_words,hidden_size).to(device)
decoder1 = DecoderRNN(hidden_size, target_tra.n_words).to(device)

start = time.time()

encoder_optimizer = optim.Adam(encoder1.parameters(), lr=learning_rate)
decoder_optimizer = optim.Adam(decoder1.parameters(), lr=learning_rate)
# decoder_scheduler = ExponentialLR(decoder_optimizer, gamma=0.95)
# encoder_scheduler = ExponentialLR(decoder_optimizer, gamma=0.95)

whole_train_loss = []
whole_val_bleu = []
for epoch in range(1, num_epoch + 1):
    print_bleu_score_val = []
    print_losses = []
    print_loss_total = 0    
    for i, (input_sentences, target_sentences,len1,len2) in enumerate(train_loader): 
        encoder1.train()
        decoder1.train()
        input_tensor = input_sentences.transpose(0,1)   
        target_tensor = target_sentences.transpose(0,1)
        loss = train(input_tensor, target_tensor, encoder1,
                     decoder1, encoder_optimizer, decoder_optimizer)
        print_loss_total += loss
        
        if i > 0 and i % eva_every == 0:
            inputs, corpus, truths = evaluate(encoder1, decoder1, val_loader, max_length=MAX_SENT_LEN)
            bleu_score_val_avg = bleu_new(corpus, truths)

            print_loss_avg = print_loss_total / eva_every
            print_loss_total = 0
            print('Time: {}, Epoch: [{}/{}], Step: [{}/{}], Train Loss: {}, BLEU: {}'.format(
                timeSince(start, i + 1/len(train_loader)), epoch, num_epoch, i, 
                len(train_loader),print_loss_avg,bleu_score_val_avg))
            
            print_bleu_score_val.append(bleu_score_val_avg)
            if best_score is None:
                  best_score = bleu_score_val_avg
            if bleu_score_val_avg < best_score + required_progress:
                  count += 1
            elif bleu_score_val_avg > best_score:
                state = {'epoch': start_epoch + epoch + 1, 
                           'state_dict_enc': encoder1.state_dict(),
                           'state_dict_dec': decoder1.state_dict(), 
                           'best_accuracy': best_score, 
                           'optimizer_enc': encoder_optimizer.state_dict(),
                          'optimizer_dec': decoder_optimizer.state_dict()}
                print ('new best achieved')
                torch.save(state, filename+'.pth.tar')
                best_score = bleu_score_val_avg
                count = 0
            if early_stopping:
                if count >= patience:
                    print("earily stop triggered")


            print('\nInput1:> %s'%(' '.join([source_tra.index2word[i.item()] for i in inputs[0][3] if i.item() not in set([PAD_IDX,EOS_token,SOS_token])])))
            print('\nTarget1:= %s'%(convert_idx_2_sent_new(truths[0][3], target_tra)),
                    '\nPredict1:< %s' %(convert_idx_2_sent_new(corpus[0][3], target_tra)))
            
            print('\nInput2:> %s'%(' '.join([source_tra.index2word[i.item()] for i in inputs[1][3] if i.item() not in set([PAD_IDX,EOS_token,SOS_token])])))
            print('\nTarget2:= %s'%(convert_idx_2_sent_new(truths[1][3], target_tra)),
                    '\nPredict2:< %s' %(convert_idx_2_sent_new(corpus[1][3], target_tra)))
    if early_stopping:
        if count >= patience:
            break

    whole_train_loss.append(print_loss_avg)
    whole_val_bleu.append(print_bleu_score_val)
    pkl.dump(whole_val_bleu, open(data_dir+'NOATTNCNN_bleu_score_list.pkl','wb'))
    pkl.dump(whole_train_loss,open(data_dir+'NOATTNCNN_train_loss.pkl','wb'))
    pkl.dump(truths, open(data_dir+'NOATTNCNN_truths.pkl','wb'))
    pkl.dump(corpus,open(data_dir+'NOATTNCNN_corpus.pkl','wb'))

Time: 1m 2s (- -2m 57s), Epoch: [1/20], Step: [200/3327], Train Loss: 6.2429876494407655, BLEU: 2.6154890736512644

Input1:> 他 出身 阿富汗 <unk> <unk> 地区 有着 与 他人 不同 的 见解 他 坚持 让 他 的 女儿 我 的 母亲 去 上学 并 因此 被迫 与 他 的 父亲 断绝 父子 <unk> <unk> 关系

Target1:= A total maverick from a remote province of Afghanistan , he insisted that his daughter , my mom , go to school , and for that he was <unk> by his father <unk> 
Predict1:< And the the of the of the , and the , and the , and the , and the world , and the world , and the world , and the world , and the <unk> <unk>

Input2:> 我 本 以为 只有 我们 意大利 <unk> 大利 <unk> 在 非洲 栽 了 <unk> 但 当 我 知道 美国 国人 的 情况 英国 <unk> 国人 的 情况 法国 <unk> 国人 的 情况 当 看 了 他们 的 所作

Target2:= I thought it was only us Italians <unk> around Africa , but then I saw what the Americans were doing , what the English were doing , what the French were doing , and after seeing what they 
Predict2:< And I , the , and , and , and , and , and the , and , and the , and , and the , and the , and the world , and 

KeyboardInterrupt: 