In [1]:
from __future__ import unicode_literals, print_function, division
import pickle as pkl
import os
from io import open
import unicodedata
import string
import re
import random
import torch
import sacrebleu
from torch.utils.data import Dataset
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
plt.switch_backend('agg')
import matplotlib.ticker as ticker
%matplotlib inline
import torch.nn as nn
from torch import optim
import torch.nn.functional as F
from torch.autograd import Variable
from torch.optim.lr_scheduler import ExponentialLR
import time
import math
import html
from sacrebleu import corpus_bleu, raw_corpus_bleu

device = 'cuda' if torch.cuda.is_available() else 'cpu'


In [2]:
data_dir = '/home/yh1844/data'
SOS_token = 0
EOS_token = 1
PAD_IDX = 2
UNK_IDX = 3
VOCAB_SIZE = 50000
BATCH_SIZE = 64

In [3]:
def pkl_loader(file_name):
    with open(file_name+'.p', 'rb') as f:
        objct = pkl.load(f)
    return(objct)

def pkl_dumper(obj, file_name):
    with open(file_name+'.p', 'wb') as f:
        pkl.dump(obj, f, protocol=None)

In [10]:
def load_pretrained_wordvec(lan):
    if lan == 'zh':
        filename = 'wiki.zh.vec'
    elif lan == 'en':
        filename = 'wiki-news-300d-1M.vec'
    else:
        filename = 'wiki.vi.vec' #Vietnamese
    with open(os.path.join(data_dir, filename),encoding='utf-8') as f:
        word_vecs = np.zeros((VOCAB_SIZE+4, 300))
        word_vecs[UNK_IDX] = np.random.normal(scale=0.6, size=(300, ))
        word_vecs[SOS_token] = np.random.normal(scale=0.6, size=(300, ))
        word_vecs[EOS_token] = np.random.normal(scale=0.6, size=(300, ))

        words_ft = {'<pad>': PAD_IDX,
                   '<unk>': UNK_IDX, 
                   '<sos>': SOS_token,
                   '<eos>': EOS_token}
        idx2words_ft = {PAD_IDX:'<pad>', UNK_IDX: '<unk>', SOS_token: '<sos>', EOS_token: '<eos>'}
        ordered_words_ft = ['<sos>', '<eos>', '<pad>', '<unk>']
        count = 0
        for i, line in enumerate(f):
            if i == 0:
                continue
            if len(idx2words_ft) >= VOCAB_SIZE: 
                break
            s = line.split()
            if (np.asarray(s[1:]).size != 300):
                print(lan, i, np.asarray(s[1:]).size)
                continue
            word_vecs[count+4, :] = np.asarray(s[1:])
            words_ft[s[0]] = count+4
            idx2words_ft[count+4] = s[0]
            ordered_words_ft.append(s[0])
            count += 1
    word_vecs = torch.FloatTensor(word_vecs)
    pkl_dumper(word_vecs, os.path.join(data_dir, lan + '_word_vecs'))
    pkl_dumper(words_ft, os.path.join(data_dir, lan + '_words_ft'))
    pkl_dumper(idx2words_ft, os.path.join(data_dir, lan + '_idx2words_ft'))
    pkl_dumper(ordered_words_ft, os.path.join(data_dir, lan + '_ordered_words_ft'))


In [4]:
# load_pretrained_wordvec('zh')
# load_pretrained_wordvec('en')
# load_pretrained_wordvec('vi')

In [5]:
#load pretrained vectors
word_vecs = {}
word2index = {}
index2word = {}

word_vecs['en'] = pkl_loader(data_dir+'/en_word_vecs')
word_vecs['zh'] = pkl_loader(data_dir+'/zh_word_vecs')
word_vecs['vi'] = pkl_loader(data_dir+'/vi_word_vecs')
target_pre_trained_emb = word_vecs['en']
word2index['en'] = pkl_loader(data_dir+'/en_words_ft')
word2index['zh'] = pkl_loader(data_dir+'/zh_words_ft')
word2index['vi'] = pkl_loader(data_dir+'/vi_words_ft')
index2word['en'] = pkl_loader(data_dir+'/en_idx2words_ft')
index2word['zh'] = pkl_loader(data_dir+'/zh_idx2words_ft')
index2word['vi'] = pkl_loader(data_dir+'/vi_idx2words_ft')

VOCAB_SIZE = len(word2index['en'])

In [7]:
class Lang:
    def __init__(self, name, index2word, word2index):
        self.name = name
        self.word2index = word2index
        self.index2word = index2word
        self.n_words = len(index2word)

In [8]:
def normalizeString(s):
    s = re.sub(r"([.!?])", r" \1", s)
    s = html.unescape(s)
    return s

def asMinutes(s):
    m = math.floor(s / 60)
    s -= m * 60
    return '%dm %ds' % (m, s)


def timeSince(since, percent):
    now = time.time()
    s = now - since
    es = s / (percent)
    rs = es - s
    return '%s (- %s)' % (asMinutes(s), asMinutes(rs))

In [9]:
def loadingLangs(sourcelang, targetlang, setname):
    input_ls = []
    output_ls = []
    print('Reading lines...')
    # Read the file 
    with open(data_dir+'/iwslt-%s-%s/%s.tok.%s'%(sourcelang, targetlang, setname,sourcelang), encoding='utf-8') as f:
        for line in f.readlines():
            input_ls.append([normalizeString(word) for word in line.split()])
    with open(data_dir+'/iwslt-%s-%s/%s.tok.%s'%(sourcelang, targetlang, setname,targetlang), encoding='utf-8') as f:
        for line in f.readlines():
            output_ls.append([normalizeString(word) for word in line.split()])
    pairs = list(zip(input_ls, output_ls))
    pairs = [pair for pair in pairs if (len(pair[0])+len(pair[1]))>0]
    print('Read %s sentence pairs'%(len(input_ls)))
    if sourcelang == 'zh':
        input_lang = Lang(sourcelang, index2word['zh'], word2index['zh'])
    else:
        input_lang = Lang(sourcelang, index2word['vi'], word2index['vi'])
    output_lang = Lang(targetlang, index2word['en'], word2index['en'])
    print("Counted words:")
    print(input_lang.name, input_lang.n_words)
    print(output_lang.name, output_lang.n_words)
    return input_lang, output_lang, pairs

In [12]:
source_tra, target_tra, pairs_tra = loadingLangs('zh', 'en', 'train')
source_val, target_val, pairs_val = loadingLangs('zh', 'en', 'dev')
source_tes, target_tes, pairs_tes = loadingLangs('zh', 'en', 'test')

Reading lines...
Read 213377 sentence pairs
Counted words:
zh 50004
en 50004
Reading lines...
Read 1261 sentence pairs
Counted words:
zh 50004
en 50004
Reading lines...
Read 1397 sentence pairs
Counted words:
zh 50004
en 50004


In [14]:
def indexesFromSentence(lang, sentence):
    return [lang.word2index[word] if word in lang.word2index else UNK_IDX for word in sentence]


def tensorFromSentence(lang, sentence):
    indexes = indexesFromSentence(lang, sentence)
    indexes.append(EOS_token)
    return torch.tensor(indexes, dtype=torch.long, device=device).view(-1, 1)


def tensorsFromPair(pair,source,target):
    input_lang = source
    output_lang = target
    input_tensor = tensorFromSentence(input_lang, pair[0]).reshape((-1))
    target_tensor = tensorFromSentence(output_lang, pair[1]).reshape((-1))
    return (input_tensor, input_tensor.shape[0], target_tensor, target_tensor.shape[0])

class NMTDataset(Dataset):
    def __init__(self, source, target, pairs):
        self.source = source
        self.target = target
        self.pairs = pairs
        
    def __len__(self):
        return len(self.pairs)
    
    def __getitem__(self, key):
        """
        Triggered when you call dataset[i]
        """
        inp_ten, inp_len, tar_ten, tar_len = tensorsFromPair(self.pairs[key], self.source, self.target)
        item = {}
        item['inputtensor'] = inp_ten[:MAX_SENT_LEN]
        item['inputlen'] = min(inp_len, MAX_SENT_LEN)
        item['targettensor'] = tar_ten[:MAX_SENT_LEN]
        item['targetlen'] = min(tar_len, MAX_SENT_LEN)
        return item

train_data = NMTDataset(source_tra, target_tra, pairs_tra)
val_data = NMTDataset(source_tra, target_tra, pairs_val)
test_data = NMTDataset(source_tra, target_tra, pairs_tes)


#collate function


def collate_func(batch):
    """
    Customized function for DataLoader that dynamically pads the batch so that all
    data have the same length
    """
    src_data, tar_data, src_len, tar_len = [], [], [], []
    
    for datum in batch: 
        src_len.append(datum['inputlen'])
        tar_len.append(datum['targetlen'])
    
    max_length = [np.max(src_len), np.max(tar_len)]
    for datum in batch: 
        
        src_datum = np.pad(np.array(datum['inputtensor']),
                                pad_width=((0, max_length[0]-datum['inputlen'])),
                                mode="constant", constant_values=PAD_IDX)
        tar_datum = np.pad(np.array(datum['targettensor']),
                                pad_width=((0, max_length[1]-datum['targetlen'])),
                                mode="constant", constant_values=PAD_IDX)
        src_data.append(src_datum)
        tar_data.append(tar_datum)
        
        
    ind_dec_order = np.argsort(src_len)[::-1]
    src_data = np.array(src_data)[ind_dec_order]
    src_len = np.array(src_len)[ind_dec_order]
    tar_data = np.array(tar_data)[ind_dec_order]
    tar_len = np.array(tar_len)[ind_dec_order]
    return [torch.from_numpy(np.array(src_data)).to(device),torch.from_numpy(np.array(tar_data)).to(device),
                torch.LongTensor(np.array(src_len)).to(device),torch.LongTensor(np.array(tar_len)).to(device)]

train_loader = torch.utils.data.DataLoader(train_data,
                                           batch_size=BATCH_SIZE,shuffle=True, collate_fn=collate_func)
val_loader = torch.utils.data.DataLoader(val_data,
                                           batch_size=BATCH_SIZE,shuffle=False, collate_fn=collate_func)

In [15]:
MAX_SENT_LEN = 38

In [49]:
hyper = {
    'HIDDEN_SIZE': 512,
    'LR': 0.0004,
    'EVA_EVERY': 200,
    'DROP_OUT': 0,
    'TEACHER_RATIO': 0,
    'N_LAYERS': 1,
    'KER_SIZE': 3,
    'NUM_EPOCHS': 20   
}

In [50]:
class EncoderCNN(nn.Module):
    def __init__(self, input_size, emb_size, hidden_size, pretrained=True, ker_size=3, dropout_p=0.1):
        super(EncoderCNN, self).__init__()
        self.hidden_size = hidden_size
        if pretrained==True:
            word_vec = word_vecs[source_tra.name]
            self.embedding = nn.Embedding(input_size, emb_size, padding_idx=PAD_IDX)
            self.embedding.weight = nn.Parameter(word_vec)
            self.embedding.requires_grad = False
        else:
            self.embedding = nn.Embedding(input_size, emb_size, padding_idx=PAD_IDX)
          
          
        self.seq1 = nn.Sequential(nn.Conv1d(emb_size, hidden_size, kernel_size=ker_size, padding=(ker_size-1)//2),
                                nn.ReLU())
        self.seq2 = nn.Sequential(nn.Conv1d(hidden_size, hidden_size, kernel_size=ker_size, padding=(ker_size-1)//2),
                                nn.ReLU())
                                #nn.MaxPool1d(kernel_size=ker_size, stride=1, padding=(ker_size-1)//2))
        self.dropout = nn.Dropout(p=dropout_p)
        self.fc = nn.Linear(hidden_size, hidden_size)
    
    def forward(self, input, hidden=None): 
        seq_len, batch_size = input.size() # input size is torch.Size([38, 64])
        # input size for conv1d is , N is a batch size, C denotes a number of channels, L is a length of signal sequence.
        output = self.embedding(input) #
#         print('output size is', output.size())
        output = output.permute(1, 2, 0)  #
#         print('output size is', output.size())
        #.permute()view(batch_size, -1, seq_len)
        output = self.seq1(output)
#         print('output size is', output.size())
        output = self.seq2(output)
        hidden = torch.sum(output, dim=2)

        hidden = hidden.unsqueeze(0)


        output = output.permute(2, 0, 1)
        hidden = self.fc(hidden)
        hidden = F.relu(self.dropout(hidden))
        hidden = self.fc(hidden)


        return output, hidden
    

In [52]:
"Some code was inspired by Bahdanau's work.(Bahdanau, Dzmitry, Kyunghyun Cho, and Yoshua Bengio. 'Neural machine translation by jointly learning to align and translate.' arXiv preprint arXiv:1409.0473 (2014)."
class Attn(nn.Module):
    def __init__(self, method, hidden_size):
        super(Attn, self).__init__()
        self.method = method
        self.hidden_size = hidden_size
        self.attn = nn.Linear(self.hidden_size * 2, hidden_size)
        self.v = nn.Parameter(torch.rand(hidden_size))
        stdv = 1. / math.sqrt(self.v.size(0))
        self.v.data.normal_(mean=0, std=stdv)

    def forward(self, hidden, encoder_outputs):
        
        max_len = encoder_outputs.size(0)
        Hid_copy = hidden.repeat(max_len,1,1).transpose(0,1)
        encoder_outputs = encoder_outputs.transpose(0,1) 
        attn_energies = self.score(Hid_copy,encoder_outputs) 
        return F.softmax(attn_energies, dim=1).unsqueeze(1) 

    def score(self, hidden, encoder_outputs):
        score = F.tanh(self.attn(torch.cat([hidden, encoder_outputs], 2)))
        score = score.transpose(2,1) 
        v = self.v.repeat(encoder_outputs.data.shape[0],1).unsqueeze(1) 
        score = torch.bmm(v,score) 
        return score.squeeze(1) 

class AttnDecoderRNN(nn.Module):
    def __init__(self, hid_size, emb_size, out_size, pretra = False,n_layers=1):
        super(AttnDecoderRNN, self).__init__()
       
        self.n_layers = n_layers
        self.hid_size = hid_size
        self.emb_size = emb_size
        self.out_size = out_size
          
        if pretra:
            emb_mat = torch.from_numpy(target_pre_trained_emb.numpy()).float()
            self.embedding = nn.Embedding.from_pretrained(emb_mat,freeze = True)
        else:
            self.embedding = nn.Embedding(out_size, emb_size)
       
        self.attn = Attn('concat', hid_size)
        self.gru = nn.GRU(hid_size + emb_size, hidden_size, n_layers, dropout=0.1)
      
        self.out = nn.Linear(hid_size, out_size)



    def forward(self, word_input, last_hidden, encoder_outputs):
        
        
#         print('dec, word input', word_input.size())
        word_embedded = self.embedding(word_input.squeeze()).view(1, -1, self.emb_size) 
       
#         print('dec, wembd', word_embedded.size())
        
#         print('dec, last hidden', last_hidden.size())
#         print('dec, enc outp', encoder_outputs.size())
        attn_weights = self.attn(last_hidden[-1], encoder_outputs)
        context = attn_weights.bmm(encoder_outputs.transpose(0, 1))  
        context = context.transpose(0, 1) 
#         print('dec, context', context.size())
#         print('dec, wembd', word_embedded.size())
        
        rnn_input = torch.cat((word_embedded, context), 2)
#         print('dec rnn input', rnn_input.size())
#         print('dec last hidden', last_hidden.size())
        
        output, hidden = self.gru(rnn_input, last_hidden[-1].unsqueeze(0))
        output = output.squeeze(0)  
        
        output = F.log_softmax(self.out(output))
        
        return output, hidden, attn_weights

In [20]:
def train(input_tensor, target_tensor, input_len, target_len, encoder, decoder,
          encoder_optimizer, decoder_optimizer, criterion, max_length=MAX_SENT_LEN):
    
    hidden_size = hyper['HIDDEN_SIZE']
    learning_rate = hyper['LR']
    dropout_p = hyper['DROP_OUT']
    teacher_forcing_ratio = hyper['TEACHER_RATIO']
    n_layers = hyper['N_LAYERS']
    ker_size = hyper['KER_SIZE']
    
    encoder_optimizer.zero_grad()
    decoder_optimizer.zero_grad()
    scheduler_encoder = ExponentialLR(encoder_optimizer, 0.95) 
    scheduler_decoder = ExponentialLR(decoder_optimizer, 0.95) 

    max_input_len = max(input_len)
    max_target_len = max(target_len)
    
    batch_size = input_tensor.size()[1]
    
    
    loss = 0
    
    # feed-forward layer resulting encoder outputs, ei refers to each word token in input sentence
    encoder_outputs, encoder_hidden = encoder(input_tensor)
    decoder_input = torch.tensor([[SOS_token]*batch_size], device=device) 
    
    decoder_hidden = encoder_hidden#torch.cat([encoder_hidden[0, :, :].unsqueeze(0), encoder_hidden[1, :, :].unsqueeze(0)], dim = 0)

     
    #print('input to decoder:', decoder_input.size(), decoder_hidden.size(), encoder_outputs.size())
    use_teacher_forcing = True if random.random() < teacher_forcing_ratio else False

    if use_teacher_forcing: 
        for di in range(max_target_len):
            decoder_output, decoder_hidden, decoder_attention = decoder(
                decoder_input, decoder_hidden, encoder_outputs)

            decoder_input = target_tensor[di].view(batch_size,1)  
            loss += criterion(decoder_output, target_tensor[di])    
    else:
  
        for di in range(max_target_len):
            decoder_output, decoder_hidden, decoder_attention = decoder(
                    decoder_input, decoder_hidden, encoder_outputs)
            topv, topi = decoder_output.topk(1)
            decoder_input = topi.squeeze().detach().unsqueeze(1)
    
            loss += criterion(decoder_output, target_tensor[di])

    loss.backward()
    torch.nn.utils.clip_grad_norm_(encoder.parameters(), 3)
    torch.nn.utils.clip_grad_norm_(decoder.parameters(), 3)
    encoder_optimizer.step() 
    decoder_optimizer.step()

    return loss.item() / float(max_target_len)

In [21]:
def bleu_new(corpus,truths):
    n = len(corpus)
    pred_ls = []
    true_ls = []
    for i in range(n):
        pred, true = corpus[i], truths[i]
        pred_ls.append( [convert_idx_2_sent_new(sent, target_tra) for sent in pred])
        true_ls.append([convert_idx_2_sent_new(sent, target_tra) for sent in true])
    flattened_pred  = [val for sublist in pred_ls for val in sublist]
    flattened_true  = [val for sublist in true_ls for val in sublist]
    bleu= corpus_bleu(flattened_pred, [flattened_true]).score
    return bleu

In [22]:
def convert_idx_2_sent_new(idx_tensor, lang_obj):
    word_list = []

    for i in idx_tensor:
        if i.item() not in set([PAD_IDX,EOS_token,SOS_token]):
            word_list.append(lang_obj.index2word[i.item()])

    sent = (' ').join(word_list)
    return sent

In [23]:
def evaluate(encoder, decoder, data_loader, mode_enc, mode_dec, max_length=MAX_SENT_LEN):
    start = time.time()
    hidden_size = hyper['HIDDEN_SIZE']
    learning_rate = hyper['LR']
    dropout_p = hyper['DROP_OUT']
    teacher_forcing_ratio = hyper['TEACHER_RATIO']
    n_layers = hyper['N_LAYERS']
    ker_size = hyper['KER_SIZE']
    encoder.eval()
    decoder.eval()
    inputs = []
    corpus = []
    truths = []
    for i, (input_sentences, target_sentences,len1,len2) in enumerate(data_loader):
        
        input_tensor = input_sentences.transpose(0,1).to(device) #L*B
        target_tensor = target_sentences.transpose(0,1).to(device)
    
        inputs.append(input_sentences)
        truths.append(target_sentences)
        
        max_input_len = max(len1)
        max_target_len = max(len2)
        
        
        batch_size = input_tensor.size()[1]

        encoder_outputs, encoder_hidden = encoder(input_tensor, len1)

        decoder_input = torch.tensor([[SOS_token]*batch_size], device=device) 
        
        decoder_hidden = encoder_hidden
        
        decoded_words = torch.zeros(batch_size, int(max_input_len*1.5))
    
        for di in range(int(max_input_len*1.5)):
            decoder_output, decoder_hidden, decoder_attention = decoder(decoder_input, decoder_hidden, encoder_outputs)        
            topv, topi = decoder_output.topk(1) #topi B*1
            decoded_words[:,di] = topi.squeeze()
            decoder_input = topi.squeeze().detach().unsqueeze(1)
            
        corpus.append(decoded_words)
#         print(inputs[0].size(), corpus[0].size(), truths[0].size()) # all B*L
    return inputs, corpus, truths

In [56]:
def evaluate_beam(encoder, decoder, data_loader, mode_enc, mode_dec, max_length=MAX_SENT_LEN, beam=True, beam_width=3):
    start = time.time()
    hidden_size = hyper['HIDDEN_SIZE']
    learning_rate = hyper['LR']
    dropout_p = hyper['DROP_OUT']
    teacher_forcing_ratio = hyper['TEACHER_RATIO']
    n_layers = hyper['N_LAYERS']
    ker_size = hyper['KER_SIZE']
    encoder.eval()
    decoder.eval()
    inputs = []
    corpus = []
    truths = []
    for i, (input_sentences, target_sentences,len1,len2) in enumerate(data_loader):

        input_tensor = input_sentences.transpose(0,1).to(device) #L*B
        target_tensor = target_sentences.transpose(0,1).to(device)
    
        inputs.append(input_sentences)
        truths.append(target_sentences)
        
        max_input_len = max(len1)
        max_target_len = max(len2)
        
        
        batch_size = input_tensor.size()[1]

        encoder_outputs, encoder_hidden = encoder(input_tensor, len1)

        decoder_input = torch.tensor([[SOS_token]*batch_size], device=device) 
        
        decoder_hidden = encoder_hidden
        max_length = int(max_input_len*1.5)
        decoded_words = torch.zeros(batch_size, max_length)
        if beam == False:
            
            for di in range(int(max_input_len*1.5)):
                decoder_output, decoder_hidden, decoder_attention = decoder(decoder_input, decoder_hidden, encoder_outputs)        
                topv, topi = decoder_output.topk(1) #topi B*1
                decoded_words[:,di] = topi.squeeze()
                decoder_input = topi.squeeze().detach().unsqueeze(1)

            corpus.append(decoded_words)
    
            for di in range(max_length):
                decoder_output, decoder_hidden, decoder_attention = decoder(decoder_input, decoder_hidden, encoder_outputs)        
                topv, topi = decoder_output.topk(1) #topi B*1
                decoded_words[:, di] = topi.squeeze()
                #print('topi is {}, size is {}'.format(topi, topi.size()))
                decoder_input = topi.detach()

            corpus.append(decoded_words)
#         print(inputs[0].size(), corpus[0].size(), truths[0].size()) # all B*L
        else:
            completed_sents = []
            caches = [[] for i in range(max_length)]
            caches[0] = [([SOS_token],0, encoder_hidden)]
            for di in range(1,max_length):
                for (prev_tokens, log_prob, decoder_hidden) in caches[di-1]:
                    decoder_input = torch.tensor(prev_tokens[-1], device=device) 
                    decoder_output, decoder_hidden, decoder_attention = decoder(decoder_input, decoder_hidden, encoder_outputs)
                    topv, topi = decoder_output.topk(beam_width)
                    log_prob_cur = [prob for prob in topv.squeeze().detach()]
                    idxs_cur = [ind.item() for ind in topi.squeeze().detach()]
                    for i in range(beam_width):
                        caches[di].append((prev_tokens+[idxs_cur[i]], log_prob_cur[i]+log_prob, decoder_hidden))
                        if idxs_cur[i]==EOS_token:
                            completed_sents.append((prev_tokens+[idxs_cur[i]], log_prob_cur[i]+log_prob))
                    caches[di].sort(key=lambda x: x[1], reverse=True)
                    caches[di] = caches[di][:beam_width]
            caches[max_length-1].sort(key=lambda x: x[1], reverse=True)
            for i in range(beam_width):
                completed_sents.append((caches[max_length-1][i][0], caches[max_length-1][i][1]))
            completed_sents.sort(key=lambda x: x[1]/(len(x[0])**0.6), reverse=True)
            decoded_words = torch.tensor(completed_sents[0][0]).to(device)
            
            corpus.append(decoded_words.unsqueeze(0))
        
    return inputs, corpus, truths

In [55]:
start = time.time()
hidden_size = hyper['HIDDEN_SIZE']
learning_rate = hyper['LR']
eva_every = hyper['EVA_EVERY']
dropout_p = hyper['DROP_OUT']
teacher_forcing_ratio = hyper['TEACHER_RATIO']
n_layers = hyper['N_LAYERS']
ker_size = hyper['KER_SIZE']
num_epoch = hyper['NUM_EPOCHS']
early_stopping = False
patience = 3
required_progress = 0.01

encoder = EncoderCNN(source_tra.n_words, 300, hidden_size).to(device)

encoder_optimizer = optim.Adam(encoder.parameters(), lr=learning_rate)

decoder = AttnDecoderRNN(hidden_size, 300, target_tra.n_words, n_layers=1).to(device)

decoder_optimizer = optim.Adam(decoder.parameters(), lr=learning_rate)
criterion = nn.NLLLoss() 
plot_bleu_score_val = []
plot_losses = []
loss_total = 0 
best_score = None
count = 0
filename = 'best_cnn' #########
whole_val_bleu = []
whole_train_loss = []
for epoch in range(1, num_epoch + 1): 
    plot_bleu_score_val = []
    plot_losses = []
    print_loss_total = 0
    plot_loss_total = 0
    for i, (input_sentences, target_sentences,len1,len2) in enumerate(train_loader): 
        encoder.train()
        decoder.train()
        input_tensor = input_sentences.transpose(0,1).to(device)    
        target_tensor = target_sentences.transpose(0,1).to(device)
        loss = train(input_tensor, target_tensor, len1, len2, encoder,
                   decoder, encoder_optimizer, decoder_optimizer, criterion)
        loss_total += loss
        
        if i > 0 and i % eva_every == 0:
            inputs, corpus, truths = evaluate(encoder, decoder, val_loader, max_length=MAX_SENT_LEN, mode_enc='rnn', mode_dec='noattn')

            bleu_score_val_avg = bleu_new(corpus, truths)
            loss_avg = loss_total / eva_every
            loss_total = 0
            plot_losses.append(loss_avg)
            plot_bleu_score_val.append(bleu_score_val_avg)
            if best_score is None:
                best_score = bleu_score_val_avg
            if bleu_score_val_avg < best_score + required_progress:
                count += 1
            elif bleu_score_val_avg > best_score:
                state = {'epoch': epoch + 1, 
                           'state_dict_enc': encoder.state_dict(),
                           'state_dict_dec': decoder.state_dict(), 
                           'best_accuracy': best_score, 
                           'optimizer_enc': encoder_optimizer.state_dict(),
                          'optimizer_dec': decoder_optimizer.state_dict()}
                print ('new best achieved')
                torch.save(state, filename+'.pth')
                pkl_dumper(inputs,os.path.join(data_dir,'attn_inputs_cnn'))
                pkl_dumper(corpus,os.path.join(data_dir,'attn_pred_corpus_cnn'))
                pkl_dumper(truths,os.path.join(data_dir, 'attn_target_truths_cnn')) 
                best_score = bleu_score_val_avg
                count = 0
                if early_stopping:
                    if count >= patience:
                        print("earily stop triggered")
                        break
            print('-----------------------------------------')
            print('Time: {0}, Epoch: [{1}/{2}], Step: [{3}/{4}], Train Loss: {5}, BLEU score: {6}'.format(
                  timeSince(start, i + 1/len(train_loader)), epoch, num_epoch, i, 
                  len(train_loader), loss_avg, bleu_score_val_avg))
            print('\nInput> %s'%(' '.join([source_tra.index2word[i.item()] for i in inputs[0][3] if i.item() not in set([PAD_IDX,EOS_token,SOS_token])])))
            print('\nTarget= %s'%(convert_idx_2_sent_new(truths[0][3], target_tra)),
              '\nPredict< %s' %(convert_idx_2_sent_new(corpus[0][3], target_tra)))

            print('\nInput2> %s'%(' '.join([source_tra.index2word[i.item()] for i in inputs[0][13] if i.item() not in set([PAD_IDX,EOS_token,SOS_token])])))
            print('\nTarget2= %s'%(convert_idx_2_sent_new(truths[0][13], target_tra)),
              '\nPredict2< %s' %(convert_idx_2_sent_new(corpus[0][13], target_tra)))
    if early_stopping:
        if count >= patience:
            print("earily stop triggered")
            break
    whole_train_loss.append(plot_losses)
    whole_val_bleu.append(plot_bleu_score_val)
    print('-----------------------------------------')
    pkl_dumper(whole_train_loss,os.path.join(data_dir,'attn_train_loss_cnn'))
    pkl_dumper(whole_val_bleu,os.path.join(data_dir, 'attn_val_bleu_cnn'))



-----------------------------------------
Time: 1m 19s (- -2m 41s), Epoch: [1/20], Step: [200/3332], Train Loss: 3.7572551938107126, BLEU score: 2.796375772080839

Input> 他 出身 阿富汗 <unk> <unk> 地区 有着 与 他人 不同 的 见解 他 坚持 让 他 的 女儿 我 的 母亲 去 上学 并 因此 被迫 与 他 的 父亲 断绝 父子 <unk> <unk> 关系

Target= A total maverick from a remote province of Afghanistan , he insisted that his daughter , my mom , go to school , and for that he was <unk> by his father <unk> 
Predict< So , , , , , , , , , , , , , , , , , , , , , , , , ,

Input2> 对 我 来说 阿富汗 充满 了 希望 和 无限 的 可能 可能性 每 一天 在 <unk> 读书 的 女孩 <unk> <unk> 都 这么 提醒 着 我

Target2= To me , Afghanistan is a country of hope and boundless possibilities , and every single day the girls of <unk> remind me of that <unk> 
Predict2< So , , , , , , , , , , , , , , , , , , ,
new best achieved
-----------------------------------------
Time: 2m 39s (- -3m 21s), Epoch: [1/20], Step: [400/3332], Train Loss: 3.3756724618610594, BLEU score: 3.9549592413894037

Input> 他 出身 阿富汗 <unk> <unk>

KeyboardInterrupt: 