## Prep

In [0]:
!pip3 install jieba

In [0]:
#!python preprocess_translation/token_zh_en.py

In [0]:
!pip3 install sacrebleu

In [0]:

from google.colab import drive
drive.mount('/content/drive/')

In [0]:
from os import path
from wheel.pep425tags import get_abbr_impl, get_impl_ver, get_abi_tag
platform = '{}{}-{}'.format(get_abbr_impl(), get_impl_ver(), get_abi_tag())

accelerator = 'cu80' if path.exists('/opt/bin/nvidia-smi') else 'cpu'

!pip install -q http://download.pytorch.org/whl/{accelerator}/torch-0.4.1-{platform}-linux_x86_64.whl torchvision
import torch
print(torch.__version__)
print(torch.cuda.is_available())

In [0]:
from __future__ import unicode_literals, print_function, division
from io import open
import unicodedata
import string
import re
import random
import operator
from torch.utils.data import Dataset
from collections import Counter
import matplotlib.pyplot as plt
import numpy as np
import torch
import torch.nn as nn
from torch import optim
import torch.nn.functional as F
from sacrebleu import corpus_bleu, TOKENIZERS, DEFAULT_TOKENIZER
#from masked_cross_entropy import *
from torch.optim.lr_scheduler import ExponentialLR

import random
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


from utils import *
from satt import *


In [0]:
#device = torch.device('cpu')

In [0]:
import torch
from torch.nn import functional
from torch.autograd import Variable

def sequence_mask(sequence_length, max_len=None):
      """
    Code paraphrased from 
    https://github.com/spro/practical-pytorch/blob/master/seq2seq-translation/masked_cross_entropy.py
    """
    if max_len is None:
        max_len = sequence_length.data.max()
    batch_size = sequence_length.size(0)
    seq_range = torch.arange(0, max_len).long()
    seq_range_expand = seq_range.unsqueeze(0).expand(batch_size, max_len)
    seq_range_expand = Variable(seq_range_expand)
    if sequence_length.is_cuda:
        seq_range_expand = seq_range_expand.to(device)
    seq_length_expand = (sequence_length.unsqueeze(1)
                         .expand_as(seq_range_expand))
    return seq_range_expand < seq_length_expand


def masked_cross_entropy(logits, target, length):
    length = Variable(torch.LongTensor(length)).to(device)

    """
    Code paraphrased from 
    https://github.com/spro/practical-pytorch/blob/master/seq2seq-translation/masked_cross_entropy.py
    """
    
    
    """
    Args:
        logits: A Variable containing a FloatTensor of size
            (batch, max_len, num_classes) which contains the
            unnormalized probability for each class.
        target: A Variable containing a LongTensor of size
            (batch, max_len) which contains the index of the true
            class for each corresponding step.
        length: A Variable containing a LongTensor of size (batch,)
            which contains the length of each data in a batch.

    Returns:
        loss: An average loss value masked by the length.
    """

    # logits_flat: (batch * max_len, num_classes)
    logits_flat = logits.view(-1, logits.size(-1))
    # log_probs_flat: (batch * max_len, num_classes)
    log_probs_flat = functional.log_softmax(logits_flat, dim=1)
    # target_flat: (batch * max_len, 1)
    target_flat = target.view(-1, 1)
    # losses_flat: (batch * max_len, 1)
    losses_flat = -torch.gather(log_probs_flat, dim=1, index=target_flat)
    # losses: (batch, max_len)
    losses = losses_flat.view(*target.size())
    # mask: (batch, max_len)
    mask = sequence_mask(sequence_length=length, max_len=target.size(1))
    losses = losses * mask.float()
    loss = losses.sum() / length.float().sum()
    return loss


In [0]:
PAD_token = 0
SOS_token = 1
EOS_token = 2
hidden_size = 512
dropout_p = 0.1
teacher_forcing_ratio = 0.1
BATCH_SIZE = 64
MIN_LENGTH = 1
MAX_LENGTH = 55
vocab_size = 19000
tar_vocab_size = 22000
n_layers = 2
lr_rate_en = 0.0001
lr_rate_de = 0.0005
lr_decay = False
gamma_encoder = 0.9
gamma_decoder = 0.9
n_epochs = 50
plot_every = 20
print_every = 100
evaluate_every = 600
attn_model = 'dot'
Attention = True
sentence_ratio = True

In [0]:
device

## Loading Data

In [0]:
class Lang:
  
    '''
    Some codes are paraphrased from
    https://pytorch.org/tutorials/intermediate/seq2seq_translation_tutorial.html
    '''
    def __init__(self, name):
        self.name = name
        self.word2index = {}
        self.word2count = {}
        self.index2word = {0: "PAD", 1: "SOS", 2:"EOS", 3:"UNK"}
        self.n_words = 4  # Count SOS and EOS

    def addSentence(self, sentence):
        for word in sentence.split(' '):
            self.addWord(word)

    def addWord(self, word):
        if word not in self.word2index:
            self.word2index[word] = self.n_words
            self.word2count[word] = 1
            self.index2word[self.n_words] = word
            self.n_words += 1
        else:
            self.word2count[word] += 1

In [0]:
other_punctuations = string.punctuation.replace('!','').replace('.','').replace('?','').replace(',','').replace('-','')

def normalizeEnString(s):
#     s = unicodeToAscii(s.strip())
    s = s.replace("&apos", "").replace("&quot","")
#    s = re.sub(r"([.!?])", r" \1", s)
    s = re.sub(r"[^a-zA-Z,.!?0-9]+", r" ", s)
    s = re.sub( '\s+', ' ', s).strip()
    return s

def normalizeViString(s):
    s = s.replace("&apos", "").replace("&quot","").replace("_","").replace('-','')
    s = re.sub(r'[{}]'.format(other_punctuations), '', s)
    s = re.sub( '\s+', ' ', s).strip()
    return s 

In [0]:
normalizeEnString("It &apos;s very pretty , and it has rapidly started to overgrow the once very rich biodiversity of the northwestern Mediterranean .")

In [0]:
def readLangs(lang1, lang2, data='train'):
  '''
    Some codes are paraphrased from
    https://pytorch.org/tutorials/intermediate/seq2seq_translation_tutorial.html
    '''
    #data: train/dev/test
    print("Reading lines...")

    # Read the file and split into lines
    zh_lines = open('/content/drive/My Drive/Neural-Machine-Translation/data/iwslt-vi-en/{}.tok.vi'.format(data)).read().split('\n')
    en_lines = open('/content/drive/My Drive/Neural-Machine-Translation/data/iwslt-vi-en/{}.tok.en'.format(data)).read().split('\n')
    # Split every line into pairs and normalize
    pairs = [[normalizeViString(element[0]), normalizeEnString(element[1])] for element in zip(zh_lines, en_lines)]

    input_lang = Lang(lang1)
    output_lang = Lang(lang2)

    return input_lang, output_lang, pairs

In [0]:
def filter_pairs(pairs):
    filtered_pairs = []
    for pair in pairs:
        if len(pair[0].split()) >= MIN_LENGTH and len(pair[0].split()) <= MAX_LENGTH \
            and len(pair[1].split()) >= MIN_LENGTH and len(pair[1].split()) <= MAX_LENGTH:
                filtered_pairs.append(pair)
    return filtered_pairs


In [0]:
def prepareData(lang1, lang2):
    input_lang, output_lang, pairs = readLangs(lang1, lang2)
    print("Read %s sentence pairs" % len(pairs))
    pairs = filter_pairs(pairs)
    print("Trimmed to %s sentence pairs" % len(pairs))
    print("Counting words...")
    for pair in pairs:
        input_lang.addSentence(pair[0])
        output_lang.addSentence(pair[1])
    print("Counted words:")
    print(input_lang.name, input_lang.n_words)
    print(output_lang.name, output_lang.n_words)
    return input_lang, output_lang, pairs


In [0]:
def build_topwordVocab(lang, vocab_size):
    print("Build vocabulary by top {} frequent word...".format(vocab_size))
    sorted_word2Count = sorted(lang.word2count.items(),
        key=operator.itemgetter(1),
        reverse=True)
    sorted_words = [x[0] for x in sorted_word2Count[:vocab_size]]
    
    lang.word2index = {}

    for ind, word in enumerate(sorted_words):
            lang.word2index[word] = ind + 4

#     lang.word2index = {}
    lang.index2word = {}
    lang.index2word[0] = "PAD"
    lang.index2word[1] = "SOS"
    lang.index2word[2] = "EOS"
    lang.index2word[3] = "UNK"

    for ind, word in enumerate(sorted_words):
        lang.index2word[ind + 4] = word
    
    lang.n_words = len(lang.index2word)
    
    print(lang.name, lang.n_words)
    return lang

input_lang, output_lang, pairs = prepareData('zh', 'eng')

input_lang = build_topwordVocab(input_lang,vocab_size=vocab_size)
output_lang = build_topwordVocab(output_lang, vocab_size=tar_vocab_size)
print(random.choice(pairs))

In [0]:
input_lang.n_words

In [0]:
_, _, val_pairs = readLangs('ch', 'eng', 'dev')
val_pairs = filter_pairs(val_pairs)
_, _, test_pairs = readLangs('ch', 'eng', 'test')
test_pairs = filter_pairs(test_pairs)

In [0]:
print(random.choice(val_pairs))

# Preparing Training Data

In [0]:
def indexesFromSentence(lang, sentence):
    idxs = []
    for word in sentence.split(' '):
        try:
            idxs.append(lang.word2index[word])
        except KeyError:
            idxs.append(3)  # 3 is the id of 'UNK'
    idxs.append(EOS_token)
    return idxs

# def tensorFromSentence(lang, sentence):
#     indexes = indexesFromSentence(lang, sentence)
#     indexes.append(EOS_token)
#     return torch.tensor(indexes, dtype=torch.long, device=device).view(-1, 1)


# def tensorsFromPair(pair):
#     input_tensor = tensorFromSentence(input_lang, pair[0])
#     target_tensor = tensorFromSentence(output_lang, pair[1])
#     return (input_tensor, target_tensor)

class VocabDataset(Dataset):
    def __init__(self, pairs):
#         pairs = [tensorsFromPair(pair) for pair in pairs]
#         self.source_sent_list = [i[0] for i in pairs]
#         self.target_sent_list = [i[1] for i in pairs]
        
        self.source_sent_list = [indexesFromSentence(input_lang,pair[0]) for pair in pairs]
        self.target_sent_list = [indexesFromSentence(output_lang,pair[1]) for pair in pairs]

    def __len__(self):
        return len(self.source_sent_list)
        
    def __getitem__(self, key):
        token1_idx = self.source_sent_list[key]
        token2_idx = self.target_sent_list[key]
        return [token1_idx,token2_idx, len(token1_idx), len(token2_idx)]

    
def Vocab_collate_func(batch):
    source_sent_list = []
    target_sent_list = []
    source_len_list = []
    target_len_list = []
    src_mask_list = []
    for datum in batch:   ### batch = sample
        source_len_list.append(datum[2])
        target_len_list.append(datum[3])
    
    max_len_src = max(source_len_list)
    max_len_trg = max(target_len_list)
    
    # padding
    for datum in batch:
        
        # source sentence processing
        padded_source = np.pad(np.array(datum[0]), 
                                pad_width=((0,max_len_src-datum[2])),          ### 0代表左边没有pad,右边的值代表右边pad的个数
                                mode="constant", constant_values=PAD_token)
        source_sent_list.append(padded_source)
        
        src_mask = (padded_source != PAD_token)
        
        src_mask_list.append(src_mask)
        
        # target sentence processing
        padded_target = np.pad(np.array(datum[1]), 
                                pad_width=((0,max_len_trg-datum[3])),          ### 0代表左边没有pad,右边的值代表右边pad的个数
                                mode="constant", constant_values=PAD_token)
        target_sent_list.append(padded_target)
        
    #sort sentences for the batch
    sort_idx = sorted(range(len(source_len_list)), key=source_len_list.__getitem__, reverse=True)
    source_sent_list = np.array(source_sent_list)[sort_idx]
    target_sent_list = np.array(target_sent_list)[sort_idx]
    source_len_list = np.array(source_len_list)[sort_idx]
    target_len_list = np.array(target_len_list)[sort_idx]
    src_mask_list = np.array(src_mask_list)[sort_idx].tolist()
        
    return [torch.tensor(source_sent_list).to(device), 
            torch.tensor(target_sent_list).to(device),
            torch.LongTensor(source_len_list), 
            torch.LongTensor(target_len_list), 
            torch.LongTensor(src_mask_list).to(device)]

train_dataset = VocabDataset(pairs)
train_loader = torch.utils.data.DataLoader(dataset=train_dataset, 
                                           batch_size=BATCH_SIZE,
                                           collate_fn=Vocab_collate_func,
                                           shuffle=True)

val_dataset = VocabDataset(val_pairs)
val_loader = torch.utils.data.DataLoader(dataset=val_dataset,
                                        batch_size=BATCH_SIZE,
                                        collate_fn=Vocab_collate_func,
                                        shuffle=False)
test_dataset = VocabDataset(test_pairs)
test_loader = torch.utils.data.DataLoader(dataset=test_dataset,
                                        batch_size=BATCH_SIZE,
                                        collate_fn=Vocab_collate_func,
                                        shuffle=False)

In [0]:
for i,(inputs, outputs, len1, len2, src_mask) in enumerate(val_loader):
    print(inputs, len1)
    print(outputs, len2)
    print(src_mask)
    break

# Build Encoder-Decoder

In [0]:
class Attn(nn.Module):
     """
    Some code is paraphrased from
    https://github.com/spro/practical-pytorch/blob/master/seq2seq-translation/seq2seq-translation-batched.ipynb
    """
    def __init__(self, method, hidden_size):
        super(Attn, self).__init__()
        
        self.method = method
        self.hidden_size = hidden_size

        if self.method == 'concat':
            self.attn = nn.Linear(self.hidden_size * 2, hidden_size)
            self.v = nn.Parameter(torch.FloatTensor(hidden_size))

    def forward(self, hidden, encoder_outputs):
        max_len = encoder_outputs.size(0)
        this_batch_size = encoder_outputs.size(1)

        # Create variable to store attention energies
        attn_energies = Variable(torch.zeros(this_batch_size, max_len)).to(device) # B x S
        
        if self.method == 'dot':

            attn_energies = torch.matmul(encoder_outputs.permute(1,0,2), hidden.permute(1,2,0)).squeeze()
            
        if self.method == 'concat':
            hidden_expand = hidden.expand(max_len, -1, -1).permute(1, 0, 2)  # shape of (B, S, N)
            enc_cat_hid = torch.cat([encoder_outputs.permute(1,0,2), hidden_expand], dim=-1)  # shape of (B, S, 2*N)
            # After nn.Linear(2*N, N), enc_cat_hid with shape (B, S, N)
            # v is shape of (N)
            attn_energies = torch.matmul(self.attn(enc_cat_hid), self.v)  # shape of (B, S)
        # Normalize energies to weights in range 0 to 1, resize to 1 x B x S
        #print(' attn_energies.size= ', attn_energies.size())
        if len(attn_energies.size()) == 1:
            attn_energies = attn_energies.unsqueeze(0)
        return F.softmax(attn_energies, dim=1).unsqueeze(1)
    

In [0]:
a = torch.Tensor(3)
a.size()
len(a.size())

In [0]:
class LuongAttnDecoderRNN(nn.Module):
  """
    Some code is paraphrased from
    https://github.com/spro/practical-pytorch/blob/master/seq2seq-translation/seq2seq-translation-batched.ipynb
    """
    def __init__(self, attn_model, hidden_size, output_size, n_layers=1, dropout=0.1):
        super(LuongAttnDecoderRNN, self).__init__()

        # Keep for reference
        self.attn_model = attn_model
        self.hidden_size = hidden_size
        self.output_size = output_size
        self.n_layers = n_layers
        self.dropout = dropout

        # Define layers
        self.embedding = nn.Embedding(output_size, hidden_size)
        self.embedding_dropout = nn.Dropout(dropout)
        self.gru = nn.GRU(hidden_size, hidden_size, n_layers, dropout=dropout)
        self.concat = nn.Linear(hidden_size * 2, hidden_size)
        self.out = nn.Linear(hidden_size, output_size)
        
        # Choose attention model
        if attn_model != 'none':
            self.attn = Attn(attn_model, hidden_size)

    def forward(self, input_seq, last_hidden, encoder_outputs):
        # Note: we run this one step at a time

        # Get the embedding of the current input word (last output word)
        batch_size = input_seq.size(0)
        embedded = self.embedding(input_seq)
        embedded = self.embedding_dropout(embedded)
        embedded = embedded.view(1, batch_size, self.hidden_size) # S=1 x B x N
        #print(' embedded.size= ', embedded.size())
        # Get current hidden state from input word and last hidden state
        rnn_output, hidden = self.gru(embedded, last_hidden)

        # Calculate attention from current RNN state and all encoder outputs;
        # apply to encoder outputs to get weighted average
        attn_weights = self.attn(rnn_output, encoder_outputs)
        context = attn_weights.bmm(encoder_outputs.transpose(0, 1)) # B x S=1 x N

        # Attentional vector using the RNN hidden state and context vector
        # concatenated together (Luong eq. 5)
        rnn_output = rnn_output.squeeze(0) # S=1 x B x N -> B x N
        context = context.squeeze(1)       # B x S=1 x N -> B x N
        concat_input = torch.cat((rnn_output, context), 1)
        concat_output = torch.tanh(self.concat(concat_input))

        # Finally predict next token (Luong eq. 6, without softmax)
        output = self.out(concat_output)

        # Return final output, hidden state, and attention weights (for visualization)
        return output, hidden, attn_weights

# Testing the models

# Training Model

In [0]:
import time
import math


def asMinutes(s):
    m = math.floor(s / 60)
    s -= m * 60
    return '%dm %ds' % (m, s)


def timeSince(since, percent):
    now = time.time()
    s = now - since
    es = s / (percent)
    rs = es - s
    return '%s (- %s)' % (asMinutes(s), asMinutes(rs))

In [0]:
# this is just one sentence input, could be batchlized 
def train(input_tensor, target_tensor, input_lengths, target_lengths, encoder, decoder, 
          encoder_optimizer, decoder_optimizer, criterion, clip=10.0, mask = None, src_mask=None):
    encoder_optimizer.zero_grad()  # zero out the accumulated gradient over mini-batch
    decoder_optimizer.zero_grad()
    
    
    batch_size = input_tensor.size(0)
    input_tensor = input_tensor.to(device)
    target_tensor = target_tensor.to(device)

    encoder_outputs = encoder(input_tensor, src_mask = src_mask).transpose(0,1).contiguous()
    #print(' encoder_outputs.size= ', encoder_outputs.size())
    encoder_hidden = encoder_outputs[-1].view(1, -1, hidden_size).contiguous()
    #print(' encoder_hidden.size= ', encoder_hidden.size())
    #encoder_outputs: 20 x batch_size x hidden_size
    #hidden: 1 x batch_size x hidden_size

    loss = 0

    
    decoder_input = torch.tensor([SOS_token]*batch_size).to(device)  # decoder_input: torch.Size([1, 32])
    decoder_hidden = encoder_hidden[:decoder.n_layers] # Use last (forward) hidden state from encoder
    all_decoder_outputs = Variable(torch.zeros(target_lengths.max(), batch_size, decoder.output_size)).to(device)
    
    use_teacher_forcing = True if random.random() < teacher_forcing_ratio else False
    
    if use_teacher_forcing:
        # Teacher forcing: Feed the target as the next input
        for di in range(target_lengths.max()):
            decoder_output, decoder_hidden, decoder_attention = decoder(
                decoder_input, decoder_hidden, encoder_outputs)
            
            decoder_input = target_tensor[di]  # Teacher forcing
            all_decoder_outputs[di] = decoder_output
            
    else:
        # Without teacher forcing: use its own predictions as the next input
        for di in range(target_lengths.max()):
            decoder_output, decoder_hidden, decoder_attention = decoder(
                decoder_input, decoder_hidden, encoder_outputs)
            topv, topi = decoder_output.topk(1)

            decoder_input = topi.squeeze().detach()  # detach from history as input
            all_decoder_outputs[di] = decoder_output
           
            
    # Loss calculation and backpropagation

    loss = masked_cross_entropy(
        all_decoder_outputs.transpose(0, 1).contiguous(), # -> batch x seq
        target_tensor.transpose(0, 1).contiguous(), # -> batch x seq
        target_lengths
    )
#     loss = loss.sum()/batch_size 
    loss.backward()
    #    ave_loss.backward()
    
    # Clip gradient norms
    torch.nn.utils.clip_grad_norm_(encoder.parameters(), clip)
    torch.nn.utils.clip_grad_norm_(decoder.parameters(), clip)
    
    encoder_optimizer.step()   # update parameters
    decoder_optimizer.step()

    return loss.item()

In [0]:
def trainIters(encoder, decoder, n_iters, lr_decay=True, gamma_encoder=0.9, gamma_decoder=0.9, print_every=100, plot_every=100, learning_rate_encoder=0.0005, learning_rate_decoder=0.002,evaluate_every=3000):
    start = time.time()

    encoder_optimizer = optim.Adam(encoder.parameters(), lr=learning_rate_encoder)
    decoder_optimizer = optim.Adam(decoder.parameters(), lr=learning_rate_decoder)
    criterion = nn.NLLLoss(reduction='none', ignore_index=PAD_token)
    scheduler_encoder = ExponentialLR(encoder_optimizer, gamma_encoder, last_epoch=-1) 
    scheduler_decoder = ExponentialLR(decoder_optimizer, gamma_decoder, last_epoch=-1) 
    
    encoder.to(device)
    decoder.to(device)
    score_max = 0
    validation_scores = []
    plot_losses = []
    for epoch in range(1, n_iters + 1):
        print_loss_total = 0  # Reset every print_every
        plot_loss_total = 0  # Reset every plot_every
        if lr_decay:
            scheduler_encoder.step()
            scheduler_decoder.step()
        
        for i, (input_sentences, target_sentences,len1,len2, src_mask) in enumerate(train_loader): 
            #input_tensor = input_sentences.transpose(0,1)   
            input_tensor = input_sentences   
            target_tensor = target_sentences.transpose(0,1)
            mask = target_tensor.ge(1)   # 100 * 13
            #print(' input_tensor.size= ', input_tensor.size())
            loss = train(input_tensor, target_tensor, len1, len2, encoder,
                         decoder, encoder_optimizer, decoder_optimizer, criterion, mask = mask, src_mask = src_mask)
            print_loss_total += loss
            plot_loss_total += loss
            
            if i > 0 and i % evaluate_every == 0:
                bleu_score, (src_sents, sys_sents, ref_sents) = test_model(encoder, decoder, val_loader)
                print('Validation Score: {} \n source sentence {} \n predicted sentence {} \n Reference sentence: {}'.format(bleu_score, src_sents, sys_sents, ref_sents))
                validation_scores.append(bleu_score)
                
                if bleu_score > score_max:
                    score_max = bleu_score
                
                    torch.save({
                                'epoch': epoch,
                                'encoder': encoder.state_dict(),
                                'encoder_optimizer': encoder_optimizer.state_dict(),
                                'decoder': decoder.state_dict(),
                                'decoder_optimizer': decoder_optimizer.state_dict()
                                }, "/content/drive/My Drive/Neural-Machine-Translation/saved_model/attnIs{}_hiddenSize{}_nLayer{}_batchSize{}_epoch{}_srcVocSize{}_lrDecay{}_teacherF{}"\
                        .format(Attention,hidden_size,n_layers,BATCH_SIZE,n_iters,vocab_size,
                                lr_decay,teacher_forcing_ratio))   
                  
            if i > 0 and i % print_every == 0:
                print_loss_avg = print_loss_total / print_every
                print_loss_total = 0
#                bleu_score, (sys_sents, ref_sents) = test_model(encoder, decoder, val_loader)
                print('Time: {}, Epoch: [{}/{}], Step: [{}/{}], Train Loss: {}'.format(
                    timeSince(start, i + 1/len(train_loader)), epoch, n_iters, i, 
                    len(train_loader),print_loss_avg))

            if i > 0 and i % plot_every == 0:
                plot_loss_avg = plot_loss_total / plot_every
                plot_losses.append(plot_loss_avg)
                plot_loss_total = 0
                
                #print(plot_losses)
                showPlot(plot_losses)
                torch.save({
                            'plot_losses': plot_losses,
                            'validation_scores': validation_scores
                            }, "/content/drive/My Drive/Neural-Machine-Translation/saved_scores/attnIs{}_hiddenSize{}_nLayer{}_batchSize{}_epoch{}_srcVocSize{}_lrDecay{}_teacherF{}"\
                    .format(Attention,hidden_size,n_layers,BATCH_SIZE,n_iters,vocab_size,
                            lr_decay,teacher_forcing_ratio))   

                torch.cuda.empty_cache()    
#             print("plot_losses:",plot_losses)
#             print("validation_scores:",validation_scores)
        showPlot(plot_losses)
        showPlot(validation_scores)

# Plotting results

In [0]:
import matplotlib.pyplot as plt
plt.switch_backend('agg')
import matplotlib.ticker as ticker
import numpy as np


def showPlot(points):
    plt.figure()
    fig, ax = plt.subplots()
    # this locator puts ticks at regular intervals
    loc = ticker.MultipleLocator(base=0.2)
    ax.yaxis.set_major_locator(loc)
    plt.plot(points)

# Evaluation

In [0]:
# out, attn = evaluate(encoder1, attn_decoder1, '我 11 岁 那年 ，   记得 有 一天 早晨 醒来 ， 听见 家里 有 愉悦 的 声音 。')

In [0]:
def get_batch_outputs(encoder, decoder, input_sentences, input_lengths, output_lengths): 
    with torch.no_grad():
        input_tensor = input_sentences.to(device)   # 32*100 to 100*32
        batch_size = input_tensor.size(0)
        encoder_outputs = encoder(input_tensor).transpose(0,1).contiguous()
        #print(' encoder_outputs.size= ', encoder_outputs.size())
        encoder_hidden = encoder_outputs[-1].view(1, -1, hidden_size).contiguous()
        #print(' encoder_hidden.size= ', encoder_hidden.size())
        #encoder_outputs: 20 x batch_size x hidden_size
        #hidden: 1 x batch_size x hidden_size
            
        decoder_input = Variable(torch.tensor([SOS_token]*batch_size)).to(device)  # decoder_input: torch.Size([1, 32])
        decoder_hidden = encoder_hidden[:decoder.n_layers]
        
        decoded_words = np.empty((output_lengths.max(), batch_size), dtype=object)
#         print(' decoder_input.size= ', decoder_input.size())
#         print(' decoder_hidden.size= ', decoder_hidden.size())
#         print(' encoder_outputs.size= ', encoder_outputs.size())
        for di in range(output_lengths.max()):
            decoder_output, decoder_hidden, decoder_attention = decoder(
                decoder_input, decoder_hidden, encoder_outputs)
            topv, topi = decoder_output.topk(1)
            decoder_input = topi.squeeze().detach().to(device)  # detach from history as input
            
            decoded_words[di:] = np.array(['<EOS>' if idx==EOS_token else output_lang.index2word[idx] for idx in decoder_input.tolist()])
        
        return decoded_words.transpose()
        

In [0]:
def test_model(encoder, decoder, loader):
    score = []
    src_sentences = []
    sys_sentences = []
    ref_sentences = []
    encoder.train(False)
    decoder.train(False)
    for i, (input_sentences, target_sentences,len1,len2, src_mask) in enumerate(loader):
        for sentence in target_sentences:
            trg_list = []
            for idx in sentence:
                if idx.item() == EOS_token:
                    break
                else:
                    trg_list.append(output_lang.index2word[idx.item()])
            ref_sentences.append(' '.join(trg_list))
        #print(len(ref_sentences))
        for sentence in input_sentences:
            src_list = []
            for idx in sentence:
                if idx.item() == EOS_token:
                    break
                else:
                    src_list.append(input_lang.index2word[idx.item()])
            src_sentences.append(' '.join(src_list))
        #print(len(src_sentences))
        #ref_sentences.append(' '.join(sent) for sent in target_sentences)
        #src_sentences.append(' '.join(sent) for sent in input_sentences)
        batch_size = input_sentences.size(0)
#        print(batch_size)
        for sentence in get_batch_outputs(encoder, decoder, input_sentences, len1, len2):
            try:
                end_idx = sentence.tolist().index('<EOS>')
                sys_sentences.append(' '.join(sentence[:end_idx]))
            except ValueError:
                sys_sentences.append(' '.join(sentence))
    encoder.train(True)
    decoder.train(True)
    #src_sentences = [val_pair[0] for val_pair in val_pairs]
    #ref_sentences = [val_pair[1] for val_pair in val_pairs]
    #print(corpus_bleu(['what the fck','hello world !'],[['what the fck','hello world']],smooth="floor", smooth_floor=0.01, use_effective_order=True, tokenize=DEFAULT_TOKENIZER))

    score = corpus_bleu(sys_sentences,[ref_sentences], smooth="floor", smooth_floor=0.01, lowercase=False, use_effective_order=True, tokenize=DEFAULT_TOKENIZER).score
    return score, (src_sentences[:3], sys_sentences[:3], ref_sentences[:3])


In [0]:
encoder1 = make_model(input_lang.n_words).to(device)
attn_decoder1 = LuongAttnDecoderRNN(attn_model, hidden_size, output_lang.n_words, dropout=dropout_p, n_layers=1).to(device)
#trainIters(encoder1, attn_decoder1, 30, print_every=100, plot_every=1, evaluate_every=500, learning_rate=lr_rate, lr_decay=lr_decay, gamma_encoder=gamma_encoder, gamma_decoder=gamma_decoder)
trainIters(encoder1, attn_decoder1, n_iters=n_epochs, print_every=print_every, plot_every=plot_every, evaluate_every=evaluate_every, learning_rate_encoder=lr_rate_en, learning_rate_decoder=lr_rate_de, lr_decay=lr_decay, gamma_encoder=gamma_decoder,
           gamma_decoder=gamma_decoder)


In [0]:
# encoder1 = EncoderRNN(input_lang.n_words, hidden_size, n_layers=2).to(device)

# test_model(encoder1, attn_decoder1, val_loader)

# TRAINING AND EVALUATING

In [0]:
# encoder1 = EncoderRNN(input_lang.n_words, hidden_size, n_layers=n_layers).to(device)
# attn_decoder1 = LuongAttnDecoderRNN(attn_model, hidden_size, output_lang.n_words, dropout=dropout_p, n_layers=n_layers).to(device)

# trainIters(encoder1, attn_decoder1, n_iters=n_epochs, print_every=print_every, plot_every=plot_every, evaluate_every=evaluate_every, learning_rate_encoder=lr_rate_en, learning_rate_decoder=lr_rate_de, lr_decay=lr_decay, gamma_encoder=gamma_decoder,
#           gamma_decoder=gamma_decoder)


In [0]:
class beam_search(object):
    """
    Some code is paraphrased from
    https://github.com/OpenNMT/OpenNMT-py/blob/master/onmt/translate/beam.py
    """
    def __init__(self, encoder, decoder, max_length, beam_size, attention = True,sentence_ratio = False): 
        super(beam_search, self).__init__()
        self.encoder = encoder
        self.decoder = decoder
        self.attention = attention
        self.max_length = max_length
        self.beam_size = beam_size
        self.sentence_ratio = sentence_ratio
        
    def search(self, encoder_outputs, decoder_input, decoder_hidden, src_len):

        prob = {k:0 for k in range(self.beam_size)}
        bestSent = []
        bestScore = []       
        decoder_word_choices = {k:[] for k in range(self.beam_size)}
        decoder_hidden_choices = {}
        decoder_input_choices = {}
        decoder_output_choices = {}
        
        # Initialize beam serach
        if self.attention == True:
#             print(' decoder_input.size= ', decoder_input.size())
#             print(' decoder_hidden.size= ', decoder_hidden.size())
#             print(' encoder_outputs.size= ', encoder_outputs.size())
            decoder_output, decoder_hidden, decoder_attention = self.decoder(decoder_input.contiguous(), decoder_hidden.contiguous(), encoder_outputs)
            #print(' decoder_output.size= ', decoder_output.size())
            decoder_output = F.log_softmax(decoder_output, dim=1)
            topv, topi = decoder_output.data.topk(self.beam_size)
        else: 
            print("Only available when attention = True")
        
        # Initialize beam candidates 
        for i in range(self.beam_size):
            decoder_word_choices[i].append(topi.squeeze()[i].item())
            decoder_input_choices[i] = topi.squeeze()[i].detach()
            decoder_hidden_choices[i] = decoder_hidden
            prob[i] += topv.squeeze()[i].detach()
            
        ## running beam search
        cur_len = 0
        max_length = 2*src_len if self.sentence_ratio else self.max_length
        # delete
#         print(self.sentence_ratio)
#         print(src_len)
#         print(max_length)
        
        while decoder_hidden_choices and cur_len <= max_length:
            cur_len += 1
            topi = {}
            key_list = list(decoder_hidden_choices.keys())
            scores = []
            for key in key_list:
                    
                decoder_output, decoder_hidden_choices[key],decoder_attn  = self.decoder(decoder_input_choices[key].unsqueeze(0), decoder_hidden_choices[key],encoder_outputs)
                decoder_output_choices[key] = F.log_softmax(decoder_output, dim=1)
                topv, topi[key] = decoder_output_choices[key].data.topk(len(decoder_hidden_choices))
                scores.extend((topv+prob[key]).tolist()[0])
                
            scores = np.array(scores)   
            max_candidate_score = scores.argsort()[-len(decoder_hidden_choices):][::-1]
            decoded_sent_score = scores[max_candidate_score]

            choice_sentence = {}
            choiceHidden = {}
            trashOfKeys = []
            
            for j in range(len(max_candidate_score)):
                prev_choice_idx = key_list[int(np.floor(max_candidate_score[j]/len(decoder_hidden_choices)))]
                if topi[prev_choice_idx].squeeze().dim() == 0:
                    next_idx = topi[prev_choice_idx].squeeze()
                else:
                    next_idx = topi[prev_choice_idx].squeeze()[max_candidate_score[j] % len(decoder_hidden_choices)]
                
                s_choice = decoder_word_choices[prev_choice_idx].copy()
                s_choice.append(next_idx.item())
                choice_sentence[j] = s_choice
                h_choice = decoder_hidden_choices[prev_choice_idx]
                choiceHidden[j] = h_choice
                decoder_input_choices[j] = next_idx.detach()   
                prob[j] = decoded_sent_score[j] 
   
            decoder_word_choices = choice_sentence
            decoder_hidden_choices = choiceHidden
            
            for key, s in decoder_word_choices.items():
                if EOS_token in s:
                    bestSent.append(s)
                    bestScore.append(prob[key]) 
                    trashOfKeys.append(key)
                    
            for k in trashOfKeys:
                decoder_hidden_choices.pop(k)
                decoder_word_choices.pop(k)

        if len(bestScore) == 0:
            max_prob = prob[0]
            max_prob_idx = 0
            for k in prob.keys():
                if prob[k] > max_prob: 
                    max_prob_idx = k
                    max_prob = prob[k]
            bestScore.append(max_prob)
            bestSent.append(decoder_word_choices[max_prob_idx])
                
        return bestSent, bestScore

In [0]:
def get_beam_batch_outputs(encoder, decoder, input_sentences, input_lengths): #####
    with torch.no_grad():
        input_tensor = input_sentences.to(device)   # 32*100 to 100*32
        batch_size = input_tensor.size(0)
        encoder_outputs = encoder(input_tensor).transpose(0,1).contiguous()
        #print(' encoder_outputs.size= ', encoder_outputs.size())
        encoder_hidden = encoder_outputs[-1].view(1, -1, hidden_size).contiguous()
        
        decoder_hidden = encoder_hidden[:decoder.n_layers].to(device)   
        my_beam_search = beam_search(encoder, decoder,input_sentences.max().item(), beam_size, True, sentence_ratio)
        beam_search_result = []
        for i in range(batch_size):
            decoder_input = torch.tensor([SOS_token], device=device, requires_grad=False)#.unsqueeze(0)#.view(1,-1) # take care of different input shape
            sentences, probs = my_beam_search.search(encoder_outputs[:,i,:].unsqueeze(1), decoder_input, 
                                                     decoder_hidden[:,i,:].unsqueeze(1), input_lengths[i].item())
            
            beam_search_result.append(sentences[probs.index(max(probs))])

        padded_beam_search_result = []

        max_length = 0
        for sentence in beam_search_result:
            if len(sentence) > max_length:
                max_length = len(sentence)

        for sentence in beam_search_result:
            while len(sentence) < max_length + 2:
                sentence.append(PAD_token)
            padded_beam_search_result.append(sentence)

        batch_sentences = []
        
        for sentence in padded_beam_search_result:
            sentence = [output_lang.index2word[k] for k in sentence]
            try:
                end_idx = sentence.index('EOS')
                batch_sentences.append(' '.join(sentence[:end_idx]))
            except ValueError:
                batch_sentences.append(' '.join(sentence))

    return batch_sentences

In [0]:
def test_model(encoder, decoder, loader, search_method = 'greedy'):
    
    encoder.eval()
    decoder.eval()
    
    score = []
    src_sentences = []
    sys_sentences = []
    ref_sentences = []
    encoder.train(False)
    decoder.train(False)
    for i, (input_sentences, target_sentences, len1, len2, src_mask) in enumerate(loader):
        for sentence in target_sentences:
            trg_list = []
            for idx in sentence:
                if idx.item() == EOS_token:
                    break
                else:
                    trg_list.append(output_lang.index2word[idx.item()])
            ref_sentences.append(' '.join(trg_list))
        for sentence in input_sentences:
            src_list = []
            for idx in sentence:
                if idx.item() == EOS_token:
                    break
                else:
                    src_list.append(input_lang.index2word[idx.item()])
            src_sentences.append(' '.join(src_list))

        #ref_sentences.append(' '.join(sent) for sent in target_sentences)
        #src_sentences.append(' '.join(sent) for sent in input_sentences)
        batch_size = input_sentences.size(0)
        if search_method == 'greedy':
            for sentence in get_batch_outputs(encoder, decoder, input_sentences, len1, len2):
                try:
                    end_idx = sentence.tolist().index('<EOS>')
                    sys_sentences.append(' '.join(sentence[:end_idx]))
                except ValueError:
                    sys_sentences.append(' '.join(sentence))
                    
        elif search_method == 'beam':
            translation_output = get_beam_batch_outputs(encoder, decoder, input_sentences, len1)
            sys_sentences.extend(translation_output)
            
    encoder.train(True)
    decoder.train(True) 
    
    score = corpus_bleu(sys_sentences,[ref_sentences], smooth="floor", smooth_floor=0.01, lowercase=False, use_effective_order=True, tokenize=DEFAULT_TOKENIZER).score
    return score, (src_sentences[0:5], sys_sentences[0:5], ref_sentences[0:5])


In [0]:
encoder2 = make_model(input_lang.n_words).to(device)

decoder2 = LuongAttnDecoderRNN(attn_model, hidden_size, output_lang.n_words, dropout=dropout_p, n_layers=1).to(device)   
    
  
# # encoder_optimizer = optim.Adam(encoder1.parameters(), lr=0.003)
# # decoder_optimizer = optim.Adam(attn_decoder1.parameters(), lr=0.003)

checkpoint = torch.load('/content/drive/My Drive/Neural-Machine-Translation/saved_model/attnIsTrue_hiddenSize512_nLayer2_batchSize64_epoch50_srcVocSize19000_lrDecayFalse_teacherF1')
encoder2.load_state_dict(checkpoint['encoder'])
decoder2.load_state_dict(checkpoint['decoder'])
# # encoder_optimizer.load_state_dict(checkpoint['encoder_optimizer'])
# # decoder_optimizer.load_state_dict(checkpoint['decoder_optimizer'])
# # epoch = checkpoint['epoch']

encoder2.eval()
decoder2.eval()
# # encoder1.train()
# # attn_decoder1.train()

In [0]:
# test_model(encoder2, decoder2, val_loader)

In [0]:
search_method = 'greedy'
for beam_size in range(2,15):
    print("beam_size: ",beam_size)
    print(test_model(encoder2, decoder2, val_loader,search_method))
    print(test_model(encoder2, decoder2, test_loader,search_method))
    print()

In [0]:
scores = torch.load('/content/drive/My Drive/Neural-Machine-Translation/saved_scores/attnIsTrue_hiddenSize512_nLayer2_batchSize64_epoch50_srcVocSize19000_lrDecayFalse_teacherF1')

In [0]:
training_loss = scores['plot_losses']
plt.plot(training_loss)
plt.show()

In [0]:
len(scores['plot_losses'])

In [0]:

loss = np.zeros(15)
for i in range(15):
    loss[i] = np.average(training_loss[63*i:63*i+63])
[x for x in loss]