In [3]:
from __future__ import unicode_literals, print_function, division
from io import open
import unicodedata
import string
import re
import random
import numpy as np
import torch
import torch.nn as nn
from torch import optim
import torch.nn.functional as F
from torch.utils.data import Dataset
from torch.autograd import Variable
import operator
import warnings
warnings.filterwarnings('ignore')
import os
from queue import PriorityQueue

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [4]:
PAD_TOKEN = 0
SOS_TOKEN = 1
EOS_TOKEN = 2
UNK_TOKEN = 3
BATCH_SIZE = 3
CUDA = False

class Lang:
    def __init__(self, name):
        self.name = name
        self.word2index = {}
        self.word2count = {}
        self.index2word = {0: 'PAD', 1: 'SOS', 2:'EOS', 3:'UNK'}#Dict
        self.n_words = 4  # Count SOS and EOS +(batch: pad and unk)

    def addSentence(self, sentence):
        for word in sentence.split(' '):
            self.addWord(word)

    def addWord(self, word):
        if word not in self.word2index:
            self.word2index[word] = self.n_words
            self.word2count[word] = 1
            self.index2word[self.n_words] = word
            self.n_words += 1
        else:
            self.word2count[word] += 1
            

# Turn a Unicode string to plain ASCII, thanks to
# http://stackoverflow.com/a/518232/2809427
def unicodeToAscii(s):
    return ''.join(
        c for c in unicodedata.normalize('NFD', s)
        if unicodedata.category(c) != 'Mn'
    )

#Lowercase, trim, and remove non-letter characters
def normalizeString(s):
    s = unicodeToAscii(s.lower().strip())
    s = re.sub(r"([.!?])", r" \1", s)
    #This line is commented out since it will not properly deal with Chinese Letters
    #s = re.sub(r"[^a-zA-Z.!?]+", r" ", s)
    return s


import time
import math


def asMinutes(s):
    m = math.floor(s / 60)
    s -= m * 60
    return '%dm %ds' % (m, s)


def timeSince(since, percent):
    now = time.time()
    s = now - since
    es = s / (percent)
    rs = es - s
    return '%s (- %s)' % (asMinutes(s), asMinutes(rs))

# def readLangs(lang1, lang2, reverse=False):
#     print("Reading lines...")

#     # Read the file and split into lines
#     lines = open('data/%s-%s.txt' % (lang1, lang2), encoding='utf-8').\
#         read().strip().split('\n')

#     # Split every line into pairs and normalize
#     pairs = [[normalizeString(s) for s in l.split('\t')] for l in lines]

#     # Reverse pairs, make Lang instances
#     if reverse:
#         pairs = [list(reversed(p)) for p in pairs]
#         input_lang = Lang(lang2)
#         output_lang = Lang(lang1)
#     else:
#         input_lang = Lang(lang1)
#         output_lang = Lang(lang2)

#     return input_lang, output_lang, pairs

def indexesFromSentence(lang, sentence):
    return [lang.word2index[word] for word in sentence.split(' ')] + [EOS_TOKEN]


def tensorFromSentence(lang, sentence):
    indexes = indexesFromSentence(lang, sentence)
    indexes.append(EOS_TOKEN)
    return torch.tensor(indexes, dtype=torch.long, device=device).view(-1, 1)


def tensorsFromPair(pair):
    input_tensor = tensorFromSentence(input_lang, pair[0])
    target_tensor = tensorFromSentence(output_lang, pair[1])
    return (input_tensor, target_tensor)

MAX_LENGTH = 10

# eng_prefixes = (
#     "i am ", "i m ",
#     "he is", "he s ",
#     "she is", "she s",
#     "you are", "you re ",
#     "we are", "we re ",
#     "they are", "they re "
# )


# def filterPair(p):
#     return len(p[0].split(' ')) < MAX_LENGTH and \
#         len(p[1].split(' ')) < MAX_LENGTH and \
#         p[1].startswith(eng_prefixes)


# def filterPairs(pairs):
#     return [pair for pair in pairs if filterPair(pair)]

### Process Data for project

In [33]:
def readLangs(lang1, lang2, category, reverse = False):#category = ['train', 'dev','test]
    print('Reading lines:')
    lines1 = open('data/iwslt-' + lang1 +'-en/' + category +'.tok.'+ lang1, encoding = 'utf-8').\
    read().strip().split('\n')
    data1 = [normalizeString(l) for l in lines1]
    
    lines2 = open('data/iwslt-' + lang1 +'-en/' + category + '.tok.' + lang2, encoding = 'utf-8').\
    read().strip().split('\n')
    data2 = [normalizeString(l) for l in lines2]
    #Given that data2 is english hence we further normalize
    data2 = [re.sub(r"[^a-zA-Z.!?]+", r" ", data) for data in data2]
    print('Generating pairs')
    print(len(data1), len(data2))
    pairs = [[data1[i], data2[i]] for i in range(len(data1))]
    
    if reverse:
        pairs = [list(reversed(p)) for p in pairs]
        input_lang = Lang(lang2)
        output_lang = Lang(lang1)
    else:
        input_lang = Lang(lang1)
        output_lang = Lang(lang2)

    return input_lang, output_lang, pairs


In [34]:
#Data Preparation for CHN to ENG
def prepareData(lang1, lang2, category, reverse = False):
    input_lang, output_lang, pairs = readLangs(lang1, lang2, category, reverse)
    print('Read %s sentence pairs' % len(pairs))
    #Build the vocabulary
    print('Counting words')
    #max_length = 0
    for pair in pairs:
        input_lang.addSentence(pair[0])
        output_lang.addSentence(pair[1])
        #Get the maximum legnth 
        #pair_max = max(len(list(filter(None, pair[0].split(' ')))),
          #             len(list(filter(None, pair[1].split(' ')))))
        #max_length = max(pair_max, max_length)
        
    #Test for basic info about the data
    print('Counted Words')
    print(input_lang.name, input_lang.n_words)
    print(output_lang.name, output_lang.n_words)
    return input_lang, output_lang, pairs

In [35]:
os.chdir('/Users/Hengyu/Desktop/MasterWork/1014/Neural-Machine-Translation')

In [36]:
input_zh, output_zh_en, train_zh_pairs = prepareData('zh', 'en', 'train')
_,_, val_zh_pairs = prepareData('zh', 'en', 'dev')
_,_, test_zh_pairs = prepareData('zh','en','test')
print(random.choice(train_zh_pairs))

Reading lines:
Generating pairs
213376 213376
Read 213376 sentence pairs
Counting words
Counted Words
zh 88426
en 50970
Reading lines:
Generating pairs
1261 1261
Read 1261 sentence pairs
Counting words
Counted Words
zh 6133
en 3671
Reading lines:
Generating pairs
1397 1397
Read 1397 sentence pairs
Counting words
Counted Words
zh 5214
en 3220
['就是 这么 一小 小点 基因   小 而 卑劣 的 基因', 'it apos s that little gene . it apos s small and it apos s mean .']


In [37]:
input_vi, output_vi_en, train_vi_pairs = prepareData('vi', 'en', 'train')
_,_, val_vi_pairs = prepareData('vi', 'en', 'dev')
_,_, test_vi_pairs = prepareData('vi','en','test')
print(random.choice(train_vi_pairs))

Reading lines:
Generating pairs
133317 133317
Read 133317 sentence pairs
Counting words
Counted Words
vi 30768
en 41272
Reading lines:
Generating pairs
1268 1268
Read 1268 sentence pairs
Counting words
Counted Words
vi 3050
en 3572
Reading lines:
Generating pairs
1553 1553
Read 1553 sentence pairs
Counting words
Counted Words
vi 2899
en 3408
['đung , co ay noi nhu the  . ok , đieu đo co nghia_la chung_ta moi đi đuoc nua cau_chuyen  .', 'yes she said something like that . ok that means we are at half of the story .']


In [10]:
train_vi_pairs[1000]

['vi_vay chung_ta nen dung ngay su u_me nay lai , dung su tho_o , dung su ky_thi che_nhao , va dung su im_lang nay , va pha_bo nhung đieu cam ky , nhin thang vao su_that , va bat_đau tro_chuyen , boi_vi cach duy_nhat đe đanh_bai mot van_đe ma ca_nhan moi nguoi phai tu minh chien_đau đo la cung manh_me vung_vang đung_lai gan nhau , cung manh_me vung_vang đung_lai gan nhau  .',
 'so we need to stop the ignorance stop the intolerance stop the stigma and stop the silence and we need to take away the taboos take a look at the truth and start talking because the only way we apos re going to beat a problem that people are battling alone is by standing strong together by standing strong together .']

In [11]:
train_zh_pairs[419]

['作为 领导 领导人   我们 不可 可能 总是 对 的', 'we apos re not always right as leaders .']

In [12]:
class NMTDataset(Dataset):
    def __init__(self, pairs, input_lang, output_lang):#Needs the index pairs
        self.pairs = pairs
        self.input_lang = input_lang
        self.output_lang = output_lang
        self.input_seqs = [pairs[i][0] for i in range(len(self.pairs))]
        self.output_seqs = [pairs[i][1] for i in range(len(self.pairs))]

    def __len__(self):
        return len(self.pairs)#Returning number of pairs
    
    def __getitem__(self, index):
        input_seq = self.input_seqs[index]
        output_seq = self.output_seqs[index]
        return [input_seq, len(input_seq), output_seq, len(output_seq)]
    
def vocab_collate_func(batch):
    #Reference: lab8_3_mri
    def _pad_sequences(seqs):
        lens = [len(seq) for seq in seqs]
        padded_seqs = torch.zeros(len(seqs), max(lens)).long()
        for i, seq in enumerate(seqs):
            end = lens[i]
            padded_seqs[i, :end] = torch.LongTensor(seq[:end])
        return padded_seqs, lens
    
    batch_input_seqs = [datum[0] for datum in batch]
    batch_output_seqs = [datum[2] for datum in batch]
    #batch_input_length = [datum[1] for datum in batch]
    #batch_output_length = [datum[3] for datum in batch]

    sorted_pairs = sorted(zip(batch_input_seqs, batch_output_seqs), key=lambda x: len(x[0]), reverse = True)
    in_seq_sorted, out_seq_sorted = zip(*sorted_pairs)
    
    padded_input,input_lens = _pad_sequences(in_seq_sorted)
    padded_output,output_lens = _pad_sequences(out_seq_sorted)
    
    input_list = torch.from_numpy(np.array(padded_input))
    input_length = torch.LongTensor(input_lens)
    output_list = torch.from_numpy(np.array(padded_output))
    output_length = torch.LongTensor(output_lens)
    
    if CUDA:
        input_list = input_list.cuda()
        output_list = output_list.cuda()
        input_length = input_length.cuda()
        output_length = out_length.cuda()
            
    return [input_list, input_length, output_list, output_length]

In [13]:
index_zh_pairs = [[indexesFromSentence(input_zh, train_zh_pairs[i][0]),
                indexesFromSentence(output_zh_en, train_zh_pairs[i][1])] for i in range(len(train_zh_pairs))]
index_vi_pairs = [[indexesFromSentence(input_vi, train_vi_pairs[i][0]),
                  indexesFromSentence(output_vi_en, train_vi_pairs[i][1])] for i in range(len(train_vi_pairs))]
'''
NMTDataset needs index pairs, need to call indexesFromPairs functions beforehand
The dataLoader is sorted according to length of the input_length, and padded to
max length of input and output list repectively
TODO: output_list is not sorted, hence need to sort (maybe) in the rnn sequence.
'''
train_zh_dataset = NMTDataset(index_zh_pairs, input_zh, output_zh_en)
train_vi_dataset = NMTDataset(index_vi_pairs, input_vi, output_vi_en)

train_zh_loader = torch.utils.data.DataLoader(dataset = train_zh_dataset, 
                                          batch_size = BATCH_SIZE,
                                          collate_fn = vocab_collate_func,
                                          shuffle = True)
#Input_batch in size Batch x maxLen


In [14]:
#Get a batch for testing purpose
for i, (input_list, input_length, output_list, output_length) in enumerate(train_zh_loader):
    if i== 0:
        break

In [None]:
input_list.size()

In [None]:
input_list

In [None]:
input_list[0]

In [None]:
input_list[:,0]

In [None]:
input_test_list = input_list.transpose(0,1)
input_test_list[0]

### End of Data Processing


### Single batch train method

In [15]:
#Here for the constant definition
MAX_SENTENCE_LENGTH = 10
hidden_size = 256
max_length = 10
BATCH_SIZE = 3
TEST_BATCH_SIZE = 3
CLIP = 50
TEACHER_RATIO = 0.5


In [22]:
def rnn_mask(length_list):
    max_length = length_list.max().item()
    masked_sentences = []
    longest_sentence = [1]*max_length
    for i in range(BATCH_SIZE):
        curr_length = length_list[i].item()
        masked_sentence = [1]*max_length
        masked_sentence[curr_length:] = [0] * (max_length - curr_length)
        masked_sentences.append(masked_sentence)
    if CUDA:
        masked_sentences = torch.from_numpy(np.array(masked_sentences)).cuda()
    else:
        masked_sentences = torch.from_numpy(np.array(masked_sentences))
    return masked_sentences
        
def rnn_mask_loss(decoder_outputs, output_list, output_length):
    '''
    decoder_outputs: 3d matrix containing all decoder output(B x output_lang vocab size)
                    while decoder_output is in size(max_len x vocab_size)
    output_list: Batch x max_len
    output_length: batch
    '''
    batch_size, max_len = output_list.size()
    decoder_outputs = decoder_outputs.view(-1, decoder_outputs.size(-1))#(bxmax_len) x V
    output_list = output_list.view(-1, 1)# (b x max_len) x 1 
    neg_loss = -torch.gather(decoder_outputs, 1, output_list)#(b x max_len) x 1
    neg_loss = neg_loss.view(batch_size, -1)# restore to b x max_len
    
    mask = rnn_mask(output_length)#b x max_len
    mask_loss = neg_loss * mask.float()
    
    loss = mask_loss.sum() / output_length.float().sum()
    return loss
    
    
    
    
        

In [28]:
def batch_train(input_list, input_length, output_list,output_length, 
                batch_encoder, batch_decoder, encoder_optimizer, decoder_optimizer, 
                attention, criterion):
    '''
    param: @attention is a Boolean variable indicating whether using attention
    '''
    batch_encoder.train()
    batch_decoder.train()
    encoder_optimizer.zero_grad()
    decoder_optimizer.zero_grad()
    max_output_length = output_length.max().item()


#     input_length = input_tensor.size(0)
#     target_length = target_tensor.size(0)

#     encoder_outputs = torch.zeros(max_length, encoder.hidden_size, device=device)
    
    loss = 0

    #batch_encoder = BatchEncoderRNN(input_lang.n_words, hidden_size, BATCH_SIZE, n_layers = 2, dropout = 0.1)
    init_hidden = batch_encoder.initHidden()
    encoder_outputs, encoder_hidden = batch_encoder(input_list, input_length, init_hidden)

    #Initialize for decoding process
    curr_batch = input_list.size(0)#Take the current batch size
    decoder_input = torch.tensor([[SOS_TOKEN]], device=device).repeat(curr_batch,1)
    decoder_hidden = encoder_hidden[:batch_decoder.n_layers]#Bidirectional summoned
    #encoder_outputs : L x B x H
    decoder_outputs = torch.zeros(max_output_length, curr_batch, batch_decoder.output_size)

    
    # Move new Variables to CUDA
    if CUDA:
        decoder_input = decoder_input.cuda()
        decoder_outputs = decoder_outputs.cuda()
    
    #use_teacher_forcing = True if random.random() < teacher_forcing_ratio else False
    use_teacher_forcing = True
    
    if attention: #If attention is used
        if use_teacher_forcing:
        # Teacher forcing: Feed the target as the next input
            for di in range(max_output_length):
                decoder_output, decoder_hidden, decoder_attention = batch_decoder(
                    decoder_input, decoder_hidden, encoder_outputs)
                decoder_outputs[di] = decoder_output
                decoder_input = output_list[:,di].unsqueeze(1)  # Teacher forcing

        else:
        # Without teacher forcing: use its own predictions as the next input
            for di in range(max_output_length):
                decoder_output, decoder_hidden, decoder_attention = batch_decoder(
                    decoder_input, decoder_hidden, encoder_outputs)
                decoder_outputs[di] = decoder_input
                topv, topi = decoder_output.topk(1)
                decoder_input = topi.detach()# detach from history as input: size batch x 1 
                if ((decoder_output == EOS_TOKEN).sum().item()) == decoder_output.size(0):#If all are EOS tokens
                    break;
            
    else:
        if use_teacher_forcing:
            # Teacher forcing: Feed the target as the next input
            for di in range(max_output_length):
                decoder_output, decoder_hidden = batch_decoder(
                    decoder_input, decoder_hidden)
                decoder_outputs[di] = decoder_output
                decoder_input = output_list[:,di].unsqueeze(1)  # Teacher forcing

        else:
            # Without teacher forcing: use its own predictions as the next input
            for di in range(max_output_length):
                decoder_output, decoder_hidden = batch_decoder(
                    decoder_input, decoder_hidden)
                decoder_outputs[di] = decoder_input
                topv, topi = decoder_output.topk(1)
                decoder_input = topi.detach()# detach from history as input: size batch x 1 
                if ((decoder_output == EOS_TOKEN).sum().item()) == decoder_output.size(0):#If all are EOS tokens
                    break;
    loss += rnn_mask_loss(decoder_outputs.transpose(0,1).contiguous(), output_list.contiguous(), output_length)
            

    loss.backward()
    ec = torch.nn.utils.clip_grad_norm(batch_encoder.parameters(), CLIP)
    dc = torch.nn.utils.clip_grad_norm(batch_decoder.parameters(), CLIP)

    encoder_optimizer.step()
    decoder_optimizer.step()

    return loss.item()

### Encoders and Decoders

In [29]:
class BatchEncoderRNN(nn.Module):
    def __init__(self, input_size, hidden_size, batch_size, n_layers=1, dropout=0.1):
        super(BatchEncoderRNN, self).__init__()
        
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.batch_size = batch_size
        self.n_layers = n_layers
        self.dropout = dropout
        
        self.embedding = nn.Embedding(input_size, hidden_size)
        self.gru = nn.GRU(hidden_size, hidden_size, n_layers, dropout=self.dropout, bidirectional = True)
        
    def forward(self, input_seqs, input_lengths, hidden=None):
        '''
        input_seqs in size B x L sorted in decreasing order -> will transpose to fit in embedding dimension
        '''
        embedded = self.embedding(input_seqs.transpose(0,1))#input_seqs B x L -> transpose to L x B
        #Input length sorted by loader
        packed = torch.nn.utils.rnn.pack_padded_sequence(embedded, input_lengths)
        outputs, hidden = self.gru(packed, hidden)
        outputs, output_lengths = torch.nn.utils.rnn.pad_packed_sequence(outputs) # unpack (back to padded)
        #Outputs in shape L x B x 2H, hidden as the last state of the GRU
        outputs = outputs[:, :, :self.hidden_size] + outputs[:, : ,self.hidden_size:] # Sum bidirectional outputs
        #hidden = hidden[:self.n_layers, :, :] + hidden[self.n_layers:,:,:]#Sum bidrectional information
        #outputs L x B x H
        #hidden size (2*n_layers) x B x H

        #outputs: seq_len x Batch x H
        return outputs, hidden
    
    def initHidden(self):
        #Due to bidrectional will have self.n_layers * 2
        return torch.zeros(self.n_layers *2, self.batch_size, self.hidden_size,device = device)#hidden size 2lays *B*H

In [30]:
batch_encoder = BatchEncoderRNN(input_zh.n_words, hidden_size, BATCH_SIZE, n_layers = 2, dropout = 0.1)
init_hidden = batch_encoder.initHidden()
encoder_outputs, encoder_hidden = batch_encoder(input_list, input_length, init_hidden)
#inithidden bidirectional, encoder_hidden, summing up both directions
init_hidden.size(), encoder_outputs.size(), encoder_hidden.size()

(torch.Size([4, 3, 256]), torch.Size([77, 3, 256]), torch.Size([4, 3, 256]))

In [45]:
# RNN decoder with no attention used, batch implemented
# RNN decoder take one token at a time
class DecoderRNN(nn.Module):
    def __init__(self, emb_size, hidden_size, output_size, batch_size, n_layers=1, dropout_p = 0.1):
        super(DecoderRNN, self).__init__()
        self.hidden_size = hidden_size
        self.emb_size = emb_size
        self.batch_size = batch_size
        self.output_size = output_size
        self.n_layers = n_layers
        self.dropout = nn.Dropout(dropout_p)
        self.embedding = nn.Embedding(output_size, emb_size,padding_idx = PAD_TOKEN)
        self.gru = nn.GRU(emb_size, hidden_size, n_layers, dropout = dropout_p)
        self.out = nn.Linear(hidden_size, output_size)
        self.softmax = nn.LogSoftmax(dim=1)
        
    def forward(self, input, hidden):
        embedded = self.embedding(input).view(1, self.batch_size, -1)# 1 x B x H
        embedded = self.dropout(embedded)
        output = F.relu(embedded)
        output, hidden = self.gru(output, hidden)#output 1 x B x E, hidden n_layers x B x H
        out = self.out(output[0])
        out = self.softmax(out)
        #out size batch x output_lang_vocab_size
        #hidden n_layers x B x H
        return out, hidden
    
#     def initHidden(self):
#         return torch.zeros(self.n_layers, self.batch_size, self.hidden_size, device = device)

In [133]:
class Attn(nn.Module):
    def __init__(self, method, hidden_size):
        super(Attn, self).__init__()
        
        self.method = method
        self.hidden_size = hidden_size
        
        if self.method == 'general':
            self.attn = nn.Linear(self.hidden_size, hidden_size)

        elif self.method == 'concat':
            self.attn = nn.Linear(self.hidden_size * 2, hidden_size)
            self.v = nn.Parameter(torch.FloatTensor(1, hidden_size))

    def forward(self, hidden, encoder_outputs):
        max_len = encoder_outputs.size(0)
        this_batch_size = encoder_outputs.size(1)

        # Create variable to store attention energies
        attn_energies = Variable(torch.zeros(this_batch_size, max_len)) # B x S

        if CUDA:
            attn_energies = attn_energies.cuda()

        # For each batch of encoder outputs
        for b in range(this_batch_size):
            # Calculate energy for each encoder output
            for i in range(max_len):
                attn_energies[b, i] = self.score(hidden[:, b], encoder_outputs[i, b].unsqueeze(0))

        # Normalize energies to weights in range 0 to 1, resize to 1 x B x S
        return F.softmax(attn_energies).unsqueeze(1)
    
    def score(self, hidden, encoder_output):
        hidden = hidden.squeeze()
        encoder_output = encoder_output.squeeze()
#         print(hidden.size())
#         print(encoder_output.size())
        if self.method == 'dot':
            energy = hidden.dot(encoder_output)
            return energy
        
        elif self.method == 'general':
            energy = self.attn(encoder_output)
            energy = hidden.dot(energy)
            return energy
        elif self.method == 'concat':
            energy = self.attn(torch.cat((hidden, encoder_output), 1))
            energy = self.v.dot(energy)
            return energy

In [149]:
class LuongAttnDecoderRNN(nn.Module):
    def __init__(self, attn_model, emb_size, hidden_size, output_size, n_layers=1, dropout=0.1):
        super(LuongAttnDecoderRNN, self).__init__()

        # Keep for reference
        self.attn_model = attn_model
        self.hidden_size = hidden_size
        self.output_size = output_size
        self.n_layers = n_layers
        self.dropout = dropout

        # Define layers
        self.embedding = nn.Embedding(output_size, hidden_size)
        self.embedding_dropout = nn.Dropout(dropout)
        self.gru = nn.GRU(hidden_size, hidden_size, n_layers, dropout=dropout)
        self.concat = nn.Linear(hidden_size * 2, hidden_size)
        self.out = nn.Linear(hidden_size, output_size)
        
        # Choose attention model
        if attn_model != 'none':
            self.attn = Attn(attn_model, hidden_size)

    def forward(self, input_seq, last_hidden, encoder_outputs):
        # Note: we run this one step at a time

        # Get the embedding of the current input word (last output word)
        batch_size = input_seq.size(0)
        #print(batch_size)
        embedded = self.embedding(input_seq)
        embedded = self.embedding_dropout(embedded)
        embedded = embedded.view(1, batch_size, self.hidden_size) # S=1 x B x N
        # Get current hidden state from input word and last hidden state
        rnn_output, hidden = self.gru(embedded, last_hidden)

        # Calculate attention from current RNN state and all encoder outputs;
        # apply to encoder outputs to get weighted average
        attn_weights = self.attn(rnn_output, encoder_outputs)
        context = attn_weights.bmm(encoder_outputs.transpose(0, 1)) # B x S=1 x N

        # Attentional vector using the RNN hidden state and context vector
        # concatenated together (Luong eq. 5)
        rnn_output = rnn_output.squeeze(0) # S=1 x B x N -> B x N
        context = context.squeeze(1)       # B x S=1 x N -> B x N
        concat_input = torch.cat((rnn_output, context), 1)
        concat_output = F.tanh(self.concat(concat_input))

        # Finally predict next token (Luong eq. 6, without softmax)
        output = self.out(concat_output)
        output = F.log_softmax(output)


        # Return final output, hidden state, and attention weights (for visualization)
        return output, hidden, attn_weights

### Test for no attention decoder

In [135]:
#Inside the batch_train_function for testing purpose
encoder1 = BatchEncoderRNN(input_zh.n_words, hidden_size, BATCH_SIZE).to(device)
no_attn_decoder = DecoderRNN(attn_model, hidden_size, hidden_size, output_zh_en.n_words, BATCH_SIZE).to(device)

max_output_length = output_length.max().item()
print(max_output_length)


init_hidden = encoder1.initHidden()
encoder_outputs, encoder_hidden = encoder1(input_list, input_length, init_hidden)

curr_batch = input_list.size(0)#Take the current batch size
decoder_input = torch.tensor([[SOS_TOKEN]], device=device).repeat(curr_batch,1)
decoder_hidden = encoder_hidden[:batch_decoder.n_layers]#Bidirectional summoned
#print('In function')
#print(decoder_hidden.size())
#encoder_outputs : L x B x H
decoder_outputs = torch.zeros(max_output_length, curr_batch, no_batch_decoder.output_size)
for di in range(max_output_length):
    decoder_output, decoder_hidden, decoder_attention = batch_decoder(
        decoder_input, decoder_hidden, encoder_outputs)
    decoder_outputs[di] = decoder_output
    decoder_input = output_list[:,di].unsqueeze(1)

53
In function
torch.Size([1, 3, 256])
3
torch.Size([1, 3, 256])
torch.Size([1, 3, 256])
torch.Size([1, 3, 256])
3
torch.Size([1, 3, 256])
torch.Size([1, 3, 256])
torch.Size([1, 3, 256])
3
torch.Size([1, 3, 256])
torch.Size([1, 3, 256])
torch.Size([1, 3, 256])
3
torch.Size([1, 3, 256])
torch.Size([1, 3, 256])
torch.Size([1, 3, 256])
3
torch.Size([1, 3, 256])
torch.Size([1, 3, 256])
torch.Size([1, 3, 256])
3
torch.Size([1, 3, 256])
torch.Size([1, 3, 256])
torch.Size([1, 3, 256])
3
torch.Size([1, 3, 256])
torch.Size([1, 3, 256])
torch.Size([1, 3, 256])
3
torch.Size([1, 3, 256])
torch.Size([1, 3, 256])
torch.Size([1, 3, 256])
3
torch.Size([1, 3, 256])
torch.Size([1, 3, 256])
torch.Size([1, 3, 256])
3
torch.Size([1, 3, 256])
torch.Size([1, 3, 256])
torch.Size([1, 3, 256])
3
torch.Size([1, 3, 256])
torch.Size([1, 3, 256])
torch.Size([1, 3, 256])
3
torch.Size([1, 3, 256])
torch.Size([1, 3, 256])
torch.Size([1, 3, 256])
3
torch.Size([1, 3, 256])
torch.Size([1, 3, 256])
torch.Size([1, 3, 256])

In [153]:
#Test for loss function printing out number should be
hidden_size = 256
attn_model = 'dot'
encoder1 = BatchEncoderRNN(input_zh.n_words, hidden_size, BATCH_SIZE).to(device)
batch_decoder = LuongAttnDecoderRNN(attn_model, hidden_size, hidden_size, output_zh_en.n_words).to(device)
learning_rate = 0.01
encoder_optimizer = optim.SGD(encoder1.parameters(), lr=learning_rate)
decoder_optimizer = optim.SGD(batch_decoder.parameters(), lr=learning_rate)
#criterion = torch.nn.CrossEntropyLoss().cuda()
criterion = torch.nn.CrossEntropyLoss().cuda()


loss= batch_train(input_list, input_length, output_list, output_length, 
            encoder1,batch_decoder, encoder_optimizer, 
           decoder_optimizer, True, criterion)
print(loss)

10.84076976776123


In [151]:
#Test for loss function printing out number should be
#RUN THIS!!!!
hidden_size = 256
attn_model = 'dot'
encoder1 = BatchEncoderRNN(input_zh.n_words, hidden_size, BATCH_SIZE).to(device)
batch_decoder1 = LuongAttnDecoderRNN(attn_model, hidden_size, hidden_size, output_zh_en.n_words).to(device)
learning_rate = 0.01
encoder_optimizer = optim.SGD(encoder1.parameters(), lr=learning_rate)
decoder_optimizer = optim.SGD(batch_decoder1.parameters(), lr=learning_rate)
#criterion = torch.nn.CrossEntropyLoss().cuda()
criterion = torch.nn.CrossEntropyLoss().cuda()

loss= batch_train(input_list, input_length, output_list, output_length, 
            encoder1, batch_decoder1, encoder_optimizer, 
           decoder_optimizer, True, criterion)
print(loss)

10.827942848205566


In [None]:
#Overfitting test for batch_size = 3
init_hidden = batch_encoder.initHidden()
learning_rate = 0.01
encoder_optimizer = optim.SGD(encoder1.parameters(), lr=learning_rate)
decoder_optimizer = optim.SGD(no_batch_decoder.parameters(), lr=learning_rate)
#criterion = torch.nn.NLLLoss()
criterion = torch.nn.CrossEntropyLoss().cuda()

train_loss_list = []
for i in range(1000):
    print('The current round is {}'.format(int(i)))
    loss = batch_train(input_list, input_length, output_list, output_length, 
            encoder1, no_batch_decoder, encoder_optimizer, 
           decoder_optimizer, criterion)
    train_loss_list.append(loss)
    
import matplotlib.pyplot as plt
plt.switch_backend('agg')
import matplotlib.ticker as ticker
import numpy as np
%matplotlib inline
fig, ax = plt.subplots(figsize = (12,10))
ax.plot(train_loss_list)