In [1]:
from attention import Attention
from loss import NLLLoss
import torch

In [2]:
import torch
import torch.nn as nn
from generalRnn import BaseCoder

class Encoder(BaseCoder):
    def __init__(self,vocab_size, hidden_size, embedding_size, input_dropout=0.0,output_dropout=0.0, n_layers=1, bidirectional=True,rnn="lstm"):
        super(Encoder, self).__init__(vocab_size, hidden_size, embedding_size, input_dropout,output_dropout, n_layers, rnn)
        self.embedding = nn.Embedding(vocab_size,embedding_size)

        # TODO: add pretrained embeddings

        self.rnn = self.baseModel(input_size=embedding_size, hidden_size=hidden_size, num_layers=n_layers,
                    batch_first=True, bidirectional=bidirectional, dropout=output_dropout)

    def forward(self, input_seq, input_lengths=None):
        embedded = self.embedding(input_seq)
        # embedded = self.input_dropout(embedded)
        # embedded = nn.utils.rnn.pack_padded_sequence(embedded, input_lengths, batch_first=True)
        output, hidden = self.rnn(embedded)
        # output, _ = nn.utils.rnn.pad_packed_sequence(output, batch_first=True)
        return output, hidden

In [3]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from generalRnn import BaseCoder
from attention import Attention

import numpy as np

class Decoder(BaseCoder):
    def __init__(self, vocab_size, hidden_size, embedding_size, input_dropout=0.0, output_dropout=0.0, n_layers=1, bidirectional=False,rnn="lstm"):
        super(Decoder,self).__init__(vocab_size, hidden_size,embedding_size,input_dropout,output_dropout, n_layers, rnn)
        self.rnn = self.baseModel(input_size=embedding_size, hidden_size=hidden_size, num_layers=n_layers, 
                    batch_first=True,dropout=output_dropout)
        self.output_size = vocab_size
        self.embedding = nn.Embedding(vocab_size, embedding_size)

        # temporary set attention embedding size to hidden size
        self.attention = Attention(self.hidden_size)

        self.wsm = nn.Linear(self.hidden_size, self.output_size)

    def forward(self, input_seq, encoder_hidden, encoder_outputs, func=F.log_softmax):
        # batch_size = input_seq.size(0)
        max_length = input_seq.size(1)

        # using cuda or not
        inputs = input_seq
        
        # for bidrectional encoder
        # encoder_hidden: (num_layers * num_directions, batch_size, hidden_size)
        decoder_hidden = tuple([torch.cat([h[0:h.size(0):2], h[1:h.size(0):2]], 2) for h in encoder_hidden])

        outputs = []
        # lengths = np.array([max_length] * batch_size)

        prev = inputs[:, 0].unsqueeze(1)
        for i in range(max_length):
            softmax, decoder_hidden, attention = self.forward_helper(prev, decoder_hidden,encoder_outputs ,func)
            output_seq = softmax.squeeze(1) # batch * seq_length
            outputs.append(output_seq)
            prev = output_seq.topk(1)[1] # max probability index

        return outputs,decoder_hidden
            


    # could insert one parameter like: src_matrix
    def forward_helper(self, decoder_input, decoder_hidden, encoder_outputs, func):
        batch_size = decoder_input.size(0)
        output_size = decoder_input.size(1)
        embedded = self.embedding(decoder_input)
        # embedded = self.input_dropout(embedded)
        output,hidden = self.rnn(embedded, decoder_hidden)
        output, attention = self.attention(output, encoder_outputs) # attention
        softmax = func(self.wsm(output.view(-1, self.hidden_size)), dim=1).view(batch_size,output_size,-1)
        return softmax, hidden, attention

In [32]:
src_sents = torch.LongTensor(5,10).random_(0,10) # batch * seqlen
src_sents

tensor([[9, 6, 7, 1, 7, 6, 4, 0, 1, 1],
        [1, 1, 0, 6, 5, 7, 8, 5, 8, 6],
        [9, 6, 8, 7, 9, 6, 8, 5, 8, 4],
        [6, 9, 1, 4, 2, 4, 6, 0, 6, 0],
        [4, 5, 7, 9, 0, 2, 9, 1, 6, 2]])

In [37]:
tgt_sents = torch.LongTensor(5,10).random_(0,10)
tgt_sents

tensor([[7, 1, 9, 0, 3, 0, 6, 8, 1, 4],
        [8, 9, 7, 2, 1, 2, 4, 5, 6, 2],
        [7, 9, 7, 0, 6, 1, 7, 5, 0, 9],
        [7, 2, 1, 8, 4, 1, 2, 5, 6, 6],
        [8, 6, 4, 8, 0, 4, 8, 9, 6, 4]])

In [26]:
encoder = Encoder(10,10,5)

In [16]:
encoder_outputs, encoder_hidden = encoder.forward(input_var)

In [9]:
encoder_outputs.size()

torch.Size([5, 10, 20])

In [18]:
decoder = Decoder(10,2*10,5)

In [19]:
result = decoder.forward(output_var,encoder_hidden,encoder_outputs)



In [49]:
def sent_padding(src_sents, tgt_sents):
    batch_size = len(src_sents)

    max_src_len = max([len(sent) for sent in src_sents])
    max_tgt_len = max([len(sent) for sent in tgt_sents])

    padded_src_sents = np.zeros((batch_size, max_src_len))
    padded_Yinput = np.zeros((batch_size, max_tgt_len))
    padded_Ytarget = np.zeros((batch_size, max_tgt_len))

    src_lens = []
    tgt_lens = []

    for i, sent in enumerate(zip(src_sents, tgt_sents)):
        src_sent = sent[0]
        y_input = sent[1][:-1]
        y_target = sent[1][1:]

        src_len = len(src_sent)
        tgt_len = len(y_input)

        padded_src_sents[i, :src_len] = src_sent
        padded_Yinput[i, :tgt_len] = y_input
        padded_Ytarget[i, :tgt_len] = y_target

        src_lens.append(src_len)
        tgt_lens.append(tgt_len)

    return torch.LongTensor(padded_src_sents), src_lens, \
           torch.LongTensor(padded_Yinput), torch.LongTensor(padded_Ytarget), tgt_lens

In [52]:
class NMT(object):

    def __init__(self, embed_size, hidden_size, nvocab_src, nvocab_tgt, dropout_rate=0.2):
        super(NMT, self).__init__()

#         nvocab_src = len(vocab.src)
#         nvocab_tgt = len(vocab.tgt)
#         self.vocab = vocab
        self.encoder = Encoder(nvocab_src, hidden_size, embed_size, input_dropout=dropout_rate, n_layers=1)
        self.decoder = Decoder(nvocab_tgt, 2*hidden_size, embed_size,output_dropout=dropout_rate, n_layers=1)
        LAS_params = list(self.encoder.parameters()) + list(self.decoder.parameters())
        self.optimizer = optim.Adam(LAS_params, lr=0.001)
        self.scheduler = torch.optim.lr_scheduler.StepLR(self.optimizer, step_size=1, gamma=0.5)
        weight = torch.ones(nvocab_tgt)
        self.loss = NLLLoss(weight=weight, mask=0)

        if torch.cuda.is_available():
            # Move the network and the optimizer to the GPU
            self.encoder = self.encoder.cuda()
            self.decoder = self.decoder.cuda()
            self.loss.cuda()


    def __call__(self, src_sents, tgt_sents):
        """
        take a mini-batch of source and target sentences, compute the log-likelihood of 
        target sentences.

        Args:
            src_sents: list of source sentence tokens
            tgt_sents: list of target sentence tokens, wrapped by `<s>` and `</s>`

        Returns:
            scores: a variable/tensor of shape (batch_size, ) representing the 
                log-likelihood of generating the gold-standard target sentence for 
                each example in the input batch
        """
        src_sents = self.vocab.src.words2indices(src_sents)
        tgt_sents = self.vocab.tgt.words2indices(tgt_sents)
        src_sents, src_len, y_input, y_tgt, tgt_len  = sent_padding(src_sents, tgt_sents)
        src_encodings, decoder_init_state = self.encode(src_sents,src_len)
        scores = self.decode(src_encodings, decoder_init_state, [y_input, y_tgt])

        return scores

    def encode(self, src_sents, input_lengths):
        """
        Use a GRU/LSTM to encode source sentences into hidden states

        Args:
            src_sents: list of source sentence tokens

        Returns:
            src_encodings: hidden states of tokens in source sentences, this could be a variable 
                with shape (batch_size, source_sentence_length, encoding_dim), or in orther formats
            decoder_init_state: decoder GRU/LSTM's initial state, computed from source encodings
        """
        encoder_outputs, encoder_hidden = self.encoder(src_sents,input_lengths)

        return encoder_outputs, encoder_hidden


    def decode(self, src_encodings, decoder_init_state, tgt_sents):
        """
        Given source encodings, compute the log-likelihood of predicting the gold-standard target
        sentence tokens

        Args:
            src_encodings: hidden states of tokens in source sentences
            decoder_init_state: decoder GRU/LSTM's initial state
            tgt_sents: list of gold-standard target sentences, wrapped by `<s>` and `</s>`

        Returns:
            scores: could be a variable of shape (batch_size, ) representing the 
                log-likelihood of generating the gold-standard target sentence for 
                each example in the input batch
        """
        tgt_input,tgt_target = tgt_sents
        decoder_outputs, decoder_hidden = self.decoder(tgt_input, decoder_init_state, src_encodings)
        loss = self.loss
        loss.reset()
        for step, step_output in enumerate(decoder_outputs):
            batch_size = tgt_input.size(0)
            loss.eval_batch(step_output.contiguous().view(batch_size, -1), tgt_target[:, step])
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

        scores = loss.get_loss()

        return scores


    def decode_without_bp(self, src_encodings, decoder_init_state, tgt_sents):
        """
        Given source encodings, compute the log-likelihood of predicting the gold-standard target
        sentence tokens

        Args:
            src_encodings: hidden states of tokens in source sentences
            decoder_init_state: decoder GRU/LSTM's initial state
            tgt_sents: list of gold-standard target sentences, wrapped by `<s>` and `</s>`

        Returns:
            scores: could be a variable of shape (batch_size, ) representing the
                log-likelihood of generating the gold-standard target sentence for
                each example in the input batch
        """
        tgt_input,tgt_target = tgt_sents
        decoder_outputs, decoder_hidden = self.decoder(tgt_input, decoder_init_state, src_encodings)
        loss = self.loss
        loss.reset()
        for step, step_output in enumerate(decoder_outputs):
            batch_size = tgt_input.size(0)
            loss.eval_batch(step_output.contiguous().view(batch_size, -1), tgt_target[:, step])

        scores = loss.get_loss()

        return decoder_outputs, scores

    # TODO: sent_padding for only src
    # def beam_search(self, src_sent: List[str], beam_size: int=5, max_decoding_time_step: int=70) -> List[Hypothesis]:
    def beam_search(self, src_sent, beam_size, max_decoding_time_step):
        """
        Given a single source sentence, perform beam search

        Args:
            src_sent: a single tokenized source sentence
            beam_size: beam size
            max_decoding_time_step: maximum number of time steps to unroll the decoding RNN

        Returns:
            hypotheses: a list of hypothesis, each hypothesis has two fields:
                value: List[str]: the decoded target sentence, represented as a list of words
                score: float: the log-likelihood of the target sentence
        """

        hypotheses = 0
        return hypotheses
    
    # def evaluate_ppl(self, dev_data: List[Any], batch_size: int=32):
    def evaluate_ppl(self, dev_data, batch_size):
        """
        Evaluate perplexity on dev sentences

        Args:
            dev_data: a list of dev sentences
            batch_size: batch size
        
        Returns:
            ppl: the perplexity on dev sentences
        """

        cum_loss = 0.
        count = 0

        # you may want to wrap the following code using a context manager provided
        # by the NN library to signal the backend to not to keep gradient information
        # e.g., `torch.no_grad()`

        ref_corpus = []
        hyp_corpus = []
        for src_sents, tgt_sents in batch_iter(dev_data, batch_size):
            ref_corpus.extend(tgt_sents)
            src_sents = self.vocab.src.words2indices(src_sents)
            tgt_sents = self.vocab.tgt.words2indices(tgt_sents)
            src_sents, src_len, y_input, y_tgt, tgt_len = sent_padding(src_sents, tgt_sents)
            src_encodings, decoder_init_state = self.encode(src_sents, src_len)
            decoder_outputs, loss = self.decode_without_bp(src_encodings, decoder_init_state, [y_input, y_tgt])
            cum_loss += loss
            count += 1

            # decoder outputs to word sequence
            hyp_np = np.zeros((len(tgt_sents), len(decoder_outputs), len(self.vocab.tgt)))

            for step in range(len(decoder_outputs)):
                tmp = decoder_outputs[step].cpu().data.numpy()
                # print(tmp.shape)
                hyp_np[:, step, :] = tmp
            # print(hyp_np.shape)

            # converting softmax to word string
            for b in range(hyp_np.shape[0]):
                word_seq = []
                for step in range(hyp_np.shape[1]):
                    pred_idx = np.argmax(hyp_np[b,step,:])
                    # print(pred_idx)
                    if pred_idx == self.vocab.tgt.word2id['</s>']:
                        break
                    word_seq.append(self.vocab.tgt.id2word[pred_idx])
                hyp_corpus.append(word_seq)

            # tgt_word_num_to_predict = sum(len(s[1:]) for s in tgt_sents)  # omitting the leading `<s>`
            # cum_tgt_words += tgt_word_num_to_predict

        # ppl = np.exp(cum_loss / cum_tgt_words)
        for r, h in zip(ref_corpus, hyp_corpus):
            print(r)
            print(h)
            print()
        bleu = compute_corpus_level_bleu_score(ref_corpus, hyp_corpus)
        print('bleu score: ', bleu)

        return cum_loss / count

    # @staticmethod
    def load(self, model_path):

        self.encoder.load_state_dict(torch.load(model_path + '-encoder'))
        self.decoder.load_state_dict(torch.load(model_path + '-decoder'))

    def save(self, model_save_path):
        """
        Save current model to file
        """
        torch.save(self.encoder.state_dict(), model_save_path + '-encoder')
        torch.save(self.decoder.state_dict(), model_save_path + '-decoder')

In [54]:
nmt = NMT(5,10,10,10)

  "num_layers={}".format(dropout, num_layers))


NameError: name 'optim' is not defined