In [191]:
from attention import Attention
from loss import Perplexity
import torch

In [39]:
input_var = torch.LongTensor(5,10).random_(0,10) # batch * seqlen

In [40]:
output_var = torch.LongTensor(5,10).random_(0,10)

In [41]:
encoder = Encoder(10,10,10,5)

In [42]:
encoder_outputs, encoder_hidden = encoder.forward(input_var)

In [53]:
decoder = Decoder(10,10,2*10,5)

In [54]:
result = decoder.forward(output_var,encoder_hidden,encoder_outputs)



In [192]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from generalRnn import BaseCoder
from attention import Attention

import numpy as np

class Decoder(BaseCoder):
    def __init__(self, vocab_size, hidden_size, embedding_size, dropout=0.0, n_layers=1, bidirectional=False,attention=False,rnn="lstm"):
        super(Decoder,self).__init__(vocab_size, hidden_size,embedding_size,
                dropout, n_layers, rnn)
        self.bidirectional = bidirectional
        self.rnn = self.baseModel(input_size=embedding_size, hidden_size=hidden_size, num_layers=n_layers, 
                    batch_first=True,dropout=(0 if n_layers == 1 else dropout))
        self.output_size = vocab_size
        self.attention_usage = attention
        self.embedding = nn.Embedding(vocab_size, embedding_size)
        self.attention = Attention(self.hidden_size)

        self.wsm = nn.Linear(self.hidden_size, self.output_size)

    def forward(self, input_seq, encoder_hidden, encoder_outputs, func=F.log_softmax):
        batch_size = input_seq.size(0)
        max_length = input_seq.size(1)

        # using cuda or not
        inputs = input_seq
        
        # for bidrectional encoder
        # encoder_hidden: (num_layers * num_directions, batch_size, hidden_size)
        decoder_hidden = tuple([torch.cat([h[0:h.size(0):2], h[1:h.size(0):2]], 2) for h in encoder_hidden])

        outputs = []
        # lengths = np.array([max_length] * batch_size)

        prev = inputs[:, 0].unsqueeze(1)
        for i in range(max_length):
            softmax, decoder_hidden, attention = self.forward_helper(prev, decoder_hidden,encoder_outputs ,func)
            output_seq = softmax.squeeze(1) # batch * seq_length
            outputs.append(output_seq)
            prev = output_seq.topk(1)[1] # max probability index

        return outputs,decoder_hidden
            


    # could insert one parameter like: src_matrix
    def forward_helper(self, decoder_input, decoder_hidden, encoder_outputs, func):
        batch_size = decoder_input.size(0)
        output_size = decoder_input.size(1)
        embedded = self.embedding(decoder_input)
        output,hidden = self.rnn(embedded, decoder_hidden)
        output, attention = self.attention(output, encoder_outputs) # attention
        softmax = func(self.wsm(output.view(-1, self.hidden_size)), dim=1).view(batch_size,output_size,-1)
        return softmax, hidden, attention


In [193]:
import torch
import torch.nn as nn
from generalRnn import BaseCoder
class Encoder(BaseCoder):
    def __init__(self,vocab_size, hidden_size, embedding_size, dropout=0.0, n_layers=1, bidirectional=True,rnn="lstm"):
        super(Encoder, self).__init__(vocab_size, hidden_size,embedding_size,
                dropout, n_layers, rnn)
        self.embedding = nn.Embedding(vocab_size,embedding_size)

        self.rnn = self.baseModel(input_size=embedding_size, hidden_size=hidden_size, num_layers=n_layers, 
                    batch_first=True,dropout=(0 if n_layers == 1 else dropout), bidirectional=bidirectional)

    def forward(self, input_seq, input_lengths=None):
        embedded = self.embedding(input_seq)
        embedded = nn.utils.rnn.pack_padded_sequence(embedded, input_lengths, batch_first=True)
        output, hidden = self.rnn(embedded)
        output, _ = nn.utils.rnn.pad_packed_sequence(output, batch_first=True)
        return output, hidden

In [194]:
train_data_src = read_corpus("data/train.de-en.en.wmixerprep", source='src')
train_data_tgt = read_corpus("data/train.de-en.de.wmixerprep", source='tgt')
train_data = list(zip(train_data_src, train_data_tgt))
vocab = pickle.load(open("VOCAB_FILE", 'rb'))

In [235]:
import math
import pickle
import sys
import time
from collections import namedtuple

import numpy as np
from typing import List, Tuple, Dict, Set, Union
from docopt import docopt
from tqdm import tqdm
from nltk.translate.bleu_score import corpus_bleu, sentence_bleu, SmoothingFunction

from utils import read_corpus, batch_iter
from vocab import Vocab, VocabEntry

import torch
import torch.nn as nn
from torch.autograd import Variable

from encoder import Encoder
from decoder import Decoder
from torch import optim

from loss import Perplexity
from optim import Optimizer


Hypothesis = namedtuple('Hypothesis', ['value', 'score'])
class NMT(object):

    def __init__(self, embed_size, hidden_size, vocab, dropout_rate=0.2):
        super(NMT, self).__init__()

        nvocab_src = len(vocab.src)
        nvocab_tgt = len(vocab.tgt)
        self.encoder = Encoder(nvocab_src, hidden_size, embed_size, dropout=dropout_rate, n_layers=1)
        self.decoder = Decoder(nvocab_tgt, 2*hidden_size, embed_size, n_layers=1)
        LAS_params = list(self.encoder.parameters()) + list(self.decoder.parameters())
        self.optimizer = optim.Adam(LAS_params, lr=0.01)
        weight = torch.ones(nvocab_tgt)
        self.loss = Perplexity(weight, 0)




    def __call__(self, src_sents, tgt_sents):
        """
        take a mini-batch of source and target sentences, compute the log-likelihood of 
        target sentences.

        Args:
            src_sents: list of source sentence tokens
            tgt_sents: list of target sentence tokens, wrapped by `<s>` and `</s>`

        Returns:
            scores: a variable/tensor of shape (batch_size, ) representing the 
                log-likelihood of generating the gold-standard target sentence for 
                each example in the input batch
        """
        # src_encodings, decoder_init_state = self.encode(src_sents, tgt_sents)
        src_sents = vocab.src.words2indices(src_sents)
        tgt_sents = vocab.tgt.words2indices(tgt_sents)
        src_sents, src_len, y_input, y_tgt, tgt_len  = sent_padding(src_sents, tgt_sents)
        src_encodings, decoder_init_state = self.encode(src_sents,src_len)
        scores = self.decode(src_encodings, decoder_init_state, [y_input, y_tgt])
        print(scores)

        return scores

    def encode(self, src_sents,input_lengths):
        """
        Use a GRU/LSTM to encode source sentences into hidden states

        Args:
            src_sents: list of source sentence tokens

        Returns:
            src_encodings: hidden states of tokens in source sentences, this could be a variable 
                with shape (batch_size, source_sentence_length, encoding_dim), or in orther formats
            decoder_init_state: decoder GRU/LSTM's initial state, computed from source encodings
        """
        encoder_outputs, encoder_hidden = self.encoder(src_sents,input_lengths)

        return encoder_outputs, encoder_hidden

    def decode(self, src_encodings, decoder_init_state, tgt_sents):
        """
        Given source encodings, compute the log-likelihood of predicting the gold-standard target
        sentence tokens

        Args:
            src_encodings: hidden states of tokens in source sentences
            decoder_init_state: decoder GRU/LSTM's initial state
            tgt_sents: list of gold-standard target sentences, wrapped by `<s>` and `</s>`

        Returns:
            scores: could be a variable of shape (batch_size, ) representing the 
                log-likelihood of generating the gold-standard target sentence for 
                each example in the input batch
        """
        tgt_input,tgt_target = tgt_sents
        decoder_outputs, decoder_hidden = self.decoder(tgt_input, decoder_init_state, src_encodings)
        loss = self.loss
        loss.reset()
        for step, step_output in enumerate(decoder_outputs):
            batch_size = tgt_input.size(0)
            loss.eval_batch(step_output.contiguous().view(batch_size, -1), tgt_target[:, step])
        loss.backward()
        self.optimizer.step()

        scores = loss.get_loss()

        return scores

    def beam_search(self, src_sent: List[str], beam_size: int=5, max_decoding_time_step: int=70) -> List[Hypothesis]:
        """
        Given a single source sentence, perform beam search

        Args:
            src_sent: a single tokenized source sentence
            beam_size: beam size
            max_decoding_time_step: maximum number of time steps to unroll the decoding RNN

        Returns:
            hypotheses: a list of hypothesis, each hypothesis has two fields:
                value: List[str]: the decoded target sentence, represented as a list of words
                score: float: the log-likelihood of the target sentence
        """

        return hypotheses

#     def evaluate_ppl(self, dev_data: List[Any], batch_size: int=32):
#         """
#         Evaluate perplexity on dev sentences

#         Args:
#             dev_data: a list of dev sentences
#             batch_size: batch size
        
#         Returns:
#             ppl: the perplexity on dev sentences
#         """

#         cum_loss = 0.
#         cum_tgt_words = 0.

#         # you may want to wrap the following code using a context manager provided
#         # by the NN library to signal the backend to not to keep gradient information
#         # e.g., `torch.no_grad()`

#         for src_sents, tgt_sents in batch_iter(dev_data, batch_size):
#             loss = -model(src_sents, tgt_sents).sum()

#             cum_loss += loss
#             tgt_word_num_to_predict = sum(len(s[1:]) for s in tgt_sents)  # omitting the leading `<s>`
#             cum_tgt_words += tgt_word_num_to_predict

#         ppl = np.exp(cum_loss / cum_tgt_words)

#         return ppl

#     @staticmethod
#     def load(model_path: str):
#         """
#         Load a pre-trained model

#         Returns:
#             model: the loaded model
#         """

#         return model

#     def save(self, path: str):
#         """
#         Save current model to file
#         """

#         raise NotImplementedError()

In [236]:
model = NMT(128,256,vocab)



In [245]:
for src_sents, tgt_sents in batch_iter(train_data, batch_size=100, shuffle=True):
#     src_length = [len(sent) for sent in src_sents]
#     tgt_length = [len(sent) for sent in tgt_sents]
#     model(src_sents, tgt_sents)
    src_sents = vocab.src.words2indices(src_sents)
    tgt_sents = vocab.tgt.words2indices(tgt_sents)
    sent_padding(src_sents, tgt_sents)
    break

In [246]:
def sent_padding(src_sents, tgt_sents):
    batch_size = len(src_sents)
    assert len(src_sents) == len(tgt_sents)

    max_src_len = len(src_sents[0])
    max_tgt_len = len(tgt_sents[0])

    padded_src_sents = np.zeros((batch_size, max_src_len))
    padded_Yinput = np.zeros((batch_size, max_tgt_len))
    padded_Ytarget = np.zeros((batch_size, max_tgt_len))

    src_lens = []
    tgt_lens = []

    for i, sent in enumerate(zip(src_sents, tgt_sents)):
        src_sent = sent[0]
        y_input = sent[1][:-1]
        y_target = sent[1][1:]

        src_len = len(src_sent)
        tgt_len = len(y_input)

        padded_src_sents[i, :src_len] = src_sent
        padded_Yinput[i, :tgt_len] = y_input
        padded_Ytarget[i, :tgt_len] = y_target

        src_lens.append(src_len)
        tgt_lens.append(tgt_len)

    return torch.LongTensor(padded_src_sents), src_lens, \
           torch.LongTensor(padded_Yinput), torch.LongTensor(padded_Ytarget), tgt_lens