In [2]:
SOS_TOKEN=2
EOS_TOKEN=3
class LSTM(t.nn.Module):
    """
    Implementation of `Sequence to Sequence Learning with Neural Networks`
    https://papers.nips.cc/paper/5346-sequence-to-sequence-learning-with-neural-networks.pdf

    NOTE THAT ITS INPUT SHOULD HAVE THE BATCH SIZE FIRST !!!!!
    """

    def __init__(self, params, source_embeddings=None, target_embeddings=None):
        super(LSTM, self).__init__()
        print("Initializing LSTM")
        self.cuda_flag = params.get('cuda', CUDA_DEFAULT)
        self.model_str = 'LSTM'
        self.params = params

        # Initialize hyperparams.
        self.hidden_dim = params.get('hidden_dim', 100)
        self.batch_size = params.get('batch_size', 32)
        try:
            # if you provide pre-trained embeddings for target/source, they should have the same embedding dim
            self.source_vocab_size = params.get('source_vocab_size')
            self.target_vocab_size = params.get('target_vocab_size')
            assert source_embeddings.size(1) == target_embeddings.size(1)
            self.embedding_dim = source_embeddings.size(1)
        except:
            # if you dont provide a pre-trained embedding, you have to provide these
            self.source_vocab_size = params.get('source_vocab_size')
            self.target_vocab_size = params.get('target_vocab_size')
            self.embedding_dim = params.get('embedding_dim')
            assert self.embedding_dim is not None and self.source_vocab_size is not None and self.target_vocab_size is not None
        self.output_size = self.target_vocab_size
        self.num_layers = params.get('num_layers', 1)
        self.dropout = params.get('dropout', 0.5)
        self.embed_dropout = params.get('embed_dropout')
        self.train_embedding = params.get('train_embedding', False)

        # Initialize embeddings. Static embeddings for now.
        self.source_embeddings = t.nn.Embedding(self.source_vocab_size, self.embedding_dim)
        self.target_embeddings = t.nn.Embedding(self.target_vocab_size, self.embedding_dim)
        if source_embeddings is not None:
            self.source_embeddings.weight = t.nn.Parameter(source_embeddings, requires_grad=self.train_embedding)
        if target_embeddings is not None:
            self.target_embeddings.weight = t.nn.Parameter(target_embeddings, requires_grad=self.train_embedding)

        # Initialize network modules.
        self.encoder_rnn = t.nn.LSTM(self.embedding_dim, self.hidden_dim, dropout=self.dropout, num_layers=self.num_layers, batch_first=True)
        self.decoder_rnn = t.nn.LSTM(self.embedding_dim + self.hidden_dim, self.hidden_dim, dropout=self.dropout, num_layers=self.num_layers, batch_first=True)
        self.hidden2out = t.nn.Linear(self.hidden_dim, self.output_size)
        self.hidden_enc = self.init_hidden()
        self.hidden_dec = self.init_hidden()
        if self.embed_dropout:
            self.dropout_1s = t.nn.Dropout(self.dropout)
            self.dropout_1t = t.nn.Dropout(self.dropout)
        self.dropout_2 = t.nn.Dropout(self.dropout)
        
        # Beam Search related.
        self.beam = np.array([])   
        self.beam_size = params.get('beam_size',3)
                
    def init_hidden(self, batch_size=None):
        # The axes semantics are (num_layers, minibatch_size, hidden_dim). The helper function
        # will return torch variable.
        bs = self.batch_size if batch_size is None else batch_size
        return tuple((
            variable(np.zeros((self.num_layers, bs, self.hidden_dim)), cuda=self.cuda_flag),
            variable(np.zeros((self.num_layers, bs, self.hidden_dim)), cuda=self.cuda_flag)
        ))

    def forward(self, x_source, x_target):
        """
        :param x_source: the source sentence
        :param x_target: the target (translated) sentence
        :return:
        """
        # EMBEDDING
        xx_source = self.reverse_source(x_source)
        embedded_x_source = self.source_embeddings(xx_source)
        embedded_x_target = self.source_embeddings(x_target[:, :-1])
        if self.embed_dropout:
            embedded_x_source = self.dropout_1s(embedded_x_source)
            embedded_x_target = self.dropout_1t(embedded_x_target)

        # ENCODING SOURCE SENTENCE INTO FIXED LENGTH VECTOR
        _, self.hidden_enc = self.encoder_rnn(embedded_x_source, self.hidden_enc)

        # DECODING
        embedded_x_target = self.append_hidden_to_target(embedded_x_target)
        rnn_out, self.hidden_dec = self.decoder_rnn(embedded_x_target, self.hidden_dec)
        rnn_out = self.dropout_2(rnn_out)

        # OUTPUT
        out_linear = self.hidden2out(rnn_out)
        return out_linear
    
    def translate(self, x_source):
        # INITIALIZE
        self.eval()
    
        self.hidden_enc = self.init_hidden()
        self.hidden_dec = self.init_hidden()
        hidden = self.hidden_dec
    
        count_eos = 0
        time = 0
    
        x_target = (SOS_TOKEN * t.ones(x_source.size(0), 1)).long()  # `2` is the SOS token (<s>)
        x_target = variable(x_target, to_float=False, cuda=self.cuda_flag)
    
        # EMBEDDING
        xx_source = self.reverse_source(x_source)
        embedded_x_source = self.source_embeddings(xx_source)
        if self.embed_dropout:
            embedded_x_source = self.dropout_1s(embedded_x_source)
    
        # ENCODING SOURCE SENTENCE INTO FIXED LENGTH VECTOR
        _, self.hidden_enc = self.encoder_rnn(embedded_x_source, self.hidden_enc)
    
        while count_eos < x_source.size(0):
            embedded_x_target = self.target_embeddings(x_target)
            embedded_x_target = self.append_hidden_to_target(embedded_x_target)
            dec_out, hidden = self.decoder_rnn(embedded_x_target, hidden)
            hidden = hidden[0].detach(), hidden[1].detach()
            dec_out = dec_out[:, time:time + 1, :].detach()
            dec_out = self.dropout_2(dec_out)
    
            # OUTPUT
            pred = self.hidden2out(dec_out).detach()
            # concatenate the output of the decoder and the context and apply nonlinearity
            x_target = t.cat([x_target, pred.max(2)[1]], 1).detach()
    
            # should you stop ?
            count_eos += t.sum((pred.max(2)[1] == EOS_TOKEN).long()).data.cpu().numpy()[0]  # `3` is the EOS token
            time += 1
        return x_target
        
    
    def translate_beam(self, x_source):
        # INITIALIZE
        self.eval()       

        self.hidden_enc = self.init_hidden()
        self.hidden_dec = self.init_hidden()
        hidden = self.hidden_dec

        count_eos = 0
        time = 0        
              
        x_target = (SOS_TOKEN * t.ones(x_source.size(0), 1)).long()  # `2` is the SOS token (<s>)
        #x_target = variable(x_target, to_float=False, cuda=self.cuda_flag)
        
        #BEAM WILL BE MAINTAINED AS BATCH_SIZE * BEAM_SIZE
        self.beam = np.array([x_target])
        terminate_beam = False

        # EMBEDDING
        xx_source = self.reverse_source(x_source)
        embedded_x_source = self.source_embeddings(xx_source)
        if self.embed_dropout:
            embedded_x_source = self.dropout_1s(embedded_x_source)

        # ENCODING SOURCE SENTENCE INTO FIXED LENGTH VECTOR
        _, self.hidden_enc = self.encoder_rnn(embedded_x_source, self.hidden_enc)
        batch_size = x_source.size(0)
       
        while not terminate_beam and time < 50:
            collective_children   = np.array([])
            collective_scores     = np.array([])
           
            if len(self.beam) == 1:
                reshaped_beam = self.beam
            else:
                reshaped_beam = self.beam.reshape((self.beam_size,batch_size,time+1))
                reshaped_beam = t.from_numpy(reshaped_beam)

            for elem in reshaped_beam :        
                x_target = elem.view(self.batch_size,-1)
                x_target = variable(x_target, to_float=False, cuda=self.cuda_flag).long()
                embedded_x_target = self.target_embeddings(x_target)
                embedded_x_target = self.append_hidden_to_target(embedded_x_target)
                dec_out, hidden = self.decoder_rnn(embedded_x_target, hidden)
                hidden = hidden[0].detach(), hidden[1].detach()
                dec_out = dec_out[:, time:time + 1, :].detach()
                dec_out = self.dropout_2(dec_out)
                
                # OUTPUT        


                pred = self.hidden2out(dec_out).detach()

                topk = t.topk(pred, self.beam_size,dim=2)
                top_k_indices, top_k_scores = topk[1],topk[0]                    
                top_k_indices = top_k_indices.view(self.beam_size,batch_size)
                top_k_scores = top_k_scores.view(self.beam_size,batch_size)

                for new_word_batch, new_score_batch in zip(top_k_indices, top_k_scores):                    
                    new_word_batch, new_score_batch = new_word_batch.view(batch_size,1),new_score_batch.view(batch_size,1)
                    new_child_batch = t.cat([x_target,new_word_batch],1).detach()
                    if len(collective_children) > 0:
                        collective_children = np.hstack((collective_children, new_child_batch.data.numpy())) 
                        collective_scores = np.hstack((collective_scores, new_score_batch.data.numpy()))              
                    else:
                        collective_children, collective_scores = new_child_batch.data.numpy(),new_score_batch.data.numpy()                

                     
            #At the end of a for loop collective children, collective scores 
            #will look a numpy array of tensors.            
            current_beam_length = 1 #Means only start elem is there.
            if len(self.beam)!= 1:
                current_beam_length = self.beam.shape[1]
                
                  
            collective_children = collective_children.reshape((batch_size, current_beam_length*self.beam_size, 
                                                               int(collective_children.shape[1]/
                                                                   current_beam_length/self.beam_size)
                                                             ))
            
            if collective_children.shape[1] == self.beam_size:  #Happens the first time.
                self.beam = collective_children                                
            else:
                self.beam = deepcopy(np.zeros((batch_size,self.beam_size,collective_children.shape[2])))
                for i in range(batch_size):
                    #Since argsort gives ascending order
                    best_scores_indices = np.argsort(-1*collective_scores[i])[:self.beam_size]                    
                    for key,index in enumerate(best_scores_indices):
                        self.beam[i][key][:] = collective_children[i][index]
            
            terminate_beam = True
            #Check if every beam has EOS token and it has happened for all elements in the batch.
            for x in self.beam:
                    for c in x:
                        if EOS_TOKEN not in c:
                            terminate_beam = False
                            break
                    if not terminate_beam:
                        break   
            
            assert(self.beam.shape == (batch_size,self.beam_size,time+2))

            time += 1
        return self.beam

    @staticmethod
    def reverse_source(x):
        """
        Reverse the source sentence x. Empirically it was observed to work better in terms of final valid PPL, and especially for long sentences
        `x` is the integer-encoded sentence. It is a batch x sentence_length LongTensor
        """
        # asssume that the batch_size is the first dim
        return variable(t.cat([x.data[:, -1:]] + [x.data[:, -(k + 1):-k] for k in range(1, x.size(1))], 1), to_float=False)

    def append_hidden_to_target(self, x):
        """Append self.hidden_enc to all timesteps of x"""
        # self.hidden_enc[0] this is h. Size num_layers x batch x hdim
        h = self.hidden_enc[0]
        # h[-1:, :, :].permute(1,0,2) this is h for the last layer. Size batch x 1 x hdim
        h_last = h[-1:, :, :].permute(1, 0, 2)
        hidden = t.cat(x.size(1) * [h_last], 1)
        return t.cat([x, hidden], 2)

In [1]:
import torch as t
import numpy as np
from torch.autograd import Variable
import torch.nn as nn
import torch.nn.functional as F
import os

os.chdir('../HW3')  # so that there is not any import bug in case HW2 is not already the working directory
from utils import *
from const import *
import argparse
import torch as t
from process_params import check_args, get_params
from const import *
from train_models import train, validate
from data_process import generate_iterators, generate_text
from utils import *
t.manual_seed(1)

import torchtext
from torchtext.vocab import Vectors, GloVe
from utils import variable
from const import *
import numpy as np
from torch.autograd import Variable
import spacy
from torchtext import data
from torchtext import datasets
import pickle
from copy import deepcopy

In [5]:
def generate_iterators(BATCH_SIZE=32, MAX_LEN=20, load_data=False):
    if not load_data:
        spacy_de = spacy.load('de')
        spacy_en = spacy.load('en')

        def tokenize_de(text):
            return [tok.text for tok in spacy_de.tokenizer(text)]

        def tokenize_en(text):
            return [tok.text for tok in spacy_en.tokenizer(text)]

        BOS_WORD = '<s>'
        EOS_WORD = '</s>'
        DE = data.Field(tokenize=tokenize_de)
        EN = data.Field(tokenize=tokenize_en, init_token=BOS_WORD, eos_token=EOS_WORD)  # only target needs BOS/EOS

        train, val, test = datasets.IWSLT.splits(exts=('.de', '.en'), fields=(DE, EN),
                                                 filter_pred=lambda x: len(vars(x)['src']) <= MAX_LEN and
                                                                       len(vars(x)['trg']) <= MAX_LEN)
        MIN_FREQ = 5
        DE.build_vocab(train.src, min_freq=MIN_FREQ)
        EN.build_vocab(train.trg, min_freq=MIN_FREQ)
        train_iter, val_iter = data.BucketIterator.splits((train, val), batch_size=BATCH_SIZE, device=-1,
                                                          repeat=False, sort_key=lambda x: len(x.src))

        return train_iter, val_iter, EN, DE
    else:  # does not work...
        with open('train.pkl', 'rb') as f:
            train = pickle.load(f)
        with open('val.pkl', 'rb') as f:
            val = pickle.load(f)
        with open('DE.torchtext.Field.pkl', 'rb') as f:
            DE = pickle.load(f)
        with open('EN.torchtext.Field.pkl', 'rb') as f:
            EN = pickle.load(f)
        BATCH_SIZE = 32
        train_iter, val_iter = data.BucketIterator.splits((train, val), batch_size=BATCH_SIZE, device=-1,
                                                          repeat=False, sort_key=lambda x: len(x.src))
        return train_iter, val_iter, EN, DE

In [16]:
# Load data code should be here. Vocab size function of text.
train_iter, val_iter, EN, DE = generate_iterators(MAX_LEN=20, load_data=False, BATCH_SIZE=32)
model_params['source_vocab_size'] = len(DE.vocab.itos)
model_params['target_vocab_size'] = len(EN.vocab.itos)
args = {}

downloading de-en.tgz
.data\iwslt\de-en\IWSLT16.TED.dev2010.de-en.de.xml
.data\iwslt\de-en\IWSLT16.TED.dev2010.de-en.en.xml
.data\iwslt\de-en\IWSLT16.TED.tst2010.de-en.de.xml
.data\iwslt\de-en\IWSLT16.TED.tst2010.de-en.en.xml
.data\iwslt\de-en\IWSLT16.TED.tst2011.de-en.de.xml
.data\iwslt\de-en\IWSLT16.TED.tst2011.de-en.en.xml
.data\iwslt\de-en\IWSLT16.TED.tst2012.de-en.de.xml
.data\iwslt\de-en\IWSLT16.TED.tst2012.de-en.en.xml
.data\iwslt\de-en\IWSLT16.TED.tst2013.de-en.de.xml
.data\iwslt\de-en\IWSLT16.TED.tst2013.de-en.en.xml
.data\iwslt\de-en\IWSLT16.TED.tst2014.de-en.de.xml
.data\iwslt\de-en\IWSLT16.TED.tst2014.de-en.en.xml
.data\iwslt\de-en\IWSLT16.TEDX.dev2012.de-en.de.xml
.data\iwslt\de-en\IWSLT16.TEDX.dev2012.de-en.en.xml
.data\iwslt\de-en\IWSLT16.TEDX.tst2013.de-en.de.xml
.data\iwslt\de-en\IWSLT16.TEDX.tst2013.de-en.en.xml
.data\iwslt\de-en\IWSLT16.TEDX.tst2014.de-en.de.xml
.data\iwslt\de-en\IWSLT16.TEDX.tst2014.de-en.en.xml
.data\iwslt\de-en\train.tags.de-en.de
.data\iwslt\de-e

UnicodeDecodeError: 'charmap' codec can't decode byte 0x9d in position 2213: character maps to <undefined>

NameError: name 'DE' is not defined

In [27]:
spacy_de = spacy.load('de')
spacy_en = spacy.load('en')

def tokenize_de(text):
    return [tok.text for tok in spacy_de.tokenizer(text)]

def tokenize_en(text):
    return [tok.text for tok in spacy_en.tokenizer(text)]

BOS_WORD = '<s>'
EOS_WORD = '</s>'
DE = data.Field(tokenize=tokenize_de)
EN = data.Field(tokenize=tokenize_en, init_token=BOS_WORD, eos_token=EOS_WORD)  # only target needs BOS/EOS
train, val, test = datasets.IWSLT.splits(exts=('.de', '.en'), fields=(DE, EN),
                                         filter_pred=lambda x: len(vars(x)['src']) <= MAX_LEN and
                                                               len(vars(x)['trg']) <= MAX_LEN)
MIN_FREQ = 5
DE.build_vocab(train.src, min_freq=MIN_FREQ)
EN.build_vocab(train.trg, min_freq=MIN_FREQ)

UnicodeDecodeError: 'charmap' codec can't decode byte 0x9d in position 2213: character maps to <undefined>

In [3]:
x_source = Variable(t.ones((32,20))).long()
params = {'source_vocab_size':10000,'target_vocab_size':10000,'embedding_dim':50}

In [4]:
model = LSTM(params)
model.eval()

Initializing LSTM


LSTM (
  (source_embeddings): Embedding(10000, 50)
  (target_embeddings): Embedding(10000, 50)
  (encoder_rnn): LSTM(50, 100, batch_first=True, dropout=0.5)
  (decoder_rnn): LSTM(150, 100, batch_first=True, dropout=0.5)
  (hidden2out): Linear (100 -> 10000)
  (dropout_2): Dropout (p = 0.5)
)

In [6]:
final_beam = model.translate_beam(x_source)
print(final_beam.shape)

(32, 3, 51)


In [None]:
pred.max(2)[1]
