In [1]:
import torch
from torch import Tensor, LongTensor
import torch.nn as nn
from torch.nn import Module
from torch.autograd import Variable
from torch import optim
import torch.nn.functional as F
from data import  CornellVocab, CornellMovie, OpenSubVocab, OpenSub, sort_batch
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence
from utils import parse, length_to_mask, masked_cross_entropy_loss, save_checkpoint
from tensorboardX import SummaryWriter
from seq import Encoder, Decoder, run
# from inference import Beam
import os
import sys
import random
from time import strftime, localtime, time
from torch.nn.functional import softmax, log_softmax

In [23]:
class Beam():
    def __init__(self, beam_size, vocab, alpha, n_best, use_cuda):
        self.beam_size = beam_size
        self.vocab = vocab
        self.alpha = alpha
        self.n_best = n_best

        self.prevs = [] # pointer to sequence in beam
        self.nexts = [torch.zeros(beam_size).fill_(vocab.SOS)]
        if use_cuda: self.nexts = [t.cuda() for t in self.nexts]
        self.attns = []
        self.scores = torch.zeros(beam_size)
        self.all_scores = []
        if use_cuda: self.scores = self.scores.cuda()
        self.finished = [] # list of tuples, (index within beam, output index, score)
        self.stop = False

    def get_last_words(self):
        return self.nexts[-1]
    
    def get_last_root(self):
        return self.prevs[-1]

    def advance(self, logits, attn):
        """
        Args:
        `logits`: log probability of each candidate sequence for generating next word, beam_size x vocab_size
        `attn`: attention vectors of decoder
        """
        if len(self.prevs) == 0:
            beam_scores = logits[0]
        else:
            beam_scores = self.scores.unsqueeze(1).expand_as(logits) + logits
            for i in range(self.nexts[-1].size(0)):
                if self.nexts[-1][i] == self.vocab.EOS:
                    beam_scores[i] = -1e20 
        print(beam_scores)
                
        
        
            #TODO: Block Children of finish sentence
            #TODO: Normalization over length

        flat_beam_scores = beam_scores.view(-1)
        best_scores, best_word_id = flat_beam_scores.topk(self.beam_size, 0, True, True)
        print(best_scores)
        print(best_word_id)
        self.all_scores.append(self.scores)
        self.scores = best_scores
        prev = best_word_id / self.vocab.vocab_size
        prev = prev.data.long()
        self.prevs.append(prev)
        next_idx = (best_word_id % self.vocab.vocab_size).data.long()
        self.nexts.append(next_idx)
        self.attns.append(attn.index_select(0, prev))
        for idx, word_idx in enumerate(self.nexts[-1]):
            if word_idx == self.vocab.EOS:
                self.finished.append((idx, len(self.nexts)-1, self.scores.data[idx]))
        if self.nexts[-1][0] == self.vocab.EOS:
            self.all_scores.append(self.scores)
            self.stop = True

    def topk(self, k):
        """
        If this beam has finished searching, get the top k best sequence. If there are less than k completed sentences,
        add partial sentences.
        """
        self.finished.sort(key=lambda x : x[2]) #TODO: Check why this is inverse
        scores = [s for _, _, s in self.finished]
        idx = [(word_idx, beam_idx) for (word_idx, beam_idx, _) in self.finished]
        makeup = k-len(idx)
        for i, (score, word_idx) in enumerate(zip(self.scores, self.nexts[-1])):
            if i > makeup -1: continue
            scores.append(score)
            idx.append((i, len(self.nexts)-1))
                       
        def get_pred(word_idx, beam_idx):
            pred = []
            attn = []
            for i in range(len(self.prevs[:beam_idx]), -1, -1):
                pred.append(self.nexts[i][word_idx])
                attn.append(self.attns[i-1][word_idx])
                word_idx = self.prevs[i-1][word_idx]
            attn.reverse()
            pred.reverse()
            return pred, torch.stack(attn)
        preds = [get_pred(*x) for x in idx]
        sentences, attns = zip(*preds)
        return sentences, attns

In [3]:
args = {
    "epoch":20,
    "batch_size":1,
    "num_workers":4,
    "train_path":"../data/dev/2020_dev",
    "test_path":"../data/dev/2020_dev",
    "vocab_size":25000,
    "embed_size":1000,
    "hidden_size":1000,
    "num_layers":4,
    "clip_thresh":1,
    "seed":1,
    "lr":0.1,
    "global_max_target_len":20,
    "cuda":torch.cuda.is_available,
    "resume":"checkpoint-29999",
    "dir":"final",
    "reverse": False,
    "vocab_path":"../data/movie_25000",
    "dropout":0.2
}
class AttributeDict(dict):
    def __getattr__(self, attr):
        return self[attr]
    def __setattr__(self, attr, value):
        self[attr] = value
args = AttributeDict(args)

In [4]:
vocab = OpenSubVocab(args.vocab_path)
train_data = OpenSub(args, vocab, args.train_path)
test_data = OpenSub(args, vocab, args.test_path)

In [5]:
train_loader = torch.utils.data.DataLoader(train_data,
                                           batch_size=args.batch_size,
                                           shuffle=True, collate_fn=sort_batch,
                                           num_workers=args.num_workers)
test_loader = torch.utils.data.DataLoader(test_data,
                                          batch_size=args.batch_size,
                                          shuffle=True, collate_fn=sort_batch,
                                          num_workers=args.num_workers)

In [6]:
encoder = Encoder(args, train_data.source_vocab.vocab_size).cuda() if args.cuda else Encoder(args, train_data.source_vocab.vocab_size)
decoder = Decoder(args, train_data.target_vocab.vocab_size).cuda() if args.cuda else Decoder(args, train_data.target_vocab.vocab_size)


encoder_optim = optim.SGD(encoder.parameters(), lr=args.lr)
decoder_optim = optim.SGD(decoder.parameters(), lr=args.lr)

In [7]:
checkpoint = torch.load(os.path.join(args.dir, args.resume))
encoder.load_state_dict(checkpoint['encoder_state'])
decoder.load_state_dict(checkpoint['decoder_state'])


In [8]:
use_cuda = True

In [25]:
source, source_lens, target, target_lens = next(iter(test_loader))
source, target = Variable(source, volatile=True), Variable(target, volatile=True)
if use_cuda: source, target = source.cuda(), target.cuda()
batch_size = source.size()[1]
encoder_outputs, encoder_last_hidden = encoder(source, source_lens, None)


In [66]:
decoder_hidden = encoder_last_hidden

In [67]:
max_tgt_len = 1
beam_size = 1

In [68]:
decoder_hidden = (decoder_hidden[0].repeat(1,beam_size,1), decoder_hidden[1].repeat(1,beam_size,1))
encoder_outputs = encoder_outputs.repeat(1,beam_size,1)
source_lens = torch.LongTensor(source_lens).repeat(1,beam_size,1).view(-1).tolist()

In [69]:
make_beam = lambda : Beam(beam_size, vocab, 0, 2, use_cuda)
beams = [make_beam() for _ in range(batch_size)]

In [78]:
for l in range(max_tgt_len):
    last_words = torch.stack([b.get_last_words() for b in beams])
    last_words = Variable(last_words).t().contiguous().view(1, -1).squeeze(0).long()
    print(last_words)
    if use_cuda: last_words = last_words.cuda()
    logits, decoder_hidden, atten_scores = decoder(last_words,encoder_outputs,source_lens,decoder_hidden)
    logits = log_softmax(logits, 1)
    logits = logits.view(beam_size, batch_size, -1)
    atten_scores = atten_scores.view(beam_size, batch_size, -1)

    for j, b in enumerate(beams):
        b.advance(logits[:, j], atten_scores.data[:, j])
        last_roots = b.get_last_root()
        for d in decoder_hidden:
            layer_size = d.size(0)
            beam_batch = d.size(1)
            hidden_size = d.size(2)
            sent_states = d.view(layer_size, beam_size, beam_batch // beam_size,
                    hidden_size)[:, :, j]
            sent_states.data.copy_(sent_states.data.index_select(1, last_roots))


Variable containing:
 0
[torch.cuda.LongTensor of size 1 (GPU 0)]

Variable containing:
-1.0000e+20 -1.0000e+20 -1.0000e+20  ...  -1.0000e+20 -1.0000e+20 -1.0000e+20
[torch.cuda.FloatTensor of size 1x25004 (GPU 0)]

Variable containing:
-1.0000e+20
[torch.cuda.FloatTensor of size 1 (GPU 0)]

Variable containing:
 0
[torch.cuda.LongTensor of size 1 (GPU 0)]



In [79]:
print(b.nexts)

[
 25002
[torch.cuda.FloatTensor of size 1 (GPU 0)]
, 
 5
[torch.cuda.LongTensor of size 1 (GPU 0)]
, 
 39
[torch.cuda.LongTensor of size 1 (GPU 0)]
, 
 37
[torch.cuda.LongTensor of size 1 (GPU 0)]
, 
 136
[torch.cuda.LongTensor of size 1 (GPU 0)]
, 
 5
[torch.cuda.LongTensor of size 1 (GPU 0)]
, 
 46
[torch.cuda.LongTensor of size 1 (GPU 0)]
, 
 25001
[torch.cuda.LongTensor of size 1 (GPU 0)]
, 
 0
[torch.cuda.LongTensor of size 1 (GPU 0)]
, 
 0
[torch.cuda.LongTensor of size 1 (GPU 0)]
]


In [80]:
b.prevs

[
  0
 [torch.cuda.LongTensor of size 1 (GPU 0)], 
  0
 [torch.cuda.LongTensor of size 1 (GPU 0)], 
  0
 [torch.cuda.LongTensor of size 1 (GPU 0)], 
  0
 [torch.cuda.LongTensor of size 1 (GPU 0)], 
  0
 [torch.cuda.LongTensor of size 1 (GPU 0)], 
  0
 [torch.cuda.LongTensor of size 1 (GPU 0)], 
  0
 [torch.cuda.LongTensor of size 1 (GPU 0)], 
  0
 [torch.cuda.LongTensor of size 1 (GPU 0)], 
  0
 [torch.cuda.LongTensor of size 1 (GPU 0)]]

In [81]:
preds, _ = b.topk(3)

In [82]:
print(test_data.target_vocab.to_text(source.data[:, 0]))
print(test_data.target_vocab.to_text(target.data[:, 0]))
print(test_data.target_vocab.to_text(preds[0]))
# print(test_data.target_vocab.to_text(preds[1]))
# print(test_data.target_vocab.to_text(preds[2]))

i 'm so glad we met her <end>
how about taking a look at the lion she works with ? <end>
<start>i 'm not sure i know <end>
