In [1]:
import torch
import torch.nn as nn
from torch.autograd import Variable
from torch import optim
import torch.nn.functional as F
use_cuda = torch.cuda.is_available()

In [2]:
def read_vocab(src):
    return {w:i for i,w in enumerate(open(src).read().splitlines())}

In [14]:
en_vocab_src = "./Data/vocab.en.txt"
vi_vocab_src = "./Data/vocab.vi.txt"
train_en_src = "./Data/train.en.txt"
train_vi_src = "./Data/train.vi.txt"
valid_en_src = "./Data/valid.en.txt"
valid_vi_src = "./Data/valid.vi.txt"

source_vocab = read_vocab(en_vocab_src)
target_vocab = read_vocab(vi_vocab_src)
MAX_LEN = 100 # 


In [15]:
# import pdb
# s_data = open(valid_en_src, "rb")
# for l in s_data:
#     print(l.split())
    

In [16]:
def data_iterator(s_src, t_src, s_vocab, t_vocab, max_sent_len=MAX_LEN, batch_size=1):
    s_data = open(s_src, "r")
    t_data = open(t_src, "r")
    f = lambda x: Variable(torch.LongTensor(x).view(1,-1))
    out_source, out_target, len_source, len_target = [], [], [], []
    batch_idx = 0
    for i, (s_line, t_line) in enumerate(zip(s_data, t_data)):
        if i - batch_idx >= batch_size:
            yield out_source, out_target, len_source, len_target
            out_source, out_target, len_source, len_target = [], [], [], []
            batch_idx = i
        a_source = [ s_vocab[w] if w in s_vocab else s_vocab["<unk>"] 
                      for w in s_line.replace("\n", "").split(" ")][:max_sent_len] ## could do reverse the input
        a_target = [ t_vocab[w] if w in t_vocab else t_vocab["<unk>"] 
                      for w in t_line.replace("/n", "</s>").split()]
        a_target.insert(0,t_vocab["<s>"])
        out_source.append(f(a_source))
        out_target.append(f(a_target))
        if (i+1)%batch_size == 0:
            yield (out_source), (out_target), len_source, len_target
            
            

In [17]:
# data = data_iterator(valid_en_src, valid_vi_src,source_vocab, target_vocab)
# for d, s,_,_ in data:
#     print(d)
#     print("source len:", len(d))
#     print("target len:", len(s))

In [52]:
class EncoderRNN(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers=1, bidirectional=False):
        super(EncoderRNN, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.bi = bidirectional
        self.embedding = nn.Embedding(input_size, hidden_size)
        self.gru = nn.GRU(hidden_size, hidden_size, num_layers = num_layers, bidirectional = self.bi)

    def forward(self, input, hidden=None):
        embedded = self.embedding(input).view(1,1,-1)
        output = embedded
#         pdb.set_trace()
        output, hidden = self.gru(output, hidden)
        if self.bi:
            pdb.set_trace()
            output = (output[:,:,:self.hidden_size]+
                      output[:,:,self.hidden_size:]) ##? why sum up
        return output, hidden
    def initHidden(self):
        direction = 2 if self.bi else 1
        result = Variable(torch.zeros(self.num_layers*direction, 1, self.hidden_size))
        if use_cuda:
            return result.cuda()
        else:
            return result

In [53]:
class DecoderRNN(nn.Module):
    def __init__(self, hidden_size, output_size, num_layers=1):
        super(DecoderRNN, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.embedding = nn.Embedding(output_size, hidden_size)
        self.gru = nn.GRU(hidden_size, hidden_size)
        self.out = nn.Linear(hidden_size, output_size)
        self.softmax = nn.LogSoftmax(dim=1)

    def forward(self, input, hidden):
        output = self.embedding(input).view(1,1,-1)
        output = F.relu(output)
#         print(output.size())
        output, hidden = self.gru(output, hidden)
        output = self.softmax(self.out(output[0]))
        return output, hidden

    def initHidden(self):
        result = Variable(torch.zeros(self.num_layers, 1, self.hidden_size))
        if use_cuda:
            return result.cuda()
        else:
            return result

In [56]:
def train(encoder, decoder, learning_rate=0.01):
    encoder_opt = optim.SGD(encoder.parameters(), lr=learning_rate)
    decoder_opt = optim.SGD(decoder.parameters(), lr=learning_rate)
    data = data_iterator(valid_en_src, valid_vi_src,source_vocab, target_vocab)
    total = 0
    for source, target, _, _ in data:
        for s, t in zip(source, target):
            s = s.view(-1)
            t = t.view(-1)
            encoder_hidden = encoder.initHidden()

            encoder_opt.zero_grad()
            decoder_opt.zero_grad()

            source_len = s.size()[0]
            target_len = t.size()[0]
            
            encoder_outputs = Variable(torch.zeros(MAX_LEN, encoder.hidden_size))
            encoder_output = None
            
            loss = 0
            criterion = nn.NLLLoss()
            
            for en_i in range(source_len):
                encoder_output, encoder_hidden = encoder(s[en_i], encoder_hidden)
                encoder_outputs[en_i] = encoder_output[0][0]
            
            decoder_hidden = encoder_hidden
            
            for de_i in range(target_len):
                decoder_output, decoder_hidden = decoder(t[de_i], decoder_hidden)
                pdb.set_trace()
                loss += criterion(decoder_output, t[de_i]) # need 2D of output
            loss.backward()
            
            encoder_opt.step()
            decoder_opt.step()
            total += target_len
    return loss.data[0]/target_len
            
            

In [57]:
### Training
import pdb
hidden_size = 256
encoder = EncoderRNN(len(source_vocab), hidden_size)
decoder = DecoderRNN(hidden_size, len(target_vocab))
print(train(encoder, decoder))



> <ipython-input-56-bc67f37e2ce4>(32)train()
-> loss += criterion(decoder_output, t[de_i])
(Pdb) decoder_output
Variable containing:
-8.9754 -8.7584 -9.0226  ...  -8.8632 -8.9784 -8.9368
[torch.FloatTensor of size 1x7709]

(Pdb) c
> <ipython-input-56-bc67f37e2ce4>(31)train()
-> pdb.set_trace()
(Pdb) c
> <ipython-input-56-bc67f37e2ce4>(32)train()
-> loss += criterion(decoder_output, t[de_i])
(Pdb) c
> <ipython-input-56-bc67f37e2ce4>(31)train()
-> pdb.set_trace()
(Pdb) c
> <ipython-input-56-bc67f37e2ce4>(32)train()
-> loss += criterion(decoder_output, t[de_i])
(Pdb) c
> <ipython-input-56-bc67f37e2ce4>(31)train()
-> pdb.set_trace()
(Pdb) c
> <ipython-input-56-bc67f37e2ce4>(32)train()
-> loss += criterion(decoder_output, t[de_i])
(Pdb) c
> <ipython-input-56-bc67f37e2ce4>(31)train()
-> pdb.set_trace()
(Pdb) c
> <ipython-input-56-bc67f37e2ce4>(32)train()
-> loss += criterion(decoder_output, t[de_i])
(Pdb) c
> <ipython-input-56-bc67f37e2ce4>(31)train()
-> pdb.set_trace()
(Pdb) c
> <ipython-in