In [1]:
%load_ext autoreload
%autoreload 2
%matplotlib notebook

## Numberish

### Grammar
#### translate English words into their lengths
ex) 'I', 'often', 'wonder', 'if', 'it', 'might', 'be', 'X' -> '1', '5', '6', '2', '2', '5', '2', '1'

#### swith (n | n%3 == 0) th words with (n | n%3 == 1) th words
ex) '1', '5', '6', '2', '2', '5', '2', '1' -> '5', '1', '6', '2', '2', '5', '1', '2'

In [2]:
import sys
from my_utils import Dictionary, DataLoader, Trainer

en_dict = Dictionary()
num_dict = Dictionary(['<BOS>', '<EOS>'])

dataset = []
with open('./eng-num.txt', 'r') as f:
    for i, line in enumerate(f.readlines()):
        if i == 1000: break
        eng, num = line[:-1].split('\t')
        eng_list = eng.split()
        num_list = list(num)
        dataset.append((eng_list, num_list))
        for w in eng_list:
            en_dict.add_word(w)
        for n in num_list:
            num_dict.add_word(n)

print('English vocabulary: ', len(en_dict))
print('Numberish vocabulary: ', len(num_dict))

English vocabulary:  2069
Numberish vocabulary:  12


In [3]:
def numericalize(seq, dictionary):
    return [dictionary(token) for token in seq]
numericalized = [(numericalize(eng, en_dict), numericalize(num, num_dict)) for eng, num in dataset]

In [6]:
from torch_models.models import MLP, LSTMEncoder
import torch.nn as nn
import torch

class Seq2Seq(nn.Module):
    def __init__(self, embed_size, src_vocab_size, tgt_vocab_size, tgt_BOS, tgt_EOS, num_layers=1):
        super().__init__()
        self.encoder = LSTMEncoder(embed_size, src_vocab_size, bidirectional=False, num_layers=num_layers)
        self.decoder = LSTMEncoder(embed_size, tgt_vocab_size, bidirectional=False, num_layers=num_layers)
        self.out_mlp = MLP(dims=[embed_size, tgt_vocab_size])
        
        self.tgt_BOS = tgt_BOS
        self.tgt_EOS = tgt_EOS

    def fit(self, inputs, targets, optimizer):
        # encoding
        _, enc_hiddens = self.encoder.forward(inputs)
        # decoding
        BOS_targets = self._append_BOS(targets)
        decoded, _ = self.decoder.forward(BOS_targets, enc_hiddens)
        decoded = self._flatten_and_unpad(decoded)
        # predicting
        targets_EOS = self._append_EOS_flatten(targets)
        loss = self.out_mlp.fit(decoded, targets_EOS, optimizer)
        return loss
    
    def _append_BOS(self, targets):
        BOS_targets = [torch.cat((torch.tensor([self.tgt_BOS]), target)) for target in targets]
        return BOS_targets
        
    def _append_EOS_flatten(self, targets):
        EOS_targets = [torch.cat((target, torch.tensor([self.tgt_EOS]))) for target in targets]
        return torch.cat(EOS_targets)
    
    def _flatten_and_unpad(self, decoded):
        decoded = decoded.view(-1, self.decoder.embed_size) # (batch * seq_len, embed_size)
        decoded = torch.stack([tensor for tensor in decoded if not torch.tensor(float('-inf')) in tensor], dim=0)
        return decoded

In [11]:
from torch_models.models import LSTMEncoder, DotAttn
from torch_models.utils import seq2seq, get_device

device = get_device()
train_loader = DataLoader(numericalized, batch_size = 4, trans_func=seq2seq)
model = Seq2Seq(50, len(en_dict), len(num_dict), tgt_BOS=num_dict('<BOS>'), tgt_EOS=num_dict('<EOS>'))

===== Device =====
cpu


In [13]:
from my_utils import Trainer, EvaluatorC
from torch.optim import Adam

optimizer = Adam(model.parameters())

# evaluator = EvaluatorC(model, test_loader)

trainer = Trainer(model, train_loader)
trainer.train(optimizer, max_epoch=1,
              evaluator=None, score_monitor=None, show_log=True, hook_func=None)

epoch 0  	loss: 2.084116714000702	
epoch 1  	loss: 2.0596342821121216	
epoch 2  	loss: 2.0508489060401915	
epoch 3  	loss: 2.034707717895508	
epoch 4  	loss: 2.028004758358002	
epoch 5  	loss: 2.0195835165977476	
epoch 6  	loss: 2.025053680419922	
epoch 7  	loss: 2.031168773174286	
epoch 8  	loss: 2.0377994151115417	
epoch 9  	loss: 2.0425978140830994	
