In [1]:
import math
import time
import takepod
from takepod.datasets import BucketIterator, Iterator, BasicSupervisedImdbDataset
from takepod.storage import Field, Vocab
from takepod.storage.vectorizers.impl import GloVe
from takepod.models import Experiment, AbstractSupervisedModel
from takepod.models.trainer import AbstractTrainer

def lowercase(raw, data):
    return raw, [d.lower() for d in data]

def max_length(raw, data, length=200):
    return raw, data[:length]

In [2]:
fields = BasicSupervisedImdbDataset.get_default_fields()

In [3]:
def create_fields():
    # Define the vocabulary
    vocab = Vocab(max_size=10000, min_freq=5)
    text = Field(name='text', vocab=vocab, tokenizer='spacy', store_as_raw=False)
    # Add preprpocessing hooks to model
    # 1. Lowercase
    text.add_posttokenize_hook(lowercase)
    text.add_posttokenize_hook(max_length)
    # Improve readability: LabelField
    label = Field(name='label', vocab=Vocab(specials=()), is_target=True, tokenize=False)
    return {text.name : text, label.name: label}

In [4]:
fields = create_fields()
imdb_train, imdb_test = BasicSupervisedImdbDataset.get_train_test_dataset(fields)

In [5]:
# Construct vectoziter based on vocab
vocab = fields['text'].vocab
vectorizer = GloVe()
vectorizer.load_vocab(vocab)
embeddings = vectorizer.get_embedding_matrix(vocab)

In [6]:
# Works on simplify vectorizer branch
embeddings = GloVe().load_vocab(vocab)
print(embeddings)

[[ 0.         0.         0.        ...  0.         0.         0.       ]
 [ 0.         0.         0.        ...  0.         0.         0.       ]
 [ 0.04656    0.21318   -0.0074364 ...  0.0090611 -0.20989    0.053913 ]
 ...
 [-0.24734    0.019346   0.13974   ...  0.34035    0.0824     0.38554  ]
 [ 0.67287   -0.43249    0.1106    ... -0.16644    0.21169    0.45995  ]
 [ 0.034368   0.22004    0.14626   ... -0.18641   -0.032439   0.24544  ]]


In [7]:
train_iterator = Iterator(dataset=imdb_train, batch_size=32)
valid_iterator = Iterator(dataset=imdb_train, batch_size=32)

In [8]:
import torch
import torch.nn as nn
import torch.nn.functional as F
RNNS = ['LSTM', 'GRU']

class Encoder(nn.Module):
    def __init__(self, embedding_dim, hidden_dim, nlayers=1, dropout=0.,
                   bidirectional=True, rnn_type='GRU'):
        super(Encoder, self).__init__()
        
        self.bidirectional = bidirectional
        assert rnn_type in RNNS, 'Use one of the following: {}'.format(str(RNNS))
        rnn_cell = getattr(nn, rnn_type) # fetch constructor from torch.nn, cleaner than if
        self.rnn = rnn_cell(embedding_dim, hidden_dim, nlayers, 
                                dropout=dropout, bidirectional=bidirectional)

    def forward(self, input, hidden=None):
        return self.rnn(input, hidden)


class Attention(nn.Module):
    def __init__(self, query_dim, key_dim, value_dim):
        super(Attention, self).__init__()
        self.scale = 1. / math.sqrt(query_dim)

    def forward(self, query, keys, values):
        # Query = [BxQ]
        # Keys = [TxBxK]
        # Values = [TxBxV]
        # Outputs = a:[TxB], lin_comb:[BxV]

        # Here we assume q_dim == k_dim (dot product attention)

        query = query.unsqueeze(1) # [BxQ] -> [Bx1xQ]
        keys = keys.transpose(0,1).transpose(1,2) # [TxBxK] -> [BxKxT]
        energy = torch.bmm(query, keys) # [Bx1xQ]x[BxKxT] -> [Bx1xT]
        energy = F.softmax(energy.mul_(self.scale), dim=2) # scale, normalize

        values = values.transpose(0,1) # [TxBxV] -> [BxTxV]
        linear_combination = torch.bmm(energy, values).squeeze(1) #[Bx1xT]x[BxTxV] -> [BxV]
        return energy, linear_combination

class AttentionRNN(nn.Module):
    def __init__(self, cfg):
        super(AttentionRNN, self).__init__()
        self.config = cfg
        self.embedding = nn.Embedding(cfg.vocab_size, cfg.embed_dim)
        self.encoder = Encoder(cfg.embed_dim, cfg.hidden_dim, cfg.nlayers, 
                               cfg.dropout, cfg.bidirectional, cfg.rnn_type)
        attention_dim = cfg.hidden_dim if not cfg.bidirectional else 2 * cfg.hidden_dim
        self.attention = Attention(attention_dim, attention_dim, attention_dim)
        self.decoder = nn.Linear(attention_dim, cfg.num_classes)

        size = 0
        for p in self.parameters():
            size += p.nelement()
        print('Total param size: {}'.format(size))


    def forward(self, input):
        outputs, hidden = self.encoder(self.embedding(input))
        if isinstance(hidden, tuple): # LSTM
            hidden = hidden[1] # take the cell state

        if self.encoder.bidirectional: # need to concat the last 2 hidden layers
            hidden = torch.cat([hidden[-1], hidden[-2]], dim=1)
        else:
            hidden = hidden[-1]

        energy, linear_combination = self.attention(hidden, outputs, outputs) 
        logits = self.decoder(linear_combination)
        return_dict = {
            'pred': logits,
            'attention_weights':energy
        }

        return return_dict

In [9]:
class MyTorchModel(AbstractSupervisedModel):
    def __init__(self, model_class, config, criterion, optimizer):
        self.model_class = model_class
        self.config = config
        self._model = model_class(config)
        self.optimizer = optimizer(self.model.parameters(), config.lr)
        self.criterion = criterion

    @property
    def model(self):
        return self._model
        
    def __call__(self, x):
        return self._model(x)

    def fit(self, X, y, **kwargs):
        # This is a _step_ in the iteration process.
        # Should assume model is in training mode
        
        # Train-specific code
        self.model.train()
        self.model.zero_grad()
        
        return_dict = self(X)
        logits = return_dict['pred']
        #print(logits.view(-1, self.config.num_classes), y.squeeze())
        loss = self.criterion(logits.view(-1, self.config.num_classes), y.squeeze())
        return_dict['loss'] = loss
        
        loss.backward()
        torch.nn.utils.clip_grad_norm_(self.model.parameters(), self.config.clip)
        self.optimizer.step()
        return return_dict
        
    def predict(self, X, **kwargs):
        # Assumes that the model is in _eval_ mode
        self.model.eval()
        with torch.no_grad():
            return_dict = self(X)
            return return_dict
    
    def evaluate(self, X, y, **kwargs):
        self.model.eval()
        with torch.no_grad():
            return_dict = self(X)
            logits = return_dict['pred']
            loss = self.criterion(logits.view(-1, self.config.num_classes), y.squeeze())
            return_dict['loss'] = loss
            return return_dict
    
    def reset(self, **kwargs):
        # Restart model
        self._model = self.model_class(self.config)

In [10]:
class TorchTrainer(AbstractTrainer):
    def __init__(self, num_epochs, valid_iterator=None):
        self.epochs = num_epochs
        self.valid_iterator = valid_iterator
    
    def train(self,
              model: AbstractSupervisedModel,
              iterator: Iterator,
              feature_transformer,
              label_transform_fun,
              **kwargs):
        # Actual training loop
        # Single training epoch
        for batch_num, (batch_x, batch_y) in enumerate(iterator):
            t = time.time()
            X = torch.from_numpy(
                feature_transformer.transform(batch_x).swapaxes(0,1) # swap batch_size and T
                )
            y = torch.from_numpy(
                label_transform_fun(batch_y)
                )
            return_dict = model.fit(X, y)
            
            print("[Batch]: {}/{} in {:.5f} seconds, loss={:.5f}".format(
                   batch_num, len(iterator), time.time() - t, return_dict['loss']), 
                   end='\r', flush=True)
            
        
        for batch_num, batch_x, batch_y in enumerate(self.valid_iterator):
            X = feature_transformer.transform(batch_x)
            y = label_transform_fun(batch_y)

            return_dict = model.evaluate(X, y)
            loss = return_dict['loss']

In [11]:
class Config(dict):
    def __init__(self, *args, **kwargs): 
        dict.__init__(self, *args, **kwargs)     
            
    def __getattr__(self, key):
        #print(key)
        return self[key]

    def __setattr__(self, key, value):
        #print(key, value)
        self[key] = value

In [12]:
criterion = nn.CrossEntropyLoss()
label_vocab = fields['label'].vocab
# Ugly but just to check
config_dict = {
    'rnn_type': 'LSTM',
    'embed_dim': 300,
    'hidden_dim': 300,
    'nlayers': 1,
    'lr': 1e-3,
    'clip': 5,
    'epochs': 5,
    'batch_size': 32,
    'dropout': 0.,
    'bidirectional': True,
    'cuda': False,
    'vocab_size': len(vocab),
    'num_classes': len(label_vocab)
    
}

config = Config(config_dict)
#model = TorchModel(AttentionRNN, config, criterion, optimizer=torch.optim.Adam)

#optimizer = optimizer = torch.optim.Adam(model.parameters(), config.lr, amsgrad=True)
trainer = TorchTrainer(config.epochs, valid_iterator)


In [15]:
from functools import partial
train_iterator = partial(Iterator, batch_size=32, shuffle=True)
valid_iterator = Iterator(dataset=imdb_train, batch_size=32, shuffle=True)

experiment = Experiment(MyTorchModel, trainer=trainer, 
                        training_iterator_callable=train_iterator)
experiment.fit(
    imdb_train,
    model_kwargs={
        'model_class': AttentionRNN, 
        'config': config, 
        'criterion': criterion,
        'optimizer': torch.optim.Adam
    },
)

Total param size: 4446002
[Batch]: 39/782 in 0.75744 seconds, loss=0.63336

KeyboardInterrupt: 