In [6]:
import math
import time
import takepod
import pickle
from takepod.datasets import BucketIterator, Iterator, BasicSupervisedImdbDataset
from takepod.storage import LabelField, Field, Vocab
from takepod.storage.vectorizers.impl import GloVe
from takepod.models import Experiment, AbstractSupervisedModel
from takepod.pipeline import Pipeline
from takepod.models import AbstractTrainer


In [26]:
def lowercase(raw, data):
    return raw, [d.lower() for d in data]

def max_length(raw, data, length=200):
    return raw, data[:length]

def create_fields():
    # Define the vocabulary
    vocab = Vocab(max_size=10000, min_freq=5)

    text = Field(name='text', vocab=vocab, tokenizer='spacy', store_as_raw=False)
    # Add preprpocessing hooks to model
    # 1. Lowercase
    text.add_posttokenize_hook(lowercase)
    text.add_posttokenize_hook(max_length)

    # Improve readability: LabelField
    label = LabelField(name='label', vocab = Vocab(specials=()))
    return {text.name : text, label.name: label}

In [27]:
fields = create_fields()
# TODO: get_dataset_splits; remove Dataset, Basic, Supervised from IMDB name
imdb_train, imdb_test = BasicSupervisedImdbDataset.get_train_test_dataset(fields)

In [28]:
# Load GloVe embeddings
vocab = fields['text'].vocab
embeddings = GloVe().load_vocab(vocab)
print(embeddings)

[[ 0.         0.         0.        ...  0.         0.         0.       ]
 [ 0.         0.         0.        ...  0.         0.         0.       ]
 [ 0.04656    0.21318   -0.0074364 ...  0.0090611 -0.20989    0.053913 ]
 ...
 [-0.24734    0.019346   0.13974   ...  0.34035    0.0824     0.38554  ]
 [ 0.67287   -0.43249    0.1106    ... -0.16644    0.21169    0.45995  ]
 [ 0.034368   0.22004    0.14626   ... -0.18641   -0.032439   0.24544  ]]


In [41]:
import torch
import torch.nn as nn
import torch.nn.functional as F
RNNS = ['LSTM', 'GRU']


class Encoder(nn.Module):
    def __init__(self, embedding_dim, hidden_dim, nlayers=1, dropout=0.,
                   bidirectional=True, rnn_type='GRU'):
        super(Encoder, self).__init__()
        
        self.bidirectional = bidirectional
        assert rnn_type in RNNS, 'Use one of the following: {}'.format(str(RNNS))
        rnn_cell = getattr(nn, rnn_type) # fetch constructor from torch.nn
        self.rnn = rnn_cell(embedding_dim, hidden_dim, nlayers, 
                                dropout=dropout, bidirectional=bidirectional)

    def forward(self, input, hidden=None):
        return self.rnn(input, hidden)


class Attention(nn.Module):
    def __init__(self, query_dim, key_dim, value_dim):
        super(Attention, self).__init__()
        self.scale = 1. / math.sqrt(query_dim)

    def forward(self, query, keys, values):
        # Query = [BxQ]
        # Keys = [TxBxK]
        # Values = [TxBxV]
        # Outputs = a:[TxB], lin_comb:[BxV]

        # Here we assume q_dim == k_dim (dot product attention)

        query = query.unsqueeze(1) # [BxQ] -> [Bx1xQ]
        keys = keys.transpose(0,1).transpose(1,2) # [TxBxK] -> [BxKxT]
        energy = torch.bmm(query, keys) # [Bx1xQ]x[BxKxT] -> [Bx1xT]
        energy = F.softmax(energy.mul_(self.scale), dim=2) # scale, normalize

        values = values.transpose(0,1) # [TxBxV] -> [BxTxV]
        linear_combination = torch.bmm(energy, values).squeeze(1) #[Bx1xT]x[BxTxV] -> [BxV]
        return energy, linear_combination

class AttentionRNN(nn.Module):
    def __init__(self, pretrained_embedding=None, **config):
        super(AttentionRNN, self).__init__()
        self.config = config
        self.embedding = nn.Embedding(config['vocab_size'], config['embed_dim'])

        # Copy the pretrained embeddings if they exist
        if pretrained_embedding is not None:
            self.embedding.weight.data.copy_(torch.from_numpy(pretrained_embedding))

        self.encoder = Encoder(config['embed_dim'], config['hidden_dim'], config['nlayers'], 
                               config['dropout'], config['bidirectional'], config['rnn_type'])
        attention_dim = config['hidden_dim'] if not config['bidirectional'] else 2 * config['hidden_dim']
        self.attention = Attention(attention_dim, attention_dim, attention_dim)
        self.decoder = nn.Linear(attention_dim, config['num_classes'])

        size = 0
        for p in self.parameters():
            size += p.nelement()
        print('Total parameter size: {}'.format(size))


    def forward(self, input):
        outputs, hidden = self.encoder(self.embedding(input))
        if isinstance(hidden, tuple): # LSTM
            hidden = hidden[1] # take the cell state

        if self.encoder.bidirectional: # need to concat the last 2 hidden layers
            hidden = torch.cat([hidden[-1], hidden[-2]], dim=1)
        else:
            hidden = hidden[-1]

        energy, linear_combination = self.attention(hidden, outputs, outputs) 
        logits = self.decoder(linear_combination)
        return_dict = {
            'pred': logits,
            'attention_weights':energy
        }

        return return_dict

In [46]:
class MyTorchModel(AbstractSupervisedModel):
    def __init__(self, model_class, criterion, optimizer, 
                 device=torch.device('cpu'), **model_config):
        self.model_class = model_class
        self.model_config = model_config
        self.device = device
        self.optimizer_class = optimizer
        
        self._model = model_class(**model_config).to(self.device)
        self.optimizer = optimizer(self.model.parameters(), model_config['lr'])

        self.criterion = criterion

    @property
    def model(self):
        return self._model
        
    def __call__(self, X):
        """Call the forward method of the internalized model
        """
        return self.model(X)

    def fit(self, X, y, **kwargs):
        """Fit the model on (X, y). 
        Assumes that the model is in training mode.
        """
        # Train-specific boilerplate code
        self.model.train()
        self.model.zero_grad()

        return_dict = self(X)
        logits = return_dict['pred'].view(-1, self.model_config['num_classes'])

        loss = self.criterion(logits, y.squeeze())
        return_dict['loss'] = loss
        
        # Optimization
        loss.backward()
        torch.nn.utils.clip_grad_norm_(self.model.parameters(), self.model_config['clip'])
        self.optimizer.step()
        return return_dict

    def predict(self, X, return_as_numpy=True, **kwargs):
        """Return the outputs of the model for inputs X.
        """
        self.model.eval()
        with torch.no_grad():
            return_dict = self(X)

            if return_as_numpy:
                # Cast everything to numpy
                preds = return_dict['pred']
                # .cpu() is a no-op if the model is already on cpu
                preds = preds.cpu().numpy()
                return_dict['pred'] = preds

            return return_dict
    
    def evaluate(self, X, y, **kwargs):
        """Evaluate the model (compute loss) on (X, y). 
        Assumes that the model is in evaluation mode.
        """

        self.model.eval()
        with torch.no_grad():
            return_dict = self(X)
            logits = return_dict['pred'].view(-1, self.model_config['num_classes'])
            loss = self.criterion(logits, y.squeeze())
            return_dict['loss'] = loss
            return return_dict

    def reset(self, **kwargs):
        """Reset (re-initialize) the model.
        Also resets the internal state of the optimizer.
        """
        self._model = self.model_class(self.model_config).to(self.model_config['device'])
        self.optimizer = self.optimizer_class(self.model.parameters(), self.model_config['lr'])

    def __setstate__(self, state):
        self.model_class = state['model_class']
        self.model_config = state['model_config']
        self.device = state['device']

        # Deserialize model
        model = self.model_class(self.config)
        model.load_state_dict(state['model_state'])
        self._model = model

        # Deserialize optimizer
        self.optimizer_class = state['optimizer_class']
        self.optimizer = self.optimizer_class(self.model.parameters(), self.config.lr)
        self.optimizer.load_state_dict(state['optimizer_state'])

        # Deserialize loss
        loss_class = state['loss_class']
        self.criterion = loss_class()
        self.criterion.load_state_dict(state['loss_state'])

    def __getstate__(self):
        state = {
            'model_class': self.model_class,
            'config': self.model_config,
            'model_state': self.model.state_dict(),
            'optimizer_class': self.optimizer_class,
            'optimizer_state': self.optimizer.state_dict(),
            'loss_class': self.criterion.__class__,
            'loss_state': self.criterion.state_dict(),
            'device': self.device
        }
        return state

In [31]:
class TorchTrainer(AbstractTrainer):
    def __init__(self, num_epochs, device, iterator, valid_data=None):
        self.epochs = num_epochs
        self.valid_data = valid_data
        self.device = device
        self.iterator = iterator

    def train(self,
              model,
              dataset,
              feature_transformer,
              label_transform_fun,
              **kwargs):

        for _ in range(self.epochs):
            total_time = time.time()
            for batch_num, (batch_x, batch_y) in enumerate(self.iterator(dataset)):
                t = time.time()
                X = torch.from_numpy(
                    feature_transformer.transform(batch_x).swapaxes(0,1) # swap batch_size and T
                    ).to(self.device)
                y = torch.from_numpy(
                    label_transform_fun(batch_y)
                    ).to(self.device)

                return_dict = model.fit(X, y)

                print("[Batch]: {} in {:.5f} seconds, loss={:.5f}".format(
                       batch_num, time.time() - t, return_dict['loss']), 
                       end='\r', flush=True)

            print(f"\nTotal time for train epoch: {time.time() - total_time}")

            total_time = time.time()
            for batch_num, (batch_x, batch_y) in enumerate(self.iterator(self.valid_data)):
                t = time.time()
                X = torch.from_numpy(
                    feature_transformer.transform(batch_x).swapaxes(0,1) # swap batch_size and T
                    ).to(self.device)
                y = torch.from_numpy(
                    label_transform_fun(batch_y)
                    ).to(self.device)

                return_dict = model.evaluate(X, y)
                loss = return_dict['loss']
                print("[Valid]: {} in {:.5f} seconds, loss={:.5f}".format(
                       batch_num, time.time() - t, loss), 
                       end='\r', flush=True)

            print(f"\nTotal time for valid epoch: {time.time() - total_time}")

In [35]:
# Model-specific configuration
model_config = {
    'rnn_type': 'LSTM',
    'embed_dim': 300,
    'hidden_dim': 150,
    'nlayers': 1,
    'lr': 1e-3,
    'clip': 5,
    'epochs': 1,
    'batch_size': 32,
    'dropout': 0.,
    'bidirectional': True,
    'gpu': -1
}

# Task-specific configuration
model_config['vocab_size'] = len(vocab)
label_vocab = fields['label'].vocab
model_config['num_classes'] = len(label_vocab)
model_config['pretrained_embedding'] = embeddings

device = torch.device('cpu:0')

Vocab[finalized: True, size: 2]


In [44]:
data_iterator = Iterator(batch_size=32)

trainer = TorchTrainer(model_config['epochs'], device, data_iterator, imdb_test)
criterion = nn.CrossEntropyLoss()

experiment = Experiment(MyTorchModel, trainer=trainer)
model = experiment.fit(
    imdb_train,
    model_kwargs={
        'model_class': AttentionRNN, 
        'criterion': criterion,
        'optimizer': torch.optim.Adam,
        'device': device,
        **model_config
    },
)

Total parameter size: 3543002
[Batch]: 781 in 0.34659 seconds, loss=0.53425
Total time for train epoch: 807.2142312526703
[Valid]: 781 in 0.03590 seconds, loss=0.71830
Total time for valid epoch: 79.69959831237793


In [47]:
# Check serialization for _model_ only (should be for experiment as well)
import pickle
fitted_model = experiment.model

model_save_file = 'model.pt'
with open(model_save_file, 'wb') as dump_file:
    pickle.dump(fitted_model, dump_file)

with open(model_save_file, 'rb') as load_file:
    loaded_model = pickle.load(load_file)

PicklingError: Can't pickle <class '__main__.MyTorchModel'>: it's not the same object as __main__.MyTorchModel

In [48]:
ft = experiment.feature_transformer
cast_to_torch_transformer = lambda t: torch.from_numpy(ft.transform(t).swapaxes(0,1)).to(device)

pipe = Pipeline(
  fields = list(fields.values()),
  example_format = 'list',
  feature_transformer = cast_to_torch_transformer,
  model = fitted_model
  )

instances = [
        ['This movie is horrible'], 
        ['This movie is great!']
]

# Make IMDB labels "positive" and "negative"
for instance in instances:
    prediction = pipe.predict_raw(instance)
    print(f"For instance: {instance}, the prediction is: {fields['label'].vocab.itos[prediction.argmax()]}, with logits: {prediction}")


For instance: ['This movie is horrible'], the prediction is: 0, with logits: [-5.358241   5.3401985]
For instance: ['This movie is great!'], the prediction is: 1, with logits: [ 3.6698027 -3.4615505]
