In [2]:
import math
import time
import takepod
import pickle

import torch
import torch.nn as nn
import numpy as np

from takepod.datasets import BasicSupervisedImdbDataset
from takepod.storage import LabelField, Field, Vocab

## Dataset loading & preprocessing

When using `podium` you have three options for data loading:
1. Use one of our built-in datasets
2. Use a flexible data loader from a predefined format (`TabularDataset` loads from `.csv`, `.tsv` files)
3. Write your own data loader for a dataset in a custom format

### IMDB sentiment classification

![Imdb logo](img/imdb_logo_small.png)

For this walkthough, we will use the [IMDB sentiment classification dataset](https://ai.stanford.edu/~amaas/data/sentiment/). This dataset is built-in, so let's check what exactly does that mean.

- Each built-in dataset has a static method `get_dataset_splits` which downloads and caches the splits for that model and returns them as a tuple (train, valid?, test).
  - Note: the IMDB dataset only has a train and test split
- We will first load the IMDB dataset with default `Fields` (preprocessing pipelines) and check whether we might want to modify something.
- You can inspect the default fields by calling the `get_default_fields` static method of the dataset

In [3]:
imdb_train, imdb_test = BasicSupervisedImdbDataset.get_dataset_splits()

In [4]:
first_instance = imdb_train[0]
text, label = first_instance.text, first_instance.label


# Note that the text is cased
print("Data in a single dataset instance:")
print("="*50)
print(text)
print(label)
print("="*50)

def get_text_statistics(dataset):
    instance_lengths = [len(ex.text[1]) for ex in dataset]
    print(f"Input text length interval [{min(instance_lengths)}, {max(instance_lengths)}] \n" 
                             f"Average length {np.mean(instance_lengths)} +- {np.std(instance_lengths)}")
get_text_statistics(imdb_train)

Data in a single dataset instance:
(None, ['I', 'am', 'and', 'was', 'very', 'entertained', 'by', 'the', 'movie', '.', 'It', 'was', 'my', 'all', 'time', 'favorite', 'movie', 'of', '1976', '.', 'Being', 'raised', 'in', 'the', '70', "'s", ',', 'I', 'was', 'so', 'in', 'love', 'with', 'Kris', 'Kristoffersons', 'look', 'and', 'demeanor', ',', 'of', 'course', 'I', 'am', 'no', 'movie', 'critic', ',', 'but', 'for', 'the', 'time', 'era', ',', 'I', 'think', 'it', 'was', 'very', 'good', '.', 'I', 'very', 'much', 'like', 'the', 'combo', 'of', 'Streisand', 'and', 'Kristofferson', '.', 'I', 'thought', 'they', 'worked', 'very', 'well', 'together', '.', 'I', 'have', 'seen', 'the', 'movie', 'many', 'times', 'and', 'still', 'love', 'the', 'two', 'of', 'them', 'as', 'Esther', 'and', 'John', 'Norman', '.', 'I', 'am', 'a', 'very', 'huge', 'fan', 'of', 'Kris', 'and', 'see', 'him', 'in', 'concert', 'when', 'I', 'can', '.', 'What', 'a', 'talented', 'singer', 'song', 'writer', ',', 'not', 'to', 'mention', ',', 

### Using hooks during data preprocessing
The average length of instances in the dataset is large, while the longest instance has 2789 tokens. 
Instances of this length are likely to cause memory issues when batched and transferred to GPU, so we would like to limit this. We might also want only lowercase data in our instances.

TODO: add data processing graph (cf prez)

We can implement this ourselves easily by adding `hooks` to our model. Hooks are methods with a standardized signature which view and modify the data flowing through the preprocessing pipeline at **two** points.
1. **Pre-tokenization hooks**:
  - pre-tokenization hooks work on raw data (the loaded input string). You might want to lowercase data during pre-tokenization, but keep in mind that most tokenizers (such as `spacy`) are sensitive to casing and might produce bad results. Since we use `spacy` as the `IMDB` tokenizer, this is not a good choice and we might want to delegate lowercasing to post-tokenization.
2. **Post-tokenization hooks**:
  - post-tokenization hooks work on raw **and** tokenized data. Here you might want to limit the length of your instances to a fixed amount or filter out stop-words.

In [5]:
## The signature of post-tokenization hooks has *two* arguments: raw and tokenized data
def lowercase(raw, tokenized):
    """Applies lowercasing as a post-tokenization hook
    
    Parameters
    ----------
    Raw : str
        the untokenized input data
    Tokenized: list(str)
        list of tokens.
    Returns
    -------
    Raw: str 
        unmodified input
    Tokenized: list(str) 
        lowercased tokenized data
    """
    return raw, [token.lower() for token in tokenized]

def max_length(raw, data, length=200):
    """Applies lowercasing as a post-tokenization hook
    
    Parameters
    ----------
    Raw : str
        the untokenized input data
    Tokenized: list(str)
        list of tokens.
    Length: int
        maximum length for each instance 
    Returns
    -------
    Raw: str 
        unmodified input
    Tokenized: list(str) 
        tokenized data truncated to `length`
    """
    return raw, data[:length]


def create_fields():
    # Define the vocabulary
    max_vocab_size = 10000
    min_frequency = 5
    vocab = Vocab(max_size=max_vocab_size, min_freq=min_frequency)

    text = Field(name='text', vocab=vocab, tokenizer='spacy', store_as_raw=False)
    # Add preprpocessing hooks to model
    # 1. Lowercase
    text.add_posttokenize_hook(lowercase)
    # 2. Truncate to length
    text.add_posttokenize_hook(max_length)

    label = LabelField(name='label', vocab = Vocab(specials=()))
    return {text.name : text, label.name: label}

Now let's create our modified fields and load the dataset

In [6]:
fields = create_fields()
# TODO: remove Dataset, Basic, Supervised from IMDB name
imdb_train, imdb_test = BasicSupervisedImdbDataset.get_dataset_splits(fields)

In [7]:
# Check whether the preprocessing worked
get_text_statistics(imdb_train)

Input text length interval [11, 200] 
Average length 170.3254 +- 41.14426515129417


## Load pretrained embeddings

In most use-cases, we want to use pre-trained word embeddings. With `podium`, this process is incredibly simple. If your field uses a vocabulary, it has already built an inventory of tokens for your dataset.

`Podium` offers a number of implemented `vectorizers` and a class ([BasicVectorStorage](https://github.com/mttk/takepod/blob/master/takepod/storage/vectorizers/vectorizer.py#L218)) which is able to load the standardized word2vec-style format of word embeddings from disk.

For example, we will use the [GloVe](https://nlp.stanford.edu/projects/glove/) vectors. The procedure to load these vectors has two steps:
1. Initialize the vector class, which sets all the required paths
  - Right now, the vectors are not yet loaded from disk as you usually don't want to load the full file
2. Get the vectors for a pre-defined list of words by calling `load_vocab`
  - The argument can be a `Vocab` object (which is itself an `iterable` of strings), or any sequence of strings
  
The output of the function call is a numpy matrix of word embeddings which you can then pass to your model to initialize the embedding matrix or to be used otherwise.

In [9]:
# Load GloVe embeddings
from takepod.storage.vectorizers.impl import GloVe
vocab = fields['text'].vocab
embeddings = GloVe().load_vocab(vocab)
print(f"For vocabulary of size: {len(vocab)} loaded embedding matrix of shape: {embeddings.shape}")

For vocabulary of size: 10000 loaded embedding matrix of shape: (10000, 300)


## Define & train a model

Now we need to train a concrete model on our data!

We will use a pre-defined RNN classifier with self-attention as our model.
The model, which is implemented in pytorch, needs to be wrapped in the `podium.model` interface so other convenience classes can be used. In this case, the classes you need to use are:

- `podium.model` subclass: 
  - Exposes abstract methods required to train or evaluate the model, or predict on raw data
- `podium.trainer` subclass:
  - Handles the data <-> model communication (e.g. batching, early stopping). The user only implements this class but does not explicitly use its methods.
- `podium.experiment` instance:
  - Wraps the model and its parameters to simplify multiple restarts with different choices of hyperparameters (in order to use grid search)

In [13]:
# First, we will define the hyperparameters for our model. 
# These are only used when a concrete model is trained, and can be changed between calls.
model_config = {
    'rnn_type': 'LSTM',
    'embed_dim': 300,
    'hidden_dim': 150,
    'nlayers': 1,
    'lr': 1e-3,
    'clip': 5,
    'epochs': 1,
    'batch_size': 32,
    'dropout': 0.,
    'bidirectional': True,
    'gpu': -1
}

# Task-specific metadata
label_vocab = fields['label'].vocab
model_config['num_classes'] = len(label_vocab)
model_config['vocab_size'] = len(vocab)
model_config['pretrained_embedding'] = embeddings
# Run on CPU since we don't have a GPU on this machine
device = torch.device('cpu:0')
# Define the model criterion
criterion = nn.CrossEntropyLoss()

print(model_config)

{'rnn_type': 'LSTM', 'embed_dim': 300, 'hidden_dim': 150, 'nlayers': 1, 'lr': 0.001, 'clip': 5, 'epochs': 1, 'batch_size': 32, 'dropout': 0.0, 'bidirectional': True, 'gpu': -1, 'num_classes': 2, 'vocab_size': 10000, 'pretrained_embedding': array([[ 0.       ,  0.       ,  0.       , ...,  0.       ,  0.       ,
         0.       ],
       [ 0.       ,  0.       ,  0.       , ...,  0.       ,  0.       ,
         0.       ],
       [ 0.04656  ,  0.21318  , -0.0074364, ...,  0.0090611, -0.20989  ,
         0.053913 ],
       ...,
       [-0.24734  ,  0.019346 ,  0.13974  , ...,  0.34035  ,  0.0824   ,
         0.38554  ],
       [ 0.67287  , -0.43249  ,  0.1106   , ..., -0.16644  ,  0.21169  ,
         0.45995  ],
       [ 0.034368 ,  0.22004  ,  0.14626  , ..., -0.18641  , -0.032439 ,
         0.24544  ]])}


In [11]:
from takepod.datasets import Iterator
from takepod.models import Experiment

from takepod.models.impl.pytorch import TorchTrainer, TorchModel, AttentionRNN

data_iterator = Iterator(batch_size=32)

trainer = TorchTrainer(model_config['epochs'], device, data_iterator, imdb_test)
experiment = Experiment(TorchModel, trainer=trainer)

model = experiment.fit(
    imdb_train, # Data on which to fit the model
    model_kwargs={ # Arguments passed to the model constructor
        'model_class': AttentionRNN, # The wrapped concrete model
        'criterion': criterion, # The loss for the concrete model
        'optimizer': torch.optim.Adam, # Optimizer _class_
        'device': device, # The device to store the data on
        **model_config # Delegated to the concrete model
    },
)

Total parameter size: 3543002
[Batch]: 781 in 0.32565 seconds, loss=0.55139
Total time for train epoch: 397.8200333118439
[Valid]: 781 in 0.03639 seconds, loss=0.67276
Total time for valid epoch: 76.19551157951355


In [15]:
# Check serialization for _model_ only (should be for experiment as well)
import pickle
fitted_model = experiment.model

model_save_file = 'model.pt'
with open(model_save_file, 'wb') as dump_file:
    pickle.dump(fitted_model, dump_file)

with open(model_save_file, 'rb') as load_file:
    loaded_model = pickle.load(load_file)

KeyError: 'vocab_size'

## Pipeline: enable your model to process raw data

So far, we have been dealing with data wrapped in podium `Dataset` instances. This might not be the case in real-world scenarios, where you want to use a trained model to process raw data.

To simplify this, we provide a `Pipeline` class, designed to streamline raw data processing. Pipeline extends your `Experiment` class with the following functionality:
1. Obtain predictions from raw data
2. Fine-tune your model on raw data
3. Retrain your model on raw data

In [11]:
from takepod.pipeline import Pipeline

ft = experiment.feature_transformer
cast_to_torch_transformer = lambda t: torch.from_numpy(ft.transform(t).swapaxes(0,1)).to(device)

pipe = Pipeline(
  fields = list(fields.values()),
  example_format = 'list',
  feature_transformer = cast_to_torch_transformer,
  model = fitted_model
  )

instances = [
        ['This movie is horrible'], 
        ['This movie is great!']
]

for instance in instances:
    prediction = pipe.predict_raw(instance)
    print(f"For instance: {instance}, the prediction is: {fields['label'].vocab.itos[prediction.argmax()]}, with logits: {prediction}")


For instance: ['This movie is horrible'], the prediction is: negative, with logits: [-4.63021    4.4030523]
For instance: ['This movie is great!'], the prediction is: positive, with logits: [ 4.0780497 -4.1491404]
