In [1]:
import math
import time
import takepod
import pickle

import torch
import torch.nn as nn
import numpy as np

from takepod.datasets import Iterator, BasicSupervisedImdbDataset
from takepod.storage import LabelField, Field, Vocab
from takepod.storage.vectorizers.impl import GloVe
from takepod.models import Experiment
from takepod.pipeline import Pipeline

from takepod.models.impl.pytorch import TorchTrainer, TorchModel, AttentionRNN

## Dataset loading & preprocessing

When using `podium` you have three options for data loading:
1. Use one of our built-in datasets
2. Use a flexible data loader from a predefined format (`TabularDataset` loads from `.csv`, `.tsv` files)
3. Write your own data loader for a dataset in a custom format

### IMDB sentiment classification

![Imdb logo](img/imdb_logo_small.png)

For this walkthough, we will use the [IMDB sentiment classification dataset](https://ai.stanford.edu/~amaas/data/sentiment/). This dataset is built-in, so let's check what exactly does that mean.

- Each built-in dataset has a static method `get_dataset_splits` which downloads and caches the splits for that model and returns them as a tuple (train, valid?, test).
  - Note: the IMDB dataset only has a train and test split
- We will first load the IMDB dataset with default `Fields` (preprocessing pipelines) and check whether we might want to modify something.
- You can inspect the default fields by calling the `get_default_fields` static method of the dataset

In [2]:
imdb_train, imdb_test = BasicSupervisedImdbDataset.get_dataset_splits()

In [9]:
first_instance = imdb_train[0]
text, label = first_instance.text, first_instance.label


# Note that the text is cased
print("Data in a single dataset instance:")
print("="*50)
print(text)
print(label)
print("="*50)

def get_text_statistics(dataset):
    instance_lengths = [len(ex.text[1]) for ex in dataset]
    print(f"Input text length interval [{min(instance_lengths)}, {max(instance_lengths)}] \n" 
                             f"Average length {np.mean(instance_lengths)} +- {np.std(instance_lengths)}")
get_text_statistics(imdb_train)

Data in a single dataset instance:
(None, ['i', 'am', 'and', 'was', 'very', 'entertained', 'by', 'the', 'movie', '.', 'it', 'was', 'my', 'all', 'time', 'favorite', 'movie', 'of', '1976', '.', 'being', 'raised', 'in', 'the', '70', "'s", ',', 'i', 'was', 'so', 'in', 'love', 'with', 'kris', 'kristoffersons', 'look', 'and', 'demeanor', ',', 'of', 'course', 'i', 'am', 'no', 'movie', 'critic', ',', 'but', 'for', 'the', 'time', 'era', ',', 'i', 'think', 'it', 'was', 'very', 'good', '.', 'i', 'very', 'much', 'like', 'the', 'combo', 'of', 'streisand', 'and', 'kristofferson', '.', 'i', 'thought', 'they', 'worked', 'very', 'well', 'together', '.', 'i', 'have', 'seen', 'the', 'movie', 'many', 'times', 'and', 'still', 'love', 'the', 'two', 'of', 'them', 'as', 'esther', 'and', 'john', 'norman', '.', 'i', 'am', 'a', 'very', 'huge', 'fan', 'of', 'kris', 'and', 'see', 'him', 'in', 'concert', 'when', 'i', 'can', '.', 'what', 'a', 'talented', 'singer', 'song', 'writer', ',', 'not', 'to', 'mention', ',', 

### Using hooks during data preprocessing
The average length of instances in the dataset is large, while the longest instance has 2789 tokens. 
Instances of this length are likely to cause memory issues when batched and transferred to GPU, so we would like to limit this. We might also want only lowercase data in our instances.

We can implement this ourselves easily by adding `hooks` to our model. Hooks are methods with a standardized signature which view and modify the data flowing through the preprocessing pipeline at **two** points.
1. **Pre-tokenization hooks**:
  - pre-tokenization hooks work on raw data (the loaded input string). You might want to lowercase data during pre-tokenization, but keep in mind that most tokenizers (such as `spacy`) are sensitive to casing and might produce bad results. Since we use `spacy` as the `IMDB` tokenizer, this is not a good choice and we might want to delegate lowercasing to post-tokenization.
2. **Post-tokenization hooks**:
  - post-tokenization hooks work on raw **and** tokenized data. Here you might want to limit the length of your instances to a fixed amount or filter out stop-words.

In [7]:
## The signature of post-tokenization hooks has *two* arguments: raw and tokenized data

def lowercase(raw, tokenized):
    """Applies lowercasing as a post-tokenization hook
    
    Parameters
    ----------
    Raw : str
        the untokenized input data
    Tokenized: list(str)
        list of tokens.
    Returns
    -------
    Raw: str 
        unmodified input
    Tokenized: list(str) 
        lowercased tokenized data
    """
    return raw, [token.lower() for token in tokenized]

def max_length(raw, data, length=200):
    """Applies lowercasing as a post-tokenization hook
    
    Parameters
    ----------
    Raw : str
        the untokenized input data
    Tokenized: list(str)
        list of tokens.
    Length: int
        maximum length for each instance 
    Returns
    -------
    Raw: str 
        unmodified input
    Tokenized: list(str) 
        tokenized data truncated to `length`
    """
    return raw, data[:length]


def create_fields():
    # Define the vocabulary
    max_vocab_size = 10000
    min_frequency = 5
    vocab = Vocab(max_size=max_vocab_size, min_freq=min_frequency)

    text = Field(name='text', vocab=vocab, tokenizer='spacy', store_as_raw=False)
    # Add preprpocessing hooks to model
    # 1. Lowercase
    text.add_posttokenize_hook(lowercase)
    # 2. Truncate to length
    text.add_posttokenize_hook(max_length)

    label = LabelField(name='label', vocab = Vocab(specials=()))
    return {text.name : text, label.name: label}

In [8]:
fields = create_fields()
# TODO: remove Dataset, Basic, Supervised from IMDB name
imdb_train, imdb_test = BasicSupervisedImdbDataset.get_dataset_splits(fields)

In [12]:
get_text_statistics(imdb_train)

Input text length interval [11, 200] 
Average length 170.3254 +- 41.14426515129417


## Load pretrained embeddings for the dataset

In [4]:
# Load GloVe embeddings
vocab = fields['text'].vocab
embeddings = GloVe().load_vocab(vocab)
print(embeddings)

[[ 0.         0.         0.        ...  0.         0.         0.       ]
 [ 0.         0.         0.        ...  0.         0.         0.       ]
 [ 0.04656    0.21318   -0.0074364 ...  0.0090611 -0.20989    0.053913 ]
 ...
 [-0.24734    0.019346   0.13974   ...  0.34035    0.0824     0.38554  ]
 [ 0.67287   -0.43249    0.1106    ... -0.16644    0.21169    0.45995  ]
 [ 0.034368   0.22004    0.14626   ... -0.18641   -0.032439   0.24544  ]]


## Define & train a model

In [5]:
# Model-specific configuration
model_config = {
    'rnn_type': 'LSTM',
    'embed_dim': 300,
    'hidden_dim': 150,
    'nlayers': 1,
    'lr': 1e-3,
    'clip': 5,
    'epochs': 1,
    'batch_size': 32,
    'dropout': 0.,
    'bidirectional': True,
    'gpu': -1
}

# Task-specific configuration
model_config['vocab_size'] = len(vocab)
label_vocab = fields['label'].vocab
model_config['num_classes'] = len(label_vocab)
model_config['pretrained_embedding'] = embeddings

device = torch.device('cpu:0')

In [6]:
data_iterator = Iterator(batch_size=32)

trainer = TorchTrainer(model_config['epochs'], device, data_iterator, imdb_test)
criterion = nn.CrossEntropyLoss()

experiment = Experiment(TorchModel, trainer=trainer)
model = experiment.fit(
    imdb_train,
    model_kwargs={
        'model_class': AttentionRNN, 
        'criterion': criterion,
        'optimizer': torch.optim.Adam,
        'device': device,
        **model_config
    },
)

Total parameter size: 3543002
[Batch]: 781 in 0.34126 seconds, loss=0.57051
Total time for train epoch: 464.8692126274109
[Valid]: 781 in 0.05970 seconds, loss=0.66946
Total time for valid epoch: 75.99735260009766


In [7]:
# Check serialization for _model_ only (should be for experiment as well)
import pickle
fitted_model = experiment.model

model_save_file = 'model.pt'
with open(model_save_file, 'wb') as dump_file:
    pickle.dump(fitted_model, dump_file)

with open(model_save_file, 'rb') as load_file:
    loaded_model = pickle.load(load_file)

KeyError: 'model_config'

In [8]:
ft = experiment.feature_transformer
cast_to_torch_transformer = lambda t: torch.from_numpy(ft.transform(t).swapaxes(0,1)).to(device)

pipe = Pipeline(
  fields = list(fields.values()),
  example_format = 'list',
  feature_transformer = cast_to_torch_transformer,
  model = fitted_model
  )

instances = [
        ['This movie is horrible'], 
        ['This movie is great!']
]

# Make IMDB labels "positive" and "negative"
for instance in instances:
    prediction = pipe.predict_raw(instance)
    print(f"For instance: {instance}, the prediction is: {fields['label'].vocab.itos[prediction.argmax()]}, with logits: {prediction}")


For instance: ['This movie is horrible'], the prediction is: 0, with logits: [-6.198414  6.626996]
For instance: ['This movie is great!'], the prediction is: 1, with logits: [ 3.8818069 -3.9210994]
