In this example, we'll use PyTorch to implement a LSTM-based sentiment analysis over the [Large Movie Review Dataset](http://ai.stanford.edu/~amaas/data/sentiment/). The model will take as input a text sequence, which represents a movie review and will output a binary result of whether the review is positive or negative.

_This example is partially based on_ [https://github.com/bentrevett/pytorch-sentiment-analysis](https://github.com/bentrevett/pytorch-sentiment-analysis)


In [1]:
!pip install torch
!pip install torchtext
!pip install portalocker

Collecting portalocker
  Downloading portalocker-2.7.0-py2.py3-none-any.whl (15 kB)
Installing collected packages: portalocker
Successfully installed portalocker-2.7.0


Define the device (GPU by default with a fallback on CPU):

In [2]:
import torch

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

Start the training and testing dataset pipeline. First, define the
basic_english tokenizer, which splits the text on spaces (that is, word
tokenization):

In [3]:
from torchtext.data.utils import get_tokenizer

tokenizer = get_tokenizer('basic_english')

 Use the tokenizer to build the token vocabulary:

In [4]:
from torchtext.datasets import IMDB

from torchtext.vocab import build_vocab_from_iterator


def yield_tokens(data_iter):
    for _, text in data_iter:
        yield tokenizer(text)


vocabulary = build_vocab_from_iterator(
    yield_tokens(IMDB(split='train')),
    specials=["<unk>"])
vocabulary.set_default_index(vocabulary["<unk>"])

 Define the `collate_batch` function, which takes a batch of tokenized samples with varying lengths, and concatenates them in a single long sequence of tokens:

In [5]:
def collate_batch(batch):
    labels, samples, offsets = [], [], [0]
    for (_label, _sample) in batch:
        labels.append(int(_label) - 1)
        processed_text = torch.tensor(
            vocabulary(tokenizer(_sample)),
            dtype=torch.int64)
        samples.append(processed_text)
        offsets.append(processed_text.size(0))
    labels = torch.tensor(
        labels,
        dtype=torch.int64)
    offsets = torch.tensor(
        offsets[:-1]).cumsum(dim=0)
    samples = torch.cat(samples)

    return labels, samples, offsets

Define the LSTM model:

In [6]:
class LSTMModel(torch.nn.Module):
    def __init__(self,
                 vocab_size,
                 embedding_size,
                 hidden_size,
                 num_classes):
        super().__init__()

        # Embedding field
        self.embedding = torch.nn.EmbeddingBag(
            num_embeddings=vocab_size,
            embedding_dim=embedding_size)

        # LSTM cell
        self.rnn = torch.nn.LSTM(
            input_size=embedding_size,
            hidden_size=hidden_size)

        # Fully connected output
        self.fc = torch.nn.Linear(
            hidden_size, num_classes)

    def forward(self, text_sequence, offsets):
        # Extract embedding vectors
        embeddings = self.embedding(
            text_sequence, offsets)

        h_t, c_t = self.rnn(embeddings)

        return self.fc(h_t)

Define `train_model(model, cost_function, optimizer,
data_loader)` and `test_model(model, cost_function,
data_loader)` functions:

In [7]:
def train_model(model, cost_function, optimizer, data_loader):
    # send the model to the GPU
    model.to(device)

    # set model to training mode
    model.train()

    current_loss = 0.0
    current_acc = 0

    # iterate over the training data
    for i, (labels, inputs, offsets) in enumerate(data_loader):
        # send the input/labels to the GPU
        inputs = inputs.to(device)
        labels = labels.to(device)
        offsets = offsets.to(device)

        # zero the parameter gradients
        optimizer.zero_grad()

        with torch.set_grad_enabled(True):
            # forward
            outputs = model(inputs, offsets)
            _, predictions = torch.max(outputs, 1)
            loss = cost_function(outputs, labels)

            # backward
            loss.backward()
            optimizer.step()

        # statistics
        current_loss += loss.item() * labels.size(0)
        current_acc += torch.sum(predictions == labels.data)

    total_loss = current_loss / len(data_loader.dataset)
    total_acc = current_acc.double() / len(data_loader.dataset)

    print('Train Loss: {:.4f}; Accuracy: {:.4f}'.format(total_loss, total_acc))


def test_model(model, cost_function, data_loader):
    # send the model to the GPU
    model.to(device)

    # set model in evaluation mode
    model.eval()

    current_loss = 0.0
    current_acc = 0

    # iterate over  the validation data
    for i, (labels, inputs, offsets) in enumerate(data_loader):
        # send the input/labels to the GPU
        inputs = inputs.to(device)
        labels = labels.to(device)
        offsets = offsets.to(device)

        # forward
        with torch.set_grad_enabled(False):
            outputs = model(inputs, offsets)
            _, predictions = torch.max(outputs, 1)
            loss = cost_function(outputs, labels)

        # statistics
        current_loss += loss.item() * labels.size(0)
        current_acc += torch.sum(predictions == labels.data)

    total_loss = current_loss / len(data_loader.dataset)
    total_acc = current_acc.double() / len(data_loader.dataset)

    print('Test Loss: {:.4f}; Accuracy: {:.4f}'.format(total_loss, total_acc))

    return total_loss, total_acc

Proceed with the experiment. Instantiate the LSTM model, the cross-entropy cost function, and the Adam optimizer:

In [8]:
model = LSTMModel(
    vocab_size=len(vocabulary),
    embedding_size=64,
    hidden_size=64,
    num_classes=2)

cost_fn = torch.nn.CrossEntropyLoss()
optim = torch.optim.Adam(model.parameters())

Define `train_dataloader`, `test_dataloader`, and their respective
datasets (use mini-batch size of 64):

In [9]:
from torchtext.data.functional import to_map_style_dataset

train_iter, test_iter = IMDB()
train_dataset = to_map_style_dataset(train_iter)
test_dataset = to_map_style_dataset(test_iter)

from torch.utils.data import DataLoader

train_dataloader = DataLoader(
    train_dataset, batch_size=64,
    shuffle=True, collate_fn=collate_batch)
test_dataloader = DataLoader(
    test_dataset, batch_size=64,
    shuffle=True, collate_fn=collate_batch)

Run the training for 5 epochs:

In [10]:
for epoch in range(5):
    print(f'Epoch: {epoch + 1}')
    train_model(model, cost_fn, optim, train_dataloader)
    test_model(model, cost_fn, test_dataloader)

Epoch: 1
Train Loss: 0.5504; Accuracy: 0.7045
Test Loss: 0.4375; Accuracy: 0.7950
Epoch: 2
Train Loss: 0.3090; Accuracy: 0.8740
Test Loss: 0.3390; Accuracy: 0.8559
Epoch: 3
Train Loss: 0.2240; Accuracy: 0.9135
Test Loss: 0.3137; Accuracy: 0.8717
Epoch: 4
Train Loss: 0.1692; Accuracy: 0.9393
Test Loss: 0.3277; Accuracy: 0.8699
Epoch: 5
Train Loss: 0.1245; Accuracy: 0.9586
Test Loss: 0.3427; Accuracy: 0.8717
