# Sentiment analysis using PyTorch

In this example, we'll use PyTorch to implement a LSTM-based sentiment analysis over the [Large Movie Review Dataset](http://ai.stanford.edu/~amaas/data/sentiment/). The model will take as input a text sequence, which represents a movie review and will output a binary result of whether the review is positive or negative.

_This example is partially based on_ [https://github.com/bentrevett/pytorch-sentiment-analysis](https://github.com/bentrevett/pytorch-sentiment-analysis)

Let's start with the imports and the configuration. We'll use the `torchtext` package to load the dataset and tokenize the dataset:

In [1]:
import torch
import torchtext

EMBEDDING_SIZE = 100
HIDDEN_SIZE = 256

Next, let's load the dataset, which is embedded in `torchtext`:

In [2]:
# set up fields
TEXT = torchtext.data.Field(
    tokenize='spacy',  # use SpaCy tokenizer
    lower=True,  # convert all letters to lower case
    include_lengths=True,  # include the length of the movie review
)

LABEL = torchtext.data.LabelField(dtype=torch.float)

# Dataset splits
train, test = torchtext.datasets.IMDB.splits(TEXT, LABEL)

# Build glove vocabulary
TEXT.build_vocab(train, vectors=torchtext.vocab.GloVe(name='6B', dim=100))
LABEL.build_vocab(train)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# make iterator for splits
train_iter, test_iter = torchtext.data.BucketIterator.splits(
    (train, test), sort_within_batch=True, batch_size=64, device=device)

Next, wel'll implement the `LSTMModel` class, which uses `torch.nn.LSTM` at its core. `LSTMModel` combines the LSTM cell with an embedding input layer:

In [3]:
class LSTMModel(torch.nn.Module):
    def __init__(self, vocab_size, embedding_size, hidden_size, output_size, pad_idx):
        super().__init__()

        # Embedding field
        self.embedding = torch.nn.Embedding(num_embeddings=vocab_size,
                                            embedding_dim=embedding_size,
                                            padding_idx=pad_idx)

        # LSTM cell
        self.rnn = torch.nn.LSTM(input_size=embedding_size, hidden_size=hidden_size)

        # Fully connected output
        self.fc = torch.nn.Linear(hidden_size, output_size)

    def forward(self, text_sequence, text_lengths):
        # Extract embedding vectors
        embeddings = self.embedding(text_sequence)

        # Pad the sequences to equal length
        packed_sequence = torch.nn.utils.rnn.pack_padded_sequence(embeddings, text_lengths)

        packed_output, (hidden, cell) = self.rnn(packed_sequence)

        return self.fc(hidden)

Next, we'll instantiate the model and we'll initialize it's weights:

In [4]:
model = LSTMModel(vocab_size=len(TEXT.vocab),
                  embedding_size=EMBEDDING_SIZE,
                  hidden_size=HIDDEN_SIZE,
                  output_size=1,
                  pad_idx=TEXT.vocab.stoi[TEXT.pad_token])

model.embedding.weight.data.copy_(TEXT.vocab.vectors)

model.embedding.weight.data[TEXT.vocab.stoi[TEXT.unk_token]] = torch.zeros(EMBEDDING_SIZE)
model.embedding.weight.data[TEXT.vocab.stoi[TEXT.pad_token]] = torch.zeros(EMBEDDING_SIZE)

Then, we'll implement the training procedure, which is generic and works for feed-forward networks as wellL:

In [5]:
def train_model(model, loss_function, optimizer, data_loader):
    # set model to training mode
    model.train()

    current_loss = 0.0
    current_acc = 0

    # iterate over the training data
    for i, batch in enumerate(data_loader):
        # zero the parameter gradients
        optimizer.zero_grad()

        text, text_lengths = batch.text

        with torch.set_grad_enabled(True):
            # forward
            outputs = model(text, text_lengths).squeeze()
            loss = loss_function(outputs, batch.label)

            # backward
            loss.backward()
            optimizer.step()

        # statistics
        current_loss += loss.item() * text_lengths.shape[0]
        current_acc += torch.sum(torch.round(torch.sigmoid(outputs)).round() == batch.label.data)

    total_loss = current_loss / len(data_loader.dataset)
    total_acc = current_acc.double() / len(data_loader.dataset)

    print('Train Loss: {:.4f}; Accuracy: {:.4f}'.format(total_loss, total_acc))

We'll continue with the testing procedure, which is also generic:

In [6]:
def test_model(model, loss_function, data_loader):
    # set model in evaluation mode
    model.eval()

    current_loss = 0.0
    current_acc = 0

    # iterate over  the validation data
    for i, batch in enumerate(data_loader):
        text, text_lengths = batch.text

        # forward
        with torch.set_grad_enabled(False):
            outputs = model(text, text_lengths).squeeze()
            loss = loss_function(outputs, batch.label)

        # statistics
        current_loss += loss.item() * text_lengths.shape[0]
        current_acc += torch.sum(torch.round(torch.sigmoid(outputs)).round() == batch.label.data)

    total_loss = current_loss / len(data_loader.dataset)
    total_acc = current_acc.double() / len(data_loader.dataset)

    print('Test Loss: {:.4f}; Accuracy: {:.4f}'.format(total_loss, total_acc))

    return total_loss, total_acc

Finally, we'll instantiate the training components and train the model for 5 epochs:

In [7]:
optimizer = torch.optim.Adam(model.parameters())
loss_function = torch.nn.BCEWithLogitsLoss().to(device)

model = model.to(device)

for epoch in range(5):
    print(f"Epoch {epoch + 1}/5")
    train_model(model, loss_function, optimizer, train_iter)
    test_model(model, loss_function, test_iter)

Epoch 1/5
Train Loss: 0.6390; Accuracy: 0.6390
Test Loss: 0.6100; Accuracy: 0.6719
Epoch 2/5
Train Loss: 0.4262; Accuracy: 0.8076
Test Loss: 0.3940; Accuracy: 0.8248
Epoch 3/5
Train Loss: 0.2123; Accuracy: 0.9215
Test Loss: 0.3123; Accuracy: 0.8758
Epoch 4/5
Train Loss: 0.0855; Accuracy: 0.9736
Test Loss: 0.4015; Accuracy: 0.8571
Epoch 5/5
Train Loss: 0.0334; Accuracy: 0.9916
Test Loss: 0.5525; Accuracy: 0.8531
