# Project one – predicting the sentiment of IMDb movie reviews

## Preparing the movie review data

In [1]:
import torch
import torch.nn as nn
from torchtext.datasets import IMDB
from torch.utils.data.dataset import random_split

In [2]:
train_dataset = IMDB(split='train')
test_dataset = IMDB(split='test')


In [3]:
torch.manual_seed(1)
train_dataset, valid_dataset = random_split(
    list(train_dataset), [20000, 5000])

In [4]:
train_dataset[0]

('pos',
 'An extra is called upon to play a general in a movie about the Russian Revolution. However, he is not any ordinary extra. He is Serguis Alexander, former commanding general of the Russia armies who is now being forced to relive the same scene, which he suffered professional and personal tragedy in, to satisfy the director who was once a revolutionist in Russia and was humiliated by Alexander. It can now be the time for this broken man to finally "win" his penultimate battle. This is one powerful movie with meticulous direction by Von Sternberg, providing the greatest irony in Alexander\'s character in every way he can. Jannings deserved his Oscar for the role with a very moving performance playing the general at his peak and at his deepest valley. Powell lends a sinister support as the revenge minded director and Brent is perfect in her role with her face and movements showing so much expression as Jannings\' love. All around brilliance. Rating, 10.')

In [5]:
## Step 2: find unique tokens (words)
import re
from collections import Counter, OrderedDict

def tokenizer(text):
    text = re.sub('<[^>]*>', '', text)
    emoticons = re.findall('(?::|;|=)(?:-)?(?:\)|\(|D|P)', text.lower())
    text = re.sub('[\W]+', ' ', text.lower()) +' '.join(emoticons).replace('-', '')
    tokenized = text.split()
    return tokenized    

In [6]:
tokens_count = Counter()
for label, line in train_dataset:
    tokens = tokenizer(line)
    tokens_count.update(tokens)

print('vocab-size:', len(tokens_count))

vocab-size: 69023


In [9]:
tokens[0:5]

['this', 'was', 'talked', 'about', 'to']

In [17]:
## Step 3: encoding each unique token into integers
from torchtext.vocab import vocab
sorted_by_freq_tuples = sorted(tokens_count.items(), key=lambda x: x[1], reverse=True)
ordered_dict = OrderedDict(sorted_by_freq_tuples)
vocab = vocab(ordered_dict)
vocab.insert_token('<pad>', 0)
vocab.insert_token('<unk>', 1)
vocab.set_default_index(1)

In [18]:
# convert an example input text into a list of integer values
print([vocab[token] for token in ['this', 'is', 'an', 'example']])


[11, 7, 35, 457]


In [19]:
## Step 3-A: define the functions for transformation
text_pipeline = lambda x: [vocab[token] for token in tokenizer(x)]
label_pipeline = lambda x: 1. if x=='pos' else 0.

In [20]:
## Step 3-B: wrap the encode and transformation function

def collate_batch(batch):
    lbl_lst, txt_lst, lengths = [], [], []
    for _label, _text in batch:
        lbl_lst.append(label_pipeline(_label))
        processed_text = torch.tensor(text_pipeline(_text), dtype=torch.int64)
        txt_lst.append(processed_text)
        lengths.append(processed_text.size(0))
    
    lbl_lst = torch.tensor(lbl_lst)
    lengths = torch.tensor(lengths)
    padded_text_list = nn.utils.rnn.pad_sequence(txt_lst, batch_first=True)
    return padded_text_list, lbl_lst, lengths

In [21]:
## Take a small batch
from torch.utils.data import DataLoader
dataloader = DataLoader(train_dataset, batch_size=4, shuffle=False, collate_fn=collate_batch)

In [22]:
## divide all three datasets into data loaders with a batch size of 32
batch_size = 32
train_dl = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_batch)
valid_dl = DataLoader(valid_dataset, batch_size=batch_size, shuffle=False, collate_fn=collate_batch)
test_dl = DataLoader(test_dataset, batch_size=batch_size, shuffle=False, collate_fn=collate_batch)


In [26]:
embedding = nn.Embedding(num_embeddings=10, embedding_dim=3, padding_idx=0)
# a batch of 2 samples of 4 indices each
txt_encoded_input = torch.LongTensor([
    [1, 2, 4, 5],
    [4, 3, 2, 0]
])
print(embedding(txt_encoded_input))

tensor([[[-1.2902,  0.2566, -0.2339],
         [ 0.1644,  0.9498, -0.0640],
         [ 1.1612,  0.8091, -0.2493],
         [ 1.9732, -1.6155,  0.7387]],

        [[ 1.1612,  0.8091, -0.2493],
         [ 0.1517,  0.0105, -0.3540],
         [ 0.1644,  0.9498, -0.0640],
         [ 0.0000,  0.0000,  0.0000]]], grad_fn=<EmbeddingBackward0>)


## Building an RNN model

we will create an RNN model with two recurrent layers of type RNN. Finally, we will add a non-recurrent fully connected layer as the output layer, which will return a single output value as the prediction.

In [27]:
class RNN(nn.Module):
    def __init__(self, input_size, hidden_size):
        super().__init__()
        self.rnn = nn.RNN(input_size, hidden_size, num_layers=2, batch_first=True)
        # self.rnn = nn.GRU(input_size, hidden_size, num_layers, batch_first=True)
        # self.rnn = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_size, 1)
    
    def forward(self, x):
        _, hidden = self.rnn(x)
        # we use the final hidden state from the last hidden layer as the input to the fully connected layer
        out = hidden[-1, :, :]
        out = self.fc(out)
        return out
    

In [28]:
model = RNN(64, 32)
print(model)

RNN(
  (rnn): RNN(64, 32, num_layers=2, batch_first=True)
  (fc): Linear(in_features=32, out_features=1, bias=True)
)


In [29]:
model(torch.randn(5, 3, 64)) #batch size = 5, sequence length = 3, each sequence has 64 input features 

tensor([[ 0.2987],
        [-0.1265],
        [-0.0386],
        [ 0.3542],
        [-0.1470]], grad_fn=<AddmmBackward0>)

## Building an RNN model for the sentiment analysis task

In [30]:
class RNN(nn.Module):
    def __init__(self, vocab_size, embed_dim, rnn_hidden_size, fc_hidden_size):
        super().__init__()
        self.embedding = nn.Embedding(num_embeddings=vocab_size, embedding_dim=embed_dim, padding_idx=0)
        self.rnn = nn.LSTM(input_size=embed_dim, hidden_size=rnn_hidden_size, batch_first=True)
        self.fc1 = nn.Linear(rnn_hidden_size, fc_hidden_size)
        self.relu1 = nn.ReLU()
        self.fc2 = nn.Linear(fc_hidden_size, 1)
        self.sigmoid = nn.Sigmoid()
    
    def forward(self, text, lengths):
        out = self.embedding(text)
        out = nn.utils.rnn.pack_padded_sequence(
            out, lengths.cpu().numpy(), enforce_sorted=False, batch_first=True
        )
        out, (hidden, cell) = self.rnn(out)
        out = hidden[-1, :, :]
        out = self.fc1(out)
        out = self.relu1(out)
        out = self.fc2(out)
        out = self.sigmoid(out)
        return out


In [32]:
vocab_size = len(vocab)
embed_dim = 20
rnn_hidden_size = 64
fc_hidden_size = 64

torch.manual_seed(1)

model = RNN(vocab_size, embed_dim, rnn_hidden_size, fc_hidden_size)


RNN(
  (embedding): Embedding(69025, 20, padding_idx=0)
  (rnn): LSTM(20, 64, batch_first=True)
  (fc1): Linear(in_features=64, out_features=64, bias=True)
  (relu1): ReLU()
  (fc2): Linear(in_features=64, out_features=1, bias=True)
  (sigmoid): Sigmoid()
)

In [34]:
def train(dataloader):
    model.train()
    total_acc, total_loss = 0, 0
    for txt_batch, lbl_batch, lengths in dataloader:
        optimizer.zero_grad()
        pred = model(txt_batch, lengths)[:, 0]
        loss = loss_fn(pred, lbl_batch)
        loss.backward()
        optimizer.step()
        total_acc += ((pred >= 0.5).float() == lbl_batch).float().sum().item()
        total_loss += loss.item() * lbl_batch.size(0)
    
    return total_acc/len(dataloader.dataset), total_loss/len(dataloader.dataset)

In [35]:
def evaluate(dataloader):
    model.eval()
    total_acc, total_loss = 0, 0
    with torch.no_grad():
        for txt_batch, lbl_batch, lengths in dataloader:
            pred = model(txt_batch, lengths)[:, 0]
            loss = loss_fn(pred, lbl_batch)
            total_acc += ((pred >= 0.5).float() == lbl_batch).float().sum().item()
            total_loss += loss.item() * lbl_batch.size(0)
    
    return total_acc/len(dataloader.dataset), total_loss/len(dataloader.dataset)

In [36]:
loss_fn = nn.BCELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

In [39]:
model.to('cpu')
num_epochs = 3
torch.manual_seed(1)
for epoch in range(num_epochs):
    acc_train, loss_train = train(train_dl)
    acc_valid, loss_valid = evaluate(valid_dl)
    print(f'Epoch {epoch} accuracy: {acc_train:.4f}' f' val_accuracy: {acc_valid:.4f}')

Epoch 0 accuracy: 0.7609 val_accuracy: 0.7542


KeyboardInterrupt: 

In [None]:
acc_test, loss_test = evaluate(test_dl)
print(f'accuracy: {acc_test:.4f}' f' loss: {loss_test:.4f}')

Bidirectional LSTM passes through the input sequences from start to end and in reverse direction

In [None]:
class Bi_RNN(nn.Module):
    def __init__(self, vocab_size, embed_dim, rnn_hidden_size, fc_hidden_size):
        super().__init__()
        self.embedding = nn.Embedding(num_embeddings=vocab_size, embed_dim=embed_dim, padding_idx=0)
        self.rnn = nn.LSTM(embed_dim, rnn_hidden_size, batch_first=True, bidirectional=True)
        self.fc1 = nn.Linear(rnn_hidden_size*2, fc_hidden_size)
        self.relu1 = nn.Relu()
        self.fc2 = nn.Linear(fc_hidden_size, 1)
        self.sigmoid = nn.Sigmoid()
    
    def forward(self, text, lengths):
        out = self.embedding(text)
        out = nn.utils.rnn.pack_padded_sequence(
            out, lengths.cpu().numpy(), enforce_sorted=False, batch_first=True
        )

        _, (hidden, cell) = self.rnn(out)
        out = torch.cat(
            (hidden[-2, :, :], hidden[-1, :, :]),
            dim=1
        )
        out = self.fc1(out)
        out = self.relu1(out)
        out = self.fc2(out)
        out = self.sigmoid(out)
        return out

torch.manual_seed(1)
bi_model = Bi_RNN(vocab_size, embed_dim, rnn_hidden_size, fc_hidden_size)
bi_model

In [None]:
def bi_train(dataloader):
    bi_model.train()
    total_acc, total_loss = 0, 0
    for txt_batch, lbl_batch, lengths in dataloader:
        optimizer.zero_grad()
        pred = bi_model(txt_batch, lengths)[:, 0]
        loss = loss_fn(pred, lbl_batch)
        loss.backward()
        optimizer.step()
        total_acc += ((pred >= 0.5).float() == lbl_batch).float().sum().item()
        total_loss += loss.item() * lbl_batch.size(0)
    
    return total_acc/len(dataloader.dataset), total_loss/len(dataloader.dataset)

In [None]:
def evaluate(dataloader):
    bi_model.eval()
    total_acc, total_loss = 0, 0
    with torch.no_grad():
        for txt_batch, lbl_batch, lengths in dataloader:
            pred = bi_model(txt_batch, lengths)[:, 0]
            loss = loss_fn(pred, lbl_batch)
            total_acc += ((pred >= 0.5).float() == lbl_batch).float().sum().item()
            total_loss += loss.item() * lbl_batch.size(0)
    
    return total_acc/len(dataloader.dataset), total_loss/len(dataloader.dataset)

In [None]:
num_epochs = 3
torch.manual_seed(1)
for epoch in range(num_epochs):
    acc_train, loss_train = train(train_dl)
    acc_valid, loss_valid = evaluate(valid_dl)
    print(f'Epoch {epoch} accuracy: {acc_train:.4f}' f' val_accuracy: {acc_valid:.4f}')