In [1]:
import pandas as pd


IMDB_df = pd.read_csv('../Sentiment analysis/movie_data.csv', encoding='utf-8')

In [2]:
from torch.utils.data.dataset import random_split
training_dataset, test_dataset = random_split(IMDB_df.values, [25000,25000])
train_dataset, valid_dataset = random_split(training_dataset,[20000,5000])

In [3]:
import re
from collections import Counter, OrderedDict

def tokenizer(text):
    text = re.sub('<[^>]*>','',text)
    emoticons = re.findall('(?::|;|=)(?:-)?(?:\)|\(|D|P)',text.lower())
    text = (
        re.sub('[\W]+',' ',text.lower()) + # removed all non-word characters from the text
        ' '.join(emoticons) # added found emoticons at the end of the screen
            .replace('-','') # removed the nose from faces for consistency
    )
    tokenized = text.split()
    return tokenized

token_counts = Counter()
for line, label in train_dataset:
    tokens = tokenizer(line)
    token_counts.update(tokens)
print('Vocab-size:',len(token_counts))

  emoticons = re.findall('(?::|;|=)(?:-)?(?:\)|\(|D|P)',text.lower())
  re.sub('[\W]+',' ',text.lower()) + # removed all non-word characters from the text


Vocab-size: 70831


In [4]:
from torchtext.vocab import vocab
sorted_by_freq_tuples = sorted(token_counts.items(),key=lambda x: x[1],reverse=True)
ordered_dict = OrderedDict(sorted_by_freq_tuples)
vocab = vocab(ordered_dict)
vocab.insert_token("<pad>",0)
vocab.insert_token("<unk>",1) # Unknown token, used to map not seen tokens
vocab.set_default_index(1)



In [5]:
print([vocab[token] for token in ['this','is','an','example']])

[11, 7, 35, 468]


In [6]:
text_pipeline = lambda x: [vocab[token] for token in tokenizer(x)]

In [7]:
import torch
from torch import nn

def collate_batch(batch):
    label_list, text_list, lengths = [], [], []
    for _text, _label in batch:
        label_list.append(_label)
        processed_text = torch.tensor(text_pipeline(_text),dtype=torch.int64)
        text_list.append(processed_text)
        lengths.append(processed_text.size(0))
    label_list = torch.tensor(label_list)
    lengths = torch.tensor(lengths)
    padded_text_list = nn.utils.rnn.pad_sequence(text_list,batch_first=True) # this is done so that the batch has the
    # same width on every phrase, using "<pad>" tokens to pad (that were previously inseted in the vocab object, and
    # associated with the 0 value).
    return padded_text_list, label_list, lengths

from torch.utils.data import DataLoader
dataloader = DataLoader(train_dataset,batch_size=4,shuffle=False,collate_fn=collate_batch)

text_batch, label_batch, length_batch = next(iter(dataloader))
print(text_batch)

tensor([[ 6878,  6417, 16755,  ...,   456,     6,   132],
        [   15,    18,    31,  ...,     0,     0,     0],
        [   52,     7,   243,  ...,     0,     0,     0],
        [   11,     7,    29,  ...,     0,     0,     0]])


In [8]:
BATCH_SIZE = 32
train_dl = DataLoader(train_dataset,batch_size=BATCH_SIZE,shuffle=True,collate_fn=collate_batch)
valid_dl = DataLoader(valid_dataset,batch_size=BATCH_SIZE,shuffle=False,collate_fn=collate_batch)
test_dl = DataLoader(test_dataset,batch_size=BATCH_SIZE,shuffle=False,collate_fn=collate_batch)

Now the dataset is fully preprocessed for a RNN to be trained on it.

## Embedding layers for sentence encoding

The elements of the sequences are integer numbers that correponded to the indices of unique words. These word indices
can be converted into input features in several different ways. One naive way is to apply one-hot encoding to convert 
the indicesinto vectors of zeros and ones. Then, each word will be mapped to avector whose size is the number of unique
words in the entire dataset.

The number of words is in the order of 1k to 10k, which will also be the number of our input features. A model trained
on such features may suffer from the curse of dimensionality (furthermore the features are very sparse).

A more elegant approach is to map each word to a vector of fixed size with real-valued elements. In contrast to the
one-hot encoded vectors, we can use finite-sized vectors to represent an infinite number of real numbers. That's the
idea behind embedding.

Given the number of unique words, $n_{words}$, we can select the size of the embedding vectors, to be much smaller than
the number of unique words to represent the entire vocabulary as input features.

We'll obtain:
- A reduction in the dimensionality of the feature space to decrease the effect of the curse of dimensionality;
- The extraction of salient features since the embedding layer in an NN can be optimized (learned).

Given a set of tokens of size $n+2$, an embedding matrix of size $(n+2)\times embedding_dim$ will be created, where each
row in this matrix represents numeric features associated with a token. Therefore, when an integer index, $i$ is given
as input to the embedding, it will look up the corresponding row of the meatrix at index $i$ and return the numeric
features.

Notice that the padding token, is mapped by the embedding layer to 0s, since it doesn't have to participate in the
update of the gradient.

In [9]:
embedding = nn.Embedding(
    num_embeddings=10,
    embedding_dim=3,
    padding_idx=0
)

text_encoded_input = torch.LongTensor([[1,2,3,4],[4,3,2,0]])
print(embedding(text_encoded_input))

tensor([[[-0.7001,  1.4577,  1.8792],
         [ 1.9282, -2.3362, -0.5040],
         [-0.6216,  1.4128,  0.8940],
         [-0.0623,  0.3325,  0.2359]],

        [[-0.0623,  0.3325,  0.2359],
         [-0.6216,  1.4128,  0.8940],
         [ 1.9282, -2.3362, -0.5040],
         [ 0.0000,  0.0000,  0.0000]]], grad_fn=<EmbeddingBackward0>)


# Building an RNN model for the sentiment analysis task

The `torch.nn` module provides many RNN implementations like: `nn.RNN`,`nn.GRU` and `nn.LSTM`.

Now let's build a model:

In [10]:
class RNN(nn.Module):
    def __init__(self,vocab_size, embed_dim, rnn_hidden_size,fc_hidden_size):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size,embed_dim,padding_idx=0,)
        self.rnn = nn.LSTM(embed_dim,rnn_hidden_size,batch_first=True)
        self.fc1 = nn.Linear(rnn_hidden_size,fc_hidden_size)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(fc_hidden_size,1)
        self.sigmoid = nn.Sigmoid()

    def forward(self,text,lengths):
        out = self.embedding(text)
        out = nn.utils.rnn.pack_padded_sequence(out,lengths.to("cpu"),enforce_sorted=False,batch_first=True)
        out, (hidden,cell) = self.rnn(out)
        out = hidden[-1,:,:]
        out = self.fc1(out)
        out = self.relu(out)
        out = self.fc2(out)
        out = self.sigmoid(out)
        return out

In [11]:
VOCAB_SIZE = len(vocab)
EMBDED_DIM = 20
RNN_HIDDEN_SIZE = 64
FC_HIDDEN_SIZE = 64
torch.manual_seed(1)
model = RNN(VOCAB_SIZE,EMBDED_DIM,RNN_HIDDEN_SIZE,FC_HIDDEN_SIZE).to("mps")
optimizer = torch.optim.Adam(model.parameters(), 0.001)
loss_fn = nn.BCELoss()

In [12]:
def train(dataloader):
    model.train()
    total_acc, total_loss = 0,0
    for text_batch, label_batch, lengths in dataloader:
        text_batch, label_batch, lengths = text_batch.to("mps"), label_batch.to("mps"), lengths.to("mps")
        optimizer.zero_grad()
        pred = model(text_batch, lengths)[:,0]
        loss = loss_fn(pred,label_batch.float())
        loss.backward()
        optimizer.step()
        total_acc += ((pred>=0.5).float() == label_batch).float().sum().item()
        total_loss += loss.item()*label_batch.size(0)
    return total_acc/len(dataloader.dataset), total_loss/len(dataloader.dataset)

In [13]:
def evaluate(dataloader):
    model.eval()
    total_acc, total_loss = 0,0
    with torch.no_grad():
        for text_batch, label_batch, lengths in dataloader:
            text_batch, label_batch, lengths = text_batch.to("mps"), label_batch.to("mps"), lengths.to("mps")
            pred = model(text_batch, lengths)[:,0]
            loss = loss_fn(pred,label_batch.float())
            total_acc += ((pred>=0.5).float() == label_batch).float().sum().item()
            total_loss += loss.item()*label_batch.size(0)
    return total_acc/len(dataloader.dataset), total_loss/len(dataloader.dataset)

In [14]:
num_epochs = 10
torch.manual_seed(1)
for epoch in range(num_epochs):
    acc_train, loss_train = train(train_dl)
    acc_valid, loss_valid = evaluate(valid_dl)
    print(f'Epoch {epoch} accuracy: {acc_train:.4f} val_accuracy: {acc_valid:.4f}')

Epoch 0 accuracy: 0.6106 val_accuracy: 0.6546
Epoch 1 accuracy: 0.7231 val_accuracy: 0.7592
Epoch 2 accuracy: 0.8064 val_accuracy: 0.8100
Epoch 3 accuracy: 0.8511 val_accuracy: 0.8358
Epoch 4 accuracy: 0.8728 val_accuracy: 0.8266
Epoch 5 accuracy: 0.9016 val_accuracy: 0.8538
Epoch 6 accuracy: 0.9204 val_accuracy: 0.8528
Epoch 7 accuracy: 0.9375 val_accuracy: 0.8540
Epoch 8 accuracy: 0.9375 val_accuracy: 0.8598
Epoch 9 accuracy: 0.9572 val_accuracy: 0.8592


In [15]:
acc_test, _ = evaluate(test_dl)
print(f'test_accuracy: {acc_test:.4f}')

test_accuracy: 0.8458


There exist even bidirectional RNNs, that do a farward pass and a backward pass trough the input. The hidden states of
the two passes are then concatenated or merged with a product, sum or average.