# Binary sentiment analysis starts here

Data : http://ai.stanford.edu/~amaas/data/sentiment/

In [1]:
!pip install -q torchtext
# !pip install -q livelossplot 

import os
from glob import glob
import random
import re
from collections import Counter, OrderedDict
# from livelossplot import PlotLosses

import torch
import torchtext
import torch.nn as nn
from torch.utils.data.dataset import random_split
from torch.utils.data import Dataset, DataLoader

from torchtext.vocab import vocab

device = 'cuda' if torch.cuda.is_available() else 'cpu'
torch. __version__ , torchtext. __version__ 

[?25l[K     |▏                               | 10 kB 23.1 MB/s eta 0:00:01[K     |▍                               | 20 kB 30.5 MB/s eta 0:00:01[K     |▋                               | 30 kB 36.8 MB/s eta 0:00:01[K     |▉                               | 40 kB 40.2 MB/s eta 0:00:01[K     |█                               | 51 kB 42.3 MB/s eta 0:00:01[K     |█▎                              | 61 kB 46.3 MB/s eta 0:00:01[K     |█▌                              | 71 kB 48.2 MB/s eta 0:00:01[K     |█▊                              | 81 kB 48.0 MB/s eta 0:00:01[K     |█▉                              | 92 kB 49.8 MB/s eta 0:00:01[K     |██                              | 102 kB 51.2 MB/s eta 0:00:01[K     |██▎                             | 112 kB 51.2 MB/s eta 0:00:01[K     |██▌                             | 122 kB 51.2 MB/s eta 0:00:01[K     |██▊                             | 133 kB 51.2 MB/s eta 0:00:01[K     |███                             | 143 kB 51.2 MB/s eta 0:

('1.12.1+cu113', '0.13.1')

In [2]:
!wget -O aclImdb_v1.tar.gz -q http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz
!tar -xf aclImdb_v1.tar.gz
#!rm aclImdb_v1.tar.gz

In [3]:
# convert raw data into dataset and split into train, valid and test sets

def gettext(fname):
  with open(fname) as f:
    lines = f.readlines()
  return lines  


label = lambda fname: fname.split("/")[-2]

classes = ['pos', 'neg']

def unify_data(classes=classes):
  trainlist = []
  testlist = []

  for cls in classes:
    train_cls = [(label(i), *gettext(i)) for _, i in enumerate(glob(f'./aclImdb/train/{cls}/*.txt'))] #I used enumerate for testing and it is not necessary here
    test_cls = [(label(i), *gettext(i)) for _, i in enumerate(glob(f'./aclImdb/test/{cls}/*.txt'))]
    trainlist.extend(train_cls)  # we need extend not append
    testlist.extend(test_cls)
    random.shuffle(trainlist)
    random.shuffle(testlist)

  return trainlist, testlist

trainset, testset = unify_data(classes)


class SentimentDataset(Dataset):
    def __init__(self, items):
        super().__init__()
        self.items = items

    def __len__(self):
        return len(self.items)

    def __getitem__(self, idx):
        item = self.items[idx]     # this is a tupple label and corresponding text
        return item[0], item[1]    # this returns label and text


train_dataset = SentimentDataset(trainset)
test_dataset = SentimentDataset(testset)

torch.manual_seed(1)
train_dataset, valid_dataset = random_split(
    list(train_dataset), [20000, 5000])

In [4]:
# create tokenizer and find unique words (tokens)

token_counts = Counter()

def tokenizer(text):
    text = re.sub('<[^>]*>', '', text)
    emoticons = re.findall('(?::|;|=)(?:-)?(?:\)|\(|D|P)', text.lower())
    text = re.sub('[\W]+', ' ', text.lower()) +\
        ' '.join(emoticons).replace('-', '')
    tokenized = text.split()
    return tokenized


for label, line in train_dataset:
    tokens = tokenizer(line)
    token_counts.update(tokens)
 
    
print('Vocab-size:', len(token_counts))

Vocab-size: 69527


In [5]:
# encode each unique token into integers

sorted_by_freq_tuples = sorted(token_counts.items(), key=lambda x: x[1], reverse=True)
ordered_dict = OrderedDict(sorted_by_freq_tuples)

vocab = vocab(ordered_dict)

vocab.insert_token("<pad>", 0)
vocab.insert_token("<unk>", 1)
vocab.set_default_index(1)

print([vocab[token] for token in ['this', 'is', 'an', 'example']])

[11, 7, 35, 468]


In [None]:
# define the functions for transformation

text_pipeline = lambda x: [vocab[token] for token in tokenizer(x)]
label_pipeline = lambda x: 1. if x == 'pos' else 0.


# collate the encode and transformation function
def collate_batch(batch):
    label_list, text_list, lengths = [], [], []
    for _label, _text in batch:
        label_list.append(label_pipeline(_label))
        processed_text = torch.tensor(text_pipeline(_text), 
                                      dtype=torch.int64)
        text_list.append(processed_text)
        lengths.append(processed_text.size(0))
    label_list = torch.tensor(label_list)
    lengths = torch.tensor(lengths)
    padded_text_list = nn.utils.rnn.pad_sequence(
        text_list, batch_first=True)
    return padded_text_list.to(device), label_list.to(device), lengths.to(device)

# create dataloaders for batching of the datasets

batch_size = 32  

train_dl = DataLoader(train_dataset, batch_size=batch_size,
                      shuffle=True, collate_fn=collate_batch)
valid_dl = DataLoader(valid_dataset, batch_size=batch_size,
                      shuffle=False, collate_fn=collate_batch)
test_dl = DataLoader(test_dataset, batch_size=batch_size,
                     shuffle=False, collate_fn=collate_batch)



# test how dataloader works
dataloader = DataLoader(train_dataset, batch_size=4, shuffle=False, collate_fn=collate_batch)
text_batch, label_batch, length_batch = next(iter(dataloader))

# test a 2 samples with 4==embeding size
embedding = nn.Embedding(num_embeddings=10, 
                         embedding_dim=3, 
                         padding_idx=0)
text_encoded_input = torch.LongTensor([[5,4,3,2],[0,2,4,7]])

#print all tests
print(text_batch)
print(label_batch)
print(length_batch)
print(text_batch.shape)

print(embedding(text_encoded_input))

In [7]:
#@title model

# ## An example of building a RNN model
# ## with simple RNN layer

# # Fully connected neural network with one hidden layer
# class RNN(nn.Module):
#     def __init__(self, input_size, hidden_size):
#         super().__init__()
#         self.rnn = nn.RNN(input_size, 
#                           hidden_size, 
#                           num_layers=2, 
#                           batch_first=True)
#         #self.gru = nn.GRU(input_size, hidden_size, num_layers, batch_first=True)
#         #self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True)
#         self.fc = nn.Linear(hidden_size, 1)
        
#     def forward(self, x):
#         _, hidden = self.rnn(x)
#         out = hidden[-1, :, :]
#         out = self.fc(out)
#         return out

# model = RNN(64, 32) 

# print(model) 
 
# model(torch.randn(5, 3, 64)) 

class RNN(nn.Module):
    def __init__(self, vocab_size, embed_dim, rnn_hidden_size, fc_hidden_size):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, 
                                      embed_dim, 
                                      padding_idx=0) 
        self.rnn = nn.LSTM(embed_dim, rnn_hidden_size, 
                           batch_first=True)
        self.fc1 = nn.Linear(rnn_hidden_size, fc_hidden_size)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(fc_hidden_size, 1)
        self.sigmoid = nn.Sigmoid()

    def forward(self, text, lengths):
        out = self.embedding(text)
        out = nn.utils.rnn.pack_padded_sequence(out, lengths.cpu().numpy(), enforce_sorted=False, batch_first=True)
        out, (hidden, cell) = self.rnn(out)
        out = hidden[-1, :, :]
        out = self.fc1(out)
        out = self.relu(out)
        out = self.fc2(out)
        out = self.sigmoid(out)
        return out
         
vocab_size = len(vocab)
embed_dim = 20
rnn_hidden_size = 64
fc_hidden_size = 64

torch.manual_seed(1)
model = RNN(vocab_size, embed_dim, rnn_hidden_size, fc_hidden_size) 
model = model.to(device)

In [None]:
#@title training

def train(dataloader):
    model.train()
    total_acc, total_loss = 0, 0
    for text_batch, label_batch, lengths in dataloader:
        optimizer.zero_grad()
        pred = model(text_batch, lengths)[:, 0]
        loss = loss_fn(pred, label_batch)
        loss.backward()
        optimizer.step()
        total_acc += ((pred>=0.5).float() == label_batch).float().sum().item()
        total_loss += loss.item()*label_batch.size(0)
    return total_acc/len(dataloader.dataset), total_loss/len(dataloader.dataset)
 
def evaluate(dataloader):
    model.eval()
    total_acc, total_loss = 0, 0
    with torch.no_grad():
        for text_batch, label_batch, lengths in dataloader:
            pred = model(text_batch, lengths)[:, 0]
            loss = loss_fn(pred, label_batch)
            total_acc += ((pred>=0.5).float() == label_batch).float().sum().item()
            total_loss += loss.item()*label_batch.size(0)
    return total_acc/len(dataloader.dataset), total_loss/len(dataloader.dataset)

loss_fn = nn.BCELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
beta = float('inf')
num_epochs = 10 

# liveloss = PlotLosses()
torch.manual_seed(1)
 
for epoch in range(num_epochs):
    acc_train, loss_train = train(train_dl)
    acc_valid, loss_valid = evaluate(valid_dl)
    if loss_valid < beta:
      beta=acc_valid
      torch.save(model.state_dict(), './model.pth')

    print(f'Epoch {epoch} accuracy: {acc_train:.4f} val_accuracy: {acc_valid:.4f}')
    # accuracies = {'loss': acc_train,  # here name should be 'loss' otherwise it plots separate chart
    #       'val_loss': acc_valid}     # here name should be 'val_loss' otherwise it draws separate chart
    # liveloss.update(accuracies)
    # liveloss.send()
 

In [None]:
# load the the best saved model parameters if needed

torch.manual_seed(1)
model = RNN(vocab_size, embed_dim, rnn_hidden_size, fc_hidden_size) 
model = model.to(device)
model.load_state_dict(torch.load('./model.pth'))
model.eval()

In [None]:
acc_test, _ = evaluate(test_dl)
print(f'test_accuracy: {acc_test:.4f}') 

#### More on the bidirectional RNN

 * **Trying bidirectional recurrent layer**

In [None]:
class RNN(nn.Module):
    def __init__(self, vocab_size, embed_dim, rnn_hidden_size, fc_hidden_size):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, 
                                      embed_dim, 
                                      padding_idx=0) 
        self.rnn = nn.LSTM(embed_dim, rnn_hidden_size, 
                           batch_first=True, bidirectional=True)
        self.fc1 = nn.Linear(rnn_hidden_size*2, fc_hidden_size)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(fc_hidden_size, 1)
        self.sigmoid = nn.Sigmoid()

    def forward(self, text, lengths):
        out = self.embedding(text)
        out = nn.utils.rnn.pack_padded_sequence(out, lengths.cpu().numpy(), enforce_sorted=False, batch_first=True)
        _, (hidden, cell) = self.rnn(out)
        out = torch.cat((hidden[-2, :, :], hidden[-1, :, :]), dim=1)
        out = self.fc1(out)
        out = self.relu(out)
        out = self.fc2(out)
        out = self.sigmoid(out)
        return out
    
torch.manual_seed(1)
model = RNN(vocab_size, embed_dim, rnn_hidden_size, fc_hidden_size) 
model = model.to(device)

In [None]:
loss_fn = nn.BCELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.002)

num_epochs = 10 

torch.manual_seed(1)
 
for epoch in range(num_epochs):
    acc_train, loss_train = train(train_dl)
    acc_valid, loss_valid = evaluate(valid_dl)
    print(f'Epoch {epoch} accuracy: {acc_train:.4f} val_accuracy: {acc_valid:.4f}')

In [None]:
test_dataset = IMDB(split='test')
test_dl = DataLoader(test_dataset, batch_size=batch_size,
                     shuffle=False, collate_fn=collate_batch)

In [None]:
acc_test, _ = evaluate(test_dl)
print(f'test_accuracy: {acc_test:.4f}') 

test_accuracy: 0.8566
