Google Colab Development Environment.

0. Install neccesary libraries 

In [4]:
!pip uninstall -y torch torchdata torchvision torchtext torchaudio fastai
!pip install portalocker
!pip install --pre torch torchdata -f https://download.pytorch.org/whl/nightly/cpu/torch_nightly.html

Found existing installation: torch 1.13.1+cu116
Uninstalling torch-1.13.1+cu116:
  Successfully uninstalled torch-1.13.1+cu116
[0mFound existing installation: torchvision 0.14.1+cu116
Uninstalling torchvision-0.14.1+cu116:
  Successfully uninstalled torchvision-0.14.1+cu116
Found existing installation: torchtext 0.14.1
Uninstalling torchtext-0.14.1:
  Successfully uninstalled torchtext-0.14.1
Found existing installation: torchaudio 0.13.1+cu116
Uninstalling torchaudio-0.13.1+cu116:
  Successfully uninstalled torchaudio-0.13.1+cu116
Found existing installation: fastai 2.7.11
Uninstalling fastai-2.7.11:
  Successfully uninstalled fastai-2.7.11
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting portalocker
  Downloading portalocker-2.7.0-py2.py3-none-any.whl (15 kB)
Installing collected packages: portalocker
Successfully installed portalocker-2.7.0
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheel

In [1]:
!pip install torchtext

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting torchtext
  Downloading torchtext-0.15.1-cp39-cp39-manylinux1_x86_64.whl (2.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.0/2.0 MB[0m [31m22.8 MB/s[0m eta [36m0:00:00[0m
Collecting torch==2.0.0
  Downloading torch-2.0.0-cp39-cp39-manylinux1_x86_64.whl (619.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m619.9/619.9 MB[0m [31m2.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting torchdata==0.6.0
  Downloading torchdata-0.6.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (4.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m4.6/4.6 MB[0m [31m108.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting nvidia-cuda-nvrtc-cu11==11.7.99
  Downloading nvidia_cuda_nvrtc_cu11-11.7.99-2-py3-none-manylinux1_x86_64.whl (21.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m21.0/21.0 MB[0m [31m70

1. Import the libraries

In [2]:
# Import libraries
import torch
import torch.nn as nn

In [3]:
from torchtext.datasets import IMDB

2a. Create the datasets: split the training dataset into seperate training and validation partitions

In [4]:
from torch.utils.data.dataset import random_split
train_dataset = IMDB(split = 'train')
test_dataset = IMDB(split = 'test')
torch.manual_seed(1)
# The original train dataset is 25000. This is being split randomly to 20000 and 5000 for training and validation
train_dataset, valid_dataset = random_split(list(train_dataset), [20000, 5000])
# Get the 25000 for testing into a definite length
test_dataset, _ = random_split(list(test_dataset), [25000, 0])

2b. Find unique tokens - as a preparation for encoding data for NN.

In [5]:
import re
from collections import Counter, OrderedDict

def tokenizer(text):
    text = re.sub('<[^>]*>', '', text)
    emoticons = re.findall('(?::|;|=)(?:-)?(?:\)|\(|D|P)', text.lower())
    text = re.sub('[\W]+', ' ', text.lower()) + ''.join(emoticons).replace('-', '')
    tokenized = text.split()
    return tokenized

token_counts = Counter()
for label, line in train_dataset:
    tokens = tokenizer(line)
    token_counts.update(tokens)

print('Vocab-size:', len(token_counts))

Vocab-size: 69033


3. Encode each unique token into integers - map each unique word (cleansed, preprocessed and tokenized by the `tokenizer` function) to a unique integer using the `vocab` method of `torchtext`.  

In [6]:
from torchtext.vocab import vocab

sorted_by_freq_tuples = sorted(token_counts.items(), key=lambda x: x[1], reverse = True)

ordered_dict = OrderedDict(sorted_by_freq_tuples)
vocab = vocab(ordered_dict)
vocab.insert_token("<pad>", 0) # Placeholder - Padding for adjusting sequence length
vocab.insert_token("<unk>", 1) # Placehlder - Unknown words

vocab.set_default_index(1)

In [7]:
#a. Define the function for transformation

device = torch.device("cuda:0")

# Transform each text in the dataset - using the defined tokenizer function
text_pipeline = lambda x: [vocab[token] for token in tokenizer(x)]

# Convert label to 1 and 0
label_pipeline = lambda x: 1. if x == 'pos' else 0.

In [9]:
#b. Wrap the encode and transformation function into this function

def collate_batch(batch):
    label_list, text_list, lengths = [], [], []
    for _label, _text in batch:
        label_list.append(label_pipeline(_label))
        processed_text = torch.tensor(text_pipeline(_text), dtype = torch.int64)
        
        text_list.append(processed_text)
        lengths.append(processed_text.size(0))
        
    label_list = torch.tensor(label_list)
    lengths = torch.tensor(lengths)
    # automatically pad consecutive elements for all sequences to have the same shape
    padded_text_list = nn.utils.rnn.pad_sequence(text_list, batch_first = True)
    
    return padded_text_list.to(device), label_list.to(device), lengths.to(device)

In [10]:
# Take a small batch - to illustrate how padding works
from torch.utils.data import DataLoader
dataloader = DataLoader(train_dataset, batch_size = 4, shuffle = False, collate_fn = collate_batch)
text_batch, label_batch, length_batch = next(iter(dataloader))

In [9]:
print(text_batch)

tensor([[   35,  1739,     7,   449,   721,     6,   301,     4,   787,     9,
             4,    18,    44,     2,  1705,  2459,   186,    25,     7,    24,
           100,  1874,  1739,    25,     7, 34417,  3568,  1103,  7517,   787,
             5,     2,  4991, 12401,    36,     7,   148,   111,   939,     6,
         11598,     2,   172,   135,    62,    25,  3199,  1602,     3,   928,
          1500,     9,     6,  4601,     2,   155,    36,    14,   274,     4,
         42948,     9,  4991,     3,    14, 10295,    34,  3568,     8,    51,
           148,    30,     2,    58,    16,    11,  1893,   125,     6,   420,
          1214,    27, 14542,   940,    11,     7,    29,   951,    18,    17,
         15994,   459,    34,  2479, 15211,  3713,     2,   840,  3200,     9,
          3568,    13,   107,     9,   175,    94,    25,    51, 10296,  1796,
            27,   712,    16,     2,   220,    17,     4,    54,   722,   238,
           395,     2,   787,    32,    27,  5236,  

In [11]:
print(label_batch)

tensor([0., 0., 0., 0.], device='cuda:0')


In [12]:
print(length_batch)

tensor([165,  86, 218, 145], device='cuda:0')


In [13]:
print(text_batch.shape)

torch.Size([4, 218])


In [16]:
# Divide the datasets into data loaders with a batch size of 32 - to have it in a suitable format for RNN
batch_size = 32
train_dl = DataLoader(train_dataset, batch_size = batch_size, shuffle = True, collate_fn = collate_batch)

valid_dl = DataLoader(valid_dataset, batch_size = batch_size, shuffle = False, collate_fn = collate_batch)

test_dl = DataLoader(test_dataset, batch_size = batch_size, shuffle = False, collate_fn = collate_batch) #drop_last=True

#### Embedding Layers for sentence encoding

In [17]:
# To reduce the dimensionality of the word vector - dimensionality of the output
embedding = nn.Embedding(num_embeddings = 10, embedding_dim = 3, padding_idx = 0)

# a batch of 2 (rank of 2) samples of 4 indices each - input length of 4
text_encoded_input = torch.LongTensor([[1,2,4,5],[4,3,2,0]])
print(embedding(text_encoded_input))

tensor([[[ 1.3254, -0.9565, -1.8320],
         [-0.0948, -0.9818,  0.3048],
         [-0.5776, -0.2964,  0.2085],
         [-0.5452, -2.8568,  1.1356]],

        [[-0.5776, -0.2964,  0.2085],
         [ 0.8829,  0.3068,  0.4394],
         [-0.0948, -0.9818,  0.3048],
         [ 0.0000,  0.0000,  0.0000]]], grad_fn=<EmbeddingBackward0>)


#### Building RNN for sentiment analysis

1. A recurrent layer of LSTM: for long sequences, LSTM layer supports long ranges

In [18]:
class RNN(nn.Module):
    def __init__(self, vocab_size, embed_dim, rnn_hidden_size, fc_hidden_size):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx = 0) # Embedding layer
        
        self.rnn = nn.LSTM(embed_dim, rnn_hidden_size, batch_first = True) # Recurrent layer of type LSTM
        self.fc1 = nn.Linear(rnn_hidden_size, fc_hidden_size) # Fully connected layer as hidden layer
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(fc_hidden_size, 1) # Fully connected layer as an output layer
        self.sigmoid = nn.Sigmoid() # Logistic sigmoid activation to produce single-class membership probability 
        
    def forward(self, text, lengths):
        out = self.embedding(text)
        out = nn.utils.rnn.pack_padded_sequence(out, lengths.cpu().numpy(), enforce_sorted = False, batch_first = True)
        out, (hidden, cell) = self.rnn(out)
        out = hidden[-1, :, :]
        out = self.fc1(out)
        out = self.relu(out)
        out = self.fc2(out)
        out = self.sigmoid(out)
        return out

vocab_size = len(vocab)
embed_dim = 20 # feature size
rnn_hidden_size = 64
fc_hidden_size = 64
torch.manual_seed(1)
model = RNN(vocab_size, embed_dim, rnn_hidden_size, fc_hidden_size)
model = model.to(device)

2. The `train` function to train the model on the given dataset for one epoch and return the classification accuracy and loss.

In [19]:
def train(dataloader):
    model.train()
    total_acc, total_loss = 0, 0
    for text_batch, label_batch, lengths in dataloader:
        optimizer.zero_grad()
        pred = model(text_batch, lengths)[:, 0]
        loss = loss_fn(pred, label_batch)
        loss.backward()
        optimizer.step()
        total_acc += ((pred >= 0.5).float() == label_batch).float().sum().item()
        total_loss  += loss.item()*label_batch.size(0)
          
    return total_acc/len(dataloader.dataset), total_loss/len(dataloader.dataset)

3. The `evaluate` function to measure the model's performance on a given dataset

In [20]:
def evaluate(dataloader):
    model.eval()
    total_acc, total_loss = 0, 0
    
    with torch.no_grad():
        for text_batch, label_batch, lengths in dataloader:
            pred = model(text_batch, lengths)[:, 0]
            loss = loss_fn(pred, label_batch)
            
            total_acc += ((pred >= 0.5).float() == label_batch).float().sum().item()
            total_loss  += loss.item()*label_batch.size(0)
        
    return total_acc/len(dataloader.dataset), total_loss/len(dataloader.dataset)

4. The `loss` function and `optimizer` (Adam Optimizer)

In [21]:
loss_fn = nn.BCELoss() # for binary classification and a single-class membership probability output
optimizer = torch.optim.Adam(model.parameters(), lr = 0.001)

5. Train model for 10 epochs and display the training and validation performances

In [22]:
num_epochs = 10
torch.manual_seed(1)
for epoch in range(num_epochs):
    acc_train, loss_train = train(train_dl)
    acc_valid, loss_valid = evaluate(valid_dl)
    print(f'Epoch {epoch} accuracy: {acc_train: .4f}' f' val_accuracy: {acc_valid: .4f}')

Epoch 0 accuracy:  1.0000 val_accuracy:  1.0000
Epoch 1 accuracy:  1.0000 val_accuracy:  1.0000
Epoch 2 accuracy:  1.0000 val_accuracy:  1.0000
Epoch 3 accuracy:  1.0000 val_accuracy:  1.0000
Epoch 4 accuracy:  1.0000 val_accuracy:  1.0000
Epoch 5 accuracy:  1.0000 val_accuracy:  1.0000
Epoch 6 accuracy:  1.0000 val_accuracy:  1.0000
Epoch 7 accuracy:  1.0000 val_accuracy:  1.0000
Epoch 8 accuracy:  1.0000 val_accuracy:  1.0000
Epoch 9 accuracy:  1.0000 val_accuracy:  1.0000


6. Evaluate on the test data

In [23]:
acc_test, _ = evaluate(test_dl)
print(f'test_accuracy: {acc_test: .4f}')

test_accuracy:  1.0000
