#### Preparing the movie review dataset

1. Get the dataset

In [None]:
# Install the torchtext library to get the IMDB dataset
!pip install torchtext

In [None]:
from torchtext.datasets import IMDB

train_dataset = IMDB(split = 'train')
test_dataset = IMDB(split = 'test')

2. Create the datasets

In [None]:
from torch.utils.data.dataset import random_split
torch.manual_seed(1)
train_dataset, valid_dataset = random_split(list(train_dataset), [2000, 500])

3. Find unique tokens

In [None]:
import re
from collections import Counter, OrderedDict

def tokenizer(text):
    text = re.sub('<[^]*>', '', text)
    emoticons = re.findall('(?::|;|=)(?::-)?(?:\)|\(|D|P)', text.lower())
    text = re.sub('[\W]+', ' ', text.lower()) + ''.join(emoticons).replace('-', '')
    tokenized = text.split()
    return tokenized

token_counts = Counter()
for label, line in train_dataset:
    tokens = tokenizer(line)
    token_counts.update(tokens)

print('Vocab-size:', len(token_counts))

4. Encode each unique token into integers

In [None]:
from torchtext.vocab import vocab

sorted_by_freq_tuples = sorted(token_counts.items(), key=lambda x: x[1], reverse = True)

ordered_dict = OrderedDict(sorted_by_freq_tuples)
vocab = vocab(ordered_dict)
vocab.insert_token("<pad>", 0)
vocab.insert_token("<unk>", 1)

vocab.set_default(1)

In [None]:
# Define the function for transformation

text_pipeline = lambda x: [vocab[token] for token in tokenixer(x)]

label_pipeline = lambda x: 1. if x == 'pos' else 0.

In [None]:
# Wrap the encode and transformation function

def collate_batch(batch):
    label_list, text_list, lengths = [], [], []
    for _label, _text in batch:
        label_list.append(label_pipeline(_label))
        processed_text = torch.tensor(text_pipeline(_text), dtype = torch.int=64)
        text_list.append(processed_text)
        lengths.append(processed_text.size(0))
        
    label_list = torch.tensor(label_list)
    lengths = torch.tensor(label_list)
    padded_text_list = nn.utils.rnn.pad_sequence(text_list, batch_first = True)
    
    return padded_text_list, lengths,label_list

# Take a small batch
from torch.utils.data import DataLoader
dataloader = DataLoader(train_dataset, batch_size = 4, shuffle = False, collate_fn = collate_batch)

text_batch, label_batch, length_batch = next(iter(dataloader))

In [None]:
print(text_batch)

In [None]:
print(label_batch)

In [None]:
print(length_batch)

In [None]:
print(text_batch.shape)

In [None]:
batch_size = 32
train_dl = DataLoader(train_dataset, batch_size = batch_size, shuffle = True, collate_fn = collate_batch)
valid_dl = DataLoader(valid_dataset, batch_size = batch_size, shuffle = False, collate_fn = collate_batch)
test_dl = DataLoader(test_dataset, batch_size = batch_size, shuffle = False, collate_fn = collate_batch)

#### Embedding Layers for sentence encoding

In [None]:
embedding = nn.Embedding(num_embeddings = 10, embedding_dim = 3, padding_idx = 0)

# a batch of 2 samples of 4 indices each
text_encoded_input = torch.LongTensor([[1,2,4,5],[4,3,2,0]])
print(embedding(text_encoded_input))

#### Building an RNN model

In [1]:
class RNN(nn.Module):
    def __init__(self, input_size, hidden_size):
        super().__init__()
        self.rnn = nn.RNN(input_size, hidden_size, num_layers = 2, batch_first = True)
        # self.rnn = nn.GRU(input_size, hidden_size, num_layers = 2, batch_first = True)
        # self.rnn = nn.LSTM(input_size, hidden_size, num_layers = 2, batch_first = True)
        self.fc = nn.Linear(hidden_size, 1)
    
    def forward(self, x):
        _, hidden = self.rnn(x)
        out = hidden[-1, :, :] # we use the final hidden state from the last hidden layer as input to the fully connected layer
        out = self.fc(out)
        return out
    
model = RNN(64, 32)
print(model)
model(torch.randn(5,3,64))

NameError: name 'nn' is not defined

#### Building RNN for sentiment analysis

1. A recurrent layer of LSTM

In [None]:
class RNN(nn.Module):
    def __init__(self, vocab_size, embed_dim, rnn_hidden_size, fc_hidden_size):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx = 0)
        
        self.rnn = nn.LSTM(embed_dim, rnn_hidden_size, batch_first = True)
        self.fc1 = nn.Linear(rnn_hidden_size, fc_hidden_size)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(fc_hidden_size, 1)
        self.sigmoid = nn.Sigmoid()
        
    def forward(self, text, lengths):
        out = self.embedding(text)
        out = nn.utils.rnn.pack_padded_sequence(out, lenghts.cpu().numpy(), enforce_sorted = False, batch_first = True)
        out, (hidden, cell) = self.rnn(out)
        out = hidden[-1, :, :]
        out = self.fc1(out)
        out = self.relu(out)
        out = self.fc2(out)
        out = self.sigmoid(out)
        
        return out

vocab_size = len(vocab)
embed_dim = 20
rn_hidden_size = 64
torch.manual_seed(1)
model = RNN(vocab_size, embed_dim, rnn_hidden_size, fc_hidden_size)
model

2. The `train` function to train the model on the given dataset for one epoch and return the classification accuracy and loss.

In [None]:
def train(dataloader):
    model.train()
    trotal_acc, total_loss = 0, 0
    for text_batch, label_batch, lengths in dataloader:
        optimizer.zero_grad()
        pred = model(text_batch, lengths)[:, 0]
        loss = loss_fn(pred, label_batch)
        loss.backward()
        optimizer.step()
        total_acc += ((pred >= 0.5).float() == label_batch).float().sum().item()
        total_loss  += loss.item()*label_batch.size(0)
        
    return total_acc/len(dataloader.dataset), total_loss/len(dataloader.dataset)

3. The `evaluate` function to measure the model's performance on a given dataset

In [None]:
def evaluate(dataloader):
    model.eval()
    total_acc, total_loss = 0, 0
    
    with torch.no_grad():
        for text_batch, label_batch, lengths in dataloader:
            pred = model(text_batch, lengths)[:, 0]
            loss = loss_fn(pred, label_batch)
            
            total_acc += ((pred >= 0.5).float() == label_batch).float().sum().item()
            total_loss  += loss.item()*label_batch.size(0)
        
    return total_acc/len(dataloader.dataset), total_loss/len(dataloader.dataset)

4. The `loss` function and `optimizer` (Adam Optimizer)

In [None]:
loss_fn = nn.BCELoss()
optimizer = torch.optim.Adam(model.parameters(), lr = 0.001)