In [1]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.optim import Adam
from torch.utils.data import TensorDataset, DataLoader
import re
import os
import time
from tqdm import tqdm
from sklearn.model_selection import train_test_split
from torchtext.vocab import GloVe

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cuda')

In [3]:
# Building the vocabulary
glove = GloVe(name='6B', dim=100, max_vectors=25000, cache = "./vectors_cache")

### Util functions

In [4]:
def load_data(csv_path, glove):

  def preprocess_string(s):
    # Remove all non-word characters (everything except numbers and letters)
    s = re.sub(r"[^\w\s]", '', s)
    # Replace all runs of whitespaces with no space
    s = re.sub(r"\s+", '', s)
    # replace digits with no space
    s = re.sub(r"\d", '', s)

    return s

  def padding_(sentences, seq_len):
    features = np.zeros((len(sentences), seq_len),dtype=int)
    for ii, review in enumerate(sentences):
      if len(review) != 0:
        features[ii, -len(review):] = np.array(review)[:seq_len]
    return features

  # Reading data from disk
  df = pd.read_csv(csv_path)
  # Replacing string labels with numbers
  df['sentiment'].replace({'positive': 1, 'negative': 0}, inplace=True)

  # dividing the dataset into train, validation, and test portions
  X,y = df['review'].values,df['sentiment'].values
  x_train1, x_test1, y_train, y_test = train_test_split(X, y, train_size=0.85, random_state=1)
  x_train1, x_val1, y_train, y_val = train_test_split(x_train1, y_train, train_size=0.823, random_state=1)

  print(f'Number of training examples: {len(x_train1)}')
  print(f'Number of validation examples: {len(x_val1)}')
  print(f'Number of test examples: {len(x_test1)}')


  

  x_train, x_val, x_test = [],[],[]
  for sent in x_train1:
    x_train.append([glove.stoi[preprocess_string(word)] for word in sent.lower().split() if preprocess_string(word) in glove.stoi])

  for sent in x_val1:
    x_val.append([glove.stoi[preprocess_string(word)] for word in sent.lower().split() if preprocess_string(word) in glove.stoi])

  for sent in x_test1:
    x_test.append([glove.stoi[preprocess_string(word)] for word in sent.lower().split() if preprocess_string(word) in glove.stoi])

  # Padding the data
  x_train_pad = padding_(x_train,500)
  x_val_pad = padding_(x_val,500)
  x_test_pad = padding_(x_test,500)

  # creating Tensor Datasets and DataLoaders
  train_data = TensorDataset(torch.from_numpy(x_train_pad), torch.from_numpy(y_train))
  valid_data = TensorDataset(torch.from_numpy(x_val_pad), torch.from_numpy(y_val))
  test_data = TensorDataset(torch.from_numpy(x_test_pad), torch.from_numpy(y_test))

  batch_size = 100

  train_loader = DataLoader(train_data, shuffle=True, batch_size=batch_size)
  valid_loader = DataLoader(valid_data, shuffle=True, batch_size=batch_size)
  test_loader = DataLoader(test_data, shuffle=True, batch_size=batch_size)

  return train_loader, valid_loader, test_loader

def binary_accuracy(preds, y):
    rounded_preds = torch.round(torch.sigmoid(preds))
    correct = (rounded_preds == y).float()  
    acc = correct.sum() / len(correct)
    return acc

def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

def train_epoch(model, data_loader, optimizer, criterion):
  epoch_loss = 0
  epoch_acc = 0
  model.train()

  for inputs, labels in tqdm(data_loader):
      optimizer.zero_grad()
      inputs, labels = inputs.to(device), labels.to(device, dtype=torch.float)   
      predictions = model(inputs).squeeze(1)
      loss = criterion(predictions, labels)      
      loss.backward()
      optimizer.step()
      acc = binary_accuracy(predictions, labels)
      epoch_loss += loss.item()
      epoch_acc += acc.item()
      
  return epoch_loss / len(data_loader), epoch_acc / len(data_loader)

def evaluate_epoch(model, data_loader, criterion):
    epoch_loss = 0
    epoch_acc = 0
    model.eval()
    with torch.no_grad():
        for inputs, labels in tqdm(data_loader):
            inputs, labels = inputs.to(device), labels.to(device, dtype=torch.float)
            predictions = model(inputs).squeeze(1)
            loss = criterion(predictions, labels)
            acc = binary_accuracy(predictions, labels)
            epoch_loss += loss.item()
            epoch_acc += acc.item()
    return epoch_loss / len(data_loader), epoch_acc / len(data_loader)

def count_recurrent_layer_parameters(model):
    return sum(p.numel() for n, p in model.named_parameters() if p.requires_grad and 'fc' not in n)

### Loading the data

In [5]:
trainloader, validloader, testloader = load_data("IMDB Dataset.csv", glove)

Number of training examples: 34977
Number of validation examples: 7523
Number of test examples: 7500


# Q5 - RNN

In [5]:
class RNN(nn.Module):
  def __init__(self, hidden_dim, embed_dim):
    super(RNN, self).__init__()
    
    self.hidden_dim = hidden_dim
    self.embed_dim = embed_dim

    self.embedding = nn.Embedding.from_pretrained(glove.vectors, freeze=True)

    self.rnn = nn.RNN(
      input_size=self.embed_dim,
      hidden_size=self.hidden_dim,
      bidirectional=False,
      num_layers=1,
      batch_first=True
    )

    self.fc = nn.Linear(self.hidden_dim, 1)

  def forward(self, x):
    embeds = self.embedding(x)
    output, hidden = self.rnn(embeds)

    logit = self.fc(hidden[0, :, :])
    return logit

In [7]:
model = RNN(embed_dim=100, hidden_dim=200)
model.to(device)

RNN(
  (embedding): Embedding(25000, 100)
  (rnn): RNN(100, 200, batch_first=True)
  (fc): Linear(in_features=200, out_features=1, bias=True)
)

In [8]:
print(f'The RNN model has {count_recurrent_layer_parameters(model):,} parameters in the RNN layer')

The RNN model has 60,400 parameters in the RNN layer


#### Training the model and evaluating the validation data per epoch

In [10]:
N_EPOCHS = 5

optimizer = Adam(model.parameters())
criterion = nn.BCEWithLogitsLoss().to(device)

best_valid_loss = float('inf')

for epoch in range(N_EPOCHS):

    start_time = time.time()
    
    train_loss, train_acc = train_epoch(model, trainloader, optimizer, criterion)
    valid_loss, valid_acc = evaluate_epoch(model, validloader, criterion)
    
    end_time = time.time()

    epoch_mins, epoch_secs = epoch_time(start_time, end_time)
    
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        os.makedirs('rnn', exist_ok=True)
        torch.save(model.state_dict(), 'rnn/best-model.pt')
    
    print(f'Epoch: {epoch+1:02} | Epoch Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.5f} | Train Acc: {train_acc*100:.2f}%')
    print(f'\t Val. Loss: {valid_loss:.5f} |  Val. Acc: {valid_acc*100:.2f}%')

100%|██████████| 350/350 [00:05<00:00, 61.40it/s]
100%|██████████| 76/76 [00:00<00:00, 173.18it/s]


Epoch: 01 | Epoch Time: 0m 6s
	Train Loss: 0.67321 | Train Acc: 58.29%
	 Val. Loss: 0.67502 |  Val. Acc: 57.52%


100%|██████████| 350/350 [00:05<00:00, 64.08it/s]
100%|██████████| 76/76 [00:00<00:00, 169.38it/s]


Epoch: 02 | Epoch Time: 0m 5s
	Train Loss: 0.67459 | Train Acc: 58.39%
	 Val. Loss: 0.69751 |  Val. Acc: 51.51%


100%|██████████| 350/350 [00:05<00:00, 64.37it/s]
100%|██████████| 76/76 [00:00<00:00, 173.45it/s]


Epoch: 03 | Epoch Time: 0m 5s
	Train Loss: 0.66053 | Train Acc: 59.93%
	 Val. Loss: 0.67116 |  Val. Acc: 58.33%


100%|██████████| 350/350 [00:05<00:00, 62.30it/s]
100%|██████████| 76/76 [00:00<00:00, 135.27it/s]


Epoch: 04 | Epoch Time: 0m 6s
	Train Loss: 0.66147 | Train Acc: 59.78%
	 Val. Loss: 0.66976 |  Val. Acc: 57.87%


100%|██████████| 350/350 [00:06<00:00, 58.10it/s]
100%|██████████| 76/76 [00:00<00:00, 144.52it/s]

Epoch: 05 | Epoch Time: 0m 6s
	Train Loss: 0.65225 | Train Acc: 61.32%
	 Val. Loss: 0.58155 |  Val. Acc: 70.71%





#### Evaluate the model on testing data

In [11]:
test_loss, test_acc = evaluate_epoch(model, testloader, criterion)
print(f'Test accuracy of the RNN is {test_acc}')

100%|██████████| 75/75 [00:00<00:00, 153.78it/s]


Test accuracy of the RNN is 0.7118666497866313
