In [1]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.optim import Adam
from torch.utils.data import TensorDataset, DataLoader
import re
import os
import time
from tqdm import tqdm
from sklearn.model_selection import train_test_split
from torchtext.vocab import GloVe

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cuda')

In [3]:
# Building the vocabulary
glove = GloVe(name='6B', dim=100, max_vectors=25000, cache = "./vectors_cache")

### Util functions

In [4]:
def load_data(csv_path, glove):

  def preprocess_string(s):
    # Remove all non-word characters (everything except numbers and letters)
    s = re.sub(r"[^\w\s]", '', s)
    # Replace all runs of whitespaces with no space
    s = re.sub(r"\s+", '', s)
    # replace digits with no space
    s = re.sub(r"\d", '', s)

    return s

  def padding_(sentences, seq_len):
    features = np.zeros((len(sentences), seq_len),dtype=int)
    for ii, review in enumerate(sentences):
      if len(review) != 0:
        features[ii, -len(review):] = np.array(review)[:seq_len]
    return features

  # Reading data from disk
  df = pd.read_csv(csv_path)
  # Replacing string labels with numbers
  df['sentiment'].replace({'positive': 1, 'negative': 0}, inplace=True)

  # dividing the dataset into train, validation, and test portions
  X,y = df['review'].values,df['sentiment'].values
  x_train1, x_test1, y_train, y_test = train_test_split(X, y, train_size=0.85, random_state=1)
  x_train1, x_val1, y_train, y_val = train_test_split(x_train1, y_train, train_size=0.823, random_state=1)

  print(f'Number of training examples: {len(x_train1)}')
  print(f'Number of validation examples: {len(x_val1)}')
  print(f'Number of test examples: {len(x_test1)}')


  

  x_train, x_val, x_test = [],[],[]
  for sent in x_train1:
    x_train.append([glove.stoi[preprocess_string(word)] for word in sent.lower().split() if preprocess_string(word) in glove.stoi])

  for sent in x_val1:
    x_val.append([glove.stoi[preprocess_string(word)] for word in sent.lower().split() if preprocess_string(word) in glove.stoi])

  for sent in x_test1:
    x_test.append([glove.stoi[preprocess_string(word)] for word in sent.lower().split() if preprocess_string(word) in glove.stoi])

  # Padding the data
  x_train_pad = padding_(x_train,500)
  x_val_pad = padding_(x_val,500)
  x_test_pad = padding_(x_test,500)

  # creating Tensor Datasets and DataLoaders
  train_data = TensorDataset(torch.from_numpy(x_train_pad), torch.from_numpy(y_train))
  valid_data = TensorDataset(torch.from_numpy(x_val_pad), torch.from_numpy(y_val))
  test_data = TensorDataset(torch.from_numpy(x_test_pad), torch.from_numpy(y_test))

  batch_size = 100

  train_loader = DataLoader(train_data, shuffle=True, batch_size=batch_size)
  valid_loader = DataLoader(valid_data, shuffle=True, batch_size=batch_size)
  test_loader = DataLoader(test_data, shuffle=True, batch_size=batch_size)

  return train_loader, valid_loader, test_loader

def binary_accuracy(preds, y):
    rounded_preds = torch.round(torch.sigmoid(preds))
    correct = (rounded_preds == y).float()  
    acc = correct.sum() / len(correct)
    return acc

def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

def train_epoch(model, data_loader, optimizer, criterion):
  epoch_loss = 0
  epoch_acc = 0
  model.train()

  for inputs, labels in tqdm(data_loader):
      optimizer.zero_grad()
      inputs, labels = inputs.to(device), labels.to(device, dtype=torch.float)   
      predictions = model(inputs).squeeze(1)
      loss = criterion(predictions, labels)      
      loss.backward()
      optimizer.step()
      acc = binary_accuracy(predictions, labels)
      epoch_loss += loss.item()
      epoch_acc += acc.item()
      
  return epoch_loss / len(data_loader), epoch_acc / len(data_loader)

def evaluate_epoch(model, data_loader, criterion):
    epoch_loss = 0
    epoch_acc = 0
    model.eval()
    with torch.no_grad():
        for inputs, labels in tqdm(data_loader):
            inputs, labels = inputs.to(device), labels.to(device, dtype=torch.float)
            predictions = model(inputs).squeeze(1)
            loss = criterion(predictions, labels)
            acc = binary_accuracy(predictions, labels)
            epoch_loss += loss.item()
            epoch_acc += acc.item()
    return epoch_loss / len(data_loader), epoch_acc / len(data_loader)

def count_recurrent_layer_parameters(model):
    return sum(p.numel() for n, p in model.named_parameters() if p.requires_grad and 'fc' not in n)

### Loading the data

In [5]:
trainloader, validloader, testloader = load_data("IMDB Dataset.csv", glove)

Number of training examples: 34977
Number of validation examples: 7523
Number of test examples: 7500


# Q4 - LSTM

In [14]:
class LSTM(nn.Module):
  def __init__(self, hidden_dim, embed_dim):
    super(LSTM, self).__init__()
    
    self.hidden_dim = hidden_dim
    self.embed_dim = embed_dim

    self.embedding = nn.Embedding.from_pretrained(glove.vectors, freeze=True)

    self.lstm = nn.LSTM(
      input_size=self.embed_dim,
      hidden_size=self.hidden_dim,
      bidirectional=False,
      num_layers=1,
      batch_first=True
    )

    self.fc = nn.Linear(self.hidden_dim , 1)

  def forward(self, x):
    embeds = self.embedding(x)
    output, (hidden, cell) = self.lstm(embeds)

    logit = self.fc(hidden[0, :, :])
    return logit

In [15]:
model = LSTM(embed_dim=100, hidden_dim=200)
model.to(device)

LSTM(
  (embedding): Embedding(25000, 100)
  (lstm): LSTM(100, 200, batch_first=True)
  (fc): Linear(in_features=200, out_features=1, bias=True)
)

In [16]:
print(f'The 2 Level Bidirectional RNN model has {count_recurrent_layer_parameters(model):,} parameters in the two bi-RNN layers')

The 2 Level Bidirectional RNN model has 241,600 parameters in the two bi-RNN layers


#### Training the model and evaluating the validation data per epoch

In [17]:
N_EPOCHS = 5

optimizer = Adam(model.parameters())
criterion = nn.BCEWithLogitsLoss().to(device)

best_valid_loss = float('inf')

for epoch in range(N_EPOCHS):

    start_time = time.time()
    
    train_loss, train_acc = train_epoch(model, trainloader, optimizer, criterion)
    valid_loss, valid_acc = evaluate_epoch(model, validloader, criterion)
    
    end_time = time.time()

    epoch_mins, epoch_secs = epoch_time(start_time, end_time)
    
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        os.makedirs('lstm', exist_ok=True)
        torch.save(model.state_dict(), 'lstm/best-model.pt')
    
    print(f'Epoch: {epoch+1:02} | Epoch Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.5f} | Train Acc: {train_acc*100:.2f}%')
    print(f'\t Val. Loss: {valid_loss:.5f} |  Val. Acc: {valid_acc*100:.2f}%')

100%|██████████| 350/350 [00:19<00:00, 17.90it/s]
100%|██████████| 76/76 [00:01<00:00, 46.74it/s]


Epoch: 01 | Epoch Time: 0m 21s
	Train Loss: 0.62714 | Train Acc: 65.21%
	 Val. Loss: 0.54435 |  Val. Acc: 72.86%


100%|██████████| 350/350 [00:20<00:00, 17.20it/s]
100%|██████████| 76/76 [00:01<00:00, 45.44it/s]


Epoch: 02 | Epoch Time: 0m 22s
	Train Loss: 0.51800 | Train Acc: 75.26%
	 Val. Loss: 0.50160 |  Val. Acc: 76.52%


100%|██████████| 350/350 [00:19<00:00, 17.52it/s]
100%|██████████| 76/76 [00:01<00:00, 46.03it/s]


Epoch: 03 | Epoch Time: 0m 21s
	Train Loss: 0.50742 | Train Acc: 75.67%
	 Val. Loss: 0.37680 |  Val. Acc: 83.75%


100%|██████████| 350/350 [00:19<00:00, 17.61it/s]
100%|██████████| 76/76 [00:01<00:00, 44.26it/s]


Epoch: 04 | Epoch Time: 0m 21s
	Train Loss: 0.36671 | Train Acc: 83.86%
	 Val. Loss: 0.35908 |  Val. Acc: 85.00%


100%|██████████| 350/350 [00:20<00:00, 17.50it/s]
100%|██████████| 76/76 [00:01<00:00, 45.20it/s]

Epoch: 05 | Epoch Time: 0m 21s
	Train Loss: 0.33073 | Train Acc: 85.97%
	 Val. Loss: 0.32774 |  Val. Acc: 85.64%





#### Evaluate the model on testing data

In [18]:
test_loss, test_acc = evaluate_epoch(model, testloader, criterion)
print(f'Test accuracy of the LSTM is {test_acc}')

100%|██████████| 75/75 [00:01<00:00, 45.66it/s]

Test accuracy of the LSTM is 0.8557333127657573



