In [1]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [2]:
# Get to the folder we are at
FOLDERNAME = 'Colab\ Notebooks/SC201L17'
%cd drive/MyDrive/$FOLDERNAME/

/content/drive/MyDrive/Colab Notebooks/SC201L17


In [3]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
import pandas as pd

In [4]:
# Seed for same output
torch.manual_seed(42)
torch.backends.cudnn.benchmark = False
torch.backends.cudnn.deterministic = True

In [5]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)

cuda:0


In [6]:
# Reading in our file
raw_data = pd.read_csv('IMDBDataset.csv')

In [7]:
# Get data & labels
reviews = raw_data.review
labels = raw_data['sentiment']

In [8]:
# Replace 'positive' with 1; 'negative' with 0
labels.replace({'positive': 1, 'negative': 0}, inplace=True)

In [9]:
patterns = ['<br />', '--', '.', ',', '!', '?', ')', '(', ';', ':', '*', '~', '_', "'", '"']
replacements = [' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', '', '']

In [10]:
def preprocessing(reviews, patterns, replacements):
  lst = []
  for i in range(len(reviews)):
    review = reviews[i].lower()
    for pattern, replacement in zip(patterns, replacements):
      review = review.replace(pattern, replacement)
    lst.append(review)
  return lst

In [11]:
reviews = preprocessing(reviews, patterns, replacements)

In [12]:
num_train = 35000
num_val = 15000
longest_num_tokens = 250

In [13]:
def indexing_tokens():
  indices = {'<SOS>': 0, '<EOS>': 1, '<PAD>': 2, '<UNK>': 3}
  index = 4
  for i in range(num_train):
    review = reviews[i].split()
    for token in review:
      if token not in indices:
        indices[token] = index
        index += 1
  return indices

In [14]:
def get_data(indices, longest_line_tokens, mode='train'):
    data = []
    Y = []
    if mode == 'train':
      for i in range(num_train):
        one_train_data = []
        y, review = labels[i], reviews[i]
        tokens = review.split()
        for token in tokens:
          one_train_data.append(indices[token])
          if len(one_train_data) == longest_line_tokens:
            break
        while len(one_train_data) < longest_line_tokens:
          one_train_data.append(indices['<PAD>'])
        one_train_data.insert(0, indices['<SOS>'])
        one_train_data.append(indices['<EOS>'])
        data.append(one_train_data)
        Y.append(y)
    else:
      for i in range(num_train, num_train+num_val):
        one_val_data = []
        y, review = labels[i], reviews[i]
        tokens = review.split()
        for token in tokens:
          if token in indices:
            one_val_data.append(indices[token])
          else:
            one_val_data.append(indices['<UNK>'])
          if len(one_val_data) == longest_line_tokens:
            break
        while len(one_val_data) < longest_line_tokens:
          one_val_data.append(indices['<PAD>'])
        one_val_data.insert(0, indices['<SOS>'])
        one_val_data.append(indices['<EOS>'])
        data.append(one_val_data)
        Y.append(y)
    return data, Y

In [15]:
# Loading Training Data & Val Data
indices = indexing_tokens()
training_data, training_labels = get_data(indices, longest_num_tokens)
val_data, val_labels = get_data(indices, longest_num_tokens, mode='val')

In [16]:
print('Number of training:', len(training_data))
print('Number of validation:', len(val_data))
print('Length of corpus:', len(indices))

Number of training: 35000
Number of validation: 15000
Length of corpus: 122545


In [17]:
# Create tensors of train & val
import numpy as np
train_tensor = torch.tensor(np.array(training_data))
train_labels_tensor = torch.tensor(np.array(training_labels))
val_tensor = torch.tensor(np.array(val_data))
val_labels_tensor = torch.tensor(np.array(val_labels))

In [18]:
print('Train Tensor:', train_tensor.shape)
print('Val Tensor:', val_tensor.shape)

Train Tensor: torch.Size([35000, 252])
Val Tensor: torch.Size([15000, 252])


In [19]:
vocab_size = 122545
embedding_dim = 300
hidden_dim = 256
sequence_len = 252
output_dim = 2
print_every = 400
batch_size = 32

In [20]:
class MyModel(nn.Module):
  def __init__(self, corpus_len, embedding_dim, hidden_dim, output_dim):
    super().__init__()
    self.embedding = nn.Embedding(corpus_len, embedding_dim)
    self.lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first=True)
    self.fc = nn.Linear(hidden_dim, output_dim)

  def forward(self, x):
    out = self.embedding(x)
    output, (h_n, c_n) = self.lstm(out)
    # out = self.fc(output[:, -1, :])
    out = self.fc(h_n).squeeze()
    return out

In [21]:
model = MyModel(vocab_size, embedding_dim, hidden_dim, output_dim)
model = model.cuda()

In [22]:
mini_trains = DataLoader(train_tensor, batch_size=batch_size)
mini_train_labels = DataLoader(training_labels, batch_size=batch_size)

mini_vals = DataLoader(val_tensor, batch_size=batch_size)
mini_val_labels = DataLoader(val_labels, batch_size=batch_size)

In [23]:
iterator = iter(mini_trains)
print(next(iterator).shape)

iterator = iter(mini_train_labels)
print(next(iterator).shape)

torch.Size([32, 252])
torch.Size([32])


In [24]:
# Training Procedure
def train(num_epoch, model, mini_trains, mini_train_labels, mini_vals, mini_val_labels, device, loss_function, optimizer):
  for epoch in range(num_epoch):
    num_iters = 0
    for x, y in zip(mini_trains, mini_train_labels):
      model.train()
      x = x.to(device)
      y = y.to(device)
      scores = model(x)
      loss = loss_function(scores, y)
      optimizer.zero_grad()
      loss.backward()
      optimizer.step()
      if num_iters % print_every == 0:
        evaluate_predictor(model, epoch, mini_vals, mini_val_labels, device)
      num_iters += 1

In [25]:
# Evaluate Procedure
def evaluate_predictor(model, epoch, mini_vals, mini_val_labels, device):
  model.eval()
  with torch.no_grad():
    acc_count = 0
    for x, y in zip(mini_vals, mini_val_labels):
      x=x.to(device)
      y=y.to(device)
      scores=model(x)
      predictions=scores.max(1)[1]
      acc = predictions.eq(y).sum().item()
      acc_count += acc
    print(f'Epoch[{epoch+1}] Acc: {acc_count/len(val_data)}')

In [26]:
loss_function = torch.nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=1e-3)

In [None]:
# Start training
train(5, model, mini_trains, mini_train_labels, mini_vals, mini_val_labels, device, loss_function, optimizer)

Epoch[1] Acc: 0.5026
