# RNN for sentiment prediction.
Movie review dataset is used. <br>
Below is the network diagram.
![RNN](assets/sentiment_rnn.png "Recurrent Neural Network")

## 0. Imports

In [1]:
import torch
from torch import nn
from torch.utils.data import TensorDataset, DataLoader
import numpy as np
from collections import Counter
from sklearn.model_selection import train_test_split
from string import punctuation

## 1. Data Preprocessing and Batches

In [2]:
PATH = 'data/'

with open(PATH + 'reviews.txt', 'r') as f:
  reviews = f.read()
with open(PATH + 'labels.txt', 'r') as f:
  labels = f.read()

print(reviews[:10], labels[:50])

bromwell h positive
negative
positive
negative
positive
negat


In [3]:
def tokenize_review(reviews):
  reviews = reviews.lower()
  all_text = ''.join([ch for ch in reviews if ch not in punctuation])
  words = all_text.split()
  reviews_split = all_text.split('\n')
  return all_text, words, reviews_split

all_text, words, reviews_split = tokenize_review(reviews)

print('all_text:\t', all_text[:50],
     '\nwords:\t',words[:10],
      '\nreview_split:\t',reviews_split[:2])


all_text:	 bromwell high is a cartoon comedy  it ran at the s 
words:	 ['bromwell', 'high', 'is', 'a', 'cartoon', 'comedy', 'it', 'ran', 'at', 'the'] 
review_split:	 ['bromwell high is a cartoon comedy  it ran at the same time as some other programs about school life  such as  teachers   my   years in the teaching profession lead me to believe that bromwell high  s satire is much closer to reality than is  teachers   the scramble to survive financially  the insightful students who can see right through their pathetic teachers  pomp  the pettiness of the whole situation  all remind me of the schools i knew and their students  when i saw the episode in which a student repeatedly tried to burn down the school  i immediately recalled          at           high  a classic line inspector i  m here to sack one of your teachers  student welcome to bromwell high  i expect that many adults of my age think that bromwell high is far fetched  what a pity that it isn  t   ', 'story of a man who has 

## Encoding datasets

In [5]:
## Define these under network

# _, words, reviews_split = tokenize_review(reviews)
c = Counter(words)
sorted_words = [c for c,i in c.most_common()]
vocab_to_int = {c:i for i, c in enumerate(sorted_words,1)}

def get_encoded(reviews, vocab_to_int = vocab_to_int):
  _,_,reviews_split = tokenize_review(reviews)
  reviews_ints = []
  for review in reviews_split:
    reviews_ints.append([vocab_to_int[word] for word in review.split() if word in vocab_to_int])
  return reviews_ints

reviews_ints = get_encoded(reviews)
  
# Encoding Labels
encoded_labels = np.array([1 if label == 'positive' else 0 for label in labels.split()])
  
# Check...
print('Unique words: ', len((vocab_to_int)))  # should ~ 74000+
print()

# print tokens in first review
print('\n\nbromwell:',vocab_to_int['bromwell'],
      '\n\nOriginal:\n',reviews_split[0],
      '\n\nEncoded Tokenized reviews: \n', reviews_ints[:1],
      '\n\nOriginal Labels:\n', labels.split()[:3],
      '\n\nEncoded Labels:\n', encoded_labels[:3])




Unique words:  74072



bromwell: 21025 

Original:
 bromwell high is a cartoon comedy  it ran at the same time as some other programs about school life  such as  teachers   my   years in the teaching profession lead me to believe that bromwell high  s satire is much closer to reality than is  teachers   the scramble to survive financially  the insightful students who can see right through their pathetic teachers  pomp  the pettiness of the whole situation  all remind me of the schools i knew and their students  when i saw the episode in which a student repeatedly tried to burn down the school  i immediately recalled          at           high  a classic line inspector i  m here to sack one of your teachers  student welcome to bromwell high  i expect that many adults of my age think that bromwell high is far fetched  what a pity that it isn  t    

Encoded Tokenized reviews: 
 [[21025, 308, 6, 3, 1050, 207, 8, 2138, 32, 1, 171, 57, 15, 49, 81, 5785, 44, 382, 110, 140, 15, 5194, 60, 154

## Removing Outliers and padding

In [6]:
# Removing zero length reviews and getting length of reviews
review_lens = Counter([len(x) for x in reviews_ints])
print('Zero Length review count: {}\nMax Length of review: {}\n\n'.format(review_lens[0],max(review_lens)))

print('Reviews before removing outliers:',len(reviews_ints))

non_zero_idx = [idx for idx,rev in enumerate(reviews_ints) if len(rev) != 0]
reviews_ints = [reviews_ints[i] for i in non_zero_idx]
encoded_labels = [encoded_labels[i] for i in non_zero_idx]

print('After removing outliers: ', len(reviews_ints))

def pad_features(arr, seq_length = 200):
  batch_size = len(arr)
  features = np.zeros((batch_size, seq_length), dtype = int)
  for i,row in enumerate(arr):
    features[i,-len(row):] = np.array(row)[:seq_length]
  return features

seq_length = 200

features = pad_features(reviews_ints, seq_length=seq_length)

## test statements - do not change - ##
assert len(features)==len(reviews_ints), "Your features should have as many rows as reviews."
assert len(features[0])==seq_length, "Each feature row should contain seq_length values."

# print first 10 values of the first 30 batches 
print(features[:30,:10])
print(features.shape)


Zero Length review count: 1
Max Length of review: 2514


Reviews before removing outliers: 25001
After removing outliers:  25000
[[    0     0     0     0     0     0     0     0     0     0]
 [    0     0     0     0     0     0     0     0     0     0]
 [22382    42 46418    15   706 17139  3389    47    77    35]
 [ 4505   505    15     3  3342   162  8312  1652     6  4819]
 [    0     0     0     0     0     0     0     0     0     0]
 [    0     0     0     0     0     0     0     0     0     0]
 [    0     0     0     0     0     0     0     0     0     0]
 [    0     0     0     0     0     0     0     0     0     0]
 [    0     0     0     0     0     0     0     0     0     0]
 [   54    10    14   116    60   798   552    71   364     5]
 [    0     0     0     0     0     0     0     0     0     0]
 [    0     0     0     0     0     0     0     0     0     0]
 [    0     0     0     0     0     0     0     0     0     0]
 [    1   330   578    34     3   162   748  2731   

## Training, validation, test and Dataloaders 

In [7]:
split_frac = 0.8
encoded_labels = np.array(encoded_labels)
X_train, X_test, y_train, y_test = train_test_split(features, encoded_labels, test_size=1-split_frac, random_state=42)
## split data into training, validation, and test data (features and labels, x and y)
X_val, X_test, y_val, y_test = train_test_split(X_test, y_test, test_size=0.5, random_state=42)


## print out the shapes of your resultant feature data
print('\t\t\tFeature Shapes')
print('Train set:\t\t{}'.format(X_train.shape),
     '\nValidation set:\t\t{}'.format(X_val.shape),
     '\nTest set:\t\t{}\n'.format(X_test.shape))

## Dataloaders in pytorch for Batches instead of using generators

# create Tensor datasets
train_data = TensorDataset(torch.from_numpy(X_train), torch.from_numpy(y_train))
valid_data = TensorDataset(torch.from_numpy(X_val), torch.from_numpy(y_val))
test_data = TensorDataset(torch.from_numpy(X_test), torch.from_numpy(y_test))

# dataloaders
batch_size = 50

# Shuffle Data
train_loader = DataLoader(train_data, shuffle=True, batch_size=batch_size)
valid_loader = DataLoader(valid_data, shuffle=True, batch_size=batch_size)
test_loader = DataLoader(test_data, shuffle=True, batch_size=batch_size)

# Obtain one batch of training data
dataiter = iter(train_loader)
sample_x, sample_y = dataiter.next()

print('Sample input size: ', sample_x.size()) # batch_size, seq_length
print('Sample input: \n', sample_x)
print()
print('Sample label size: ', sample_y.size()) # batch_size
print('Sample label: \n', sample_y)

			Feature Shapes
Train set:		(20000, 200) 
Validation set:		(2500, 200) 
Test set:		(2500, 200)

Sample input size:  torch.Size([50, 200])
Sample input: 
 tensor([[   0,    0,    0,  ...,   10,   14, 1081],
        [   0,    0,    0,  ...,   32,    8,  286],
        [   0,    0,    0,  ...,  720,   85,  207],
        ...,
        [   0,    0,    0,  ...,    3,  223,  342],
        [   0,    0,    0,  ...,    5,    1,  112],
        [   0,    0,    0,  ...,  121,    4,  259]])

Sample label size:  torch.Size([50])
Sample label: 
 tensor([0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1,
        0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1,
        1, 1])


## 2. Defining Model Architecture

In [8]:
# Check for GPU
train_on_gpu = torch.cuda.is_available()

if train_on_gpu:
  print('Training on GPU')
else:
  print('Training on CPU')

Training on GPU


In [0]:
class SentimentRNN(nn.Module):
  def __init__(self, vocab_size, output_size, embedding_dim, hidden_dim, n_layers, drop_prob=0.5):
    super().__init__()
    self.output_size = output_size
    self.hidden_dim = hidden_dim
    self.n_layers = n_layers
    
    self.embedding = nn.Embedding(vocab_size, embedding_dim)
    self.lstm = nn.LSTM(embedding_dim, hidden_dim, n_layers, dropout = drop_prob, batch_first = True)
    # dropout and Linear
    self.dropout = nn.Dropout(0.3)
    self.fc = nn.Linear(hidden_dim, output_size)
    self.sigmoid = nn.Sigmoid()
    
  def forward(self, x, hidden):
    batch_size = x.size(0)
    
    # Embeddings and LSTM output
    x = x.long()
    embds = self.embedding(x)
    r_out, hidden = self.lstm(embds, hidden)
    
    # Stack up LSTM cells
    r_out = r_out.contiguous().view(-1, self.hidden_dim)
    out = self.fc(self.dropout(r_out))
    
    # Sigmoid output
    sig_out = self.sigmoid(out)
    
    # Reshape output received from Sigmoid layer
    sig_out = sig_out.view(batch_size, -1)
    sig_out = sig_out[:,-1] # Taking only the output from the last sigmoid cell.
    
    return sig_out, hidden
  
  def init_hidden(self, batch_size):
    ''' Initializes hidden state '''
    # Create two new tensors with sizes n_layers x batch_size x hidden_dim,
    # initialized to zero, for hidden state and cell state of LSTM
    weight = next(self.parameters()).data

    if (train_on_gpu):
      hidden = (weight.new(self.n_layers, batch_size, self.hidden_dim).zero_().cuda(),
      weight.new(self.n_layers, batch_size, self.hidden_dim).zero_().cuda())
    else:
      hidden = (weight.new(self.n_layers, batch_size, self.hidden_dim).zero_(),
      weight.new(self.n_layers, batch_size, self.hidden_dim).zero_())

    return hidden

    
    
    
    

## Instantiate network

In [10]:
vocab_size = len(vocab_to_int) + 1
output_size = 1 
embedding_dim = 100 
hidden_dim = 128
n_layers = 2

net = SentimentRNN(vocab_size, output_size, embedding_dim, hidden_dim, n_layers)

print(net)

SentimentRNN(
  (embedding): Embedding(74073, 100)
  (lstm): LSTM(100, 128, num_layers=2, batch_first=True, dropout=0.5)
  (dropout): Dropout(p=0.3)
  (fc): Linear(in_features=128, out_features=1, bias=True)
  (sigmoid): Sigmoid()
)


## 3. Train Network

In [0]:
lr = 0.001
criterion = nn.BCELoss()
optimizer = torch.optim.Adam(net.parameters(), lr = lr)


In [12]:
%%time
# training params

epochs = 4 # 3-4 is approx where I noticed the validation loss stop decreasing

counter = 0
print_every = 100
clip=5 # gradient clipping

# move model to GPU, if available
if(train_on_gpu):
    net.cuda()

net.train()
# train for some number of epochs
for e in range(epochs):
    # initialize hidden state
    h = net.init_hidden(batch_size)

    # batch loop
    for inputs, labels in train_loader:
        counter += 1

        if(train_on_gpu):
            inputs, labels = inputs.cuda(), labels.cuda()

        # Creating new variables for the hidden state, otherwise
        # we'd backprop through the entire training history
        h = tuple([each.data for each in h])

        # zero accumulated gradients
        net.zero_grad()

        # get the output from the model
        output, h = net(inputs, h)

        # calculate the loss and perform backprop
        loss = criterion(output.squeeze(), labels.float())
        loss.backward()
        # `clip_grad_norm` helps prevent the exploding gradient problem in RNNs / LSTMs.
        nn.utils.clip_grad_norm_(net.parameters(), clip)
        optimizer.step()

        # loss stats
        if counter % print_every == 0:
            # Get validation loss
            val_h = net.init_hidden(batch_size)
            val_losses = []
            net.eval()
            for inputs, labels in valid_loader:

                # Creating new variables for the hidden state, otherwise
                # we'd backprop through the entire training history
                val_h = tuple([each.data for each in val_h])

                if(train_on_gpu):
                    inputs, labels = inputs.cuda(), labels.cuda()

                output, val_h = net(inputs, val_h)
                val_loss = criterion(output.squeeze(), labels.float())

                val_losses.append(val_loss.item())

            net.train()
            print("Epoch: {}/{}...".format(e+1, epochs),
                  "Step: {}...".format(counter),
                  "Loss: {:.6f}...".format(loss.item()),
                  "Val Loss: {:.6f}".format(np.mean(val_losses)))

Epoch: 1/4... Step: 100... Loss: 0.650502... Val Loss: 0.707528
Epoch: 1/4... Step: 200... Loss: 0.657278... Val Loss: 0.617991
Epoch: 1/4... Step: 300... Loss: 0.560070... Val Loss: 0.618151
Epoch: 1/4... Step: 400... Loss: 0.648672... Val Loss: 0.720080
Epoch: 2/4... Step: 500... Loss: 0.522536... Val Loss: 0.591327
Epoch: 2/4... Step: 600... Loss: 0.474078... Val Loss: 0.556692
Epoch: 2/4... Step: 700... Loss: 0.468428... Val Loss: 0.518247
Epoch: 2/4... Step: 800... Loss: 0.483454... Val Loss: 0.588294
Epoch: 3/4... Step: 900... Loss: 0.474491... Val Loss: 0.520276
Epoch: 3/4... Step: 1000... Loss: 0.452966... Val Loss: 0.542110
Epoch: 3/4... Step: 1100... Loss: 0.399037... Val Loss: 0.508460
Epoch: 3/4... Step: 1200... Loss: 0.411399... Val Loss: 0.468619
Epoch: 4/4... Step: 1300... Loss: 0.337804... Val Loss: 0.463071
Epoch: 4/4... Step: 1400... Loss: 0.462908... Val Loss: 0.579786
Epoch: 4/4... Step: 1500... Loss: 0.397692... Val Loss: 0.454176
Epoch: 4/4... Step: 1600... Loss: 

## 4. Testing

In [13]:

test_losses = [] # track loss
num_correct = 0

# init hidden state
h = net.init_hidden(batch_size)

net.eval()
# iterate over test data
for inputs, labels in test_loader:

  # Creating new variables for the hidden state, otherwise
  # we'd backprop through the entire training history
  h = tuple([each.data for each in h])

  if(train_on_gpu):
      inputs, labels = inputs.cuda(), labels.cuda()
    
  # get predicted outputs
  output, h = net(inputs, h)
    
  # calculate loss
  test_loss = criterion(output.squeeze(), labels.float())
  test_losses.append(test_loss.item())
    
  # convert output probabilities to predicted class (0 or 1)
  pred = torch.round(output.squeeze())  # rounds to the nearest integer
    
  # compare predictions to true label
  correct_tensor = pred.eq(labels.float().view_as(pred))
  correct = np.squeeze(correct_tensor.numpy()) if not train_on_gpu else np.squeeze(correct_tensor.cpu().numpy())
  num_correct += np.sum(correct)


# -- stats! -- ##
# avg test loss
print("Test loss: {:.3f}".format(np.mean(test_losses)))

# accuracy over all test data
test_acc = num_correct/len(test_loader.dataset)
print("Test accuracy: {:.3f}".format(test_acc))

Test loss: 0.450
Test accuracy: 0.814


In [0]:
test_review_neg = 'The worst movie I have seen; acting was terrible and I want my money back. This movie had bad acting and the dialogue was slow. asdfasfdsadfasdf asdf hey'


Test String:
 The worst movie I have seen; acting was terrible and I want my money back. This movie had bad acting and the dialogue was slow. asdfasfdsadfasdf asdf hey


In [0]:
def predict(net, review, sequence_length = 200):
  
  # Convert review to appropriate format.
  features = pad_features(get_encoded(review))
  features_tensor = torch.from_numpy(features)
  
  # Set the model to eval mode
  net.eval()
  batch_size = features_tensor.size(0)
  
  # Initialize hidden state
  h = net.init_hidden(batch_size)
  
  if train_on_gpu:
    features_tensor = features_tensor.cuda()
    
  output, h = net(features_tensor, h)
  
  # Get predictions
  pred = torch.round(output.squeeze())
  
  # Print Prediction
  print('Prediction value, pre-rounding: {:.6f}'.format(output.item()))
  
  if pred.item() == 1:
    print('POSITIVE')
  else:
    print('NEGATIVE')
  
  
  
  

In [30]:
predict(net, test_review_neg)

Prediction value, pre-rounding: 0.009870
NEGATIVE


In [31]:
test_review_pos = 'This movie had the best acting and the dialogue was so good. I loved it.'
predict(net, test_review_pos)

Prediction value, pre-rounding: 0.943422
POSITIVE
