# Sentiment Analysis Using RNN

This model classifies movie reviews into positive and negative.

In [1]:
import numpy as np
import pandas as pd
from string import punctuation
from collections import Counter, OrderedDict
import itertools

import torch 
from torch.utils.data import TensorDataset, DataLoader
import torch.nn as nn

import nltk
from nltk.corpus import stopwords

In [2]:
with open('deep-learning-v2-pytorch/sentiment-analysis-network/reviews.txt', 'r') as f:
    reviews = f.read()
with open('deep-learning-v2-pytorch/sentiment-analysis-network/labels.txt', 'r') as f:
    labels = f.read()

In [3]:
# This function will recieve the imported reviews (ch by ch) and return 
def clean_text(text):
    ''' This Function recieves reviews (ch by ch) and returns a list of
    reviews without punctuation'''
    # remove punctuation
    s = ''.join(ch.lower() for ch in text if ch not in punctuation)
    
    # separate each review and add to a list so that I have a list of reviews
    separated_reviews = []

    for review in s.split('\n'):
        review = ''.join(review)
        separated_reviews.append(review)
        
    return separated_reviews

In [4]:
reviews = clean_text(reviews)

In [5]:
labels = clean_text(labels)

In [6]:
class ReviewEncoder:

    def __init__(self):
        self.__words_dict = {}
        self.__indexer = 1
        
    def word_dict(self):
        '''This method returns the created dictionary'''
        return self.__words_dict
    
    def encode(self, text):
        '''Encodes the reviews'''
        encoded_review = []
        words = text.split()
        #print(words)
        for word in words:
            if word in self. __words_dict:
                encoded_review.append(self.__words_dict[word])
            else:
                self.__words_dict[word] = self.__indexer
                self.__indexer += 1
                encoded_review.append(self.__words_dict[word])
        return encoded_review
    
    def len_dict(self):
        '''Returns the length of the review'''
        return len(self.__words_dict)

In [7]:
encoder = ReviewEncoder()

In [8]:
# encoding the reviews
encoded_reviews = []
for review in reviews:
    encoded_reviews.append(encoder.encode(review))

In [9]:
# changing positive to be 1 and negative zero
encoded_labels = []
for label in labels:
    if label == 'positive':
        encoded_labels.append(1)
    else:
        encoded_labels.append(0)

In [10]:
vocab_size = encoder.len_dict()

In [11]:
def drop_empty_reviews(list_of_reviews):
    '''This function checks for empty reviews, returns the index
    and drops them'''
    full_reviews = []
    index_to_remove = []
    
    for index, review in enumerate(list_of_reviews):
        if len(review) != 0:
            full_reviews.append(review)
        else:
            index_to_remove.append(index)
    print('Indexes to remove from the labels: ', index_to_remove)
    
    return full_reviews

In [12]:
reviews = drop_empty_reviews(encoded_reviews)

Indexes to remove from the labels:  [25000]


In [13]:
# remove label with index 25000
del encoded_labels[25000]

In [14]:
def padding_truncation(encoded_review_list, seq_length):
    '''This function addes zeroes to the left of a review if
    it is shorter than the seq_length and truncates reviews
    longer than the seq_length
    This step is important because the dataloader expects all
    of the reviews to be of the same size'''
    
    padded_review = []
    for review in encoded_review_list:
        if len(review) < seq_length:
            padding = seq_length - len(review)
            review = ([0]*padding + review)
            padded_review.append(review)
        elif len(review) > seq_length:
            review = review[:seq_length]
            padded_review.append(review)
        else:
            padded_review.append(review)
            
    return padded_review

In [15]:
padded_reviews = padding_truncation(reviews, 200)

In [17]:
# changing the type of the padded reviews from list to an array of type int
padded_reviews = np.asarray(padded_reviews, dtype=int)
encoded_labels = np.asarray(encoded_labels, dtype=int)

## Defining the training, validation and testing sets

In [22]:
training = int(len(padded_reviews) * 0.8)
validation = int(training + len(padded_reviews)*0.1)

train_x = padded_reviews[:training]
train_y = np.array(encoded_labels[:training])

val_x = padded_reviews[training:validation]
val_y = np.array(encoded_labels[training:validation])

test_x = padded_reviews[validation:]
test_y = np.array(encoded_labels[validation:])

In [24]:
train_dataset = TensorDataset(torch.from_numpy(train_x), torch.from_numpy(train_y))
valid_dataset = TensorDataset(torch.from_numpy(val_x), torch.from_numpy(val_y))
test_dataset = TensorDataset(torch.from_numpy(test_x), torch.from_numpy(test_y))

In [25]:
train_loader = DataLoader(dataset=train_dataset, batch_size=50, shuffle=True)
valid_loader = DataLoader(dataset=valid_dataset, batch_size=50, shuffle=True)
test_loader = DataLoader(dataset=test_dataset, batch_size=50, shuffle=True)

### LSTM Layer

Put the embedding output into the lstm layer
- parameters: input_size, hidden_size, num_layers, batch_first
    - num of recurrent layers. Seting this to 2 stacks two LSTMs together to form a stacked LSTM, with the second LSTM taking in outputs from the first LSTM and computing the final results.
    - batch_first: if true then the input and output tensors are provided as (batch, seq, feature)


### Initializing the hidden state

Zero initial hiddenstate is standard and this is the default if we dont pass in a hidden state

initializing the hidden state to zeroes

The hidden and cell state reset to zero for every epoch so you don't need to initialize them unless you are initializing them to something other than zero.

Since I have n_layers equal to 2, the output is a packed sequence. So I need to unpack

The next step is to pass the vector into the fully connected layer. The fc layer expects 1D vectors.
In order to do that I need to flatten the vector -- the resulting shape is going to be (1, rowsxcols)
so in this case is going to be (1, 50*200)


If there is any situation that you don't know how many rows you want but are sure of the number of columns, then you can specify this with a -1. (Note that you can extend this to tensors with more dimensions. Only one of the axis value can be -1). This is a way of telling the library: "give me a tensor that has these many columns and you compute the appropriate number of rows that is necessary to make this happen".

https://stackoverflow.com/questions/42479902/how-does-the-view-method-work-in-pytorch

The view method returns a tensor with the same data as the self tensor (which means that the returned tensor has the same number of elements), but with a different shape. 

you have 10,000 elements each element is represented by 256
The 256 are going to go to the linear and are going to make an output of 1

I'm aware the LSTM cell uses both sigmoid and tanh activation functions internally, however when creating a stacked LSTM architecture does it make sense to pass their outputs through an activation function (e.g. ReLU)?

https://stats.stackexchange.com/questions/444923/activation-function-between-lstm-layers

Given that ReLUs can have quite large outputs, they have traditionally been regarded as inappropriate for use with LSTMs.

a probability of dropout around 0.5 for hidden units and 0.2 for inputs worked well for a variety of tasks.

The core concept of Srivastava el al. (2014) is that “each hidden unit in a neural network trained with dropout must learn to work with a randomly chosen sample of other units. This should make each hidden unit more robust and drive it towards creating useful features on its own without relying on other hidden units to correct its mistakes.”.

embedding = nn.Embedding(vocab_size, 400)
embedding_output = embedding(x)

lstm = nn.LSTM(input_size=400, hidden_size=256, num_layers=2, batch_first=True, dropout=0.5)
lstm_output, hidden = lstm(embedding_output)
lstm_output = lstm_output.contiguous().view(-1, 256)

Dropout = nn.Dropout(0.2)
lstm_output_dropout = Dropout(lstm_output)

fc = nn.Linear(256,1)
fc_output = fc(lstm_output_dropout)

sigmoid = nn.Sigmoid()
sigmoid_output = sigmoid(fc_output)

In [26]:
train_on_gpu = torch.cuda.is_available()
if train_on_gpu:
    print('Training on GPU')
else:
    print('GPU not available')

Training on GPU


In [27]:
class RNN(nn.Module):
    
    def __init__(self, num_embeddings, embedding_dim, hidden_size, num_layers, drop_prob, out_features):
        
        super(RNN, self).__init__()
        
        self.hidden_size = hidden_size
        self.out_features = out_features
        self.num_layers = num_layers
        
        self.embedding = nn.Embedding(num_embeddings, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_size, num_layers, batch_first=True, dropout = drop_prob)
        self.dropout = nn.Dropout(0.3)
        self.fc = nn.Linear(hidden_size, out_features)
        self.sigmoid = nn.Sigmoid()
        
    def forward(self, x, hidden):
        
        batch_size = x.size(0)
        
        embedding_out = self.embedding(x)
        lstm_out, hidden = self.lstm(embedding_out, hidden)

        lstm_out = lstm_out.contiguous().view(-1, 256)

        lstm_out_dropout = self.dropout(lstm_out)
        
        fc_out = self.fc(lstm_out_dropout)

        sig_out = self.sigmoid(fc_out)
    
        
        sig_out = sig_out.view(batch_size, -1)
        sig_out = sig_out[:,-1]
        
        return sig_out, hidden
    
    
    def init_hidden(self, batch_size):
        ''' Initializes hidden state '''
        # Create two new tensors with sizes n_layers x batch_size x hidden_dim,
        # initialized to zero, for hidden state and cell state of LSTM
        weight = next(self.parameters()).data
        
        if (train_on_gpu):
            hidden = (weight.new(self.num_layers, batch_size, self.hidden_size).zero_().cuda(),
                  weight.new(self.num_layers, batch_size, self.hidden_size).zero_().cuda())
        else:
            hidden = (weight.new(self.num_layers, batch_size, self.hidden_size).zero_(),
                      weight.new(self.num_layers, batch_size, self.hidden_size).zero_())
        
        return hidden
        

In [28]:
num_embeddings = 74072 + 1
batch_size = 50
embedding_dim = 400
hidden_size = 256
num_layers = 2
drop_prob = 0.5
out_features = 1

In [29]:
model = RNN(num_embeddings = num_embeddings, embedding_dim = embedding_dim, hidden_size = hidden_size, 
            num_layers = num_layers, drop_prob = drop_prob, out_features = out_features)

In [30]:
model

RNN(
  (embedding): Embedding(74073, 400)
  (lstm): LSTM(400, 256, num_layers=2, batch_first=True, dropout=0.5)
  (dropout): Dropout(p=0.3, inplace=False)
  (fc): Linear(in_features=256, out_features=1, bias=True)
  (sigmoid): Sigmoid()
)

In [31]:
lr = 0.001
criterion = nn.BCELoss()
optimizer = torch.optim.Adam(model.parameters(), lr = lr)

epochs = 4
counter = 0
print_every = 100
clip=5

In [32]:
model.cuda()
model.train()

for e in range(epochs):
    
    h = model.init_hidden(batch_size)
    for x, y in train_loader:
        counter += 1
        
        x, y = x.cuda(), y.cuda()
        x = x.long()
        
        h = tuple([each.data for each in h])
        model.zero_grad()
        
        output, hidden = model(x, h)

        loss = criterion(output.squeeze(), y.float())
        loss.backward()
        
        nn.utils.clip_grad_norm_(model.parameters(), clip)
        optimizer.step()
        
        
        if counter % print_every == 0:
            
            val_h = model.init_hidden(batch_size)
            val_losses = []
            model.eval()
            for x, y in valid_loader:
                x, y = x.cuda(), y.cuda()
                x = x.long()
                
                val_h = tuple([each.data for each in h])
                
                output, val_h = model(x, val_h)
                val_loss = criterion(output.squeeze(), y.float())
                
                val_losses.append(val_loss.item())
            model.train()
            print("Epoch: {}/{}...".format(e+1, epochs),
                  "Step: {}...".format(counter),
                  "Loss: {:.6f}...".format(loss.item()),
                  "Val Loss: {:.6f}".format(np.mean(val_losses)))

Epoch: 1/4... Step: 100... Loss: 0.693491... Val Loss: 0.686222
Epoch: 1/4... Step: 200... Loss: 0.674710... Val Loss: 0.642303
Epoch: 1/4... Step: 300... Loss: 0.545069... Val Loss: 0.615143
Epoch: 1/4... Step: 400... Loss: 0.681781... Val Loss: 0.538172
Epoch: 2/4... Step: 500... Loss: 0.459832... Val Loss: 0.634001
Epoch: 2/4... Step: 600... Loss: 0.493618... Val Loss: 0.493223
Epoch: 2/4... Step: 700... Loss: 0.571743... Val Loss: 0.452888
Epoch: 2/4... Step: 800... Loss: 0.416146... Val Loss: 0.454839
Epoch: 3/4... Step: 900... Loss: 0.227714... Val Loss: 0.480114
Epoch: 3/4... Step: 1000... Loss: 0.300825... Val Loss: 0.453474
Epoch: 3/4... Step: 1100... Loss: 0.307104... Val Loss: 0.439663
Epoch: 3/4... Step: 1200... Loss: 0.297350... Val Loss: 0.423074
Epoch: 4/4... Step: 1300... Loss: 0.438367... Val Loss: 0.456411
Epoch: 4/4... Step: 1400... Loss: 0.220592... Val Loss: 0.421826
Epoch: 4/4... Step: 1500... Loss: 0.258635... Val Loss: 0.435854
Epoch: 4/4... Step: 1600... Loss: 

In [33]:
# testing

test_losses = []
num_correct = 0

model.eval()

h = model.init_hidden(batch_size)

for x, y in test_loader:
    x, y = x.cuda(), y.cuda()
    x = x.long()
    
    h = tuple([each.data for each in h])
    
    output, h = model(x, h)
    test_loss = criterion(output.squeeze(), y.float())
    test_losses.append(test_loss.item())
    
    pred = torch.round(output.squeeze())
    
    correct_tensor = pred.eq(y.float().view_as(pred))
    correct = np.squeeze(correct_tensor.cpu().numpy())
    num_correct += np.sum(correct)
    
    
print("Test loss: {:.3f}".format(np.mean(test_losses)))

# accuracy over all test data
test_acc = num_correct/len(test_loader.dataset)
print("Test accuracy: {:.3f}".format(test_acc))

Test loss: 0.519
Test accuracy: 0.782


In [34]:
def predict(net, test_review, sequence_length=200):
    
    net.eval()
    
    # tokenize review
    test_ints = encoder.encode(test_review)
    #test_ints = tokenize_review(test_review)
    
    # pad tokenized sequence
    seq_length=sequence_length
    #features = padding_truncation(test_ints)
    #features = pad_features(test_ints, seq_length)
    features = []
    if len(test_ints) < 200:
        padding = 200 - len(test_ints)
        features = ([0]*padding + test_ints)
    
    # convert to tensor to pass into your model
    features = np.array(features)
    feature_tensor = torch.from_numpy(features)
    feature_tensor = feature_tensor.unsqueeze(0)
    
    batch_size = feature_tensor.size(0)
    
    # initialize hidden state
    h = net.init_hidden(batch_size)
    
    if(train_on_gpu):
        feature_tensor = feature_tensor.cuda()
    
    # get the output from the model
    output, h = net(feature_tensor.long(), h)
    
    # convert output probabilities to predicted class (0 or 1)
    pred = torch.round(output.squeeze()) 
    # printing output value, before rounding
    print('Prediction value, pre-rounding: {:.6f}'.format(output.item()))
    
    # print custom response
    if(pred.item()==1):
        print("Positive review detected!")
    else:
        print("Negative review detected.")

In [35]:
test_review_pos = 'This movie had the best acting and the dialogue was so good. I loved it.'
test_review_neg = 'This movie had the wrost acting and the dialogue was bad.'


# call function
seq_length=200 # good to use the length that was trained on

predict(model, test_review_neg, seq_length)
predict(model, test_review_pos, seq_length)

Prediction value, pre-rounding: 0.051283
Negative review detected.
Prediction value, pre-rounding: 0.962218
Positive review detected!
