# Sentiment Analysis Using RNN

In [1]:
import numpy as np
import pandas as pd
from string import punctuation
from collections import Counter, OrderedDict
import itertools

import torch 
from torch.utils.data import TensorDataset, DataLoader
import torch.nn as nn

import nltk
from nltk.corpus import stopwords

In [2]:
with open('deep-learning-v2-pytorch/sentiment-analysis-network/reviews.txt', 'r') as f:
    reviews = f.read()
with open('deep-learning-v2-pytorch/sentiment-analysis-network/labels.txt', 'r') as f:
    labels = f.read()

In [3]:
# This function will recieve the imported reviews (ch by ch) and return 
def clean_text(text):
    ''' This Function recieves reviews (ch by ch) and returns a list of
    reviews without punctuation'''
    # remove punctuation
    s = ''.join(ch.lower() for ch in text if ch not in punctuation)
    
    # separate each review and add to a list so that I have a list of reviews
    separated_reviews = []

    for review in s.split('\n'):
        review = ''.join(review)
        separated_reviews.append(review)
        
    return separated_reviews

In [4]:
reviews = clean_text(reviews)

In [5]:
labels = clean_text(labels)

In [6]:
class ReviewEncoder:

    def __init__(self):
        self.__words_dict = {}
        self.__indexer = 1
        
    def word_dict(self):
        '''This method returns the created dictionary'''
        return self.__words_dict
    
    def encode(self, text):
        '''Encodes the reviews'''
        encoded_review = []
        words = text.split()
        #print(words)
        for word in words:
            if word in self. __words_dict:
                encoded_review.append(self.__words_dict[word])
            else:
                self.__words_dict[word] = self.__indexer
                self.__indexer += 1
                encoded_review.append(self.__words_dict[word])
        return encoded_review
    
    def len_dict(self):
        '''Returns the length of the review'''
        return len(self.__words_dict)

In [7]:
encoder = ReviewEncoder()

In [8]:
# encoding the reviews
encoded_reviews = []
for review in reviews:
    encoded_reviews.append(encoder.encode(review))

In [9]:
# changing positive to be 1 and negative zero
encoded_labels = []
for label in labels:
    if label == 'positive':
        encoded_labels.append(1)
    else:
        encoded_labels.append(0)

In [10]:
vocab_size = encoder.len_dict()

In [11]:
def drop_empty_reviews(list_of_reviews):
    '''This function checks for empty reviews, returns the index
    and drops them'''
    full_reviews = []
    index_to_remove = []
    
    for index, review in enumerate(list_of_reviews):
        if len(review) != 0:
            full_reviews.append(review)
        else:
            index_to_remove.append(index)
    print('Indexes to remove from the labels: ', index_to_remove)
    
    return full_reviews

In [12]:
reviews = drop_empty_reviews(encoded_reviews)

Indexes to remove from the labels:  [25000]


In [13]:
# remove label with index 25000
del encoded_labels[25000]

In [14]:
def padding_truncation(encoded_review_list, seq_length):
    '''This function addes zeroes to the left of a review if
    it is shorter than the seq_length and truncates reviews
    longer than the seq_length'''
    
    padded_review = []
    for review in encoded_review_list:
        if len(review) < seq_length:
            padding = seq_length - len(review)
            review = ([0]*padding + review)
            padded_review.append(review)
        elif len(review) > seq_length:
            review = review[:seq_length]
            padded_review.append(review)
        else:
            padded_review.append(review)
            
    return padded_review

In [15]:
padded_reviews = padding_truncation(reviews, 200)

In [17]:
# changing the type of the padded reviews from list to an array of type int
padded_reviews = np.asarray(padded_reviews, dtype=int)

In [19]:
# changing labels from list to array of type int
encoded_labels = np.asarray(encoded_labels, dtype=int)

In [None]:
test = np.array(encoded_reviews)

In [None]:
test = drop_empty_reviews(test)

In [None]:
len(test)

In [None]:
features = pad_features(test, 200)

In [None]:
padded_reviews = features

In [None]:
# Defining training, validation and testing sets

training = int(len(padded_reviews) * 0.8)
validation = int(training + len(padded_reviews)*0.1)

train_x = padded_reviews[:training]
train_y = np.array(encoded_labels[:training])

val_x = padded_reviews[training:validation]
val_y = np.array(encoded_labels[training:validation])

test_x = padded_reviews[validation:]
test_y = np.array(encoded_labels[validation:])

In [None]:
len(encoded_labels)

In [None]:
len(features)

train_loader = DataLoader(dataset,
    batch_size=1,
    shuffle=False,
    sampler=None,
    batch_sampler=None,
    num_workers=0,
    collate_fn=None,
    pin_memory=False,
    drop_last=False,
    timeout=0,
    worker_init_fn=None,
    multiprocessing_context=None)

In [None]:
train_x.shape

In [None]:
split_frac = 0.8

## split data into training, validation, and test data (features and labels, x and y)

split_idx = int(len(features)*split_frac)
train_x, remaining_x = features[:split_idx], features[split_idx:]
train_y, remaining_y = np.array(encoded_labels[:split_idx]), np.array(encoded_labels[split_idx:])

test_idx = int(len(remaining_x)*0.5)
val_x, test_x = remaining_x[:test_idx], remaining_x[test_idx:]
val_y, test_y = remaining_y[:test_idx], remaining_y[test_idx:]

## print out the shapes of your resultant feature data
print("\t\t\tFeature Shapes:")
print("Train set: \t\t{}".format(train_x.shape), 
      "\nValidation set: \t{}".format(val_x.shape),
      "\nTest set: \t\t{}".format(test_x.shape))

In [None]:
train_dataset = TensorDataset(torch.from_numpy(train_x), torch.from_numpy(train_y))
valid_dataset = TensorDataset(torch.from_numpy(val_x), torch.from_numpy(val_y))
test_dataset = TensorDataset(torch.from_numpy(test_x), torch.from_numpy(test_y))

In [None]:
train_loader = DataLoader(dataset=train_dataset, batch_size=50, shuffle=True)
valid_loader = DataLoader(dataset=valid_dataset, batch_size=50, shuffle=True)
test_loader = DataLoader(dataset=test_dataset, batch_size=50, shuffle=True)

In [None]:
### Testing

testing = DataLoader(dataset=train_dataset, batch_size=1, shuffle=True)

In [None]:
dataiter = next(iter(testing))

In [None]:
for x, y in testing:
    x = x.long()

In [None]:
type(x)

In [None]:
x.shape

In [None]:
y.shape

In [None]:
y.squeeze()

In [None]:
y

In [None]:
# Embedding arguments(input, output)
# num_embeddings: size of the vocab
# embedding_dim: the size to which you want to embed. Reduce the input to
vocab_size = 73919

embedding = nn.Embedding(vocab_size, 400)

In [None]:
embedding_output = embedding(x)

In [None]:
# shape(batch, seq, feature)
embedding_output.shape

### LSTM Layer

Put the embedding output into the lstm layer
- parameters: input_size, hidden_size, num_layers, batch_first
    - num of recurrent layers. Seting this to 2 stacks two LSTMs together to form a stacked LSTM, with the second LSTM taking in outputs from the first LSTM and computing the final results.
    - batch_first: if true then the input and output tensors are provided as (batch, seq, feature)


### Initializing the hidden state

Zero initial hiddenstate is standard and this is the default if we dont pass in a hidden state

In [None]:
lstm = nn.LSTM(input_size=400, hidden_size=256, num_layers=2, batch_first=True, dropout=0.5)

initializing the hidden state to zeroes

The hidden and cell state reset to zero for every epoch so you don't need to initialize them unless you are initializing them to something other than zero.

Since I have n_layers equal to 2, the output is a packed sequence. So I need to unpack

In [None]:
lstm_output, hidden = lstm(embedding_output)

In [None]:
lstm_output.shape

In [None]:
test_hidden = tuple([each.data for each in hidden])

In [None]:
test_hidden.shape

In [None]:
len(test_hidden)

In [None]:
lstm_output.size(0)

The next step is to pass the vector into the fully connected layer. The fc layer expects 1D vectors.
In order to do that I need to flatten the vector -- the resulting shape is going to be (1, rowsxcols)
so in this case is going to be (1, 50*200)


If there is any situation that you don't know how many rows you want but are sure of the number of columns, then you can specify this with a -1. (Note that you can extend this to tensors with more dimensions. Only one of the axis value can be -1). This is a way of telling the library: "give me a tensor that has these many columns and you compute the appropriate number of rows that is necessary to make this happen".

https://stackoverflow.com/questions/42479902/how-does-the-view-method-work-in-pytorch

The view method returns a tensor with the same data as the self tensor (which means that the returned tensor has the same number of elements), but with a different shape. 

you have 10,000 elements each element is represented by 256
The 256 are going to go to the linear and are going to make an output of 1

In [None]:
# unpacking
lstm_output = lstm_output.contiguous().view(-1, 256)

In [None]:
lstm_output.shape

I'm aware the LSTM cell uses both sigmoid and tanh activation functions internally, however when creating a stacked LSTM architecture does it make sense to pass their outputs through an activation function (e.g. ReLU)?

https://stats.stackexchange.com/questions/444923/activation-function-between-lstm-layers

Given that ReLUs can have quite large outputs, they have traditionally been regarded as inappropriate for use with LSTMs.

a probability of dropout around 0.5 for hidden units and 0.2 for inputs worked well for a variety of tasks.

The core concept of Srivastava el al. (2014) is that “each hidden unit in a neural network trained with dropout must learn to work with a randomly chosen sample of other units. This should make each hidden unit more robust and drive it towards creating useful features on its own without relying on other hidden units to correct its mistakes.”.

In [None]:
lstm_output

In [None]:
Dropout = nn.Dropout(0.2)

In [None]:
lstm_output_dropout = Dropout(lstm_output)

In [None]:
lstm_output_dropout.shape

In [None]:
lstm_output_dropout

In [None]:
fc = nn.Linear(256,1)

In [None]:
fc_output = fc(lstm_output_dropout)

In [None]:
fc_output.shape

In [None]:
# apply a sigmoid function to trans the output to a probability value
sigmoid = nn.Sigmoid()

In [None]:
sigmoid_output = sigmoid(fc_output)

In [None]:
sigmoid_output[:4]

In [None]:
sigmoid_output.shape

In [None]:
sigmoid_out = sigmoid_output.view(1,-1)

In [None]:
sigmoid_out.shape

In [None]:
sigmoid_output[:4]

In [None]:
sigmoid_output.shape

In [None]:
sigmoid_output[:,-1]

In [None]:
y.squeeze()

embedding = nn.Embedding(vocab_size, 400)
embedding_output = embedding(x)

lstm = nn.LSTM(input_size=400, hidden_size=256, num_layers=2, batch_first=True, dropout=0.5)
lstm_output, hidden = lstm(embedding_output)
lstm_output = lstm_output.contiguous().view(-1, 256)

Dropout = nn.Dropout(0.2)
lstm_output_dropout = Dropout(lstm_output)

fc = nn.Linear(256,1)
fc_output = fc(lstm_output_dropout)

sigmoid = nn.Sigmoid()
sigmoid_output = sigmoid(fc_output)

In [None]:
# num_embeddings = vocab_size
# embedding_dim = embedding_output = 400
class RNN(nn.Module):
    
    def __init__(self, num_embeddings, embedding_dim, hidden_size, num_layers, batch_first, dropout=0.5, output_features=1):
        super(RNN, self).__init__()
        
        self.vocab_size = vocab_size
        self.embedding_dim = embedding_dim
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.batch_first = batch_first
        self.dropout = dropout
        self.output_features = output_features
        
        self.embedding = nn.Embedding(num_embeddings, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_size, num_layers, dropout=0.5, batch_first = True)
        
        self.dropout = nn.Dropout(0.2)
        self.fc = nn.Linear(hidden_size, output_features)
        self.sigmoid = nn.Sigmoid()
        
        
    def forward(self, x, hidden):

        batch_size = x.size(0)
        
        x = x.long()
        embedding_output = self.embedding(x)

        lstm_output, hidden = self.lstm(embedding_output)
        lstm_output = lstm_output.contiguous().view(-1, self.hidden_size)
        lstm_output_dropout = self.dropout(lstm_output)

        fc_output = self.fc(lstm_output_dropout)

        sigmoid_output = self.sigmoid(fc_output)
        
        sigmoid_output = sigmoid_output.view(batch_size, -1)
        sigmoid_output = sigmoid_output[:, -1]

        return sigmoid_output, hidden


    def init_hidden(self, batch_size):

        weight = next(self.parameters()).data

        if train_on_gpu:
            hidden = (weight.new(self.num_layers, batch_size, self.hidden_size).zero_().cuda(),
                     weight.new(self.num_layers, batch_size, self.hidden_size).zero_().cuda())
        else:
            hidden = (weight.new(self.num_layers, batch_size, self.hidden_size).zero_(),
                     weight.new(self.num_layers, batch_size, self.hidden_size).zero_())

        return hidden

In [None]:
vocab_size = 73919
num_embeddings = vocab_size + 1
embedding_dim = 400
hidden_size = 256
num_layers = 2
batch_size = 50

In [None]:
model = RNN(num_embeddings=num_embeddings, embedding_dim=embedding_dim, 
            hidden_size = hidden_size, num_layers= num_layers, output_features=1, batch_first=True)

In [None]:
# defining loss and optimization
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
criterion= nn.BCELoss()

In [None]:
train_on_gpu = torch.cuda.is_available()

In [None]:
if train_on_gpu:
    print('training on GPU')
else:
    print('GPU is not available')

In [None]:
# passing the model to gpu
model.cuda()

In [None]:
epochs = 4
print_every = 100
counter = 0
clip = 5

In [None]:
model.train()

for e in range(epochs):
    
    hidden = model.init_hidden(batch_size)
    
    for x, y in train_loader:
        
        counter += 1
        if train_on_gpu:
            x, y = x.cuda(), y.cuda()
            
        hidden = tuple([each.data for each in hidden])
        
        model.zero_grad()
        output, hidden = model(x, hidden)
        
        loss = criterion(output.squeeze(), y.float().squeeze())
        loss.backward()
        
        nn.utils.clip_grad_norm_(model.parameters(), clip)
        optimizer.step()
        
        
        if counter % print_every == 0:
            
            val_hidden = model.init_hidden(batch_size)
            val_losses = []
            model.eval()
            
            for x, y in valid_loader:
                if train_on_gpu:
                    x, y = x.cuda(), y.cuda()
                    
                output, hidden = model(x, val_hidden)
                #print(output.shape)
                #print(y.float().shape)
                val_loss = criterion(output.squeeze(), y.float().squeeze())

                val_losses.append(val_loss)

            model.train()

            print("Epoch: {}/{}...".format(e+1, epochs),
                "Step: {}...".format(counter),
                "Loss: {:.4f}...".format(loss.item()),
                  print(val_losses),
                "Val Loss: {:.4f}".format(np.mean(val_losses)))

In [None]:
import torch.nn as nn

class SentimentRNN(nn.Module):
    """
    The RNN model that will be used to perform Sentiment analysis.
    """

    def __init__(self, vocab_size, output_size, embedding_dim, hidden_dim, n_layers, drop_prob=0.5):
        """
        Initialize the model by setting up the layers.
        """
        super(SentimentRNN, self).__init__()

        self.output_size = output_size
        self.n_layers = n_layers
        self.hidden_dim = hidden_dim
        
        # embedding and LSTM layers
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, n_layers, 
                            dropout=drop_prob, batch_first=True)
        
        # dropout layer
        self.dropout = nn.Dropout(0.3)
        
        # linear and sigmoid layers
        self.fc = nn.Linear(hidden_dim, output_size)
        self.sig = nn.Sigmoid()
        

    def forward(self, x, hidden):
        """
        Perform a forward pass of our model on some input and hidden state.
        """
        batch_size = x.size(0)

        # embeddings and lstm_out
        x = x.long()
        embeds = self.embedding(x)
        lstm_out, hidden = self.lstm(embeds, hidden)
    
        # stack up lstm outputs
        lstm_out = lstm_out.contiguous().view(-1, self.hidden_dim)
        
        # dropout and fully-connected layer
        out = self.dropout(lstm_out)
        out = self.fc(out)
        # sigmoid function
        sig_out = self.sig(out)
        
        # reshape to be batch_size first
        sig_out = sig_out.view(batch_size, -1)
        sig_out = sig_out[:, -1] # get last batch of labels
        
        # return last sigmoid output and hidden state
        return sig_out, hidden
    
    
    def init_hidden(self, batch_size):
        ''' Initializes hidden state '''
        # Create two new tensors with sizes n_layers x batch_size x hidden_dim,
        # initialized to zero, for hidden state and cell state of LSTM
        weight = next(self.parameters()).data
        
        if (train_on_gpu):
            hidden = (weight.new(self.n_layers, batch_size, self.hidden_dim).zero_().cuda(),
                  weight.new(self.n_layers, batch_size, self.hidden_dim).zero_().cuda())
        else:
            hidden = (weight.new(self.n_layers, batch_size, self.hidden_dim).zero_(),
                      weight.new(self.n_layers, batch_size, self.hidden_dim).zero_())
        
        return hidden
        
    

In [None]:
# Instantiate the model w/ hyperparams
vocab_size = 74072 + 1 # +1 for the 0 padding + our word tokens
output_size = 1
embedding_dim = 400
hidden_dim = 256
n_layers = 2

net = SentimentRNN(vocab_size, output_size, embedding_dim, hidden_dim, n_layers)

print(net)

In [None]:
batch_size = 50
lr=0.001

criterion = nn.BCELoss()
optimizer = torch.optim.Adam(net.parameters(), lr=lr)

In [None]:
# training params

epochs = 4 # 3-4 is approx where I noticed the validation loss stop decreasing

counter = 0
print_every = 100
clip=5 # gradient clipping

# move model to GPU, if available
if(train_on_gpu):
    net.cuda()

net.train()
# train for some number of epochs
for e in range(epochs):
    # initialize hidden state
    h = net.init_hidden(batch_size)

    # batch loop
    for inputs, labels in train_loader:
        counter += 1

        if(train_on_gpu):
            inputs, labels = inputs.cuda(), labels.cuda()

        # Creating new variables for the hidden state, otherwise
        # we'd backprop through the entire training history
        h = tuple([each.data for each in h])

        # zero accumulated gradients
        net.zero_grad()

        # get the output from the model
        output, h = net(inputs, h)

        # calculate the loss and perform backprop
        loss = criterion(output.squeeze(), labels.float())
        loss.backward()
        # `clip_grad_norm` helps prevent the exploding gradient problem in RNNs / LSTMs.
        nn.utils.clip_grad_norm_(net.parameters(), clip)
        optimizer.step()

        # loss stats
        if counter % print_every == 0:
            # Get validation loss
            val_h = net.init_hidden(batch_size)
            val_losses = []
            net.eval()
            for inputs, labels in valid_loader:

                # Creating new variables for the hidden state, otherwise
                # we'd backprop through the entire training history
                val_h = tuple([each.data for each in val_h])

                if(train_on_gpu):
                    inputs, labels = inputs.cuda(), labels.cuda()

                output, val_h = net(inputs, val_h)
                val_loss = criterion(output.squeeze(), labels.float())

                val_losses.append(val_loss.item())

            net.train()
            print("Epoch: {}/{}...".format(e+1, epochs),
                  "Step: {}...".format(counter),
                  "Loss: {:.6f}...".format(loss.item()),
                  "Val Loss: {:.6f}".format(np.mean(val_losses)))