# Sentiment Analysis Using RNN

In [1]:
import numpy as np
from string import punctuation
from collections import Counter, OrderedDict
import itertools

import torch 
from torch.utils.data import TensorDataset, DataLoader
import torch.nn as nn

import nltk
from nltk.corpus import stopwords

In [2]:
with open('deep-learning-v2-pytorch/sentiment-analysis-network/reviews.txt', 'r') as f:
    reviews = f.read()
with open('deep-learning-v2-pytorch/sentiment-analysis-network/labels.txt', 'r') as f:
    labels = f.read()

In [3]:
# This function will recieve the imported reviews (ch by ch) and return 
def clean_text(text):
    ''' This Function recieves reviews (ch by ch) and returns a list of
    reviews without punctuation and stopwords'''
    # remove punctuation
    s = ''.join(ch.lower() for ch in text if ch not in punctuation)
    
    # separate each review and add to a list so that I have a list of reviews
    separated_reviews = []

    for review in s.split('\n'):
        review = ''.join(review)
        separated_reviews.append(review)
        
        
    # remove stopwords and return a list of reviews
    clean_text = []
    for review in separated_reviews:
        review_no_stopwords = []
        for word in review.split():
            if word not in stopwords.words('english'):
                review_no_stopwords.append(word)
        clean_text.append(' '.join(review_no_stopwords))
        
    return clean_text

In [4]:
# map each word to a number 

In [5]:
reviews = clean_text(reviews)

In [6]:
reviews_copy = reviews

In [42]:
len(reviews_copy)

25001

In [45]:
len(reviews)

25001

In [46]:
class ReviewEncoder:
    def __init__(self):
        self.__words_dict = {}
        self.__indexer = 1
    def encode(self, text):
        encoded_review = []
        words = text.split()
        #print(words)
        for word in words:
            if word in self. __words_dict:
                encoded_review.append(self.__words_dict[word])
            else:
                self.__words_dict[word] = self.__indexer
                self.__indexer += 1
                encoded_review.append(self.__words_dict[word])
        return encoded_review
    
    def len_dict(self):
        return len(self.__words_dict)

In [47]:
encoder = ReviewEncoder()

In [50]:
encoded_reviews = []
for review in reviews:
    encoded_reviews.append(encoder.encode(review))

In [51]:
len(encoded_reviews)

25001

In [52]:
vocab_size = encoder.len_dict()

In [53]:
encoded_labels = encoder.encode(labels)

In [54]:
# create a function to check for reviews with length zero and dropping them

def drop_empty_reviews(text):
    
    full_reviews = []
    
    for index, review in enumerate(text):
        if len(review) != 0:
            full_reviews.append(review)
            
    return full_reviews

In [55]:
reviews = drop_empty_reviews(encoded_reviews)

In [56]:
def padding_truncation(encoded_review_list):
    
    padded_review = []
    for review in encoded_review_list:
        if len(review) < 200:
            padding = 200 - len(review)
            review = ([0]*padding + review)
            padded_review.append(review)
        elif len(review) > 200:
            truncate = len(review) - 200
            review = review[truncate:]
            padded_review.append(review)
        else:
            padded_review.append(review)
            
    return padded_review

In [57]:
padded_reviews = padding_truncation(reviews)

In [58]:
len(padded_reviews)

25000

In [59]:
check = []
for review in padded_reviews:
    if len(review) < 200 or len(review) > 200:
        check.append(review)
print(len(check))

0


In [60]:
# Defining training, validation and testing sets

training = int(len(padded_reviews) * 0.8)
validation = int(training + len(padded_reviews)*0.1)

train_x = np.array(padded_reviews[:training])
train_y = np.array(encoded_labels[:training])

val_x = np.array(padded_reviews[training:validation])
val_y = np.array(encoded_labels[training:validation])

test_x = np.array(padded_reviews[validation:])
test_y = np.array(encoded_labels[validation:])

In [61]:
len(padded_reviews)

25000

train_loader = DataLoader(dataset,
    batch_size=1,
    shuffle=False,
    sampler=None,
    batch_sampler=None,
    num_workers=0,
    collate_fn=None,
    pin_memory=False,
    drop_last=False,
    timeout=0,
    worker_init_fn=None,
    multiprocessing_context=None)

In [62]:
train_x.shape

(20000, 200)

In [356]:
train_dataset = TensorDataset(torch.from_numpy(train_x).long(), torch.from_numpy(train_y))
valid_dataset = TensorDataset(torch.from_numpy(val_x).long(), torch.from_numpy(val_y))
test_dataset = TensorDataset(torch.from_numpy(test_x).long(), torch.from_numpy(test_y))

In [357]:
train_loader = DataLoader(dataset=train_dataset, batch_size=50, shuffle=True)
valid_loader = DataLoader(dataset=valid_dataset, batch_size=50, shuffle=True)
test_loader = DataLoader(dataset=test_dataset, batch_size=50, shuffle=True)

In [424]:
### Testing

testing = DataLoader(dataset=train_dataset, batch_size=2, shuffle=True)

In [425]:
dataiter = next(iter(testing))

In [426]:
for x, y in testing:
    x = x.long()

In [427]:
x

tensor([[    0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,   601,
          1921,   595, 11799,   387,  6645,  1735,  7818,  3193,    17,    98,
          1275,  2224,  1454,  4142, 11800,    61,  2434,   413,   700,  2750,
           988,  1205,  1222,  1358,   382, 11801,   397,   595,   356,    47,
          1788, 11802,   148,   148,  7818,  3193,   339,   260,    57,   410,
          1704,  1753,  8963,   200,   428,  1467,  1911,  1601,   163, 10219,
         11803,  5251, 11792,  7837,  8019, 11792,  

In [428]:
type(x)

torch.Tensor

In [429]:
x.shape

torch.Size([2, 200])

In [161]:
# Embedding arguments(input, output)
# num_embeddings: size of the vocab
# embedding_dim: the size to which you want to embed. Reduce the input to
embedding = nn.Embedding(vocab_size, 400)

In [162]:
embedding_output = embedding(x)

In [163]:
# shape(batch, seq, feature)
embedding_output.shape

torch.Size([2, 200, 400])

### LSTM Layer

Put the embedding output into the lstm layer
- parameters: input_size, hidden_size, num_layers, batch_first
    - num of recurrent layers. Seting this to 2 stacks two LSTMs together to form a stacked LSTM, with the second LSTM taking in outputs from the first LSTM and computing the final results.
    - batch_first: if true then the input and output tensors are provided as (batch, seq, feature)


### Initializing the hidden state

Zero initial hiddenstate is standard and this is the default if we dont pass in a hidden state

In [164]:
lstm = nn.LSTM(input_size=400, hidden_size=256, num_layers=2, batch_first=True, dropout=0.5)

initializing the hidden state to zeroes

The hidden and cell state reset to zero for every epoch so you don't need to initialize them unless you are initializing them to something other than zero.

Since I have n_layers equal to 2, the output is a packed sequence. So I need to unpack

In [165]:
lstm_output, hidden = lstm(embedding_output)

In [166]:
lstm_output.shape

torch.Size([2, 200, 256])

(tensor([[[-0.2082,  0.4071,  0.0266,  ..., -0.1071, -0.1440, -0.0391],
          [-0.3084, -0.1051,  0.0173,  ...,  0.1882, -0.1369, -0.0421]],
 
         [[-0.0206,  0.0158, -0.0287,  ...,  0.0335, -0.0180, -0.0492],
          [-0.0426,  0.0082,  0.0173,  ...,  0.0759, -0.0195, -0.0333]]],
        grad_fn=<StackBackward>),
 tensor([[[-0.3053,  0.6159,  0.0494,  ..., -0.3238, -0.3394, -0.0859],
          [-0.5425, -0.1814,  0.0623,  ...,  0.2383, -0.4649, -0.0806]],
 
         [[-0.0391,  0.0354, -0.0598,  ...,  0.0774, -0.0354, -0.0970],
          [-0.0794,  0.0153,  0.0434,  ...,  0.1414, -0.0426, -0.0596]]],
        grad_fn=<StackBackward>))

In [306]:
test_hidden = tuple([each.data for each in hidden])

In [307]:
test_hidden

(tensor([[[-0.2082,  0.4071,  0.0266,  ..., -0.1071, -0.1440, -0.0391],
          [-0.3084, -0.1051,  0.0173,  ...,  0.1882, -0.1369, -0.0421]],
 
         [[-0.0206,  0.0158, -0.0287,  ...,  0.0335, -0.0180, -0.0492],
          [-0.0426,  0.0082,  0.0173,  ...,  0.0759, -0.0195, -0.0333]]]),
 tensor([[[-0.3053,  0.6159,  0.0494,  ..., -0.3238, -0.3394, -0.0859],
          [-0.5425, -0.1814,  0.0623,  ...,  0.2383, -0.4649, -0.0806]],
 
         [[-0.0391,  0.0354, -0.0598,  ...,  0.0774, -0.0354, -0.0970],
          [-0.0794,  0.0153,  0.0434,  ...,  0.1414, -0.0426, -0.0596]]]))

In [310]:
len(test_hidden)

2

In [167]:
lstm_output.size(0)

2

The next step is to pass the vector into the fully connected layer. The fc layer expects 1D vectors.
In order to do that I need to flatten the vector -- the resulting shape is going to be (1, rowsxcols)
so in this case is going to be (1, 50*200)


If there is any situation that you don't know how many rows you want but are sure of the number of columns, then you can specify this with a -1. (Note that you can extend this to tensors with more dimensions. Only one of the axis value can be -1). This is a way of telling the library: "give me a tensor that has these many columns and you compute the appropriate number of rows that is necessary to make this happen".

https://stackoverflow.com/questions/42479902/how-does-the-view-method-work-in-pytorch

The view method returns a tensor with the same data as the self tensor (which means that the returned tensor has the same number of elements), but with a different shape. 

you have 10,000 elements each element is represented by 256
The 256 are going to go to the linear and are going to make an output of 1

In [168]:
# unpacking
lstm_output = lstm_output.contiguous().view(-1, 256)

In [170]:
lstm_output.shape

torch.Size([400, 256])

I'm aware the LSTM cell uses both sigmoid and tanh activation functions internally, however when creating a stacked LSTM architecture does it make sense to pass their outputs through an activation function (e.g. ReLU)?

https://stats.stackexchange.com/questions/444923/activation-function-between-lstm-layers

Given that ReLUs can have quite large outputs, they have traditionally been regarded as inappropriate for use with LSTMs.

a probability of dropout around 0.5 for hidden units and 0.2 for inputs worked well for a variety of tasks.

The core concept of Srivastava el al. (2014) is that “each hidden unit in a neural network trained with dropout must learn to work with a randomly chosen sample of other units. This should make each hidden unit more robust and drive it towards creating useful features on its own without relying on other hidden units to correct its mistakes.”.

In [171]:
lstm_output

tensor([[-0.0159, -0.0085,  0.0009,  ..., -0.0335, -0.0221,  0.0253],
        [-0.0349, -0.0438, -0.0341,  ...,  0.0064, -0.0208, -0.0084],
        [ 0.0294, -0.0283, -0.0673,  ..., -0.0264, -0.0305,  0.0291],
        ...,
        [ 0.0219,  0.0046,  0.0161,  ...,  0.1062, -0.0486, -0.0518],
        [-0.0319, -0.0146,  0.0177,  ...,  0.0467, -0.0167, -0.0606],
        [-0.0426,  0.0082,  0.0173,  ...,  0.0759, -0.0195, -0.0333]],
       grad_fn=<ViewBackward>)

In [172]:
Dropout = nn.Dropout(0.2)

In [173]:
lstm_output_dropout = Dropout(lstm_output)

In [174]:
lstm_output_dropout.shape

torch.Size([400, 256])

In [175]:
lstm_output_dropout

tensor([[-0.0000, -0.0000,  0.0000,  ..., -0.0419, -0.0276,  0.0316],
        [-0.0436, -0.0000, -0.0000,  ...,  0.0080, -0.0260, -0.0000],
        [ 0.0368, -0.0354, -0.0842,  ..., -0.0330, -0.0000,  0.0363],
        ...,
        [ 0.0273,  0.0000,  0.0201,  ...,  0.1327, -0.0608, -0.0000],
        [-0.0398, -0.0182,  0.0221,  ...,  0.0584, -0.0209, -0.0758],
        [-0.0533,  0.0103,  0.0216,  ...,  0.0949, -0.0244, -0.0416]],
       grad_fn=<MulBackward0>)

In [176]:
fc = nn.Linear(256,1)

In [177]:
fc_output = fc(lstm_output_dropout)

In [178]:
fc_output[:4]

tensor([[-0.1036],
        [-0.0853],
        [-0.0626],
        [-0.1280]], grad_fn=<SliceBackward>)

In [179]:
# apply a sigmoid function to trans the output to a probability value
sigmoid = nn.Sigmoid()

In [194]:
sigmoid_output = sigmoid(fc_output)

In [195]:
sigmoid_output[:4]

tensor([[0.4741],
        [0.4787],
        [0.4844],
        [0.4681]], grad_fn=<SliceBackward>)

In [196]:
sigmoid_output.shape

torch.Size([400, 1])

In [197]:
sigmoid_out = sigmoid_output.view(2,-1)

In [198]:
sigmoid_out.shape

torch.Size([2, 200])

In [189]:
sigmoid_output[:4]

tensor([[0.4741],
        [0.4787],
        [0.4844],
        [0.4681]], grad_fn=<SliceBackward>)

In [190]:
sigmoid_output.shape

torch.Size([400, 1])

In [192]:
sigmoid_output[:,-1].shape

torch.Size([400])

embedding = nn.Embedding(vocab_size, 400)
embedding_output = embedding(x)

lstm = nn.LSTM(input_size=400, hidden_size=256, num_layers=2, batch_first=True, dropout=0.5)
lstm_output, hidden = lstm(embedding_output)
lstm_output = lstm_output.contiguous().view(-1, 256)

Dropout = nn.Dropout(0.2)
lstm_output_dropout = Dropout(lstm_output)

fc = nn.Linear(256,1)
fc_output = fc(lstm_output_dropout)

sigmoid = nn.Sigmoid()
sigmoid_output = sigmoid(fc_output)

In [457]:
# num_embeddings = vocab_size
# embedding_dim = embedding_output = 400
class RNN(nn.Module):
    
    def __init__(self, num_embeddings, embedding_dim, hidden_size, num_layers, batch_first, dropout=0.5, output_features=1):
        super(RNN, self).__init__()
        
        self.vocab_size = vocab_size
        self.embedding_dim = embedding_dim
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.batch_first = batch_first
        self.dropout = dropout
        self.output_features = output_features
        
        self.embedding = nn.Embedding(num_embeddings, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_size, num_layers, dropout=0.5, batch_first = True)
        
        self.dropout = nn.Dropout(0.2)
        self.fc = nn.Linear(hidden_size, output_features)
        self.sigmoid = nn.Sigmoid()
        
        
    def forward(self, x, hidden):

        batch_size = x.size(0)

        embedding_output = self.embedding(x)

        lstm_output, hidden = self.lstm(embedding_output)
        lstm_output = lstm_output.contiguous().view(-1, self.hidden_size)
        lstm_output_dropout = self.dropout(lstm_output)

        fc_output = self.fc(lstm_output_dropout)

        sigmoid_output = self.sigmoid(fc_output)
        
        sigmoid_output = sigmoid_output.view(batch_size, -1)
        sigmoid_output = sigmoid_output[:, -1]

        return sigmoid_out, hidden


    def init_hidden(self, batch_size):

        weight = next(self.parameters()).data

        if train_on_gpu:
            hidden = (weight.new(self.num_layers, batch_size, self.hidden_size).zero_().cuda(),
                     weight.new(self.num_layers, batch_size, self.hidden_size).zero_().cuda())
        else:
            hidden = (weight.new(self.num_layers, batch_size, self.hidden_size).zero_(),
                     weight.new(self.num_layers, batch_size, self.hidden_size).zero_())

        return hidden

In [458]:
num_embeddings = vocab_size + 1
embedding_dim = 400
hidden_size = 256
num_layers = 2

In [459]:
model = RNN(num_embeddings=num_embeddings, embedding_dim=embedding_dim, 
            hidden_size = hidden_size, num_layers= num_layers, output_features=1, batch_first=True)

In [460]:
# defining loss and optimization
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
criterion= nn.BCELoss()

In [461]:
train_on_gpu = torch.cuda.is_available()

In [462]:
if train_on_gpu:
    print('training on GPU')
else:
    print('GPU is not available')

training on GPU


In [463]:
# passing the model to gpu
model.cuda()

RNN(
  (embedding): Embedding(73920, 400)
  (lstm): LSTM(400, 256, num_layers=2, batch_first=True, dropout=0.5)
  (dropout): Dropout(p=0.2, inplace=False)
  (fc): Linear(in_features=256, out_features=1, bias=True)
  (sigmoid): Sigmoid()
)

In [464]:
epochs = 4
print_every = 100
counter = 0
clip = 5

In [465]:
model.train()

for e in range(epochs):
    
    hidden = model.init_hidden(batch_size)
    for x, y in train_loader:
        
        counter += 1
        if train_on_gpu:
            x, y = x.cuda(), y.cuda()
            
        #hidden = tuple([each.data for each in hidden])
        
        model.zero_grad()
        output, hidden = model(x, hidden)
        
        loss = criterion(output, y.float())
        loss.backward()
        
        nn.utils.clip_grad_norm_(model.parameters(), clip)
        optimizer.step()
        
        
        if counter % print_every == 0:
            
            val_hidden = model.init_hidden(batch_size)
            val_losses = []
            model.eval()
            
            for x, y in valid_loader:
                if train_on_gpu:
                    x, y = x.cuda(), y.cuda()
                
                output, hidden = model(x, val_hidden)
                val_loss = criterion(output.squeeze(), y.float())

                val_losses.append(val_loss)

            model.train()

            print("Epoch: {}/{}...".format(e+1, epochs),
                "Step: {}...".format(counter),
                "Loss: {:.4f}...".format(loss.item()),
                "Val Loss: {:.4f}".format(np.mean(val_losses)))

ValueError: Target and input must have the same number of elements. target nelement (50) != input nelement (400)

In [466]:
# embedding layer -> LSTM layer --> sigmoid
# embedding layer: 
    # nn.Embedding
        # Input: (LongTensor) (num_embeddings, embedding_dim)
        # Output: input shape, embedding_dim
# LSTM layer
    # Input: (tensor w initial hidden state for each element in the batch,
         # tensor w the initiall cell state for each elt in the batch)
    # Output: (hidden state for t=seq length, cell state for t=seq_length)
# sigmoid layer

class RNN(nn.Module):
    
    def __init__(self):
        # super is used to execute a method in a parent class
        super(RNN, self).__init__()
        
        # self.embedding is a property of the module class of the type Embedding
        # num_embeddings is the vocab size
        self.embedding = nn.Embedding(num_embeddings, embedding_dim)
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True, dropout=0.3)
        self.sigmoid = nn.Sigmoid()
    
    def forward(self, x):
        
        x = x.long()
        embedding_output = self.embedding(x)
        #output, (hn, cn) = self.lstm(embedding_output, (h0, c0))
        output, (hn, cn) = self.lstm(embedding_output, (h0, c0))
        # the output is a packed sequence
        # after padding, there is a lot more computation than necessary
        # packing flattens the sequences (columns). The flattened version does not include the zeroes.
        # I need to unpack the output
        # THIS GIVES OUTPUTS of the shape (batch_size, lstm_size). You
        # can use these directly for further input but if you want to use the inter
        # mediate outputs as you need to unpack
        output = output.view(seq_len, batch)
        
    