# Sentiment Analysis Using RNN

In [1]:
import numpy as np
import pandas as pd
from string import punctuation
from collections import Counter, OrderedDict
import itertools

import torch 
from torch.utils.data import TensorDataset, DataLoader
import torch.nn as nn

import nltk
from nltk.corpus import stopwords

In [None]:
with open('deep-learning-v2-pytorch/sentiment-analysis-network/reviews.txt', 'r') as f:
    reviews = f.read()
with open('deep-learning-v2-pytorch/sentiment-analysis-network/labels.txt', 'r') as f:
    labels = f.read()

In [None]:
# This function will recieve the imported reviews (ch by ch) and return 
def clean_text(text):
    ''' This Function recieves reviews (ch by ch) and returns a list of
    reviews without punctuation and stopwords'''
    # remove punctuation
    s = ''.join(ch.lower() for ch in text if ch not in punctuation)
    
    # separate each review and add to a list so that I have a list of reviews
    separated_reviews = []

    for review in s.split('\n'):
        review = ''.join(review)
        separated_reviews.append(review)
        
        
    # remove stopwords and return a list of reviews
    clean_text = []
    for review in separated_reviews:
        review_no_stopwords = []
        for word in review.split():
            if word not in stopwords.words('english'):
                review_no_stopwords.append(word)
        clean_text.append(' '.join(review_no_stopwords))
        
    return clean_text

In [None]:
# map each word to a number 

In [None]:
reviews = clean_text(reviews)

In [None]:
class ReviewEncoder:
    def __init__(self):
        self.__words_dict = {}
        self.__indexer = 1
    def encode(self, text):
        encoded_review = []
        words = text.split()
        #print(words)
        for word in words:
            if word in self. __words_dict:
                encoded_review.append(self.__words_dict[word])
            else:
                self.__words_dict[word] = self.__indexer
                self.__indexer += 1
                encoded_review.append(self.__words_dict[word])
        return encoded_review
    
    def len_dict(self):
        return len(self.__words_dict)

In [None]:
encoder = ReviewEncoder()

In [None]:
encoded_reviews = []
for review in reviews:
    encoded_reviews.append(encoder.encode(review))

In [None]:
len(encoded_reviews)

In [None]:
vocab_size = encoder.len_dict()

In [None]:
vocab_size

In [None]:
encoded_labels = encoder.encode(labels)

In [None]:
#df_labels = pd.DataFrame(encoded_labels)

In [None]:
#df_labels.to_csv('padded_labels')

In [None]:
# create a function to check for reviews with length zero and dropping them

def drop_empty_reviews(text):
    
    full_reviews = []
    
    for index, review in enumerate(text):
        if len(review) != 0:
            full_reviews.append(review)
            
    return full_reviews

In [None]:
reviews = drop_empty_reviews(encoded_reviews)

In [None]:
def padding_truncation(encoded_review_list):
    
    padded_review = []
    for review in encoded_review_list:
        if len(review) < 200:
            padding = 200 - len(review)
            review = ([0]*padding + review)
            padded_review.append(review)
        elif len(review) > 200:
            truncate = len(review) - 200
            review = review[truncate:]
            padded_review.append(review)
        else:
            padded_review.append(review)
            
    return padded_review

In [None]:
padded_reviews = padding_truncation(reviews)

In [None]:
len(padded_reviews)

In [None]:
check = []
for review in padded_reviews:
    if len(review) < 200 or len(review) > 200:
        check.append(review)
print(len(check))

In [None]:
# df = pd.DataFrame(padded_reviews)

In [None]:
#df.to_csv('padded_reviews')

In [2]:
df = pd.read_csv('padded_reviews', index_col=0)
df_labels = pd.read_csv('padded_labels', index_col=0)

In [3]:
padded_reviews = df.values.tolist()
padded_labels = df_labels.values.tolist()

In [5]:
padded_labels.info

AttributeError: 'list' object has no attribute 'info'

In [None]:
# Defining training, validation and testing sets

training = int(len(padded_reviews) * 0.8)
validation = int(training + len(padded_reviews)*0.1)

train_x = np.array(padded_reviews[:training])
train_y = np.array(encoded_labels[:training])

val_x = np.array(padded_reviews[training:validation])
val_y = np.array(encoded_labels[training:validation])

test_x = np.array(padded_reviews[validation:])
test_y = np.array(encoded_labels[validation:])

In [None]:
len(padded_reviews)

train_loader = DataLoader(dataset,
    batch_size=1,
    shuffle=False,
    sampler=None,
    batch_sampler=None,
    num_workers=0,
    collate_fn=None,
    pin_memory=False,
    drop_last=False,
    timeout=0,
    worker_init_fn=None,
    multiprocessing_context=None)

In [None]:
train_x.shape

In [None]:
train_dataset = TensorDataset(torch.from_numpy(train_x).long(), torch.from_numpy(train_y))
valid_dataset = TensorDataset(torch.from_numpy(val_x).long(), torch.from_numpy(val_y))
test_dataset = TensorDataset(torch.from_numpy(test_x).long(), torch.from_numpy(test_y))

In [None]:
train_loader = DataLoader(dataset=train_dataset, batch_size=50, shuffle=True)
valid_loader = DataLoader(dataset=valid_dataset, batch_size=50, shuffle=True)
test_loader = DataLoader(dataset=test_dataset, batch_size=50, shuffle=True)

In [None]:
### Testing

testing = DataLoader(dataset=train_dataset, batch_size=2, shuffle=True)

In [None]:
dataiter = next(iter(testing))

In [None]:
for x, y in testing:
    x = x.long()

In [None]:
type(x)

In [None]:
x.shape

In [None]:
# Embedding arguments(input, output)
# num_embeddings: size of the vocab
# embedding_dim: the size to which you want to embed. Reduce the input to
embedding = nn.Embedding(vocab_size, 400)

In [None]:
embedding_output = embedding(x)

In [None]:
# shape(batch, seq, feature)
embedding_output.shape

### LSTM Layer

Put the embedding output into the lstm layer
- parameters: input_size, hidden_size, num_layers, batch_first
    - num of recurrent layers. Seting this to 2 stacks two LSTMs together to form a stacked LSTM, with the second LSTM taking in outputs from the first LSTM and computing the final results.
    - batch_first: if true then the input and output tensors are provided as (batch, seq, feature)


### Initializing the hidden state

Zero initial hiddenstate is standard and this is the default if we dont pass in a hidden state

In [None]:
lstm = nn.LSTM(input_size=400, hidden_size=256, num_layers=2, batch_first=True, dropout=0.5)

initializing the hidden state to zeroes

The hidden and cell state reset to zero for every epoch so you don't need to initialize them unless you are initializing them to something other than zero.

Since I have n_layers equal to 2, the output is a packed sequence. So I need to unpack

In [None]:
lstm_output, hidden = lstm(embedding_output)

In [None]:
lstm_output.shape

In [None]:
test_hidden = tuple([each.data for each in hidden])

In [None]:
test_hidden

In [None]:
len(test_hidden)

In [None]:
lstm_output.size(0)

The next step is to pass the vector into the fully connected layer. The fc layer expects 1D vectors.
In order to do that I need to flatten the vector -- the resulting shape is going to be (1, rowsxcols)
so in this case is going to be (1, 50*200)


If there is any situation that you don't know how many rows you want but are sure of the number of columns, then you can specify this with a -1. (Note that you can extend this to tensors with more dimensions. Only one of the axis value can be -1). This is a way of telling the library: "give me a tensor that has these many columns and you compute the appropriate number of rows that is necessary to make this happen".

https://stackoverflow.com/questions/42479902/how-does-the-view-method-work-in-pytorch

The view method returns a tensor with the same data as the self tensor (which means that the returned tensor has the same number of elements), but with a different shape. 

you have 10,000 elements each element is represented by 256
The 256 are going to go to the linear and are going to make an output of 1

In [None]:
# unpacking
lstm_output = lstm_output.contiguous().view(-1, 256)

In [None]:
lstm_output.shape

I'm aware the LSTM cell uses both sigmoid and tanh activation functions internally, however when creating a stacked LSTM architecture does it make sense to pass their outputs through an activation function (e.g. ReLU)?

https://stats.stackexchange.com/questions/444923/activation-function-between-lstm-layers

Given that ReLUs can have quite large outputs, they have traditionally been regarded as inappropriate for use with LSTMs.

a probability of dropout around 0.5 for hidden units and 0.2 for inputs worked well for a variety of tasks.

The core concept of Srivastava el al. (2014) is that “each hidden unit in a neural network trained with dropout must learn to work with a randomly chosen sample of other units. This should make each hidden unit more robust and drive it towards creating useful features on its own without relying on other hidden units to correct its mistakes.”.

In [None]:
lstm_output

In [None]:
Dropout = nn.Dropout(0.2)

In [None]:
lstm_output_dropout = Dropout(lstm_output)

In [None]:
lstm_output_dropout.shape

In [None]:
lstm_output_dropout

In [None]:
fc = nn.Linear(256,1)

In [None]:
fc_output = fc(lstm_output_dropout)

In [None]:
fc_output[:4]

In [None]:
# apply a sigmoid function to trans the output to a probability value
sigmoid = nn.Sigmoid()

In [None]:
sigmoid_output = sigmoid(fc_output)

In [None]:
sigmoid_output[:4]

In [None]:
sigmoid_output.shape

In [None]:
sigmoid_out = sigmoid_output.view(2,-1)

In [None]:
sigmoid_out.shape

In [None]:
sigmoid_output[:4]

In [None]:
sigmoid_output.shape

In [None]:
sigmoid_output[:,-1].shape

embedding = nn.Embedding(vocab_size, 400)
embedding_output = embedding(x)

lstm = nn.LSTM(input_size=400, hidden_size=256, num_layers=2, batch_first=True, dropout=0.5)
lstm_output, hidden = lstm(embedding_output)
lstm_output = lstm_output.contiguous().view(-1, 256)

Dropout = nn.Dropout(0.2)
lstm_output_dropout = Dropout(lstm_output)

fc = nn.Linear(256,1)
fc_output = fc(lstm_output_dropout)

sigmoid = nn.Sigmoid()
sigmoid_output = sigmoid(fc_output)

In [None]:
# num_embeddings = vocab_size
# embedding_dim = embedding_output = 400
class RNN(nn.Module):
    
    def __init__(self, num_embeddings, embedding_dim, hidden_size, num_layers, batch_first, dropout=0.5, output_features=1):
        super(RNN, self).__init__()
        
        self.vocab_size = vocab_size
        self.embedding_dim = embedding_dim
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.batch_first = batch_first
        self.dropout = dropout
        self.output_features = output_features
        
        self.embedding = nn.Embedding(num_embeddings, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_size, num_layers, dropout=0.5, batch_first = True)
        
        self.dropout = nn.Dropout(0.2)
        self.fc = nn.Linear(hidden_size, output_features)
        self.sigmoid = nn.Sigmoid()
        
        
    def forward(self, x, hidden):

        batch_size = x.size(0)

        embedding_output = self.embedding(x)

        lstm_output, hidden = self.lstm(embedding_output)
        #lstm_output = lstm_output.contiguous().view(-1, self.hidden_size)
        lstm_output_dropout = self.dropout(lstm_output)

        fc_output = self.fc(lstm_output_dropout)

        sigmoid_output = self.sigmoid(fc_output)
        
        sigmoid_output = sigmoid_output.view(batch_size, -1)
        sigmoid_output = sigmoid_output[:, -1]

        return sigmoid_output, hidden


    def init_hidden(self, batch_size):

        weight = next(self.parameters()).data

        if train_on_gpu:
            hidden = (weight.new(self.num_layers, batch_size, self.hidden_size).zero_().cuda(),
                     weight.new(self.num_layers, batch_size, self.hidden_size).zero_().cuda())
        else:
            hidden = (weight.new(self.num_layers, batch_size, self.hidden_size).zero_(),
                     weight.new(self.num_layers, batch_size, self.hidden_size).zero_())

        return hidden

In [None]:
vocab_size = 73919
num_embeddings = vocab_size + 1
embedding_dim = 400
hidden_size = 256
num_layers = 2
batch_size = 50

In [None]:
model = RNN(num_embeddings=num_embeddings, embedding_dim=embedding_dim, 
            hidden_size = hidden_size, num_layers= num_layers, output_features=1, batch_first=True)

In [None]:
# defining loss and optimization
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
criterion= nn.BCELoss()

In [None]:
train_on_gpu = torch.cuda.is_available()

In [None]:
if train_on_gpu:
    print('training on GPU')
else:
    print('GPU is not available')

In [None]:
# passing the model to gpu
model.cuda()

In [None]:
epochs = 4
print_every = 100
counter = 0
clip = 5

In [None]:
model.train()

for e in range(epochs):
    
    hidden = model.init_hidden(batch_size)
    
    for x, y in train_loader:
        
        counter += 1
        if train_on_gpu:
            x, y = x.cuda(), y.cuda()
            x = x.long()
        #hidden = tuple([each.data for each in hidden])
        
        model.zero_grad()
        output, hidden = model(x, hidden)
        
        loss = criterion(output.squeeze(), y.float())
        loss.backward()
        
        nn.utils.clip_grad_norm_(model.parameters(), clip)
        optimizer.step()
        
        
        if counter % print_every == 0:
            
            val_hidden = model.init_hidden(batch_size)
            val_losses = []
            model.eval()
            
            for x, y in valid_loader:
                if train_on_gpu:
                    x, y = x.cuda(), y.cuda()
                
                output, hidden = model(x, val_hidden)
                val_loss = criterion(output.squeeze(), y.float())

                val_losses.append(val_loss)

            model.train()

            print("Epoch: {}/{}...".format(e+1, epochs),
                "Step: {}...".format(counter),
                "Loss: {:.4f}...".format(loss.item()),
                "Val Loss: {:.4f}".format(np.mean(val_losses)))