In [43]:
import numpy as np
import pandas as pd
from string import punctuation
from collections import Counter, OrderedDict
import itertools

import torch 
from torch.utils.data import TensorDataset, DataLoader
import torch.nn as nn

import nltk
from nltk.corpus import stopwords

In [44]:
with open('deep-learning-v2-pytorch/sentiment-analysis-network/reviews.txt', 'r') as f:
    reviews = f.read()
with open('deep-learning-v2-pytorch/sentiment-analysis-network/labels.txt', 'r') as f:
    labels = f.read()

In [3]:
# This function will recieve the imported reviews (ch by ch) and return 
def clean_text(text):
    ''' This Function recieves reviews (ch by ch) and returns a list of
    reviews without punctuation and stopwords'''
    # remove punctuation
    s = ''.join(ch.lower() for ch in text if ch not in punctuation)
    
    # separate each review and add to a list so that I have a list of reviews
    separated_reviews = []

    for review in s.split('\n'):
        review = ''.join(review)
        separated_reviews.append(review)

        
    return separated_reviews

In [4]:
reviews = clean_text(reviews)

In [45]:
labels = clean_text(labels)

In [6]:
class ReviewEncoder:
    def __init__(self):
        self.__words_dict = {}
        self.__indexer = 1
    def word_dict(self):
        return self.__words_dict
    def encode(self, text):
        encoded_review = []
        words = text.split()
        #print(words)
        for word in words:
            if word in self. __words_dict:
                encoded_review.append(self.__words_dict[word])
            else:
                self.__words_dict[word] = self.__indexer
                self.__indexer += 1
                encoded_review.append(self.__words_dict[word])
        return encoded_review
    
    def len_dict(self):
        return len(self.__words_dict)

In [7]:
encoder = ReviewEncoder()

In [8]:
encoded_reviews = []
for review in reviews:
    encoded_reviews.append(encoder.encode(review))

In [15]:
encoded_labels = []

for label in labels:
    encoded_labels.append(encoder.encode(label))

In [17]:
def drop_empty_reviews(text):
    
    full_reviews = []
    
    for index, review in enumerate(text):
        if len(review) != 0:
            full_reviews.append(review)
            
    return full_reviews

In [18]:
# check the index of the zero len review
index_to_remove = []
for i, review in enumerate(encoded_reviews):
    if len(review) == 0:
        index_to_remove.append(i)
index_to_remove

[25000]

In [19]:
# remove label with index 25000
del encoded_labels[25000]

In [20]:
encoded_reviews = drop_empty_reviews(encoded_reviews)

In [21]:
def padding_truncation(encoded_review_list):
    max_review_size = 200
    padded_review = []
    for review in encoded_review_list:
        if len(review) < max_review_size:
            padding = max_review_size - len(review)
            review = ([0]*padding + review)
            padded_review.append(review)
        elif len(review) > max_review_size:
            review = review[:max_review_size]
            padded_review.append(review)
        else:
            padded_review.append(review)
            
    return padded_review

In [22]:
padded_reviews = padding_truncation(encoded_reviews)

rev = padded_reviews[0]
rev = list(filter(lambda x: x != 0, rev))
print(rev)
d = encoder.word_dict()

 print([get_key(d, c) for c in rev])

def get_key(d, v):
    for key, value in d.items():
        if value == v:
            return key
    return str(-1)


decoded_rev = ' '.join([ for c in rev])
print(decoded_rev)

training

validation

type(padded_reviews)

In [23]:
padded_reviews = np.asarray(padded_reviews, dtype=int)

In [24]:
type(padded_reviews)

numpy.ndarray

In [26]:
type(encoded_labels)

list

In [27]:
encoded_labels[:4]

[[1482], [6782], [1482], [6782]]

In [28]:
encoded_labels_ = []
for sublist in encoded_labels:
    for item in sublist:
        encoded_labels_.append(item)

In [30]:
encoded_labels = np.asarray(encoded_labels_, dtype='int32')

In [47]:
enc_labels = []
for label in encoded_labels:
    if label == 1482:
        enc_labels.append(1)
    else:
        enc_labels.append(0)

In [49]:
encoded_labels = np.asarray(enc_labels, dtype='int32')

In [50]:
# Defining training, validation and testing sets

training = int(len(padded_reviews) * 0.8)
validation = int(training + len(padded_reviews)*0.1)

train_x = padded_reviews[:training]
train_y = encoded_labels[:training]

val_x = padded_reviews[training:validation]
val_y = encoded_labels[training:validation]

test_x = padded_reviews[validation:]
test_y = encoded_labels[validation:]

In [51]:
train_dataset = TensorDataset(torch.from_numpy(train_x), torch.from_numpy(train_y))
valid_dataset = TensorDataset(torch.from_numpy(val_x), torch.from_numpy(val_y))
test_dataset = TensorDataset(torch.from_numpy(test_x), torch.from_numpy(test_y))

In [52]:
train_loader = DataLoader(dataset=train_dataset, batch_size=50, shuffle=True)
valid_loader = DataLoader(dataset=valid_dataset, batch_size=50, shuffle=True)
test_loader = DataLoader(dataset=test_dataset, batch_size=50, shuffle=True)

In [53]:
train_on_gpu = torch.cuda.is_available()

In [54]:
import torch.nn as nn

class SentimentRNN(nn.Module):
    """
    The RNN model that will be used to perform Sentiment analysis.
    """

    def __init__(self, vocab_size, output_size, embedding_dim, hidden_dim, n_layers, drop_prob=0.5):
        """
        Initialize the model by setting up the layers.
        """
        super(SentimentRNN, self).__init__()

        self.output_size = output_size
        self.n_layers = n_layers
        self.hidden_dim = hidden_dim
        
        # embedding and LSTM layers
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, n_layers, 
                            dropout=drop_prob, batch_first=True)
        
        # dropout layer
        self.dropout = nn.Dropout(0.3)
        
        # linear and sigmoid layers
        self.fc = nn.Linear(hidden_dim, output_size)
        self.sig = nn.Sigmoid()
        

    def forward(self, x, hidden):
        """
        Perform a forward pass of our model on some input and hidden state.
        """
        batch_size = x.size(0)

        # embeddings and lstm_out
        x = x.long()
        embeds = self.embedding(x)
        lstm_out, hidden = self.lstm(embeds, hidden)
    
        # stack up lstm outputs
        lstm_out = lstm_out.contiguous().view(-1, self.hidden_dim)
        
        # dropout and fully-connected layer
        out = self.dropout(lstm_out)
        out = self.fc(out)
        # sigmoid function
        sig_out = self.sig(out)
        
        # reshape to be batch_size first
        sig_out = sig_out.view(batch_size, -1)
        sig_out = sig_out[:, -1] # get last batch of labels
        
        # return last sigmoid output and hidden state
        return sig_out, hidden
    
    
    def init_hidden(self, batch_size):
        ''' Initializes hidden state '''
        # Create two new tensors with sizes n_layers x batch_size x hidden_dim,
        # initialized to zero, for hidden state and cell state of LSTM
        weight = next(self.parameters()).data
        
        if (train_on_gpu):
            hidden = (weight.new(self.n_layers, batch_size, self.hidden_dim).zero_().cuda(),
                  weight.new(self.n_layers, batch_size, self.hidden_dim).zero_().cuda())
        else:
            hidden = (weight.new(self.n_layers, batch_size, self.hidden_dim).zero_(),
                      weight.new(self.n_layers, batch_size, self.hidden_dim).zero_())
        
        return hidden
        

In [55]:
# Instantiate the model w/ hyperparams
vocab_size = 74072 +1 # +1 for the 0 padding + our word tokens
output_size = 1
embedding_dim = 400
hidden_dim = 256
n_layers = 2

net = SentimentRNN(vocab_size, output_size, embedding_dim, hidden_dim, n_layers)

print(net)

SentimentRNN(
  (embedding): Embedding(74073, 400)
  (lstm): LSTM(400, 256, num_layers=2, batch_first=True, dropout=0.5)
  (dropout): Dropout(p=0.3, inplace=False)
  (fc): Linear(in_features=256, out_features=1, bias=True)
  (sig): Sigmoid()
)


In [56]:
# loss and optimization functions
lr=0.001
batch_size = 50

criterion = nn.BCELoss()
optimizer = torch.optim.Adam(net.parameters(), lr=lr)

In [57]:
# training params

epochs = 4 # 3-4 is approx where I noticed the validation loss stop decreasing

counter = 0
print_every = 100
clip=5 # gradient clipping

# move model to GPU, if available
if(train_on_gpu):
    net.cuda()

net.train()
# train for some number of epochs
for e in range(epochs):
    # initialize hidden state
    h = net.init_hidden(batch_size)

    # batch loop
    for inputs, labels in train_loader:
        counter += 1

        if(train_on_gpu):
            inputs, labels = inputs.cuda(), labels.cuda()

        # Creating new variables for the hidden state, otherwise
        # we'd backprop through the entire training history
        h = tuple([each.data for each in h])

        # zero accumulated gradients
        net.zero_grad()

        # get the output from the model
        output, h = net(inputs, h)

        # calculate the loss and perform backprop
        loss = criterion(output.squeeze(), labels.float())
        loss.backward()
        # `clip_grad_norm` helps prevent the exploding gradient problem in RNNs / LSTMs.
        nn.utils.clip_grad_norm_(net.parameters(), clip)
        optimizer.step()

        # loss stats
        if counter % print_every == 0:
            # Get validation loss
            val_h = net.init_hidden(batch_size)
            val_losses = []
            net.eval()
            for inputs, labels in valid_loader:

                # Creating new variables for the hidden state, otherwise
                # we'd backprop through the entire training history
                val_h = tuple([each.data for each in val_h])

                if(train_on_gpu):
                    inputs, labels = inputs.cuda(), labels.cuda()

                output, val_h = net(inputs, val_h)
                val_loss = criterion(output.squeeze(), labels.float())

                val_losses.append(val_loss.item())

            net.train()
            print("Epoch: {}/{}...".format(e+1, epochs),
                  "Step: {}...".format(counter),
                  "Loss: {:.6f}...".format(loss.item()),
                  "Val Loss: {:.6f}".format(np.mean(val_losses)))

Epoch: 1/4... Step: 100... Loss: 0.622373... Val Loss: 0.646310
Epoch: 1/4... Step: 200... Loss: 0.672640... Val Loss: 0.672128
Epoch: 1/4... Step: 300... Loss: 0.702704... Val Loss: 0.643794
Epoch: 1/4... Step: 400... Loss: 0.659121... Val Loss: 0.592180
Epoch: 2/4... Step: 500... Loss: 0.554380... Val Loss: 0.586237
Epoch: 2/4... Step: 600... Loss: 0.405791... Val Loss: 0.498888
Epoch: 2/4... Step: 700... Loss: 0.448195... Val Loss: 0.609094
Epoch: 2/4... Step: 800... Loss: 0.370767... Val Loss: 0.475145
Epoch: 3/4... Step: 900... Loss: 0.293247... Val Loss: 0.469440
Epoch: 3/4... Step: 1000... Loss: 0.394853... Val Loss: 0.567648
Epoch: 3/4... Step: 1100... Loss: 0.290704... Val Loss: 0.488442
Epoch: 3/4... Step: 1200... Loss: 0.323962... Val Loss: 0.449413
Epoch: 4/4... Step: 1300... Loss: 0.342523... Val Loss: 0.492758
Epoch: 4/4... Step: 1400... Loss: 0.177499... Val Loss: 0.589974
Epoch: 4/4... Step: 1500... Loss: 0.186152... Val Loss: 0.478905
Epoch: 4/4... Step: 1600... Loss: 