<a href="https://colab.research.google.com/github/SamuelaAnastasi/RNN_Sentiment_Analysis/blob/master/Sentiment_Analysis_RNN.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
from google.colab import drive
drive.mount('/gdrive')


In [0]:
import numpy as np

# read txt files of reviews and labels
with open('/gdrive/My Drive/Colab Notebooks/sentiment_analysis/data/reviews.txt', 'r') as f:
  reviews = f.read()
with open('/gdrive/My Drive/Colab Notebooks/sentiment_analysis/data/labels.txt', 'r') as f:
  labels = f.read()

In [0]:
print(reviews[:200])
print()
print(labels[:26])

bromwell high is a cartoon comedy . it ran at the same time as some other programs about school life  such as  teachers  . my   years in the teaching profession lead me to believe that bromwell high  

positive
negative
positive


In [0]:
#preprocess and tokenize text data
#convert to lowercase 
#clean data: remove punctuation
from string import punctuation 

#string.punctuation python 3.0
print(punctuation)

reviews = reviews.lower()
clean_reviews = ''.join([c for c in reviews if c not in punctuation])

!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~


In [0]:
#clean data: remove \n chars that separates reviews from each-other
# split clean reviews by \n and join them again
reviews_split = clean_reviews.split('\n')
clean_reviews = ' '.join(reviews_split)

In [0]:
#create list of all words in cleaned reviews and print some of them
words = clean_reviews.split()
words[:20]

['bromwell',
 'high',
 'is',
 'a',
 'cartoon',
 'comedy',
 'it',
 'ran',
 'at',
 'the',
 'same',
 'time',
 'as',
 'some',
 'other',
 'programs',
 'about',
 'school',
 'life',
 'such']

In [0]:
#encode each word and label as int
# create a dict that maps each unique word to int vals
# subclass of dict: counts the hashtable object 
#creates a dict that maps obj to the n of times they apear in the input 
from collections import Counter

#create dict of words where most frequent words are assigned lowest int vals
w_counts = Counter(words)
w_sorted = sorted(w_counts, key=w_counts.get, reverse=True)
# vocab = sorted(counts, key=counts.get, reverse=True)
#create dict and assign 1 to most frequent word
w_to_int = {word: i for i, word in enumerate(w_sorted, 1)}

# create a list that will contain all int values assigned to each word for each review
reviews_ints = []
# get each review in reviews previously splitted by \n
for review in reviews_split:
  #then for each word in this review get the int val from the w_to_int dict
  #and append it to the reviews_ints. 
  #Now each word in each review is stored as int inside reviews_ints
  reviews_ints.append([w_to_int[word] for word in review.split()])


###Test data preprocessing

In [0]:
# stats about vocabulary
print('Unique words: ', len((w_to_int)))  # should ~ 74000+
print()

# print tokens in first review
print('Tokenized review: \n', reviews_ints[:1])

Unique words:  74072

Tokenized review: 
 [[21025, 308, 6, 3, 1050, 207, 8, 2138, 32, 1, 171, 57, 15, 49, 81, 5785, 44, 382, 110, 140, 15, 5194, 60, 154, 9, 1, 4975, 5852, 475, 71, 5, 260, 12, 21025, 308, 13, 1978, 6, 74, 2395, 5, 613, 73, 6, 5194, 1, 24103, 5, 1983, 10166, 1, 5786, 1499, 36, 51, 66, 204, 145, 67, 1199, 5194, 19869, 1, 37442, 4, 1, 221, 883, 31, 2988, 71, 4, 1, 5787, 10, 686, 2, 67, 1499, 54, 10, 216, 1, 383, 9, 62, 3, 1406, 3686, 783, 5, 3483, 180, 1, 382, 10, 1212, 13583, 32, 308, 3, 349, 341, 2913, 10, 143, 127, 5, 7690, 30, 4, 129, 5194, 1406, 2326, 5, 21025, 308, 10, 528, 12, 109, 1448, 4, 60, 543, 102, 12, 21025, 308, 6, 227, 4146, 48, 3, 2211, 12, 8, 215, 23]]


###Convert labels
Labels have values positive and negative that should be converted to 1 and 0 respectively

In [0]:
#convert labels to be all 1 and 0 
# 1=positive, 0=negative label conversion
labels_split = labels.split('\n')
encoded_labels = np.array([1 if label == 'positive' else 0 for label in labels_split])

###Remove Outliers
Some of the reviews are too long or too short. The model requires length of input data to be consistent. So extremely long or short reviews should be eliminated and the rest of reviews should either be truncated or padded with new values to reach the appropriate length.

In [0]:
# check for outliers in reviews
review_lens = Counter([len(x) for x in reviews_ints])
print("Zero-length reviews: {}".format(review_lens[0]))
print("Maximum review length: {}".format(max(review_lens)))

Zero-length reviews: 1
Maximum review length: 2514


In [0]:
#remove 0-length reviews and respective labels
print('Number of reviews before removing outliers: ', len(reviews_ints))

# get indices of any reviews with length 0
non_zero_idx = [i for i, review in enumerate(reviews_ints) if len(review) != 0]

# remove 0-length reviews and their labels
reviews_ints = [reviews_ints[ii] for ii in non_zero_idx]
encoded_labels = np.array([encoded_labels[ii] for ii in non_zero_idx])

print('Number of reviews after removing outliers: ', len(reviews_ints))

Number of reviews before removing outliers:  25001
Number of reviews after removing outliers:  25000


In [0]:
#truncate long reviews or pad the short ones with columns of 0 on the left
def pad_reviews(reviews_ints, r_length):
    
    # create a 0-filled 2D array with num_rows=num_reviews & num_cols=r_length
    padded_r = np.zeros((len(reviews_ints), r_length), dtype=int)

    # for each review, 
    for i, review_ints in enumerate(reviews_ints):
        # fill each row of the 0-filled 2D array with the encoded int values 
        # of the review. To conserve the 0 values on the left of each row
        # when the review is too short start filling from the end
        # if the review is too long, just truncated up to r_length 
        padded_r[i, -len(review_ints):] = np.array(review_ints)[:r_length]
    
    return padded_r

### Test implementation

In [0]:
# Input size for each review
r_length = 200

features = pad_reviews(reviews_ints, r_length=r_length)

assert len(features)==len(reviews_ints), "Your features should have as many rows as reviews."
assert len(features[0])==r_length, "Each feature row should contain seq_length values."

# print first 10 word values of the first 20 batches 
print(features[:20,:10])

[[    0     0     0     0     0     0     0     0     0     0]
 [    0     0     0     0     0     0     0     0     0     0]
 [22382    42 46418    15   706 17139  3389    47    77    35]
 [ 4505   505    15     3  3342   162  8312  1652     6  4819]
 [    0     0     0     0     0     0     0     0     0     0]
 [    0     0     0     0     0     0     0     0     0     0]
 [    0     0     0     0     0     0     0     0     0     0]
 [    0     0     0     0     0     0     0     0     0     0]
 [    0     0     0     0     0     0     0     0     0     0]
 [   54    10    14   116    60   798   552    71   364     5]
 [    0     0     0     0     0     0     0     0     0     0]
 [    0     0     0     0     0     0     0     0     0     0]
 [    0     0     0     0     0     0     0     0     0     0]
 [    1   330   578    34     3   162   748  2731     9   325]
 [    9    11 10171  5305  1946   689   444    22   280   673]
 [    0     0     0     0     0     0     0     0     0

### Split data in training, validation and test set

In [0]:
# 0.8 train - 0.1 validation - 0.1 test
split_factor = 0.8

split_index = int(len(features) * split_factor)

train_data, rest_of_data = features[:split_index], features[split_index:]
train_y, rest_of_data_y = encoded_labels[:split_index], encoded_labels[split_index:]


test_index = int(len(rest_of_data) * 0.5)

valid_data, test_data = rest_of_data[:test_index], rest_of_data[test_index:]
val_y, test_y = rest_of_data_y[:test_index], rest_of_data_y[test_index:]

print("Train set: \t\t{}".format(train_data.shape), 
      "\nValidation set: \t{}".format(valid_data.shape),
      "\nTest set: \t\t{}".format(test_data.shape))

Train set: 		(20000, 200) 
Validation set: 	(2500, 200) 
Test set: 		(2500, 200)


In [0]:
import torch
from torch.utils.data import TensorDataset, DataLoader

batch_size = 50

# convert to Tensor 
train_set = TensorDataset(torch.from_numpy(train_data), torch.from_numpy(train_y))
valid_set = TensorDataset(torch.from_numpy(valid_data), torch.from_numpy(val_y))
test_set = TensorDataset(torch.from_numpy(test_data), torch.from_numpy(test_y))

# load in batches
train_loader = DataLoader(train_set, shuffle=True, batch_size=batch_size)
valid_loader = DataLoader(valid_set, shuffle=True, batch_size=batch_size)
test_loader = DataLoader(test_set, shuffle=True, batch_size=batch_size)

In [0]:
sample_x, sample_y = dataiter.next()

print('Sample input size: ', sample_x.size()) # batch_size, seq_length
print('Sample input: \n', sample_x)
print()
print('Sample label size: ', sample_y.size()) # batch_size
print('Sample label: \n', sample_y)

Sample input size:  torch.Size([50, 200])
Sample input: 
 tensor([[   0,    0,    0,  ...,    4,   11,   18],
        [ 281,   21, 1236,  ...,    9,   11,    8],
        [  11,   18,   14,  ...,   82,    2,   11],
        ...,
        [  54,   10,   14,  ...,   93,    8,   61],
        [   0,    0,    0,  ...,  164,  104,  544],
        [7785,  743,    1,  ...,    6, 7785,  743]])

Sample label size:  torch.Size([50])
Sample label: 
 tensor([0, 1, 0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0,
        1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0,
        0, 1])


###Create model

In [0]:
# Check if GPU is available
train_on_gpu=torch.cuda.is_available()

if(train_on_gpu):
    print('Training on GPU.')
else:
    print('No GPU available, training on CPU.')

Training on GPU.


In [0]:
import torch.nn as nn

class SentimentNet(nn.Module):

    def __init__(self, vocab_size, output_size, embedding_dim, hidden_dim, n_layers, drop_prob=0.5):
      
        super(SentimentNet, self).__init__()

        self.output_size = output_size
        self.n_layers = n_layers
        self.hidden_dim = hidden_dim
        
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, n_layers, 
                            dropout=drop_prob, batch_first=True)
        
        self.dropout = nn.Dropout(0.3)
        
        self.fc = nn.Linear(hidden_dim, output_size)
        self.sig = nn.Sigmoid()
        

    def forward(self, x, hidden):
      
        batch_size = x.size(0)
        
        x = x.long()
        embeds = self.embedding(x)
        lstm_out, hidden = self.lstm(embeds, hidden)
        
        lstm_out = lstm_out.contiguous().view(-1, self.hidden_dim)
        
        out = self.dropout(lstm_out)
        out = self.fc(out)        
        sig_out = self.sig(out)
        
        sig_out = sig_out.view(batch_size, -1)
        sig_out = sig_out[:, -1] # get last batch of labels
        
        return sig_out, hidden
    
    
    def init_hidden(self, batch_size):
      
        # Create two new tensors with sizes n_layers x batch_size x hidden_dim,
        # initialized to zero, for hidden state and cell state of LSTM
        weight = next(self.parameters()).data
        
        if (train_on_gpu):
            hidden = (weight.new(self.n_layers, batch_size, self.hidden_dim).zero_().cuda(),
                  weight.new(self.n_layers, batch_size, self.hidden_dim).zero_().cuda())
        else:
            hidden = (weight.new(self.n_layers, batch_size, self.hidden_dim).zero_(),
                      weight.new(self.n_layers, batch_size, self.hidden_dim).zero_())
        
        return hidden

In [0]:
# Instantiate the model w/ hyperparams
vocab_size = len(w_to_int) + 1 
output_size = 1
embedding_dim = 400
hidden_dim = 256
n_layers = 2

net = SentimentNet(vocab_size, output_size, embedding_dim, hidden_dim, n_layers)

print(net)

SentimentNet(
  (embedding): Embedding(74073, 400)
  (lstm): LSTM(400, 256, num_layers=2, batch_first=True, dropout=0.5)
  (dropout): Dropout(p=0.3)
  (fc): Linear(in_features=256, out_features=1, bias=True)
  (sig): Sigmoid()
)


##Training

In [0]:
lr=0.001

criterion = nn.BCELoss()
optimizer = torch.optim.Adam(net.parameters(), lr=lr)

In [0]:

epochs = 4 

counter = 0
print_every = 100
# gradient clipping
clip=5 

if(train_on_gpu):
    net.cuda()

net.train()

for e in range(epochs):
  
    h = net.init_hidden(batch_size)
    
    for inputs, labels in train_loader:
        counter += 1

        if(train_on_gpu):
            inputs, labels = inputs.cuda(), labels.cuda()
            
        h = tuple([each.data for each in h])
        
        net.zero_grad()
        
        output, h = net(inputs, h)
        
        loss = criterion(output.squeeze(), labels.float())
        loss.backward()
        # prevent the exploding gradient problem in RNNs / LSTMs.
        nn.utils.clip_grad_norm_(net.parameters(), clip)
        optimizer.step()
        
        if counter % print_every == 0:
          
            val_h = net.init_hidden(batch_size)
            val_losses = []
            net.eval()
            for inputs, labels in valid_loader:
              
                val_h = tuple([each.data for each in val_h])

                if(train_on_gpu):
                    inputs, labels = inputs.cuda(), labels.cuda()

                output, val_h = net(inputs, val_h)
                val_loss = criterion(output.squeeze(), labels.float())

                val_losses.append(val_loss.item())

            net.train()
            print("Epoch: {}/{}...".format(e+1, epochs),
                  "Step: {}...".format(counter),
                  "Loss: {:.6f}...".format(loss.item()),
                  "Val Loss: {:.6f}".format(np.mean(val_losses)))

Epoch: 1/4... Step: 100... Loss: 0.601795... Val Loss: 0.647134
Epoch: 1/4... Step: 200... Loss: 0.629885... Val Loss: 0.610831
Epoch: 1/4... Step: 300... Loss: 0.617199... Val Loss: 0.701001
Epoch: 1/4... Step: 400... Loss: 0.668857... Val Loss: 0.519369
Epoch: 2/4... Step: 500... Loss: 0.467429... Val Loss: 0.536550
Epoch: 2/4... Step: 600... Loss: 0.265516... Val Loss: 0.495581
Epoch: 2/4... Step: 700... Loss: 0.415150... Val Loss: 0.453255
Epoch: 2/4... Step: 800... Loss: 0.694688... Val Loss: 0.489844
Epoch: 3/4... Step: 900... Loss: 0.303322... Val Loss: 0.454577
Epoch: 3/4... Step: 1000... Loss: 0.285727... Val Loss: 0.570366
Epoch: 3/4... Step: 1100... Loss: 0.252437... Val Loss: 0.454075
Epoch: 3/4... Step: 1200... Loss: 0.148807... Val Loss: 0.414570
Epoch: 4/4... Step: 1300... Loss: 0.201714... Val Loss: 0.465686
Epoch: 4/4... Step: 1400... Loss: 0.129139... Val Loss: 0.484931
Epoch: 4/4... Step: 1500... Loss: 0.211350... Val Loss: 0.526655
Epoch: 4/4... Step: 1600... Loss: 

##Testing

In [0]:
test_losses = [] 
num_correct = 0

h = net.init_hidden(batch_size)

net.eval()

for inputs, labels in test_loader:
  
    h = tuple([each.data for each in h])

    if(train_on_gpu):
        inputs, labels = inputs.cuda(), labels.cuda()
        
    output, h = net(inputs, h)
    
    test_loss = criterion(output.squeeze(), labels.float())
    test_losses.append(test_loss.item())
    
    # convert output probabilities to predicted class (0 or 1)
    pred = torch.round(output.squeeze())
    
    correct_tensor = pred.eq(labels.float().view_as(pred))
    correct = np.squeeze(correct_tensor.numpy()) if not train_on_gpu else np.squeeze(correct_tensor.cpu().numpy())
    num_correct += np.sum(correct)
    
    
print("Test loss: {:.3f}".format(np.mean(test_losses)))

test_acc = num_correct/len(test_loader.dataset)
print("Test accuracy: {:.3f}".format(test_acc))

Test loss: 0.550
Test accuracy: 0.783


In [0]:
# negative test review
test_review_neg = 'The worst movie I have seen; acting was terrible and I want my money back. This movie had bad acting and the dialogue was slow.'

In [0]:
from string import punctuation

def tokenize_review(test_review):
    test_review = test_review.lower() 
    test_text = ''.join([c for c in test_review if c not in punctuation])
    test_words = test_text.split()
    
    test_ints = []
    test_ints.append([w_to_int[word] for word in test_words])

    return test_ints
  
test_ints = tokenize_review(test_review_neg)
print(test_ints)

[[1, 247, 18, 10, 28, 108, 113, 14, 388, 2, 10, 181, 60, 273, 144, 11, 18, 68, 76, 113, 2, 1, 410, 14, 539]]


In [0]:

seq_length=200
features = pad_reviews(test_ints, seq_length)

print(features)

[[  0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0   0   0   0   0   0   0   0   0   0   1 247  18  10  28
  108 113  14 388   2  10 181  60 273 144  11  18  68  76 113   2   1 410
   14 539]]


In [0]:

feature_tensor = torch.from_numpy(features)
print(feature_tensor.size())

torch.Size([1, 200])


In [0]:
def predict(net, test_review, sequence_length=200):
    
    net.eval()
    test_ints = tokenize_review(test_review)
    seq_length=sequence_length
    features = pad_reviews(test_ints, seq_length)
    feature_tensor = torch.from_numpy(features)
    
    batch_size = feature_tensor.size(0)
    
    h = net.init_hidden(batch_size)
    
    if(train_on_gpu):
        feature_tensor = feature_tensor.cuda()
        
    output, h = net(feature_tensor, h)
    
    # convert output probabilities to predicted class (0 or 1)
    pred = torch.round(output.squeeze()) 
    print('Prediction value, pre-rounding: {:.6f}'.format(output.item()))
    
    if(pred.item()==1):
        print("Positive review detected!")
    else:
        print("Negative review detected.")

In [0]:
# positive test review
test_review_pos = 'This movie had the best acting and the dialogue was so good. I loved it.'

In [0]:
seq_length=200 

predict(net, test_review_neg, seq_length)

Prediction value, pre-rounding: 0.007705
Negative review detected.
