In [78]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import torch
from bs4 import BeautifulSoup
import unicodedata
import string
import re
import nltk
from collections import Counter
from torch.utils.data import DataLoader, TensorDataset
from torch import nn

In [19]:
stopwords = nltk.corpus.stopwords.words('english')

In [20]:
all_letters = string.ascii_letters + " .,;'-"
df = pd.read_csv("./data/IMDB Dataset.csv")
df.head(2)

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive


In [21]:
def preprocess(text):
    cleanr = re.compile('<.*?>')
    regex = re.compile('[%s]' % re.escape(string.punctuation))
    text = re.sub(cleanr, ' ', text)
    text = text.lower()
    text = regex.sub(' ', text)
    text = ''.join(
        c for c in unicodedata.normalize('NFD', text)
        if unicodedata.category(c) != 'Mn'
        and c in all_letters
    )
    return text
def word_process(text, stopwords = False):
    text = text.split(" ")
    if stopwords : text = [word for word in text if word not in stopwords]
    return text

def encoding(text):
    text = text.split(" ")
    text = [word2index[word] for word in text]
    return text
    

In [22]:
text = df.copy()
text['review'] = df['review'].apply(preprocess)
text.head(2)

Unnamed: 0,review,sentiment
0,one of the other reviewers has mentioned that ...,positive
1,a wonderful little production the filming t...,positive


In [8]:
all_words = word_process(" ".join(list(text['review'])))
words = list(set(all_words))
print(len(words))

100030


In [140]:
word2index = {}
index2word = {}
for i,word in enumerate(words):
    index2word[i] = word
    word2index[word] = i

In [141]:
encoding(" ".join(word_process(preprocess(df['review'][0]))))

[30482,
 7466,
 67263,
 8532,
 0,
 93487,
 42939,
 69621,
 0,
 85329,
 0,
 72411,
 55666,
 0,
 0,
 94012,
 74387,
 10869,
 93487,
 91285,
 36907,
 61570,
 29167,
 0,
 64685,
 85329,
 19231,
 34575,
 0,
 27740,
 0,
 96170,
 45917,
 92200,
 23292,
 0,
 96170,
 26675,
 85395,
 68598,
 78449,
 0,
 87408,
 29167,
 0,
 91181,
 0,
 48839,
 54272,
 19231,
 0,
 0,
 47900,
 93487,
 49320,
 20035,
 95388,
 71730,
 65258,
 92398,
 69748,
 0,
 32613,
 7330,
 66830,
 30510,
 0,
 25182,
 93144,
 30317,
 47186,
 56725,
 17848,
 12636,
 78327,
 0,
 11595,
 56471,
 6277,
 0,
 81356,
 30510,
 39839,
 58703,
 0,
 6564,
 0,
 29084,
 0,
 69313,
 0,
 35011,
 0,
 4203,
 0,
 92669,
 0,
 83956,
 0,
 0,
 0,
 4530,
 0,
 60911,
 28920,
 0,
 63592,
 63210,
 767,
 76942,
 26773,
 55357,
 73584,
 0,
 0,
 32471,
 24801,
 93736,
 47918,
 96170,
 84710,
 74184,
 16191,
 71512,
 65165,
 0,
 76320,
 30620,
 39277,
 90879,
 74620,
 94064,
 0,
 76320,
 62709,
 0,
 76320,
 1870,
 0,
 0,
 93487,
 58876,
 46156,
 0,
 94012,
 4

In [61]:
count_words = Counter(all_words)
total_words = len(words)
sorted_words = count_words.most_common(total_words)

In [62]:
vocab_to_int = {w:i for i, (w,c) in enumerate(sorted_words)}

In [63]:
reviews = list(text['review'])

In [64]:
reviews_encoded = []
for review in reviews:
    r = [vocab_to_int[w] for w in review.split()]
    reviews_encoded.append(r)
print (reviews_encoded[0:3])

[[28, 4, 1, 77, 2038, 46, 1050, 11, 100, 149, 41, 3061, 394, 20, 230, 29, 3174, 32, 25, 203, 14, 10, 6, 613, 47, 592, 17, 68, 1, 87, 148, 11, 3217, 68, 44, 3061, 13, 90, 5322, 2, 14820, 135, 4, 559, 61, 265, 8, 203, 37, 1, 647, 141, 1722, 68, 10, 6, 23, 3, 116, 16, 1, 7810, 2312, 40, 11302, 10, 116, 2571, 56, 5848, 17, 5442, 5, 1452, 371, 40, 559, 90, 6, 3784, 8, 1, 355, 356, 4, 1, 647, 7, 6, 433, 3061, 14, 11, 6, 1, 11473, 357, 5, 1, 14535, 6752, 2517, 1031, 50211, 7, 2684, 1399, 22, 22659, 518, 34, 4620, 2439, 4, 1, 1180, 115, 30, 1, 6931, 27, 2881, 11786, 2, 385, 50212, 36, 16327, 6, 23, 297, 22, 1, 4836, 2907, 518, 6, 340, 5, 107, 24380, 8063, 39249, 14536, 4993, 7691, 2426, 2, 52, 36, 43676, 324, 8981, 7236, 12281, 2, 8579, 31241, 25, 112, 223, 240, 9, 60, 132, 1, 280, 1315, 4, 1, 116, 6, 680, 5, 1, 192, 11, 7, 266, 115, 77, 274, 572, 21, 2982, 816, 182, 1287, 4124, 16, 2475, 1213, 816, 1418, 816, 863, 3061, 152, 21, 938, 184, 1, 87, 394, 9, 123, 209, 3217, 68, 14, 36, 1603, 7, 13

In [65]:
review_lens = [len(enc) for enc in reviews_encoded]

In [66]:
max(review_lens)

2492

In [67]:
np.mean(np.array(review_lens))


234.08298

In [68]:
labels = list(text['sentiment'])
encoded_labels = [1 if label =='positive' else 0 for label in labels]
encoded_labels = np.array(encoded_labels)

In [69]:
reviews_encoded = [ reviews_encoded[i] for i, l in enumerate(review_lens) if l<500 ]
encoded_labels = np.array([ encoded_labels[i] for i, l in enumerate(review_lens) if l< 500 ])

In [70]:
len(reviews_encoded), len(encoded_labels)

(45957, 45957)

In [71]:
def pad_features(reviews_int, seq_length):
    ''' Return features of review_ints, where each review is padded with 0's or truncated to the input seq_length.
    '''
    features = np.zeros((len(reviews_int), seq_length), dtype = int)
    
    for i, review in enumerate(reviews_int):
        review_len = len(review)
        
        if review_len <= seq_length:
            zeroes = list(np.zeros(seq_length-review_len))
            new = zeroes+review
        elif review_len > seq_length:
            new = review[0:seq_length]
        
        features[i,:] = np.array(new)
    
    return features

In [72]:
features = pad_features(reviews_encoded, 500)

In [73]:
split_frac = 0.8
len_feat = features.shape[0]
train_x = features[0:int(split_frac*len_feat)]
train_y = encoded_labels[0:int(split_frac*len_feat)]
remaining_x = features[int(split_frac*len_feat):]
remaining_y = encoded_labels[int(split_frac*len_feat):]
valid_x = remaining_x[0:int(len(remaining_x)*0.5)]
valid_y = remaining_y[0:int(len(remaining_y)*0.5)]
test_x = remaining_x[int(len(remaining_x)*0.5):]
test_y = remaining_y[int(len(remaining_y)*0.5):]

In [74]:
#pytorch Datasets
train_data = TensorDataset(torch.from_numpy(train_x), torch.from_numpy(train_y))
valid_data = TensorDataset(torch.from_numpy(valid_x), torch.from_numpy(valid_y))
test_data = TensorDataset(torch.from_numpy(test_x), torch.from_numpy(test_y))

# dataloaders
batch_size = 50
# make sure to SHUFFLE your data

train_loader = DataLoader(train_data, shuffle=True, batch_size=batch_size)
valid_loader = DataLoader(valid_data, shuffle=True, batch_size=batch_size)
test_loader = DataLoader(test_data, shuffle=True, batch_size=batch_size)

In [76]:
# obtain one batch of training data
dataiter = iter(train_loader)
sample_x, sample_y = dataiter.next()
print('Sample input size: ', sample_x.size()) # batch_size, seq_length # batch_size sentences at once
print('Sample input: \n', sample_x)
print()
print('Sample label size: ', sample_y.size()) # batch_size # batch_size labels at once
print('Sample label: \n', sample_y)

Sample input size:  torch.Size([50, 500])
Sample input: 
 tensor([[    0,     0,     0,  ...,     1,    49,   160],
        [    0,     0,     0,  ...,     7,    43,     4],
        [    0,     0,     0,  ...,     4,     1,    15],
        ...,
        [    0,     0,     0,  ...,   745,     4, 13258],
        [    0,     0,     0,  ...,   206,     1, 17716],
        [    0,     0,     0,  ...,     5,    65,   105]], dtype=torch.int32)

Sample label size:  torch.Size([50])
Sample label: 
 tensor([0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0,
        1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 0, 1, 1, 0, 1, 0, 1, 1, 0, 1, 1, 0, 0, 0,
        1, 0], dtype=torch.int32)


In [79]:
class SentimentLSTM(nn.Module):
    """
    The RNN model that will be used to perform Sentiment analysis.
    """

    def __init__(self, vocab_size, output_size, embedding_dim, hidden_dim, n_layers, drop_prob=0.5):
        """
        Initialize the model by setting up the layers.
        """
        super().__init__()

        self.output_size = output_size
        self.n_layers = n_layers
        self.hidden_dim = hidden_dim
        
        # embedding and LSTM layers
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, n_layers, 
                            dropout=drop_prob, batch_first=True)
        
        # dropout layer
        self.dropout = nn.Dropout(0.3)
        
        # linear and sigmoid layers
        self.fc = nn.Linear(hidden_dim, output_size)
        self.sig = nn.Sigmoid()
        

    def forward(self, x, hidden):
        """
        Perform a forward pass of our model on some input and hidden state.
        """
        batch_size = x.size(0)

        # embeddings and lstm_out
        embeds = self.embedding(x)
        lstm_out, hidden = self.lstm(embeds, hidden)
    
        # stack up lstm outputs
        lstm_out = lstm_out.contiguous().view(-1, self.hidden_dim)
        
        # dropout and fully-connected layer
        out = self.dropout(lstm_out)
        out = self.fc(out)
        # sigmoid function
        sig_out = self.sig(out)
        
        # reshape to be batch_size first
        sig_out = sig_out.view(batch_size, -1)
        sig_out = sig_out[:, -1] # get last batch of labels
        
        # return last sigmoid output and hidden state
        return sig_out, hidden
    
    
    def init_hidden(self, batch_size):
        ''' Initializes hidden state '''
        # Create two new tensors with sizes n_layers x batch_size x hidden_dim,
        # initialized to zero, for hidden state and cell state of LSTM
        weight = next(self.parameters()).data
        
        if (train_on_gpu):
            hidden = (weight.new(self.n_layers, batch_size, self.hidden_dim).zero_().cuda(),
                  weight.new(self.n_layers, batch_size, self.hidden_dim).zero_().cuda())
        else:
            hidden = (weight.new(self.n_layers, batch_size, self.hidden_dim).zero_(),
                      weight.new(self.n_layers, batch_size, self.hidden_dim).zero_())
        
        return hidden
        

In [80]:
vocab_size = len(vocab_to_int)+1 # +1 for the 0 padding
output_size = 1
embedding_dim = 400
hidden_dim = 256
n_layers = 2
net = SentimentLSTM(vocab_size, output_size, embedding_dim, hidden_dim, n_layers)
print(net)

SentimentLSTM(
  (embedding): Embedding(100031, 400)
  (lstm): LSTM(400, 256, num_layers=2, batch_first=True, dropout=0.5)
  (dropout): Dropout(p=0.3, inplace=False)
  (fc): Linear(in_features=256, out_features=1, bias=True)
  (sig): Sigmoid()
)
