# Sentiment_analysis_using_RNN/MDS201803

In [1]:
import sys
import numpy as np
from numpy.random import randn

# Only used for data preprocessing
import re 
import nltk
from nltk.corpus import stopwords 
from nltk.stem import WordNetLemmatizer 
from nltk.stem import PorterStemmer 
   
# Only used for generating summary stats
from scipy import stats

# Only used for keep tracking on the progress bar
from time import sleep
from tqdm.notebook import tqdm

In [2]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/subhasish/anaconda3/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [3]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     /home/subhasish/anaconda3/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

### Combining & Merging all the training reviews and test reviews in single files

In [4]:
train_data = []
for line in open('aclImdb/movie_data/full_train.txt', 'r'):
    train_data.append(line.strip())
    
test_data = []
for line in open('aclImdb/movie_data/full_test.txt', 'r'):
    test_data.append(line.strip())

In [5]:
# For labels 1 : positive & 0 : negative

train_labels = np.concatenate((np.ones(12500),np.zeros(12500))).astype(int)
test_labels = np.concatenate((np.ones(12500),np.zeros(12500))).astype(int)

### Preprocessing of the data

After examining the raw corpus, it is observed that the data contains punctuations, symbols etc. Since we are intereted in the english words only we preprocess the data to remove them.

Regular Expressions are used for preprocessing of the data

In [6]:
punctuations = re.compile("[!#$%&*.;:!\'?,\"()\[\]]")
symbols = re.compile("(\-)|(\/)|(<br\s*/><br\s*/>)")                          

# The following function removes all punctuations & symbols/syntax from the data
# also makes the data lower case

def preprocess(data):
    
    output = []
    for reviews in data:
        reviews = punctuations.sub("", reviews)
        reviews = symbols.sub("", reviews)
        reviews = reviews.lower()
        output.append(reviews)
        
    return output

processed_train = preprocess(train_data)
processed_test = preprocess(test_data)

### Removing the stopwords

Words having too high or too low frequencies are of no use in our study. Most commonly occuring words like "the", "a" etc are present in all the reviews and contribute nothing in our analysis. Similarly there are some rare words also which have no discriminating power. We remove all such words.

In [7]:
combined_text_train = ' '.join(processed_train)
combined_text_test = ' '.join(processed_test)
combined_text = combined_text_train + combined_text_test

word_list = combined_text.split()

print("Total number of words in our corpus : ",len(word_list))

Total number of words in our corpus :  11313560


In [8]:
stop_words = set(stopwords.words('english')) 

In [9]:
word_list_new = []
for i in word_list:
    if i not in stop_words:
        word_list_new.append(i)

### Lemmatizing the words

In [10]:
lemmatizer = WordNetLemmatizer() 

word_list_lem = []
for word in word_list_new:
    word_list_lem.append(lemmatizer.lemmatize(word))

In [11]:
print("So we are able to reduce our vocab size by " + str(len(set(word_list_new))-len(set(word_list_lem))) + " using lemmatization")

So we are able to reduce our vocab size by 11169 using lemmatization


### Stemming the words

In [12]:
ps = PorterStemmer() 
  
word_list_stem = []
for word in word_list_lem:
    word_list_stem.append(ps.stem(word))    

In [13]:
print("So we are able to reduce our vocab size by " + str(len(set(word_list_lem))-len(set(word_list_stem))) + " using stemming")

So we are able to reduce our vocab size by 29027 using stemming


### Tokenizing the words

In [14]:
# Identifying the unique words and sorting them w.r.t their counts

# Dictionary of the unique words along with their counts
vocab_dict = {}

for i in word_list_stem:
    try:
        vocab_dict[i] += 1
    except KeyError:
        vocab_dict[i] = 1

# Sorting the vocab_dict w.r.t their counts        
vocab_dict_sorted = {k: v for k, v in sorted(vocab_dict.items(), key=lambda item: item[1], reverse=True)}

### Filtering words

Here we remove the words with extreme low frequency

In [15]:
freq_list = []
for i in enumerate(vocab_dict_sorted):
    freq_list.append(vocab_dict_sorted[i[1]])
    
# summary statistics for the frequency distributions of the words    
stats.describe(np.array(freq_list))

DescribeResult(nobs=184069, minmax=(1, 98933), mean=32.56473387697005, variance=285532.9681308725, skewness=86.03721368463076, kurtosis=12502.3455189253)

In [16]:
np.sum(freq_list[:10000])/np.sum(freq_list)

0.9205986895907649

Thus we can see that the top $10k$ most frequent words account for $92\%$ of the total frequency. Hence in order to reduce computation time and increase efficienc we restrict our vocabulary to $10k$ words only.

In [17]:
vocab_dict_new = {}
count = 0
for i in vocab_dict_sorted.keys():
    vocab_dict_new[i] = vocab_dict_sorted[i]
    count += 1
    if count >= 9999:
        break

In [18]:
# Mapping each unique word to integers starting from 1
vocab_int_mapping = {c:w+1 for (w,c) in enumerate(vocab_dict_new)}

print("Total number of unique words in our corpus : ",len(vocab_int_mapping))

Total number of unique words in our corpus :  9999


### Encoding the reviews

In [19]:
# Encoding each word in the reviews by their corresponding token numbers/indices
# encoded_reviews is a list of list, where each sub-list contains the reviews which are encoded

def encode(data):
    
    encoded_reviews = []

    for review in data:

        r = []
        for word in review.split():
            try:
                r.append(vocab_int_mapping[word])
            except KeyError:
                pass
        encoded_reviews.append(r)
    
    return encoded_reviews
    
encoded_reviews_train = encode(processed_train)
encoded_reviews_test = encode(processed_test)

### Resizing reviews

In order to input the review vectors in a RNN we need all of them of the same size, i.e. this sequence length is same as number of time steps for the RNN. To deal with both short and long reviews, we will pad the small ones and  truncate all the large reviews to a specific length. We denote this length by **review_length** .<br>
For reviews shorter than *review_length*, we will pad with 0s. For reviews longer than *review_length* we will truncate them to the first *review_length* words.

In [20]:
train_review_len = []
for i in encoded_reviews_train:
    train_review_len.append(len(i))

stats.describe(np.array(train_review_len))

DescribeResult(nobs=25000, minmax=(1, 958), mean=80.39876, variance=3695.447808374735, skewness=2.2009391036269865, kurtosis=7.491137450034261)

Hence, we take *review_length* = 80.<br>
modifying all the reviews w.r.t the review_length.

In [112]:
review_length = 200

# This function truncates or pads the reviews as necessary based on their lengths
def resize(data):
    
    features = []
    
    for i in data:

        if len(i)>review_length:
            features.append(i[:review_length])
        else:
            features.append([0]*(review_length-len(i)) + i)
    return features


features_train = resize(encoded_reviews_train)
features_test = resize(encoded_reviews_test)

### Creating Inputs

So far we have converted te reviews into list of integers which denotes the index of the pericular word in the vocabulary. Now to input a review into the Neural Network we need to encode each word using One Hot Encoding. Since we have $10000$ unique words in our vocabulary , each word in the review will be represented as a vector of size $10000 \times 1$.

In [113]:
def one_hot_encoding(review):
    
    vocab_size = 10000
    inputs = []
    
    for w in review:
       
        v = np.zeros((vocab_size, 1))
        v[w] = 1
        inputs.append(v)
    
    return inputs

### Building the RNN

In [114]:
class RNN:

    def __init__(self, input_size, output_size, hidden_size=10):
        
        # Initializaing the weights randomly from uniform distribution
        self.Wh = np.random.uniform(-np.sqrt(1./input_size), np.sqrt(1./input_size), (hidden_size, hidden_size))
        self.Wx = np.random.uniform(-np.sqrt(1./input_size), np.sqrt(1./input_size), (hidden_size, input_size))
        self.Wy = np.random.uniform(-np.sqrt(1./input_size), np.sqrt(1./input_size), (output_size, hidden_size))

        # Initializing the biases to zero np arrays
        self.bh = np.zeros((hidden_size, 1))
        self.by = np.zeros((output_size, 1))
    
    def forward(self, inputs):
        
        # Here the function takes a singe review
        # input is a list of length 116 & vocab_size is 50,000
        # each element of the input list is one_hot_vector of size (vocab_size, 1)
        
        # initializing the Zero-th hidden layer (one corresponding to 0-th time stamp)
        h = np.zeros((self.Wh.shape[0], 1))
        
        # Dictionary for storing the hidden layer units
        # keys : time index ; values : hidden layer vector
        self.hidden_layer = {}
        
        # storing the inputs and the initial hidden layer for future use
        self.last_inputs = inputs
        self.hidden_layer[0] = h 
        
        # Performing the forward pass for each time step 
        for i, j in enumerate(inputs):

            h = np.tanh(np.matmul(self.Wx, j) + np.matmul(self.Wh, h) + self.bh)
            self.hidden_layer[i + 1] = h

        # Final computation of the output using the last hidden layer
        output = np.matmul(self.Wy, h) + self.by
        
        return output
    
    
    def softmax(self, x):
        
        # Softmax Function for the output array.
        # Outputs 2-class probabilities which sums up to 1
        return np.exp(x) / sum(np.exp(x))
    
    
    def BPTT(self, d_y, learn_rate=0.1):
        
        # This function performs backpropagation through time (BPTT)
        # We denote, (dL/dy) : d_y  it has shape (output_size, 1).
        # dL/dy = p_i     , if i != c
        #         p_i - 1 , if i == c
        

        # Derivative arrays initialized to zero 
        d_Wh = np.zeros(self.Wh.shape)
        d_Wx = np.zeros(self.Wx.shape)
        d_bh = np.zeros(self.bh.shape)
        
        # Derivatives dL/dWy and dL/dby.
        d_Wy = np.matmul(d_y, self.hidden_layer[len(self.last_inputs)].T)
        d_by = d_y

        # Calculate dL/dh for the last hidden layer.
        d_h = np.matmul(self.Wy.T, d_y)

        # BPTT loop with the derivatives 
        for t in range(len(self.last_inputs))[::-1]:
            
            temp = ((1 - self.hidden_layer[t + 1] ** 2) * d_h)

            d_bh += temp
            d_Wh += np.matmul(temp, self.hidden_layer[t].T)
            d_Wx += np.matmul(temp, self.last_inputs[t].T)
            d_h = np.matmul(self.Wh, temp)  
          
        # Clip to prevent exploding gradients.
        #for d in [d_Wx, d_Wh, d_Wy, d_bh, d_by]:
        #    np.clip(d, -1, 1, out=d)

        # Update weights and biases using gradient descent.
        self.Wh -= learn_rate * d_Wh
        self.Wx -= learn_rate * d_Wx
        self.Wy -= learn_rate * d_Wy
        self.bh -= learn_rate * d_bh
        self.by -= learn_rate * d_by
        

In [115]:
def processData(features, labels, backprop=True):

    loss = 0
    correct_pred = 0
    
    # Randomly picking up the training data
    l = np.arange(len(features))
    np.random.shuffle(l)
    
    with tqdm(total=len(features), file=sys.stdout) as pbar:
        
        for i in l:

            inputs = one_hot_encoding(features[i])
            target = labels[i]

            # Forward pass
            prediction = rnn.forward(inputs)
            probs = rnn.softmax(prediction)

            # Calculating loss & accuracy
            loss -= np.log(probs[target])

            if np.argmax(probs) == target:
                correct_pred += 1

            if backprop:
                # Build dL/dy
                d_L_d_y = probs

                d_L_d_y[target] -= 1

                # Backward
                rnn.BPTT(d_L_d_y)
            
            # for updating the tqdm progress bar
            pbar.update(1)
            
            
    return loss / len(features), correct_pred / len(features)

In [116]:
np.random.seed(seed=1)

# Initialize the RNN class
rnn = RNN(10000, 2)

In [117]:
# Training through epochs
for epoch in range(2):
    train_loss, train_acc = processData(features_train, train_labels, backprop=True)
 
    print('--- Epoch %d' % (epoch + 1))
    print('Train:\tLoss %.3f | Accuracy: %.3f' % (train_loss, train_acc))

    test_loss, test_acc = processData(features_test, test_labels, backprop=False)
    print('Test:\tLoss %.3f | Accuracy: %.3f' % (test_loss, test_acc))

HBox(children=(IntProgress(value=0, max=25000), HTML(value='')))


--- Epoch 1
Train:	Loss 0.693 | Accuracy: 0.534


HBox(children=(IntProgress(value=0, max=25000), HTML(value='')))


Test:	Loss 0.708 | Accuracy: 0.544


HBox(children=(IntProgress(value=0, max=25000), HTML(value='')))


--- Epoch 2
Train:	Loss 0.673 | Accuracy: 0.584


HBox(children=(IntProgress(value=0, max=25000), HTML(value='')))


Test:	Loss 0.691 | Accuracy: 0.556


In [62]:
acc

0.99496

In [61]:
_, acc = processData(features_train, train_labels, backprop=True)

HBox(children=(IntProgress(value=0, max=25000), HTML(value='')))




In [100]:
l

array([2, 3, 1, 4, 0])

In [74]:
#inputs = np.array(features_train[1]).reshape(228,1)
inputs = one_hot_encoding(features_train[900])
out = rnn.forward(inputs)
probs = rnn.softmax(out)
print(probs) # [[0.50000095], [0.49999905]]

[[9.99813265e-01]
 [1.86735251e-04]]


In [64]:
count = 0
for i in range(0,25000):
    inputs = one_hot_encoding(features_train[i])
    out = rnn.forward(inputs)
    probs = rnn.softmax(out)
    target = np.argmax(probs)
    if target == 1:
        count += 1

In [76]:
2e-2

0.02

In [None]:
gunzip -c aclImdb_v1.tar.gz | tar xopf -

cd aclImdb

mkdir movie_data

# puts four files in the combined_files directory:
# full_train.txt, full_test.txt, original_train_ratings.txt, and original_test_ratings.txt
for split in train test;
do

  for sentiment in pos neg;
  do 
    
    for file in $split/$sentiment/*; 
    do
              cat $file >> movie_data/full_${split}.txt; 
              echo >> movie_data/full_${split}.txt; 

	     # This line adds files containing the original reviews if desired
             # echo $file | cut -d '_' -f 2 | cut -d "." -f 1 >> combined_files/original_${split}_ratings.txt; 
    done;
  done;
done;