<a href="https://colab.research.google.com/github/Subhasishbasak/NLP/blob/master/Sentiment_analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Sentiment_analysis_using_RNN/MDS201803

In [0]:
from google.colab import drive
drive.mount('/content/gdrive')


Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [0]:
import sys
import numpy as np
from numpy.random import randn

# Only used for data preprocessing
import re 
import nltk
from nltk.corpus import stopwords 
from nltk.stem import WordNetLemmatizer 
from nltk.stem import PorterStemmer 
   
# Only used for generating summary stats
from scipy import stats

# Only used for keep tracking on the progress bar
from time import sleep
from tqdm.notebook import tqdm

In [0]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [0]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


True

### Combining & Merging all the training reviews and test reviews in single files

In [0]:
train_data = []
for line in open('/content/gdrive/My Drive/movie_data/full_train.txt', 'r'):
    train_data.append(line.strip())
    
test_data = []
for line in open('/content/gdrive/My Drive/movie_data/full_test.txt', 'r'):
    test_data.append(line.strip())

In [0]:
# For labels 1 : positive & 0 : negative

train_labels = np.concatenate((np.ones(12500),np.zeros(12500))).astype(int)
test_labels = np.concatenate((np.ones(12500),np.zeros(12500))).astype(int)

### Preprocessing of the data

After examining the raw corpus, it is observed that the data contains punctuations, symbols etc. Since we are intereted in the english words only we preprocess the data to remove them.

Regular Expressions are used for preprocessing of the data

In [0]:
punctuations = re.compile("[!#$%&*.;:!\'?,\"()\[\]]")
symbols = re.compile("(\-)|(\/)|(<br\s*/><br\s*/>)")                          

# The following function removes all punctuations & symbols/syntax from the data
# also makes the data lower case

def preprocess(data):
    
    output = []
    for reviews in data:
        reviews = punctuations.sub("", reviews)
        reviews = symbols.sub("", reviews)
        reviews = reviews.lower()
        output.append(reviews)
        
    return output

processed_train = preprocess(train_data)
processed_test = preprocess(test_data)

### Removing the stopwords

Words having too high or too low frequencies are of no use in our study. Most commonly occuring words like "the", "a" etc are present in all the reviews and contribute nothing in our analysis. Similarly there are some rare words also which have no discriminating power. We remove all such words.

In [0]:
combined_text_train = ' '.join(processed_train)
combined_text_test = ' '.join(processed_test)
combined_text = combined_text_train + combined_text_test

word_list = combined_text.split()

print("Total number of words in our corpus : ",len(word_list))

Total number of words in our corpus :  11313560


In [0]:
stop_words = set(stopwords.words('english')) 

In [0]:
word_list_new = []
for i in word_list:
    if i not in stop_words:
        word_list_new.append(i)

### Lemmatizing the words

In [0]:
lemmatizer = WordNetLemmatizer() 

word_list_lem = []
for word in word_list_new:
    word_list_lem.append(lemmatizer.lemmatize(word.lower()))

In [0]:
print("So we are able to reduce our vocab size by " + str(len(set(word_list_new))-len(set(word_list_lem))) + " using lemmatization")

So we are able to reduce our vocab size by 11169 using lemmatization


### Stemming the words

In [0]:
ps = PorterStemmer() 
  
word_list_stem = []
for word in word_list_lem:
    word_list_stem.append(ps.stem(word))    

In [0]:
print("So we are able to reduce our vocab size by " + str(len(set(word_list_lem))-len(set(word_list_stem))) + " using stemming")

So we are able to reduce our vocab size by 29027 using stemming


### Tokenizing the words

In [0]:
# Identifying the unique words and sorting them w.r.t their counts

# Dictionary of the unique words along with their counts
vocab_dict = {}

for i in word_list_stem:
    try:
        vocab_dict[i] += 1
    except KeyError:
        vocab_dict[i] = 1

# Sorting the vocab_dict w.r.t their counts        
vocab_dict_sorted = {k: v for k, v in sorted(vocab_dict.items(), key=lambda item: item[1], reverse=True)}

### Filtering words

Here we remove the words with extreme low frequency

In [0]:
freq_list = []
for i in enumerate(vocab_dict_sorted):
    freq_list.append(vocab_dict_sorted[i[1]])
    
# summary statistics for the frequency distributions of the words    
stats.describe(np.array(freq_list))

DescribeResult(nobs=184069, minmax=(1, 98933), mean=32.56473387697005, variance=285532.9681308725, skewness=86.03721368463076, kurtosis=12502.3455189253)

In [0]:
np.sum(freq_list[:5000])/np.sum(freq_list)

0.8655294037961628

Thus we can see that the top $5k$ most frequent words account for $86\%$ of the total frequency. Hence in order to reduce computation time and increase efficienc we restrict our vocabulary to $5k$ words only.

In [0]:
vocab_dict_new = {}
count = 0
for i in vocab_dict_sorted.keys():
    vocab_dict_new[i] = vocab_dict_sorted[i]
    count += 1
    if count >= 4999:
        break

In [0]:
# Mapping each unique word to integers starting from 1
vocab_int_mapping = {c:w+1 for (w,c) in enumerate(vocab_dict_new)}

print("Total number of unique words in our corpus : ",len(vocab_int_mapping))

Total number of unique words in our corpus :  4999


### Encoding the reviews

In [0]:
# Encoding each word in the reviews by their corresponding token numbers/indices
# encoded_reviews is a list of list, where each sub-list contains the reviews which are encoded

def encode(data):
    
    encoded_reviews = []

    for review in data:

        r = []
        for word in review.split():
            try:
                r.append(vocab_int_mapping[word])
            except KeyError:
                pass
        encoded_reviews.append(r)
    
    return encoded_reviews
    
encoded_reviews_train = encode(processed_train)
encoded_reviews_test = encode(processed_test)

### Resizing reviews

In order to input the review vectors in a RNN we need all of them of the same size, i.e. this sequence length is same as number of time steps for the RNN. To deal with both short and long reviews, we will pad the small ones and  truncate all the large reviews to a specific length. We denote this length by **review_length** .<br>
For reviews shorter than *review_length*, we will pad with 0s. For reviews longer than *review_length* we will truncate them to the first *review_length* words.

In [0]:
train_review_len = []
for i in encoded_reviews_train:
    train_review_len.append(len(i))

stats.describe(np.array(train_review_len))

DescribeResult(nobs=25000, minmax=(1, 784), mean=68.08048, variance=2589.7879144861795, skewness=2.1846289316690877, kurtosis=7.3331451151791835)

Hence, we take *review_length* = 300.<br>
modifying all the reviews w.r.t the review_length.

In [0]:
review_length = 300

# This function truncates or pads the reviews as necessary based on their lengths
def resize(data):
    
    features = []
    
    for i in data:

        if len(i)>review_length:
            features.append(i[:review_length])
        else:
            features.append([0]*(review_length-len(i)) + i)
    return features


features_train = resize(encoded_reviews_train)
features_test = resize(encoded_reviews_test)

### Creating Inputs

So far we have converted te reviews into list of integers which denotes the index of the pericular word in the vocabulary. Now to input a review into the Neural Network we need to encode each word using One Hot Encoding. Since we have $5000$ unique words in our vocabulary , each word in the review will be represented as a vector of size $5000 \times 1$.

In [0]:
def one_hot_encoding(review):
    
    vocab_size = 5000
    inputs = []
    
    for w in review:
       
        v = np.zeros((vocab_size, 1))
        v[w] = 1
        inputs.append(v)
    
    return inputs

### Building the RNN

In [0]:
class RNN:

    def __init__(self, input_size, output_size, hidden_size=10):
        
        # Initializaing the weights randomly from uniform distribution
        self.Wh = np.random.uniform(-np.sqrt(1./input_size), np.sqrt(1./input_size), (hidden_size, hidden_size))
        self.Wx = np.random.uniform(-np.sqrt(1./input_size), np.sqrt(1./input_size), (hidden_size, input_size))
        self.Wy = np.random.uniform(-np.sqrt(1./input_size), np.sqrt(1./input_size), (output_size, hidden_size))

        # Initializing the biases to zero np arrays
        self.bh = np.zeros((hidden_size, 1))
        self.by = np.zeros((output_size, 1))
    
    def forward(self, inputs):
        
        # Here the function takes a singe review
        # input is a list of length 300 & vocab_size is 5000
        # each element of the input list is one_hot_vector of size (vocab_size, 1)
        
        # initializing the Zero-th hidden layer (one corresponding to 0-th time stamp)
        h = np.zeros((self.Wh.shape[0], 1))
        
        # List for storing the hidden layer units
        self.hidden_layer = list()
        
        # storing the inputs and the initial hidden layer for future use
        self.inputs = inputs
        self.hidden_layer.append(h)
        
        # Performing the forward pass for each time step 
        for i, j in enumerate(inputs):

            h = np.tanh(np.matmul(self.Wx, j) + np.matmul(self.Wh, h) + self.bh)
            self.hidden_layer.append(h)

        # Final computation of the output using the last hidden layer
        output = np.matmul(self.Wy, h) + self.by
        
        # Softmax Function for the output array.
        # Outputs 2-class probabilities which sums up to 1
        return np.exp(output) / sum(np.exp(output))
    
    
    def BPTT(self, d_y, rate=0.2, bounds = [-1.5, 1.5]):
        
        # This function performs backpropagation through time (BPTT)
        # Initialization : Derivative arrays initialized to zero 
        d_Wh = np.zeros(self.Wh.shape)
        d_Wx = np.zeros(self.Wx.shape)
        d_bh = np.zeros(self.bh.shape)
        
        # Direct computations 

        d_Wy = np.matmul(d_y, self.hidden_layer[len(self.inputs)].T)
        d_by = d_y
        d_h = np.matmul(self.Wy.T, d_y)

        # Updating the gradients

        self.Wy -= rate * np.clip(d_Wy, a_min=bounds[0], a_max=bounds[1])
        self.by -= rate * np.clip(d_by, a_min=bounds[0], a_max=bounds[1])

        # recursive computations
        
        for i in range(len(self.inputs))[::-1]:

            d_Wh += np.matmul(((1 - np.square(self.hidden_layer[i + 1]))*d_h), self.hidden_layer[i].T)
            d_Wx += np.matmul(((1 - np.square(self.hidden_layer[i + 1]))*d_h), self.inputs[i].T)
            d_bh += ((1 - np.square(self.hidden_layer[i + 1]))*d_h)
            d_h = np.matmul(self.Wh, ((1 - np.square(self.hidden_layer[i + 1]))*d_h))  

        # Updating the gradients
        
        self.Wh -= rate * np.clip(d_Wh, a_min=bounds[0], a_max=bounds[1])
        self.Wx -= rate * np.clip(d_Wx, a_min=bounds[0], a_max=bounds[1])
        self.bh -= rate * np.clip(d_bh, a_min=bounds[0], a_max=bounds[1])
        
        

In [0]:
def run_RNN(features, labels, training=True):

    # The "training" flag is used to indicate whether to update the hyperparameters using backpropagation or not
    # We denote, (dL/dy) : d_y  it has shape (output_size, 1).
    # dL/dy = p_i     , if i != c
    #         p_i - 1 , if i == c

    loss = 0
    correct_pred = 0
    
    # Randomly picking up the training data
    l = np.arange(len(features))
    # Shuffelling is needed so that te inputs are randomly fed to the network
    np.random.shuffle(l)
    
    with tqdm(total=len(features), file=sys.stdout) as pbar:
        
        for i in l:

            # Probability computation
            probs = rnn.forward(one_hot_encoding(features[i]))

            # loss & accuracy
            loss -= np.log(probs[labels[i]])

            if np.argmax(probs) == labels[i]:
                correct_pred += 1

            if training:

                d_y = probs
                d_y[labels[i]] = d_y[labels[i]] - 1
                rnn.BPTT(d_y, rate=0.2, bounds = [-1.5, 1.5])
            
            # for updating the tqdm progress bar
            pbar.update(1)

    # computing loss & accuracy

    loss_func = loss / len(features)
    accuracy = correct_pred / len(features)
            
    return loss_func, accuracy 

In [0]:
np.random.seed(seed=1)

# Initialize the RNN class
rnn = RNN(5000, 2)

In [0]:
for epoch in range(5):
    print('Epoch Number : ',(epoch + 1))

    train_loss, train_acc = run_RNN(features_train, train_labels)
    test_loss, test_acc = run_RNN(features_test, test_labels, training=False)
    
    print('Training Loss : %f and Training Accuracy : %f' % (train_loss, train_acc))
    print('Test Loss : %f and Test Accuracy : %f' % (test_loss, test_acc))

Epoch Number :  1


HBox(children=(IntProgress(value=0, max=25000), HTML(value='')))




HBox(children=(IntProgress(value=0, max=25000), HTML(value='')))


Training Loss : 0.795744 and Training Accuracy : 0.517760
Test Loss : 1.078472 and Test Accuracy : 0.499920
Epoch Number :  2


HBox(children=(IntProgress(value=0, max=25000), HTML(value='')))