In [1]:
import pandas as pd
import numpy as np
import warnings
from sklearn.model_selection import train_test_split
import re
import contractions
import nltk
from sklearn.metrics import classification_report, accuracy_score
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.utils import shuffle
import torch
import torch.nn.functional as F
import torch.nn as nn
import time
import math
import gensim
from gensim.models import Word2Vec

from collections import defaultdict
nltk.download('wordnet',quiet=True)
nltk.download('stopwords', quiet=True)
nltk.download('punkt',quiet=True)
 

True

### 1. Dataset Generation

In [2]:
import gensim.downloader as api
wv = api.load('word2vec-google-news-300')

* Read the complete dataset from "https://s3.amazonaws.com/amazon-reviews-pds/tsv/amazon_reviews_us_Beauty_v1_00.tsv.gz"
* Create three-class classification problem according to the rating.
* Performed the below data cleaning on complete data
    * convert the all reviews into the lower case
    * remove html and url
    * remove non-alphabetical chars
    * remove extra spaces
    * perform contractions
* Selectd 20,000 random reviews from each rating class and created a balanced dataset. 


* Iterate through each review
* If word in review is in Word embedding, get the word vector
* Compute the average vector of sentence.
* Split the dataset into train and test set    

In [3]:
complete_reviews = "https://s3.amazonaws.com/amazon-reviews-pds/tsv/amazon_reviews_us_Beauty_v1_00.tsv.gz"
complete_data = pd.read_csv(complete_reviews, sep='\t', on_bad_lines='skip', low_memory=False)
complete_data.dropna(inplace=True)

warnings.filterwarnings("ignore")

full_data = complete_data[['star_rating', 'review_body']]
full_data.dropna(inplace=True)
full_data['star_rating'] = full_data['star_rating'].astype('int32')

for index, row in full_data.iterrows():
    if row['star_rating'] in {1,2}:
        full_data.loc[index, 'star_rating'] = 1
    elif row['star_rating'] in {3}:
        full_data.loc[index, 'star_rating'] = 2
    elif row['star_rating'] in {4,5}:
        full_data.loc[index, 'star_rating'] = 3

def clean_data(data):
    #covert to lower
    data= data.lower()
    #remove html and url
    data = re.sub(r'http\S+', '', data)
    urls = re.compile('<.*?>|&([a-z0-9]+|#[0-9]{1,6}|#x[0-9a-f]{1,6});')
    data = re.sub(urls, '', data)
    #remove non-alphabetical chars
    non_alpha = re.compile('[^a-zA-Z]')
    data = non_alpha.sub(' ', data)
    #remove extra spaces
    data = re.sub(' +', ' ', data)
    #perform contractions
    data = contractions.fix(data)
    return data

balanced_data = full_data.groupby('star_rating').apply(lambda group: group.sample(20000)).reset_index(drop = True)
balanced_data['review_body'] = balanced_data.apply(lambda row : clean_data(row['review_body']), axis = 1)


In [6]:
balanced_data = shuffle(balanced_data)
test_reviews = balanced_data['review_body']
avg_review_vectors = []
for review in test_reviews:
    words = review.split()
    rv = np.zeros(300)
    for word in words:
        if word in wv:
            rv += wv[word]
    if(len(words)):
        rv /= len(words)
    avg_review_vectors.append(rv)

avg_review_vectors = np.array(avg_review_vectors)
review_ratings = balanced_data['star_rating']
train_X = avg_review_vectors[:int(0.8 * len(avg_review_vectors))]
train_Y = review_ratings[:int(0.8 * len(avg_review_vectors))]
test_X = avg_review_vectors[int(0.8 * len(avg_review_vectors)):]
test_Y =  review_ratings[int(0.8 * len(avg_review_vectors)):]
train_X=np.nan_to_num(train_X, copy=True, nan=0.0, posinf=None, neginf=None)
test_X=np.nan_to_num(test_X,copy=True, nan=0.0, posinf=None, neginf=None)

### 2. Word Embedding

To learn the semantic similarity, I am considering the below three examples:
* Finding words similar to "happy" using its vector
* Performing "big - large + small = tiny" using word vector
* Finding cosine similarity between words "love" and "like"

#### (a) Pre-trained Word2Vec Model

In [5]:
print("Similar words to happy:")
res = wv.similar_by_vector(wv["happy"], topn=5)
for word, score in res:
    print("\t{}: {:.4f}".format(word, score))
print()

big = wv['big']
large = wv['large']
small = wv['small']
result = big-large+small
similarity = wv.similar_by_vector(result)
print("Most similar words to 'big-large+small':")
for word, score in similarity:
    print("\t{}: {:.4f}".format(word, score))
print()    
cosine = wv.similarity("love", "like")
print("Cosine similarity between love and like:", cosine)
print()

Similar words to happy:
	happy: 1.0000
	glad: 0.7409
	pleased: 0.6632
	ecstatic: 0.6627
	overjoyed: 0.6599

Most similar words to 'big-large+small':
	big: 0.7968
	small: 0.6329
	bigger: 0.5330
	huge: 0.4986
	little_bitty: 0.4698
	biggest: 0.4613
	tiny: 0.4609
	Small: 0.4602
	nice: 0.4599
	abig: 0.4512

Cosine similarity between love and like: 0.36713877



#### (b) Training Word2Vec on Review dataset

In [6]:
r_data =balanced_data['review_body']
sentences = []
for s in r_data:
    sentences.append(list(s.split(" ")))
    
my_model = Word2Vec(sentences, vector_size=300, window=13, min_count=9)
my_model.train(sentences, total_examples=my_model.corpus_count, epochs=my_model.epochs)

print("Similar words to happy:")
res = my_model.wv.similar_by_vector(wv["happy"], topn=5)
for word, score in res:
    print("\t{}: {:.4f}".format(word, score))
print()

big_2 = my_model.wv['big']
large_2 = my_model.wv['large']
small_2 = my_model.wv['small']
result_2 = big_2-large_2+small_2
similarity_2 = my_model.wv.similar_by_vector(result_2)
print("Most similar words to 'big-large+small':")
for word, score in similarity_2:
    print("\t{}: {:.4f}".format(word, score))
print()

cosine = my_model.wv.similarity("love", "like")
print("Cosine similarity between love and like:", cosine)
print()

Similar words to happy:
	chest: 0.2259
	perspiration: 0.2153
	cheeks: 0.2106
	rash: 0.2047
	underarm: 0.2008

Most similar words to 'big-large+small':
	big: 0.8179
	small: 0.7084
	huge: 0.5634
	tiny: 0.5382
	tall: 0.4235
	practical: 0.3925
	wasteful: 0.3881
	cute: 0.3819
	pricey: 0.3770
	breaker: 0.3611

Cosine similarity between love and like: 0.25789207



#### Conclusions:

* For the first example, pretrained models gives much better result compared to our custom model. Custome model process the word in refrence to the dataset that it has i.e with respect to reviews. Hence the similar words suggested by custom model are not synonyms of "happy"

* For the second example "big-large+small=tiny", both the models are performing considerably well. The score returned for "tiny" by the custom model is slightly more compared to  pretrained model.

* For the thrid example, cosine similarity score given by pretrained model is higher but difference is negligible. 

Overall, pretrained model performs much better in comparision to the custom model. The main reason being, custom model has smaller dataset (only the reviews) where as pretrained model has very vast dataset.

## 3. Simple models

In [7]:
def findAccuracy(y_test, y_pred):
    report = classification_report(y_test, y_pred, output_dict=True, digits=4)
    df = pd.DataFrame(report).transpose()
    return 100*accuracy_score(y_test, y_pred)

### Perceptron

In [8]:
from gensim.models import KeyedVectors
from sklearn.linear_model import Perceptron
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score

perceptron = Perceptron(max_iter=9000)
perceptron.fit(train_X, train_Y)
perceptron_predictions = perceptron.predict(test_X)
print("Perceptron Accuracy using Word2Vec  = ", findAccuracy(test_Y, perceptron_predictions))

###########Compute using TFIDF#####################
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def remove_stopword(text):
    text_tokens = word_tokenize(text)
    tokens_without_sw =  " ".join([word for word in text_tokens if word not in stop_words])
    return tokens_without_sw
def lemmetize(text):    
    text_tokens = word_tokenize(text)
    lemmatized_string = " ".join([lemmatizer.lemmatize(words) for words in text_tokens])
    return lemmatized_string
balanced_data['cleaned_data'] = balanced_data.apply(lambda row : remove_stopword(row['review_body']), axis = 1)
balanced_data['cleaned_data'] = balanced_data.apply(lambda row : lemmetize(row['review_body']), axis = 1)
tfidf_vectorizer = TfidfVectorizer(min_df=9, ngram_range=(1,2))
tfidf_vector = tfidf_vectorizer.fit_transform(balanced_data['cleaned_data'])
X_tfidf = tfidf_vector
y_tfidf = balanced_data['star_rating']
X_train_tfidf, X_test_tfidf, y_train_tfidf, y_test_tfidf = train_test_split(X_tfidf, y_tfidf, test_size=0.2, random_state=42)
perceptron_model = Perceptron(max_iter=10000)
perceptron_model.fit(X_train_tfidf, y_train_tfidf)
y_pred_perceptron = perceptron_model.predict(X_test_tfidf)
print("\nPerceptron Accuracy using TF-IDF = ", findAccuracy(y_test_tfidf, y_pred_perceptron))


Perceptron Accuracy using Word2Vec  =  59.550000000000004

Perceptron Accuracy using TF-IDF =  65.78333333333333


In [9]:
from sklearn.svm import LinearSVC

svm = SVC(C=0.1)
svm.fit(train_X, train_Y)
svm_predictions = svm.predict(test_X)
print("SVM Accuracy using Word2Vec (%) = ", findAccuracy(test_Y, svm_predictions))

######################TF-IDF####################################

svm_model = LinearSVC(C=0.1).fit(X_train_tfidf, y_train_tfidf)
y_pred_svm = svm_model.predict(X_test_tfidf)
print("\nSVM Accuracy using TF-IDF (%)= ", findAccuracy(y_test_tfidf, y_pred_svm))

SVM Accuracy using Word2Vec (%) =  65.925

SVM Accuracy using TF-IDF (%)=  74.45


#### Conclusion:

From above test accuarcy values, we can conclude that TF-IDF model performing better than Word2Vec for both SM and Perceptron.It's also worth noting that TFIDF and Word2Vec capture different aspects of language. TFIDF is a statistical measure of how important a word is to a document in a corpus, while Word2Vec is a neural network-based model that captures the semantic relationships between words. 

There could be several reasons why the model using TFIDF is performing better than the Word2Vec model.
* Word2Vec models are trained on general language data, and may not perform as well on domain-specific language. In this case reviews are specific to beauty products. Hence review dataset might have many words repeated lot of times.
* Word2Vec requires a large amount of data to accurately capture the nuances of language. If you have a small dataset, the Word2Vec model may not have enough data to work with and may not be able to capture the meaning of words accurately

## 4. Feedforward Neural Networks

##### Workflow:

* Defined a class 'Feedforward_MLP' which is used for both 4.a and 4.b

* __init__ method takes an input_size as argument which specifies the size of the input data.
    * self.fc1: The first fully connected layer takes the input of size 'input_size' and produces an output of size 100.
    * self.dropout: A dropout layer that randomly sets input elements to zero with a given probability during training. This helps prevent overfitting and improves the generalization of the model.
    * self.fc2: The second fully connected layer takes the output from the previous layer (size 100) and produces an output of size 10.
    * self.fc3: The third fully connected layer takes the output from the previous layer (size 10) and produces an output of size 3.
    * self.softmax: A softmax function applied to the output of the last layer to convert the output values to probabilities that sum to 1 over the 3
    
* **forward method**  applies the layers defined in the __init__ method in a feedforward manner to produce the output.
    * x = F.relu(self.fc1(x)): The input x is passed through the first fully connected layer, self.fc1, followed by the ReLU activation function. This produces an output x of size 100.
    * x = self.dropout(x): The output x from the first layer is passed through the dropout layer defined in __init__.
    * x = F.relu(self.fc2(x)): The output x from the dropout layer is passed through the second fully connected layer, self.fc2, followed by the ReLU activation function. This produces an output x of size 10.
    * x = self.dropout(x): The output x from the second layer is passed through the dropout layer defined in __init__.
    * x = self.fc3(x): The output x from the second dropout layer is passed through the third fully connected layer, self.fc3, which produces an output x of size 3.
    * x = self.softmax(x): The output x from the third layer is passed through the softmax function defined in __init__, which converts the output values to probabilities that sum to 1 over the 3 output classes.
    
**Optimizers** : I have used different optimizers for 4.a and 4.b. MLP model with average vectors performed better with SGD optimizer. Where as MLP model with concatenated words performed better with Adam optimizer.

**Batch Size** : 20

In [10]:
#adjust the class labels to start from 0
train_label_0 = [x-1 for x in train_Y]
test_label_0 = [x-1 for x in test_Y]

#### (a) Using average Word2Vec vectors

In [31]:
class Feedforward_MLP(torch.nn.Module):
        def __init__(self, input_size):
            super(Feedforward_MLP, self).__init__()
            self.input_size = input_size
            
            self.fc1 = torch.nn.Linear(self.input_size, 100)
            self.dropout = torch.nn.Dropout(0.2)            
#             self.relu = torch.nn.ReLU()
            self.fc2 = torch.nn.Linear(100, 10)
            self.fc3 = torch.nn.Linear(10, 3)
#             self.sigmoid = torch.nn.Sigmoid()
            self.softmax = torch.nn.Softmax(dim=1)

        def forward(self, x):
            x = F.relu(self.fc1(x))
            x = self.dropout(x)
            x = F.relu(self.fc2(x))
            x = self.dropout(x)
            x = self.fc3(x)
            x= self.softmax(x)
            return x
        
mlp_model = Feedforward_MLP(300)

In [38]:
tensor_trainX = torch.Tensor(train_X)
tensor_trainY = torch.Tensor(train_label_0).type(torch.LongTensor)
tensor_testX = torch.Tensor(test_X)
tensor_testY = torch.Tensor(test_label_0).type(torch.LongTensor)

# Create Torch datasets
tensorset_train = torch.utils.data.TensorDataset(tensor_trainX, tensor_trainY)
tensorset_test = torch.utils.data.TensorDataset(tensor_testX, tensor_testY)

# Create Data Loaders
train_loader = torch.utils.data.DataLoader(tensorset_train, batch_size=20,shuffle=False)
test_loader = torch.utils.data.DataLoader(tensorset_test, batch_size=20, shuffle=False)

optimizer = torch.optim.SGD(mlp_model.parameters(),lr=0.005)
criterion = torch.nn.CrossEntropyLoss()
test_loss_min = np.Inf # set initial "min" to infinity

for epoch in range(100):
    # monitor training loss
    train_loss = 0.0
    
    ###################
    # train the model #
    ###################
    mlp_model.train() # prep model for training
    for data, target in train_loader:
        # clear the gradients of all optimized variables
        optimizer.zero_grad()
        # forward pass: compute predicted outputs by passing inputs to the model
        output = mlp_model(data)
        # calculate the loss
        loss = criterion(output, target)
        # backward pass: compute gradient of the loss with respect to model parameters
        loss.backward()
        # perform a single optimization step (parameter update)
        optimizer.step()
        # update running training loss
        train_loss += loss.item()*data.size(0)
    train_loss = train_loss/len(train_loader.dataset)
    print('Epoch: {} \tTraining Loss: {:.6f}'.format(
        epoch+1, 
        train_loss,
        ))

Epoch: 1 	Training Loss: 0.844494
Epoch: 2 	Training Loss: 0.843197
Epoch: 3 	Training Loss: 0.841609
Epoch: 4 	Training Loss: 0.840858
Epoch: 5 	Training Loss: 0.840465
Epoch: 6 	Training Loss: 0.838932
Epoch: 7 	Training Loss: 0.838825
Epoch: 8 	Training Loss: 0.837016
Epoch: 9 	Training Loss: 0.836743
Epoch: 10 	Training Loss: 0.837468
Epoch: 11 	Training Loss: 0.836967
Epoch: 12 	Training Loss: 0.836400
Epoch: 13 	Training Loss: 0.836334
Epoch: 14 	Training Loss: 0.834739
Epoch: 15 	Training Loss: 0.835276
Epoch: 16 	Training Loss: 0.834645
Epoch: 17 	Training Loss: 0.834795
Epoch: 18 	Training Loss: 0.835128
Epoch: 19 	Training Loss: 0.834249
Epoch: 20 	Training Loss: 0.834046
Epoch: 21 	Training Loss: 0.834135
Epoch: 22 	Training Loss: 0.833982
Epoch: 23 	Training Loss: 0.833966
Epoch: 24 	Training Loss: 0.833890
Epoch: 25 	Training Loss: 0.834418
Epoch: 26 	Training Loss: 0.832815
Epoch: 27 	Training Loss: 0.834343
Epoch: 28 	Training Loss: 0.833762
Epoch: 29 	Training Loss: 0.8

In [39]:
test_loader_ffn = torch.utils.data.DataLoader(tensorset_test, batch_size=1, shuffle=False)
y_pred_mlp = []
mlp_model.eval()
for i, (review, rating) in enumerate(test_loader_ffn):
    outputs = mlp_model(review)
    predicted = torch.argmax(outputs).item() 
    y_pred_mlp.append(predicted)
print("Feedforward Neural Network Accuracy (Avg Vectors) = ", findAccuracy(tensor_testY.tolist(), y_pred_mlp))

Feedforward Neural Network Accuracy (Avg Vectors) =  67.28333333333333


#### (b) Input feature as concatenated vectors 

In [52]:
def generate_input_feature_vector(reviews):
    review_words = [review.split() for review in reviews]    
    vector_size = wv.vector_size    
    num_reviews = len(reviews)
    input_features = np.zeros((num_reviews, 10*vector_size))
    
    for i, words in enumerate(review_words):
        vectors = []
        for j in range(min(len(words), 10)):
            word = words[j]
            if word in wv:
                vectors.append(wv[word])
            else:
                pass
        
        num_missing_vectors = max(0, 10 - len(vectors))
        padded_vectors = vectors + [np.zeros(vector_size)]*num_missing_vectors        
        feature_vector = np.concatenate(padded_vectors)        
        input_features[i,:] = feature_vector
    
    return input_features

input_feature = generate_input_feature_vector(balanced_data['review_body'])
train_data_concat = input_feature[:int(0.8 * len(input_feature))]
train_label_concat = balanced_data['star_rating'][:int(0.8 * len(input_feature))]
test_data_concat = input_feature[int(0.8 * len(input_feature)):]
test_label_concat =  balanced_data['star_rating'][int(0.8 * len(input_feature)):]

tensor_trainX_concat = torch.Tensor(train_data_concat)
tensor_trainY_concat = torch.Tensor(train_label_0).type(torch.LongTensor)
tensor_testX_concat = torch.Tensor(test_data_concat)
tensor_testY_concat = torch.Tensor(test_label_0).type(torch.LongTensor)

# Create Torch datasets
train_tensorset_concat = torch.utils.data.TensorDataset(tensor_trainX_concat, tensor_trainY_concat)
test_tensorset_concat = torch.utils.data.TensorDataset(tensor_testX_concat, tensor_testY_concat)

# Create Data Loaders
train_loader = torch.utils.data.DataLoader(train_tensorset_concat, batch_size=20,shuffle=False)
# test_loader = torch.utils.data.DataLoader(test_tensorset_concat, batch_size=20, shuffle=False)

In [53]:
concatenated_model = Feedforward_MLP(3000)
optimizer_concat = torch.optim.Adam(concatenated_model.parameters(),lr=0.005, betas=(0.9,0.999),eps=1e-08,weight_decay=5e-5)
test_loss_min = np.Inf # set initial "min" to infinity

for epoch in range(100):
    # monitor training loss
    train_loss = 0.0
    
    ###################
    # train the model #
    ###################
    concatenated_model.train() # prep model for training
    for data, target in train_loader:
        # clear the gradients of all optimized variables
        optimizer_concat.zero_grad()
        # forward pass: compute predicted outputs by passing inputs to the model
        output = concatenated_model(data)
        # calculate the loss
        loss = criterion(output, target)
        # backward pass: compute gradient of the loss with respect to model parameters
        loss.backward()
        # perform a single optimization step (parameter update)
        optimizer_concat.step()
        # update running training loss
        train_loss += loss.item()*data.size(0)
        
        
    # print training/validation statistics 
    # calculate average loss over an epoch
    train_loss = train_loss/len(train_loader.dataset)
    
    print('Epoch: {} \tTraining Loss: {:.6f}'.format(
        epoch+1, 
        train_loss
        ))

Epoch: 1 	Training Loss: 1.005786
Epoch: 2 	Training Loss: 0.985832
Epoch: 3 	Training Loss: 0.975598
Epoch: 4 	Training Loss: 0.971429
Epoch: 5 	Training Loss: 0.965208
Epoch: 6 	Training Loss: 0.961411
Epoch: 7 	Training Loss: 0.958249
Epoch: 8 	Training Loss: 0.957097
Epoch: 9 	Training Loss: 0.956316
Epoch: 10 	Training Loss: 0.954477
Epoch: 11 	Training Loss: 0.953982
Epoch: 12 	Training Loss: 0.952753
Epoch: 13 	Training Loss: 0.952201
Epoch: 14 	Training Loss: 0.951212
Epoch: 15 	Training Loss: 0.951829
Epoch: 16 	Training Loss: 0.949436
Epoch: 17 	Training Loss: 0.950420
Epoch: 18 	Training Loss: 0.950830
Epoch: 19 	Training Loss: 0.952146
Epoch: 20 	Training Loss: 0.950115
Epoch: 21 	Training Loss: 0.954124
Epoch: 22 	Training Loss: 0.949655
Epoch: 23 	Training Loss: 0.949825
Epoch: 24 	Training Loss: 0.951329
Epoch: 25 	Training Loss: 0.949513
Epoch: 26 	Training Loss: 0.950727
Epoch: 27 	Training Loss: 0.949899
Epoch: 28 	Training Loss: 0.950814
Epoch: 29 	Training Loss: 0.9

In [54]:
test_loader_concat = torch.utils.data.DataLoader(test_tensorset_concat, batch_size=1, shuffle=False)
# concat_model = torch.load('loa.pt')
y_pred_concat = []
concatenated_model.eval()
for i, (review, rating) in enumerate(test_loader_concat):
    outputs = concatenated_model(review)
    predicted = torch.argmax(outputs).item() 
    y_pred_concat.append(predicted)
print("FNN Accuracy (concatenated words) in %  = ", findAccuracy(tensor_testY_concat.tolist(), y_pred_concat))

FNN Accuracy (concatenated words) in %  =  56.40833333333334


#### Conclusion:
**Test Accuracy values**

(a) MLP  Accuracy (avg vectors)  = 66.86%

(b) MLP Accuracy (Concatenated) = 56.4%

On running the multiple times, accuracy was varying between 65-67% for (a) and 54-56% for (b)

MLP with averaged vectors has higher accuray compared to MLP with first 10 concatenated vectors. Possible reasons could be,

* **Information loss**: The concatenation of the first 10 words as input feature may result in information loss since the first 10 words do not necessarily capture the context of the entire sentence. As a result, the model that takes this input feature may not have enough information to accurately predict the target variable. For example: Consider 'I bought this product one month ago in a sale. It is not good'. Here the actual sentiment of review is in the last part of sentence which will be discarded by model.

* **Word importance**: The first 10 words of a sentence may not always be the most important for predicting the target variable. The model that takes the average of all vectors as input feature considers all words in the sentence to be equally important, which may be more accurate for certain tasks. For example: 'It did not work for first one week of usage. But worked wonders after using for few weeks'

* **Overfitting**: The model that takes the concatenation of the first 10 words as input feature may be overfitting to the training data, since the input feature is specific to the first 10 words of the sentence. On the other hand, the model that takes the average of all vectors as input feature may be more generalizable since it is based on the entire sentence.

In comparision to Simple Models ( SVM and Perceptron), 
* MLP with average vectors performs better than both SVM and Perceptron because of its non-linear decision boundaries and hidden layers
* SVM performs bettern than MLP with concatenated input vectors (b), maily because of the loss of information cause by considering only 10 words.

## 5. Recurrent Neural Networks

##### Workflow:

I have three seperate classes for Simple RNN, Gated RNN and LSTM

1. Simple RNN:
    Classname : RNN
    Arguments: 
        * input_size : 300
        * output_size : 3
        * hidden_size : 20
        
   Optimizer : Adam optimizer with learning rate 0.0001
   
2. Gated RNN:
    Classname: GatedNet
    Arguments:
        * input_dim: 300
        * hidden_dim : 20
        * output_dim : 3
        
    Optimizer : Adam optimizer with learning rate 0.0001
    
    
3. LSTM:
    Classname: LSTMModel
    Arguments:
        * input_dim: 300
        * hidden_dim : 20
        * output_dim : 3
        
    Optimizer : Adam optimizer with learning rate 0.0001
    

##### Data generation:

Generate input feature by taking maximum review length of 20. Truncat longer reviews and pad shorter reviews with a null value

In [17]:
def generate_input_vec_20(reviews):
    sequences =  []
    for review in reviews:
        text_tokens = word_tokenize(review)        
        vectors=[]
        for i,word in enumerate(text_tokens):
            if word in wv:
                vectors.append(wv[word].reshape((1,300)))
                
        if(len(vectors)>=20):
            padded_vectors = vectors[:20]
        else:
            num_missing_vectors = 20-len(vectors)
            padded_vectors = vectors + [np.zeros((1,300))]*num_missing_vectors
            
        sequences.append(padded_vectors)
            
    return sequences
            
input_feature_rnn = generate_input_vec_20(balanced_data['review_body'])
train_data_rnn = input_feature_rnn[:int(0.8 * len(input_feature_rnn))]
test_data_rnn = input_feature_rnn[int(0.8 * len(input_feature_rnn)):]

x_train_rnn = torch.Tensor(train_data_rnn)
y_train_rnn = torch.Tensor(train_label_0).type(torch.LongTensor)
x_cv_rnn = torch.Tensor(test_data_rnn)
y_cv_rnn = torch.Tensor(test_label_0).type(torch.LongTensor)

# Create Torch datasets
train_rnn = torch.utils.data.TensorDataset(x_train_rnn, y_train_rnn)
test_rnn = torch.utils.data.TensorDataset(x_cv_rnn, y_cv_rnn)

# Create Data Loaders
train_loader_rnn = torch.utils.data.DataLoader(train_rnn, batch_size=1,shuffle=False)
test_loader_rnn = torch.utils.data.DataLoader(test_rnn, batch_size=1, shuffle=False)

### (a) Simple RNN

In [23]:
class RNN(torch.nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(RNN, self).__init__()
        self.hidden_size = hidden_size
        self.i2h = nn.Linear(input_size + hidden_size, hidden_size)
        self.i2o = nn.Linear(input_size + hidden_size, output_size)
        self.softmax = nn.LogSoftmax(dim=1)

    def forward(self, input, hidden):
        combined = torch.cat((input, hidden), 1)
        hidden = self.i2h(combined)
        output = self.i2o(combined)
        output = self.softmax(output)
        return output, hidden

    def initHidden(self):
        return torch.zeros(1, self.hidden_size)

rnn_model = RNN(300,20,3)
optimizer = torch.optim.Adam(rnn_model.parameters(), lr=0.0001, betas=(0.9,0.999),eps=1e-08,weight_decay=5e-5)

In [24]:
def train(line_tensor, category):
    hidden = rnn_model.initHidden()
    rnn_model.zero_grad()

    for i in range(line_tensor.shape[1]):
        inp_tensor = line_tensor[0][i]
        output, hidden = rnn_model(inp_tensor, hidden)        
    
    loss = criterion(output, category)
    optimizer.zero_grad()
    loss.backward()
    torch.nn.utils.clip_grad_norm_(rnn_model.parameters(), 5)
    optimizer.step()

    for p in rnn_model.parameters():
        p.data.add_(p.grad.data, alpha=-0.0001)
    
    return output, loss.item()

print_every = 10000
plot_every = 1000

current_loss = 0
all_losses = []

def timeSince(since):
    now = time.time()
    s = now - since
    m = math.floor(s / 60)
    s -= m * 60
    return '%dm %ds' % (m, s)

start = time.time()

for epoch in range(8):
    print_counter = 0
    for i, (review, rating) in enumerate(train_loader_rnn):
        output, loss = train(review, rating)
        current_loss += loss
        # Print iter number, loss, name and guess
        if print_counter % print_every == 0:
            guess = torch.argmax(output).item()
            correct = '✓' if guess == rating.item() else '✗ (%s)' % rating.item()
            print('%d %d%% (%s) %.4f / %s %s' % (print_counter, (print_counter / 48000) * 100, timeSince(start), loss, guess, correct))

        # Add current loss avg to list of losses
        if print_counter % plot_every == 0:
            all_losses.append(current_loss / plot_every)
            current_loss = 0
            
        print_counter += 1

0 0% (0m 0s) 1.1941 / 2 ✗ (0)
10000 20% (1m 14s) 0.5954 / 0 ✓
20000 41% (2m 33s) 2.7528 / 2 ✗ (1)
30000 62% (3m 48s) 1.3645 / 2 ✗ (1)
40000 83% (5m 1s) 1.9480 / 1 ✗ (0)
0 0% (6m 2s) 1.2555 / 1 ✗ (0)
10000 20% (7m 16s) 0.4120 / 0 ✓
20000 41% (8m 29s) 3.3803 / 2 ✗ (1)
30000 62% (9m 45s) 0.8490 / 2 ✗ (1)
40000 83% (11m 1s) 1.9096 / 1 ✗ (0)
0 0% (12m 0s) 1.4055 / 1 ✗ (0)
10000 20% (13m 14s) 0.1714 / 0 ✓
20000 41% (14m 27s) 3.7929 / 2 ✗ (1)
30000 62% (15m 41s) 1.5246 / 2 ✗ (1)
40000 83% (16m 56s) 1.0004 / 1 ✗ (0)
0 0% (18m 3s) 0.6036 / 0 ✓
10000 20% (19m 20s) 0.0353 / 0 ✓
20000 41% (20m 35s) 4.1500 / 2 ✗ (1)
30000 62% (21m 51s) 2.4909 / 2 ✗ (1)
40000 83% (23m 7s) 0.9554 / 1 ✗ (0)
0 0% (24m 8s) 0.2894 / 0 ✓
10000 20% (25m 22s) 0.0804 / 0 ✓
20000 41% (26m 38s) 4.3747 / 2 ✗ (1)
30000 62% (27m 52s) 2.1210 / 2 ✗ (1)
40000 83% (29m 7s) 1.1327 / 1 ✗ (0)
0 0% (30m 7s) 0.2013 / 0 ✓
10000 20% (31m 23s) 0.0678 / 0 ✓
20000 41% (32m 42s) 4.4907 / 2 ✗ (1)
30000 62% (33m 56s) 2.0769 / 2 ✗ (1)
40000 83% (3

In [25]:
y_pred_rnn = []
y_test_rnn = []
rnn_model.eval()
for i, (review_test_rnn, rating_test_rnn) in enumerate(test_loader_rnn):
    hidden = rnn_model.initHidden()
    for j in range(review_test_rnn.shape[1]):
        inp_tensor = review_test_rnn[0][j]
        guess, hidden = rnn_model(inp_tensor, hidden)
    guess = torch.argmax(guess).item()
    y_test_rnn.append(rating_test_rnn.item())
    y_pred_rnn.append(guess)

print("Simple RNN Accuracy in % = ", findAccuracy( y_test_rnn, y_pred_rnn))


Simple RNN Accuracy in % =  61.06666666666667


### (b) Gated RNN

In [26]:
class GRUNet(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim, n_layers=1, drop_prob=0.2):
        super(GRUNet, self).__init__()
        self.hidden_dim = hidden_dim
        self.n_layers = n_layers        
        self.gru = nn.GRU(input_dim, hidden_dim, n_layers, batch_first=True, dropout=drop_prob)
        self.fc = nn.Linear(hidden_dim, output_dim)        
        
    def forward(self, x):
        # Initializing hidden state for first input with zeros
        h0 = torch.zeros(self.n_layers, x.size(0), self.hidden_dim)

        # Forward propagation by passing in the input and hidden state into the model
        out, _ = self.gru(x, h0)

        # Reshaping the outputs in the shape of (batch_size, seq_length, hidden_size)
        # so that it can fit into the fully connected layer
        out = out[:, -1, :]

        # Convert the final state to our desired output shape (batch_size, output_dim)
        out = self.fc(out)

        return out
    
gru_model = GRUNet(300,20,3)
optimizer = torch.optim.Adam(gru_model.parameters(), lr=0.0005, betas=(0.9,0.999),eps=1e-08,weight_decay=5e-5)

In [27]:
def train_gru(line_tensor, category):
    gru_model.zero_grad()
    line_tensor = line_tensor.reshape(-1, 20, 300)
    output = gru_model(line_tensor)        
    
    loss = criterion(output, category)
    optimizer.zero_grad()
    loss.backward()
    torch.nn.utils.clip_grad_norm_(gru_model.parameters(), 5)
    optimizer.step()

    for p in gru_model.parameters():
        p.data.add_(p.grad.data, alpha=-0.005)
    
    return output, loss.item()
# Keep track of losses for plotting
train_loader_gru = torch.utils.data.DataLoader(train_rnn, batch_size=20,shuffle=False)
test_loader_gru = torch.utils.data.DataLoader(test_rnn, batch_size=20, shuffle=False)

current_loss = 0
for epoch in range(20):
    print_counter = 0
    for i, (review, rating) in enumerate(train_loader_gru):
        print_counter += 1
        output, loss = train_gru(review, rating)        
        current_loss += loss
        if print_counter%1000 == 0:
            print("Epoch {}......Step: {}/{}....... Average Loss for Epoch: {}".format(epoch, print_counter, len(train_loader_rnn), current_loss/print_counter))
            current_loss = 0
        

Epoch 0......Step: 1000/48000....... Average Loss for Epoch: 1.0269569813609123
Epoch 0......Step: 2000/48000....... Average Loss for Epoch: 0.4375462125837803
Epoch 1......Step: 1000/48000....... Average Loss for Epoch: 1.1419159276485442
Epoch 1......Step: 2000/48000....... Average Loss for Epoch: 0.39694790993630885
Epoch 2......Step: 1000/48000....... Average Loss for Epoch: 1.0919201246798038
Epoch 2......Step: 2000/48000....... Average Loss for Epoch: 0.38405050249397754
Epoch 3......Step: 1000/48000....... Average Loss for Epoch: 1.0658256910443307
Epoch 3......Step: 2000/48000....... Average Loss for Epoch: 0.37548590371012686
Epoch 4......Step: 1000/48000....... Average Loss for Epoch: 1.0467317140996457
Epoch 4......Step: 2000/48000....... Average Loss for Epoch: 0.3687788166999817
Epoch 5......Step: 1000/48000....... Average Loss for Epoch: 1.031130910217762
Epoch 5......Step: 2000/48000....... Average Loss for Epoch: 0.3631645495742559
Epoch 6......Step: 1000/48000....... A

In [28]:
y_pred_gru = []
y_test_gru = []
gru_model.eval()
for i, (review, rating) in enumerate(test_loader_rnn):
    line_tensor = review.reshape(-1, 20, 300)
    output = gru_model(line_tensor) 
    output = torch.argmax(output).item()
    y_test_gru.append(rating.item())
    y_pred_gru.append(output)

print("Gated RNN Accuracy = ", findAccuracy(y_test_gru, y_pred_gru))

Gated RNN Accuracy =  65.88333333333334


### (c) LSTM

In [29]:
class LSTMModel(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim, layer_dim=1):
        super(LSTMModel, self).__init__()
        # Hidden dimensions
        self.hidden_dim = hidden_dim
        # Number of hidden layers
        self.layer_dim = layer_dim

        self.lstm = nn.LSTM(input_dim, hidden_dim, layer_dim, batch_first=True)

        # Readout layer
        self.fc = nn.Linear(hidden_dim, output_dim)

    def forward(self, x):
        # Initialize hidden state with zeros
        h0 = torch.zeros(self.layer_dim, x.size(0), self.hidden_dim)

        # Initialize cell state
        c0 = torch.zeros(self.layer_dim, x.size(0), self.hidden_dim)

        out, (hn, cn) = self.lstm(x, (h0, c0))

        out = self.fc(out[:, -1, :]) 
        return out

lstm_model = LSTMModel(300, 20, 3)
optimizer_lstm = torch.optim.Adam(lstm_model.parameters(), lr=0.0005, betas=(0.9,0.999),eps=1e-08,weight_decay=5e-5)  


def train_lstm(line_tensor, category):
    lstm_model.zero_grad()
    line_tensor = line_tensor.reshape(-1, 20, 300)
    output = lstm_model(line_tensor)        
    
    loss = criterion(output, category)
    optimizer_lstm.zero_grad()
    loss.backward()
    torch.nn.utils.clip_grad_norm_(lstm_model.parameters(), 5)
    optimizer_lstm.step()

    for p in lstm_model.parameters():
        p.data.add_(p.grad.data, alpha=-0.005)
    
    return output, loss.item()

current_loss = 0
for epoch in range(20):
    print_counter = 0
    for i, (review, rating) in enumerate(train_loader_gru):
        print_counter += 1
        output, loss = train_lstm(review, rating)       
        current_loss += loss
        if print_counter%1000 == 0:
            print("Epoch {}......Step: {}/{}....... Average Loss for Epoch: {}".format(epoch, print_counter, len(train_loader_rnn), current_loss/print_counter))
            current_loss = 0

Epoch 0......Step: 1000/48000....... Average Loss for Epoch: 1.0319135826826096
Epoch 0......Step: 2000/48000....... Average Loss for Epoch: 0.4492488602101803
Epoch 1......Step: 1000/48000....... Average Loss for Epoch: 1.1977822415828705
Epoch 1......Step: 2000/48000....... Average Loss for Epoch: 0.4116702942252159
Epoch 2......Step: 1000/48000....... Average Loss for Epoch: 1.1262562407255172
Epoch 2......Step: 2000/48000....... Average Loss for Epoch: 0.3949509975016117
Epoch 3......Step: 1000/48000....... Average Loss for Epoch: 1.0924741402566434
Epoch 3......Step: 2000/48000....... Average Loss for Epoch: 0.38503675700724127
Epoch 4......Step: 1000/48000....... Average Loss for Epoch: 1.069533033490181
Epoch 4......Step: 2000/48000....... Average Loss for Epoch: 0.3772322600930929
Epoch 5......Step: 1000/48000....... Average Loss for Epoch: 1.0511162491440773
Epoch 5......Step: 2000/48000....... Average Loss for Epoch: 0.37053066992759703
Epoch 6......Step: 1000/48000....... Av

In [25]:
y_pred_lstm = []
y_test_lstm = []
lstm_model.eval()
for i, (review, rating) in enumerate(test_loader_rnn):
    line_tensor = review.reshape(-1, 20, 300)
    output = lstm_model(line_tensor) 
    output = torch.argmax(output).item()
    y_test_lstm.append(rating.item())
    y_pred_lstm.append(output)

print("LSTM Accuracy in % = ", findAccuracy(y_test_lstm, y_pred_lstm))

LSTM Accuracy in % =  66.03333333333333


#### Conclusion

**Test Accuracy values**

Simple RNN = 61.02% (varied between 59 - 61)

Gated RNN = 65.88% (varied between 65 - 67%)

LSTM = 66.03% (varied between 66-67%)

Gated RNN models are designed to address the vanishing gradient problem that can occur in simple RNN models, where the gradient signal becomes too small to propagate through the network during backpropagation. This can lead to difficulty in capturing long-term dependencies in the data. The vanishing gradient problem can occur when the recurrent weights in an RNN are repeatedly multiplied by small values causing the gradient signal to shrink exponentially over time.

Gated RNN models, on the other hand, use gating mechanisms to selectively update the hidden state and control the flow of information through the network. The gating mechanisms allow the model to remember information over longer periods of time and avoid the vanishing gradient problem.

Therefore, Gated RNN models outperform the simple RNN model, because they are better at capturing long-term dependencies in the data. 

LSTM has more parameters compared to a simple RNN model, which means that it has more capacity to learn complex patterns in the data. This can also contribute to the improved performance of LSTM over the simple RNN model.

Also, RNN has better performance overall when compared to FFN because FFN considered the average of review vectors which might suffer probelm of outliers ( too high or too low values). In contrast, RNN considered each word seperatly allowing them to better capture patterns in sequential data.

### References:

* https://pytorch.org/tutorials/intermediate/char_rnn_classification_tutorial.html
* https://www.kaggle.com/code/mishra1993/pytorch-multi-layer-perceptron-mnist/notebook
* https://www.deeplearningwizard.com/deep_learning/practical_pytorch/pytorch_lstm_neuralnetwork/
* https://blog.floydhub.com/gru-with-pytorch/
* https://pythonguides.com/adam-optimizer-pytorch/
* https://towardsdatascience.com/building-rnn-lstm-and-gru-for-time-series-using-pytorch-a46e5b094e7b
* https://stackoverflow.com/questions/70006954/pytorch-rnn-loss-does-not-decrease-and-validate-accuracy-remains-unchanged