## Dependencies
### Gensim - 3.4.0
### Pandas - 0.23.4
### Numpy - 1.15.4
### NLTK - 3.4
### Contractions - 0.0.52
### Pytorch - 1.9.0+cu102.
### Note: The RNN models might give bad accuracies the first time. Re-Running the cell should give accuracies close to what I have mentioned in this file. I have attached screenshots of the accuracies for all the models as I was not able to run the entire model at once. 

# Imports

In [2]:
import pandas as pd
import numpy as np
import nltk
nltk.download('wordnet')
import re
from bs4 import BeautifulSoup
import csv
import contractions
pd.set_option('display.max_colwidth', -1)
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
import gensim.downloader as api
import torch
import numpy as np
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
import gensim.models
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import WhitespaceTokenizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score
import multiprocessing
from torch.autograd import Variable

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\acer\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
  pd.set_option('display.max_colwidth', -1)


# Reading the CSV File

In [None]:
df = pd.read_csv("data.tsv", sep='\t', error_bad_lines=False, quoting=csv.QUOTE_NONE)

## Dropping N/A Values in Reviews and Ratings

In [None]:
df = df[['star_rating','review_body']]
df = df.dropna(subset=['star_rating'])
df = df.dropna(subset=['review_body'])
df['star_rating'] = df['star_rating'].astype(int)

## Assigning Sentiment based on Reviews

In [None]:
def sentiment(star_rating):
    if star_rating>3:
        return 0
    elif star_rating<3:
        return 1
    else:
        return 2
df['Sentiment'] = [sentiment(x) for x in df['star_rating']]

## Sampling 50k reviews for each rating to create a dataset of 250k Reviews

In [None]:
df_1 = df[df['star_rating'] == 1].sample(50000)
df_2 = df[df['star_rating'] == 2].sample(50000)
df_3 = df[df['star_rating'] == 3].sample(50000)
df_4 = df[df['star_rating'] == 4].sample(50000)
df_5 = df[df['star_rating'] == 5].sample(50000)

frames = [df_1, df_2, df_3, df_4, df_5]
df_sampled = pd.concat(frames)

In [None]:
#df_sampled.to_csv('250kSamples.csv', index = False)

In [None]:
#df_sampled = pd.read_csv('250kSamples.csv')

## Loading the GoogleWord2Vec model

In [None]:
wv = api.load('word2vec-google-news-300')

### Examples of Semantic Relationships of the Word Vectors using the GoogleWord2Vec Model


In [None]:
pairs = [
    ('card', 'poker'),  
    ('card', 'credit'),  
    ('card', 'birthday'),
]
for w1, w2 in pairs:
    print('%r\t%r\t%.2f' % (w1, w2, wv.similarity(w1, w2)))
    
result = wv.most_similar(positive=['woman', 'king'], negative=['man'], topn=1)
print(result)
result = wv.most_similar(positive=['woman', 'prince'], negative=['man'], topn=1)
print(result)
result = wv.most_similar(positive=['splendid', 'horrible'], negative=['good'], topn=1)
print(result)

## Data Cleaning and Preprocessing

### Converting all Reviews to Lower Case

In [None]:
df_sampled['review_body'] = df_sampled['review_body'].str.lower()

### Removing HTML and URL's

In [None]:
def remove_HTML(review):
    cleanText = BeautifulSoup(review, "html.parser").text
    return cleanText

df_sampled['review_body'] = df_sampled['review_body'].apply(lambda review : remove_HTML(str(review)))

### Keeping only Alphanumeric Characters in the Reviews

In [None]:
regex = re.compile('[^a-zA-Z \']')
df_sampled['review_body'] = df_sampled['review_body'].apply(lambda review : regex.sub('', review))

### Deleting Extra spaces in the Reviews 

In [None]:
regex = re.compile('[ +]')
df_sampled['review_body'] = df_sampled['review_body'].apply(lambda review : regex.sub(' ', review))

### Removing Contractions

In [None]:
def contractionfunction(s):
    return contractions.fix(s)
df_sampled['review_body'] = df_sampled['review_body'].apply(lambda review : contractionfunction(review))

### Removing Stop Words

In [None]:
stop_words = set(stopwords.words('english'))

def remove_stopWords(review):
    tokens = word_tokenize(review)
    filtered_words = [word for word in tokens if word not in stop_words]
    return " ".join(filtered_words)

df_sampled['review_body'] = df_sampled['review_body'].apply(lambda review : remove_stopWords(review))

### Lemmatizing the Reviews

In [None]:
tokenizer = WhitespaceTokenizer()
lemmatizer = WordNetLemmatizer()

def lemmatize_text(review):
    lemmatized_words = [lemmatizer.lemmatize(w) for w in tokenizer.tokenize(review)]
    return " ".join(lemmatized_words)

df_sampled['review_body'] = df_sampled['review_body'].apply(lambda review : lemmatize_text(review))

### Removing Words from Reviews which are not part of the GoogleWord2Vec Model

In [None]:
meaningfulWords = set(wv.vocab)
def removeUnecssaryWords(review):
    tokens = word_tokenize(review)
    cleanReview = [word for word in tokens if word.lower() in meaningfulWords]
    return cleanReview
    
df_sampled['review_body'] = df_sampled['review_body'].apply(lambda review : removeUnecssaryWords(review))

## Training our ownWord2Vec Model using Gensim
### Refrences: https://machinelearningmastery.com/develop-word-embeddings-python-gensim/
###                     https://radimrehurek.com/gensim/auto_examples/tutorials/run_word2vec.html


In [None]:
cores = multiprocessing.cpu_count()
print(cores)

In [None]:
model = gensim.models.Word2Vec(df_sampled['review_body'], window=11, min_count=10, workers=cores-1, size=300)

### Removing Words from Reviews which are not part of our Word2Vec Model

In [None]:
meaningfulWords = set(model.wv.vocab)
def removeUnecssaryWords(review):
    cleanReview = [word for word in review if word.lower() in meaningfulWords]
    return cleanReview
    
df_sampled['review_body'] = df_sampled['review_body'].apply(lambda review : removeUnecssaryWords(review))

In [None]:
pairs = [
    ('card', 'poker'),  
    ('card', 'credit'),
    ('card', 'birthday'),
]
for w1, w2 in pairs:
    print('%r\t%r\t%.2f' % (w1, w2, model.wv.similarity(w1, w2)))
result = model.wv.most_similar(positive=['woman', 'king'], negative=['man'], topn=1)
print(result)
result = model.wv.most_similar(positive=['woman', 'prince'], negative=['man'], topn=1)
print(result)
result = model.wv.most_similar(positive=['splendid', 'horrible'], negative=['good'], topn=1)
print(result)

### Comparing the above result for the GoogleWord2Vec Model and our trained Word2Vec Model. We can see that for a specific case where we are comparing "Card" to words like "Poker", "Credit" and "Birthday", our word2Vec Model is able encode Semantic Similarities better as words like this might have appeared in the Reviews in the same context. 
### The case when we are trying to generalize comparisions such as woman+king-man, woman+prince-man and splendid+horrible-good, we can see that the GoogleWord2Vec Model does better in giving us a more meaningful result(queen, princess and hideous) because it's trained on a much bigger and richer dataset. Comparing this to our Word2Vec model, we can clearly see that the dataset used to train the model is inadequate and is very specific to make such general comparisions and gives us results which don't make semantic sense (wallace, groomsman, unboxing) as the GoogleWord2Vec Model

## Removing empty Reviews from the Dataset

In [None]:
df_sampled = df_sampled[df_sampled['review_body'].map(lambda d: len(d)) > 0]

## Creating a Binary Dataset by removing Netural Reviews

In [None]:
df_binary = df_sampled[df_sampled['Sentiment'] != 2]

## Train-Test Split for Binary and Teranry Datasets (80% Training, 20% Test)

In [None]:
X_train_binary, X_test_binary, y_train_binary, y_test_binary = train_test_split(df_binary['review_body'],df_binary['Sentiment'], test_size=0.2, random_state=0)
X_train_ternary, X_test_ternary, y_train_ternary, y_test_ternary = train_test_split(df_sampled['review_body'],df_sampled['Sentiment'], test_size=0.2, random_state=0)

### Creating TF-IDF Vectors for Reviews 


In [None]:
def makeSentences(review):
    return " ".join(review)

X_train_binary_sentences = X_train_binary.apply(lambda review : makeSentences(review))
X_test_binary_sentences = X_test_binary.apply(lambda review : makeSentences(review))

from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(max_features=5000)

X_train_binary_sentences = vectorizer.fit_transform(X_train_binary_sentences)
X_test_binary_sentences = vectorizer.transform(X_test_binary_sentences)

### Main Driver Function to Create all the Required Datasets for the Upcoming Models
#### The below function is used to create 3 different types of Datasets
#### 1. Average Word Vector for the Entire Review
#### 2. First 10 Word Vectors for Each Review(if a review is less than 10 words, pad it with zeros, or if it's more than 10 words limit it to 10 words). Shape (3000)
#### 3. First 50 Word Vectors for Each Review(if a review is less than 50 words, pad it with zeros, or if it's more than 50 words limit it to 50 words). Shape (50,300)

#### We use the below function to create datasets for (Binary, Ternary) * (GoogleWord2Vec, myWord2Vec)

In [None]:
def word2VecMap(data, word2VecModel):
    vector_list = []
    mean_vector_list = []
    vector_list_10 = []
    vector_list_RNN = []
    for sentence in data:
        #print(sentence)
        temp = []
        temp_10=[]
        temp_50=[]
        for word in sentence:
            #print(word)
            embedding = word2VecModel[word]
            temp.append(embedding)
            temp_10.append(embedding)
            temp_50.append(embedding)
        #vector_list.append(temp)
        mean_vector_list.append(np.mean(temp,axis=0))
        length_10 = len(temp_10)
        length_50 = len(temp_50)
        if(length_10<10):
            zero_vector = np.zeros(300, dtype=float)
            zero_vector = zero_vector.tolist()
            for i in range(10-length_10):
                temp_10.append(zero_vector)
        if(length_10>10):
            temp_10 = temp_10[:10]
        if(length_50<50):
            zero_vector = np.zeros(300, dtype=float)
            zero_vector = zero_vector.tolist()
            for i in range(50-length_50):
                temp_50.append(zero_vector)
        if(length_50>50):
            temp_50 = temp_50[:50]
        vector_list_10.append(np.reshape(temp_10,(3000)))
        vector_list_RNN.append(temp_50)
    return mean_vector_list, vector_list_10, vector_list_RNN

#### Binary GoogleWord2Vec Word Embedding Dataset (Average, 10 word and 50 word) 

In [None]:
X_train_binary_google_vector_mean, X_train_binary_google_vector_10, X_train_binary_google_vector_50 = word2VecMap(X_train_binary, wv)
X_test_binary_google_vector_mean,X_test_binary_google_vector_10, X_test_binary_google_vector_50 = word2VecMap(X_test_binary, wv)

#### Teranry GoogleWord2Vec Word Embedding Dataset (Average, 10 word and 50 word) 

In [None]:
X_train_ternary_google_vector_mean, X_train_ternary_google_vector_10, X_train_ternary_google_vector_50 = word2VecMap(X_train_ternary, wv)
X_test_ternary_google_vector_mean, X_test_ternary_google_vector_10, X_test_ternary_google_vector_50 = word2VecMap(X_test_ternary, wv)

#### Binary myWord2Vec Word Embedding Dataset (Average, 10 word and 50 word) 

In [None]:
X_train_binary_myWord2Vec_vector_mean, X_train_binary_myWord2Vec_vector_10, X_train_binary_myWord2Vec_vector_50 = word2VecMap(X_train_binary, model.wv)
X_test_binary_myWord2Vec_vector_mean,X_test_binary_myWord2Vec_vector_10, X_test_binary_myWord2Vec_vector_50  = word2VecMap(X_test_binary, model.wv)

#### Ternary myWord2Vec Word Embedding Dataset (Average, 10 word and 50 word) 

In [None]:
X_train_ternary_myWord2Vec_vector_mean, X_train_ternary_myWord2Vec_vector_10, X_train_ternary_myWord2Vec_vector_50 = word2VecMap(X_train_ternary, model.wv)
X_test_ternary_myWord2Vec_vector_mean, X_test_ternary_myWord2Vec_vector_10, X_test_ternary_myWord2Vec_vector_50 = word2VecMap(X_test_ternary, model.wv)

## Perceptron

In [None]:
from sklearn.linear_model import Perceptron

clf = Perceptron(random_state=36)

clf.fit(X_train_binary_google_vector_mean, y_train_binary)

y_pred_train = clf.predict(X_train_binary_google_vector_mean)
accuracy_train = accuracy_score(y_train_binary, y_pred_train)
precision_score_train = precision_score(y_train_binary, y_pred_train)
recall_score_train = recall_score(y_train_binary, y_pred_train)
f1_score_train = f1_score(y_train_binary, y_pred_train)

y_pred_test= clf.predict(X_test_binary_google_vector_mean)
accuracy_test = accuracy_score(y_test_binary, y_pred_test)
precision_score_test = precision_score(y_test_binary, y_pred_test)
recall_score_test = recall_score(y_test_binary, y_pred_test)
f1_score_test = f1_score(y_test_binary, y_pred_test)

print("Accuracy %2.4f for Perceptron on Binary-Google" % (accuracy_test))

clf = Perceptron(random_state=4)

clf.fit(X_train_binary_myWord2Vec_vector_mean, y_train_binary)

y_pred_train = clf.predict(X_train_binary_myWord2Vec_vector_mean)
accuracy_train = accuracy_score(y_train_binary, y_pred_train)
precision_score_train = precision_score(y_train_binary, y_pred_train)
recall_score_train = recall_score(y_train_binary, y_pred_train)
f1_score_train = f1_score(y_train_binary, y_pred_train)

y_pred_test= clf.predict(X_test_binary_myWord2Vec_vector_mean)
accuracy_test = accuracy_score(y_test_binary, y_pred_test)
precision_score_test = precision_score(y_test_binary, y_pred_test)
recall_score_test = recall_score(y_test_binary, y_pred_test)
f1_score_test = f1_score(y_test_binary, y_pred_test)

print("Accuracy %2.4f for Perceptron on Binary-myWord2Vec" % (accuracy_test))

clf = Perceptron(random_state=0)

clf.fit(X_train_binary_sentences, y_train_binary)

y_pred_train = clf.predict(X_train_binary_sentences)
accuracy_train = accuracy_score(y_train_binary, y_pred_train)
precision_score_train = precision_score(y_train_binary, y_pred_train)
recall_score_train = recall_score(y_train_binary, y_pred_train)
f1_score_train = f1_score(y_train_binary, y_pred_train)

y_pred_test= clf.predict(X_test_binary_sentences)
accuracy_test = accuracy_score(y_test_binary, y_pred_test)
precision_score_test = precision_score(y_test_binary, y_pred_test)
recall_score_test = recall_score(y_test_binary, y_pred_test)
f1_score_test = f1_score(y_test_binary, y_pred_test)

print("Accuracy %2.4f for Perceptron on Binary- TfIdf" % (accuracy_test))

### Perceptron (Binary-Google) - 80%
### Perceptron (Binary-myWord2Vec) - 81.40%
### Perceptron (Binary- TfIdf) - 81.65%

#### We can see that the TF-IDF feature dataset did better than the word2Vec embeddings with respect to using the perceptron as the classifier. We can all see that our word2Vec model gave us a better accuracy when compared to the GoogleWord2Vec model and this might be because our word2vec model is trained specificially on the dataset we are using so it's able to model the entire dataset better.

# SVM

In [None]:
from sklearn.svm import LinearSVC

clf = LinearSVC()

clf.fit(X_train_binary_google_vector_mean, y_train_binary)

y_pred_train = clf.predict(X_train_binary_google_vector_mean)
accuracy_train = accuracy_score(y_train_binary, y_pred_train)
precision_score_train = precision_score(y_train_binary, y_pred_train)
recall_score_train = recall_score(y_train_binary, y_pred_train)
f1_score_train = f1_score(y_train_binary, y_pred_train)

y_pred_test= clf.predict(X_test_binary_google_vector_mean)
accuracy_test = accuracy_score(y_test_binary, y_pred_test)
precision_score_test = precision_score(y_test_binary, y_pred_test)
recall_score_test = recall_score(y_test_binary, y_pred_test)
f1_score_test = f1_score(y_test_binary, y_pred_test)

print("Accuracy %2.4f Precision %2.4f Recall %2.4f and f1-score %2.4f for SVM on test data" % (accuracy_test, precision_score_test, recall_score_test, f1_score_test))

clf = LinearSVC()

clf.fit(X_train_binary_myWord2Vec_vector_mean, y_train_binary)

y_pred_train = clf.predict(X_train_binary_myWord2Vec_vector_mean)
accuracy_train = accuracy_score(y_train_binary, y_pred_train)
precision_score_train = precision_score(y_train_binary, y_pred_train)
recall_score_train = recall_score(y_train_binary, y_pred_train)
f1_score_train = f1_score(y_train_binary, y_pred_train)

y_pred_test= clf.predict(X_test_binary_myWord2Vec_vector_mean)
accuracy_test = accuracy_score(y_test_binary, y_pred_test)
precision_score_test = precision_score(y_test_binary, y_pred_test)
recall_score_test = recall_score(y_test_binary, y_pred_test)
f1_score_test = f1_score(y_test_binary, y_pred_test)

print("Accuracy %2.4f Precision %2.4f Recall %2.4f and f1-score %2.4f for SVM on test data" % (accuracy_test, precision_score_test, recall_score_test, f1_score_test))

clf = LinearSVC()

clf.fit(X_train_binary_sentences, y_train_binary)

y_pred_train = clf.predict(X_train_binary_sentences)
accuracy_train = accuracy_score(y_train_binary, y_pred_train)
precision_score_train = precision_score(y_train_binary, y_pred_train)
recall_score_train = recall_score(y_train_binary, y_pred_train)
f1_score_train = f1_score(y_train_binary, y_pred_train)

y_pred_test= clf.predict(X_test_binary_sentences)
accuracy_test = accuracy_score(y_test_binary, y_pred_test)
precision_score_test = precision_score(y_test_binary, y_pred_test)
recall_score_test = recall_score(y_test_binary, y_pred_test)
f1_score_test = f1_score(y_test_binary, y_pred_test)

print("Accuracy %2.4f Precision %2.4f Recall %2.4f and f1-score %2.4f for SVM on test data" % (accuracy_test, precision_score_test, recall_score_test, f1_score_test))


### SVM (Binary-Google) - 81.74%
### SVM (Binary-myWord2Vec) - 84.61%
### SVM (Binary- TfIdf) - 86.99%

#### We can see that this is the exact same case as the Perceptron where the TF-IDF feature set did better than the word2Vec models and also our word2Vec Model did better than the GoogleWord2Vec Model


# FNN
### References: https://medium.com/analytics-vidhya/pytorch-for-deep-learning-binary-classification-logistic-regression-382abd97fb43

### Scaling the Mean vector of a Review for better performance

In [None]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()

In [None]:
X_train_binary_google_vector_mean_scaled = sc.fit_transform(X_train_binary_google_vector_mean)
X_test_binary_google_vector_mean_scaled = sc.transform(X_test_binary_google_vector_mean)

In [None]:
X_train_ternary_google_vector_mean_scaled = sc.fit_transform(X_train_ternary_google_vector_mean)
X_test_ternary_google_vector_mean_scaled = sc.transform(X_test_ternary_google_vector_mean)

In [None]:
X_train_binary_myWord2Vec_vector_mean_scaled = sc.fit_transform(X_train_binary_myWord2Vec_vector_mean)
X_test_binary_myWord2Vec_vector_mean_scaled = sc.transform(X_test_binary_myWord2Vec_vector_mean)

In [None]:
X_train_ternary_myWord2Vec_vector_mean_scaled = sc.fit_transform(X_train_ternary_myWord2Vec_vector_mean)
X_test_ternary_myWord2Vec_vector_mean_scaled = sc.transform(X_test_ternary_myWord2Vec_vector_mean)

### Driver Class(dataset) for Datasets which derives from the Dataset class of Pytorch and which implements basic functions such as lookup, length and converting to Pytorch Tensors

In [None]:
class dataset(Dataset):
  def __init__(self,x,y):
    self.x = torch.tensor(x,dtype=torch.float32)
    self.y = torch.tensor(y,dtype=torch.float32)
    self.length = self.x.shape[0]
 
  def __getitem__(self,idx):
    return self.x[idx],self.y[idx]
  def __len__(self):
    return self.length

### Using the GPU

In [5]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)

cuda:0


### Converting our datasets to Dataloader classes to enable batching

In [None]:
trainset = dataset(X_train_binary_google_vector_mean_scaled,np.array(y_train_binary))
train_loader = DataLoader(trainset,batch_size=64,shuffle=False)

testset = dataset(X_test_binary_google_vector_mean_scaled,np.array(y_test_binary))
test_loader = DataLoader(testset,batch_size=1,shuffle=False)

trainset_myWord2Vec = dataset(X_train_binary_myWord2Vec_vector_mean_scaled,np.array(y_train_binary))
train_loader_myWord2Vec = DataLoader(trainset_myWord2Vec,batch_size=64,shuffle=False)

testset_myWord2Vec = dataset(X_test_binary_myWord2Vec_vector_mean_scaled,np.array(y_test_binary))
test_loader_myWord2Vec = DataLoader(testset_myWord2Vec,batch_size=1,shuffle=False)

### FNN for Binary Classification
#### Input Layer Dimension: 300
#### Hidden Layer Dimension: 50
#### Output Layer Dimension: 1
#### Batch Normalisation Layer 1:  50
#### Batch Normalisation Layer 2: 10
#### Using Relu between Input and Hidden layers and Hidden layer and output

In [None]:
class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        # Number of input features is 12.
        self.layer_1 = nn.Linear(300, 50) 
        self.layer_2 = nn.Linear(50, 10)
        self.layer_out = nn.Linear(10, 1) 
        
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(p=0.1)
        self.batchnorm1 = nn.BatchNorm1d(50)
        self.batchnorm2 = nn.BatchNorm1d(10)
        
    def forward(self, inputs):
        x = self.relu(self.layer_1(inputs))
        x = self.batchnorm1(x)
        x = self.relu(self.layer_2(x))
        x = self.batchnorm2(x)
        x = self.dropout(x)
        x = self.layer_out(x)
        
        return x

## FNN Google Binary Average

### Learning Rate = 0.001
### Epcohs = 10
### Optimiser = torch.optim.Adam
### Loss Function = Binary Cross Entropy with Logits Loss

In [None]:
learning_rate = 0.001
epochs = 10

model_google = Net()
model_google.to(device)
optimizer = torch.optim.Adam(model_google.parameters(),lr=learning_rate)
criterion = nn.BCEWithLogitsLoss()

### Helper function to get accuracy for binary classes
#### 1. Apply Sigmoid on the predicted value to squash it between 0 and 1
#### 2. Then round of the value to either 1 or 0
#### 3. Compare with the actual value and compute accuracy by dividing total number of correctly predicted values with total number of values

In [None]:
def binary_acc(y_pred, y_test):
    y_pred_tag = torch.round(torch.sigmoid(y_pred))
    correct_results_sum = (y_pred_tag == y_test).sum().float()
    acc = correct_results_sum/y_test.shape[0]
    acc = acc * 100
    
    return acc

### Training the model
#### Outer for loop runs for number of epochs specified
#### Inner for loop runs for number of batches in Train set
#### Predict values within the inner for loop for each batch by computing loss, backpropgating it and zeroing out the gradients 

In [None]:
model_google.train()
for epoch in range(epochs):    
    epoch_loss = 0
    epoch_acc = 0    
    for X_batch, y_batch in train_loader:
        X_batch, y_batch = X_batch.to(device), y_batch.to(device)
        optimizer.zero_grad()
        #print(X_batch.size())
        y_pred = model_google(X_batch)
        
        loss = criterion(y_pred, y_batch.unsqueeze(1))
        acc = binary_acc(y_pred, y_batch.unsqueeze(1))
        
        loss.backward()
        optimizer.step()
        
        epoch_loss += loss.item()
        epoch_acc += acc.item()
        

    #print(f'Epoch {epoch+0:03}: | Loss: {epoch_loss/len(train_loader):.5f} | Acc: {epoch_acc/len(train_loader):.3f}')

### Testing the model 
#### Make sure the gradients are not updated by computing test set values under torch.no_grad()
#### Get the list of predicted values and compare against actual values to get the test accuracy of the model

In [None]:
y_pred_list = [] 

model_google.eval()
with torch.no_grad():
    for X_batch,y_batch in test_loader:
        X_batch = X_batch.to(device)
        y_test_pred = model_google(X_batch)
        y_test_pred = torch.sigmoid(y_test_pred)
        y_pred_tag = torch.round(y_test_pred)
        y_pred_list.append(y_pred_tag.cpu().numpy())

y_pred_list = [a.squeeze().tolist() for a in y_pred_list]

In [None]:
#print(confusion_matrix(y_test_binary, y_pred_list))
print(accuracy_score(y_test_binary, y_pred_list))

### FNN(Average-Binary-Google) - 84.2%

## FNN myWord2Vec Binary Average

### Learning Rate = 0.001
### Epcohs = 10
### Optimiser = torch.optim.Adam
### Loss Function = Binary Cross Entropy with Logits Loss

In [None]:
learning_rate = 0.001
epochs = 10

model_myWord2Vec = Net()
model_myWord2Vec.to(device)
optimizer = torch.optim.Adam(model_myWord2Vec.parameters(),lr=learning_rate)
criterion = nn.BCEWithLogitsLoss()

#print(model_myWord2Vec)

In [None]:
model_myWord2Vec.train()
for epoch in range(epochs):    
    epoch_loss = 0
    epoch_acc = 0    
    for X_batch, y_batch in train_loader_myWord2Vec:
        X_batch, y_batch = X_batch.to(device), y_batch.to(device)
        optimizer.zero_grad()
        
        y_pred = model_myWord2Vec(X_batch)
        
        loss = criterion(y_pred, y_batch.unsqueeze(1))
        acc = binary_acc(y_pred, y_batch.unsqueeze(1))
        
        loss.backward()
        optimizer.step()
        
        epoch_loss += loss.item()
        epoch_acc += acc.item()
        

    #print(f'Epoch {epoch+0:03}: | Loss: {epoch_loss/len(train_loader_myWord2Vec):.5f} | Acc: {epoch_acc/len(train_loader_myWord2Vec):.3f}')

In [None]:
y_pred_list = [] 

model_myWord2Vec.eval()
with torch.no_grad():
    for X_batch,y_batch in test_loader_myWord2Vec:
        X_batch = X_batch.to(device)
        y_test_pred = model_myWord2Vec(X_batch)
        y_test_pred = torch.sigmoid(y_test_pred)
        y_pred_tag = torch.round(y_test_pred)
        y_pred_list.append(y_pred_tag.cpu().numpy())

y_pred_list = [a.squeeze().tolist() for a in y_pred_list]

In [None]:
#print(confusion_matrix(y_test_binary, y_pred_list))
print(accuracy_score(y_test_binary, y_pred_list))

### FNN(Average-Binary-myWord2Vec) - 86.25%


## FNN Google Ternary Average
### Refrences: https://www.kaggle.com/mishra1993/pytorch-multi-layer-perceptron-mnist

### FNN for Ternary Classification
#### Input Layer Dimension: 300
#### Hidden Layer Dimension: 50
#### Output Layer Dimension: 3
#### Batch Normalisation Layer 1:  50
#### Batch Normalisation Layer 2: 10
#### Using Relu between Input and Hidden layers, Hidden layer and output

In [None]:
trainset_ternary = dataset(X_train_ternary_google_vector_mean_scaled,np.array(y_train_ternary))
train_loader_ternary = DataLoader(trainset_ternary,batch_size=128,shuffle=False)

testset_ternary = dataset(X_test_ternary_google_vector_mean_scaled,np.array(y_test_ternary))
test_loader_ternary = DataLoader(testset_ternary,batch_size=1,shuffle=False)

In [None]:
import torch.nn as nn
import torch.nn.functional as F
from torchsummary import summary

class MultiClassNN(nn.Module):
    def __init__(self):
        super(MultiClassNN, self).__init__()
        # Number of input features is 300.
        self.layer_1 = nn.Linear(300, 50) 
        self.layer_2 = nn.Linear(50, 10)
        self.layer_out = nn.Linear(10, 3) 
        
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(p=0.1)
        self.batchnorm1 = nn.BatchNorm1d(50)
        self.batchnorm2 = nn.BatchNorm1d(10)
        
    def forward(self, inputs):
        x = self.layer_1(inputs)
        x = self.relu(x)
        x = self.batchnorm1(x)

        
        x = self.layer_2(x)
        x = self.relu(x)
        x = self.batchnorm2(x)
        x = self.dropout(x)
        
        x = self.layer_out(x)
            
        return x

### Helper function to get accuracy for Ternary classes
#### 1. Apply Softmax on the predicted value to get probablites of each class
#### 2. Get the index of the class with maximum probability
#### 3. Compare with the actual value and compute accuracy by dividing total number of correctly predicted values with total number of values

In [None]:
def multi_acc(y_pred, y_test):
    y_pred_softmax = torch.log_softmax(y_pred, dim = 1)
    _, y_pred_tags = torch.max(y_pred_softmax, dim = 1)    
    
    correct_pred = (y_pred_tags == y_test).float()
    acc = correct_pred.sum() / len(correct_pred)
    
    acc = torch.round(acc * 100)
    
    return acc

### Learning Rate = 0.01
### Epcohs = 50
### Optimiser = torch.optim.Adam
### Loss Function = Cross Entropy Loss

In [None]:
learning_rate = 0.01
epochs = 50

model_ternary = MultiClassNN()
model_ternary.to(device)
optimizer = torch.optim.Adam(model_ternary.parameters(),lr=learning_rate)
criterion = nn.CrossEntropyLoss()

#print(model_ternary)

In [None]:
model_ternary.train()
for epoch in range(epochs):    
    epoch_loss = 0
    epoch_acc = 0    
    for X_batch, y_batch in train_loader_ternary:
        X_batch, y_batch = X_batch.to(device), y_batch.to(device)
        
        y_pred = model_ternary(X_batch)
        
        loss = criterion(y_pred, y_batch.long())
        acc = multi_acc(y_pred, y_batch.long())
        
        optimizer.zero_grad()
        
        loss.backward()
        optimizer.step()
        
        epoch_loss += loss.item()
        epoch_acc += acc.item()
        
    #print(f'Epoch {epoch+0:03}: | Loss: {epoch_loss/len(train_loader_ternary):.5f} | Acc: {epoch_acc/len(train_loader_ternary):.3f}')

In [None]:
y_pred_list = [] 

model_ternary.eval()
with torch.no_grad():
    for X_batch, _ in test_loader_ternary:
        X_batch = X_batch.to(device)
        y_test_pred = model_ternary(X_batch)
        _, y_pred_tags = torch.max(y_test_pred, dim = 1)
        y_pred_list.append(y_pred_tags.cpu().numpy())

y_pred_list = [a.squeeze().tolist() for a in y_pred_list]

In [None]:
#print(confusion_matrix(y_test_ternary, y_pred_list))
print(accuracy_score(y_test_ternary, y_pred_list))

### FNN(Average-Ternary-Google) - 67.97%

##  FNN myWord2Vec Ternary Average

In [None]:
trainset_ternary_myWord2Vec = dataset(X_train_ternary_myWord2Vec_vector_mean_scaled,np.array(y_train_ternary))
train_loader_ternary_myWord2Vec = DataLoader(trainset_ternary_myWord2Vec,batch_size=128,shuffle=False)

testset_ternary_myWord2Vec = dataset(X_test_ternary_myWord2Vec_vector_mean_scaled,np.array(y_test_ternary))
test_loader_ternary_myWord2Vec = DataLoader(testset_ternary_myWord2Vec,batch_size=1,shuffle=False)

### Learning Rate = 0.01
### Epochs = 50
### Optimiser = torch.optim.Adam
### Loss Function = Cross Entropy Loss¶

In [None]:
learning_rate = 0.01
epochs = 50

model_ternary_myWord2Vec = MultiClassNN()
model_ternary_myWord2Vec.to(device)
optimizer = torch.optim.Adam(model_ternary_myWord2Vec.parameters(),lr=learning_rate)
criterion = nn.CrossEntropyLoss()

#print(model_ternary_myWord2Vec)

In [None]:
model_ternary_myWord2Vec.train()
for epoch in range(epochs):    
    epoch_loss = 0
    epoch_acc = 0    
    for X_batch, y_batch in train_loader_ternary_myWord2Vec:
        X_batch, y_batch = X_batch.to(device), y_batch.to(device)        
        y_pred = model_ternary_myWord2Vec(X_batch)
        
        loss = criterion(y_pred, y_batch.long())
        acc = multi_acc(y_pred, y_batch.long())
        
        optimizer.zero_grad()
        
        loss.backward()
        optimizer.step()
        
        epoch_loss += loss.item()
        epoch_acc += acc.item()
        
    #print(f'Epoch {epoch+0:03}: | Loss: {epoch_loss/len(train_loader_ternary_myWord2Vec):.5f} | Acc: {epoch_acc/len(train_loader_ternary_myWord2Vec):.3f}')

In [None]:
y_pred_list = [] 

model_ternary_myWord2Vec.eval()
with torch.no_grad():
    for X_batch, _ in test_loader_ternary_myWord2Vec:
        X_batch = X_batch.to(device)
        y_test_pred = model_ternary_myWord2Vec(X_batch)
        _, y_pred_tags = torch.max(y_test_pred, dim = 1)
        y_pred_list.append(y_pred_tags.cpu().numpy())

y_pred_list = [a.squeeze().tolist() for a in y_pred_list]

In [None]:
#print(confusion_matrix(y_test_ternary, y_pred_list))
print(accuracy_score(y_test_ternary, y_pred_list))

### FNN(Average-Ternary-myWord2Vec) - 71.1%


### FNN(Average-Binary-Google) - 84.2%
### FNN(Average-Binary-myWord2Vec) - 86.25%
### FNN(Average-Ternary-Google) - 67.97%
### FNN(Average-Ternary-myWord2Vec) - 71.1%

### When comparing the accuracy values of the Simple models with the Feed Forward Neural Networks(Binary Classification) we can see that the FNN did better in terms of the overall accuracy (86.25% for myWord2Vec and 84.2% for Google Word2Vec) compared to the Perceptron(81.40% for myWord2Vec and 80% for Google Word2Vec) and SVM(84.61% for myWord2Vec and 81.74% for Google Word2Vec) 

### The Ternary Classification models have a lower accuracy when compared to the binary models because of the ambiguity of the netural class. Intuitively, it's difficult to determine if a review is neutral as when you write a review, it's usually talking about the pros and cons of the product. This explains why even our models are not able to pick this diffrentiation up succesfully. We are also limited to a lower number of neutral reviews (50k) compared to Postive and Negative Revies(100k each)

# FNN 10 Word Vectors

In [None]:
trainset_10 = dataset(X_train_binary_google_vector_10,np.array(y_train_binary))
train_loader_10 = DataLoader(trainset_10,batch_size=64,shuffle=False)

testset_10 = dataset(X_test_binary_google_vector_10,np.array(y_test_binary))
test_loader_10 = DataLoader(testset_10,batch_size=1,shuffle=False)

### FNN for Binary Classification of 10 word reviews
#### Input Layer Dimension: 3000 (10x300 tensor is reshaped to 3000)
#### Hidden Layer Dimension: 50
#### Output Layer Dimension: 1
#### Batch Normalisation Layer 1:  50
#### Batch Normalisation Layer 2: 10
#### Using Relu between Input and Hidden layers and Hidden layer and output

In [None]:
class Net_10(nn.Module):
    def __init__(self):
        super(Net_10, self).__init__()
        self.layer_1 = nn.Linear(3000, 50) 
        self.layer_2 = nn.Linear(50, 10)
        self.layer_out = nn.Linear(10, 1) 
        
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(p=0.1)
        self.batchnorm1 = nn.BatchNorm1d(50)
        self.batchnorm2 = nn.BatchNorm1d(10)
        
    def forward(self, inputs):
        x = self.relu(self.layer_1(inputs))
        x = self.batchnorm1(x)
        x = self.relu(self.layer_2(x))
        x = self.batchnorm2(x)
        x = self.dropout(x)
        x = self.layer_out(x)
        
        return x

## FNN Google Binary 10Word

### Learning Rate = 0.0001
### Epochs = 10
### Optimiser = torch.optim.Adam¶
### Loss Function = Binary Cross Entropy with Logits Loss

In [None]:
learning_rate = 0.0001
epochs = 10

model_google_10 = Net_10()
model_google_10.to(device)
optimizer = torch.optim.Adam(model_google_10.parameters(),lr=learning_rate)
criterion = nn.BCEWithLogitsLoss()

In [None]:
model_google_10.train()
for epoch in range(epochs):    
    epoch_loss = 0
    epoch_acc = 0    
    for X_batch, y_batch in train_loader_10:
        X_batch, y_batch = X_batch.to(device), y_batch.to(device)
        optimizer.zero_grad()
        #print(X_batch.size())
        y_pred = model_google_10(X_batch)
        #print(y_pred)
        
        loss = criterion(y_pred, y_batch.unsqueeze(1))
        acc = binary_acc(y_pred, y_batch.unsqueeze(1))
        
        loss.backward()
        optimizer.step()
        
        epoch_loss += loss.item()
        epoch_acc += acc.item()
        

    #print(f'Epoch {epoch+0:03}: | Loss: {epoch_loss/len(train_loader_10):.5f} | Acc: {epoch_acc/len(train_loader_10):.3f}')

In [None]:
y_pred_list = [] 

model_google_10.eval()
with torch.no_grad():
    for X_batch,y_batch in test_loader_10:
        X_batch = X_batch.to(device)
        y_test_pred = model_google_10(X_batch)
        y_test_pred = torch.sigmoid(y_test_pred)
        y_pred_tag = torch.round(y_test_pred)
        y_pred_list.append(y_pred_tag.cpu().numpy())

y_pred_list = [a.squeeze().tolist() for a in y_pred_list]

In [None]:
#print(confusion_matrix(y_test_binary, y_pred_list))
print(accuracy_score(y_test_binary, y_pred_list))

### FNN(10Word-Binary-Google) - 75.15%


## FNN myWord2Vec Binary 10Word

In [None]:
trainset_myWord2Vec_10 = dataset(X_train_binary_myWord2Vec_vector_10,np.array(y_train_binary))
train_loader_myWord2Vec_10 = DataLoader(trainset_myWord2Vec_10,batch_size=32,shuffle=False)

testset_myWord2Vec_10 = dataset(X_test_binary_myWord2Vec_vector_10,np.array(y_test_binary))
test_loader_myWord2Vec_10 = DataLoader(testset_myWord2Vec_10,batch_size=1,shuffle=False)

### Learning Rate = 0.0001
### Epochs = 10
### Optimiser = torch.optim.Adam
### Loss Function = Binary Cross Entropy with Logits Loss

In [None]:
learning_rate = 0.0001
epochs = 10

model_myWord2Vec_10 = Net_10()
model_myWord2Vec_10.to(device)
optimizer = torch.optim.Adam(model_myWord2Vec_10.parameters(),lr=learning_rate)
criterion = nn.BCEWithLogitsLoss()

In [None]:
model_google_10.train()
for epoch in range(epochs):    
    epoch_loss = 0
    epoch_acc = 0    
    for X_batch, y_batch in train_loader_myWord2Vec_10:
        X_batch, y_batch = X_batch.to(device), y_batch.to(device)
        optimizer.zero_grad()
        #print(X_batch.size())
        y_pred = model_myWord2Vec_10(X_batch)
        #print(y_pred)
        
        loss = criterion(y_pred, y_batch.unsqueeze(1))
        acc = binary_acc(y_pred, y_batch.unsqueeze(1))
        
        loss.backward()
        optimizer.step()
        
        epoch_loss += loss.item()
        epoch_acc += acc.item()
        

    #print(f'Epoch {epoch+0:03}: | Loss: {epoch_loss/len(train_loader_myWord2Vec_10):.5f} | Acc: {epoch_acc/len(train_loader_myWord2Vec_10):.3f}')

In [None]:
y_pred_list = [] 

model_myWord2Vec_10.eval()
with torch.no_grad():
    for X_batch,y_batch in test_loader_myWord2Vec_10:
        X_batch = X_batch.to(device)
        y_test_pred = model_myWord2Vec_10(X_batch)
        y_test_pred = torch.sigmoid(y_test_pred)
        y_pred_tag = torch.round(y_test_pred)
        y_pred_list.append(y_pred_tag.cpu().numpy())

y_pred_list = [a.squeeze().tolist() for a in y_pred_list]

In [None]:
#print(confusion_matrix(y_test_binary, y_pred_list))
print(accuracy_score(y_test_binary, y_pred_list))

### FNN(10Word-Binary-myWord2Vec) - 76.8%

## FNN Google Ternary 10Word


In [None]:
trainset_google_10_ternary = dataset(X_train_ternary_google_vector_10,np.array(y_train_ternary))
train_loader_google_10_ternary= DataLoader(trainset_google_10_ternary,batch_size=32,shuffle=False)

testset_google_10_ternary = dataset(X_test_ternary_google_vector_10,np.array(y_test_ternary))
test_loader_google_10_ternary = DataLoader(testset_google_10_ternary,batch_size=1,shuffle=False)

### FNN for Ternary Classification of 10 word reviews
#### Input Layer Dimension: 3000 (10x300 tensor is reshaped to 3000)
#### Hidden Layer Dimension: 50
#### Output Layer Dimension: 3
#### Batch Normalisation Layer 1:  50
#### Batch Normalisation Layer 2: 10
#### Using Relu between Input and Hidden layers and Hidden layer and output

In [None]:
class MultiClassNN_10(nn.Module):
    def __init__(self):
        super(MultiClassNN_10, self).__init__()
        self.layer_1 = nn.Linear(3000, 50) 
        self.layer_2 = nn.Linear(50, 10)
        self.layer_out = nn.Linear(10, 3) 
        
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(p=0.1)
        self.batchnorm1 = nn.BatchNorm1d(50)
        self.batchnorm2 = nn.BatchNorm1d(10)
        
    def forward(self, inputs):
        x = self.layer_1(inputs)
        x = self.relu(x)
        x = self.batchnorm1(x)

        
        x = self.layer_2(x)
        x = self.relu(x)
        x = self.batchnorm2(x)
        x = self.dropout(x)
        
        x = self.layer_out(x)
            
        return x

### Learning Rate = 0.001
### Epochs = 10
### Optimiser = torch.optim.Adam
### Loss Function = Cross Entropy

In [None]:
learning_rate = 0.001
epochs = 10

model_ternary_10 = MultiClassNN_10()
model_ternary_10.to(device)
optimizer = torch.optim.Adam(model_ternary_10.parameters(),lr=learning_rate)
criterion = nn.CrossEntropyLoss()

#print(model_ternary_10)

In [None]:
model_ternary_10.train()
for epoch in range(epochs):    
    epoch_loss = 0
    epoch_acc = 0    
    for X_batch, y_batch in train_loader_google_10_ternary:
        X_batch, y_batch = X_batch.to(device), y_batch.to(device)
        
        y_pred = model_ternary_10(X_batch)
        
        loss = criterion(y_pred, y_batch.long())
        acc = multi_acc(y_pred, y_batch.long())
        
        optimizer.zero_grad()
        
        loss.backward()
        optimizer.step()
        
        epoch_loss += loss.item()
        epoch_acc += acc.item()
        
    #print(f'Epoch {epoch+0:03}: | Loss: {epoch_loss/len(train_loader_google_10_ternary):.5f} | Acc: {epoch_acc/len(train_loader_google_10_ternary):.3f}')

In [None]:
y_pred_list = [] 

model_ternary_10.eval()
with torch.no_grad():
    for X_batch, _ in test_loader_google_10_ternary:
        X_batch = X_batch.to(device)
        y_test_pred = model_ternary_10(X_batch)
        _, y_pred_tags = torch.max(y_test_pred, dim = 1)
        y_pred_list.append(y_pred_tags.cpu().numpy())

y_pred_list = [a.squeeze().tolist() for a in y_pred_list]

In [None]:
#print(confusion_matrix(y_test_ternary, y_pred_list))
print(accuracy_score(y_test_ternary, y_pred_list))

### FNN(10Word-Ternary-Google) - 60.1%


## FNN myWord2Vec Ternary 10Word

In [None]:
trainset_myWord2Vec_10_ternary = dataset(X_train_ternary_myWord2Vec_vector_10,np.array(y_train_ternary))
train_loader_myWord2Vec_10_ternary= DataLoader(trainset_myWord2Vec_10_ternary,batch_size=32,shuffle=False)

testset_myWord2Vec_10_ternary = dataset(X_test_ternary_myWord2Vec_vector_10,np.array(y_test_ternary))
test_loader_myWord2Vec_10_ternary = DataLoader(testset_myWord2Vec_10_ternary,batch_size=1,shuffle=False)

### Learning Rate = 0.001
### Epochs = 10
### Optimiser = torch.optim.Adam
### Loss Function = Cross Entropy

In [None]:
learning_rate = 0.001
epochs = 10

model_ternary_myWord2Vec_10 = MultiClassNN_10()
model_ternary_myWord2Vec_10.to(device)
optimizer = torch.optim.Adam(model_ternary_myWord2Vec_10.parameters(),lr=learning_rate)
criterion = nn.CrossEntropyLoss()

#print(model_ternary_10)

In [None]:
model_ternary_myWord2Vec_10.train()
for epoch in range(epochs):    
    epoch_loss = 0
    epoch_acc = 0    
    for X_batch, y_batch in train_loader_myWord2Vec_10_ternary:
        X_batch, y_batch = X_batch.to(device), y_batch.to(device)
        
        y_pred = model_ternary_myWord2Vec_10(X_batch)
        
        loss = criterion(y_pred, y_batch.long())
        acc = multi_acc(y_pred, y_batch.long())
        
        optimizer.zero_grad()
        
        loss.backward()
        optimizer.step()
        
        epoch_loss += loss.item()
        epoch_acc += acc.item()
        
    #print(f'Epoch {epoch+0:03}: | Loss: {epoch_loss/len(train_loader_myWord2Vec_10_ternary):.5f} | Acc: {epoch_acc/len(train_loader_myWord2Vec_10_ternary):.3f}')

In [None]:
y_pred_list = [] 

model_ternary_myWord2Vec_10.eval()
with torch.no_grad():
    for X_batch, _ in test_loader_myWord2Vec_10_ternary:
        X_batch = X_batch.to(device)
        y_test_pred = model_ternary_myWord2Vec_10(X_batch)
        _, y_pred_tags = torch.max(y_test_pred, dim = 1)
        y_pred_list.append(y_pred_tags.cpu().numpy())

y_pred_list = [a.squeeze().tolist() for a in y_pred_list]

In [None]:
#print(confusion_matrix(y_test_ternary, y_pred_list))
print(accuracy_score(y_test_ternary, y_pred_list))

### FNN(10Word-Ternary-myWord2Vec) - 61.7%


### FNN(10Word-Binary-Google) - 75.15%
### FNN(10Word-Binary-myWord2Vec) - 76.8%
### FNN(10Word-Ternary-Google) - 60.1%
### FNN(10Word-Ternary-myWord2Vec) - 61.7%

## We can see that the when we use just the first 10 words of the review, the accuracy suffers. The first 10 words of the review might not be able to determine the sentiment of the review as reviews are usually much larger than 10 words. Another factor that might be affecting the accuracy is the fact that we pad the vector with zeros if it's less than 10 words

# RNN
### References: https://pytorch.org/tutorials/intermediate/char_rnn_classification_tutorial.html

In [None]:
trainset_50 = dataset(X_train_binary_google_vector_50,np.array(y_train_binary))
train_loader_50 = DataLoader(trainset_50,batch_size=64,shuffle=False)

testset_50 = dataset(X_test_binary_google_vector_50,np.array(y_test_binary))
test_loader_50 = DataLoader(testset_50,batch_size=1,shuffle=False)

### RNN for Binary Classification of 50 word reviews
#### Input Layer Dimension: 300 (shape of input - batch_size x 50 x 300)
#### Hidden State Size Dimension: 50
#### Output Layer Dimension: 1
#### Using Relu between RNN and Full Connected Layer

In [None]:
class RNN(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim):
        
        super().__init__()
        
        self.rnn = nn.RNN(input_dim, hidden_dim, batch_first = True)
        
        self.fc = nn.Linear(hidden_dim, output_dim)
        
        self.relu = nn.ReLU()
        
    def forward(self, x):       
        output, hidden = self.rnn(x)
        
        x = self.relu(hidden.squeeze(0))
                
        pred = self.fc(x)
        
        return pred

## RNN Google Binary 

### Learning Rate = 0.0001
### Epochs = 10
### Optimiser = torch.optim.Adam
### Loss Function = Binary Cross Entropy with Logits Loss

In [None]:
Input_dim = 300
Hidden_dim = 50
Output_dim = 1
epochs = 10

net_RNN = RNN(Input_dim, Hidden_dim, Output_dim)
net_RNN = net_RNN.to(device)

optimizer = torch.optim.Adam(net_RNN.parameters(),lr=0.0001)
criterion = nn.BCEWithLogitsLoss()
criterion = criterion.to(device)

#print(net_RNN)

In [None]:
net_RNN.train()
for epoch in range(epochs):    
    epoch_loss = 0
    epoch_acc = 0    
    for X_batch, y_batch in train_loader_50:
        optimizer.zero_grad()
        
        X_batch, y_batch = X_batch.to(device), y_batch.to(device)
        y_pred = net_RNN(X_batch).squeeze(1)
        
        loss = criterion(y_pred, y_batch)
        acc = binary_acc(y_pred, y_batch)
        
        loss.backward()
        optimizer.step()
        
        epoch_loss += loss.item()
        epoch_acc += acc.item()
        
    #print(f'Epoch {epoch+0:03}: | Loss: {epoch_loss/len(train_loader_50):.5f} | Acc: {epoch_acc/len(train_loader_50):.3f}')

In [None]:
y_pred_list = [] 

net_RNN.eval()
with torch.no_grad():
    for X_batch, _ in test_loader_50:
        X_batch = X_batch.to(device)
        y_test_pred = net_RNN(X_batch).squeeze(1)
        y_test_pred = torch.sigmoid(y_test_pred)
        y_pred_tag = torch.round(y_test_pred)
        y_pred_list.append(y_pred_tag.cpu().numpy())
y_pred_list = [a.tolist() for a in y_pred_list]

In [None]:
#print(confusion_matrix(y_test_binary, y_pred_list))
print(accuracy_score(y_test_binary, y_pred_list))

### RNN (Binary-Google) - 83%


## RNN myWord2Vec Binary 

In [None]:
trainset_50_myWord2Vec = dataset(X_train_binary_myWord2Vec_vector_50,np.array(y_train_binary))
train_loader_50_myWord2Vec = DataLoader(trainset_50_myWord2Vec,batch_size=64,shuffle=False)

testset_50_myWord2Vec = dataset(X_test_binary_myWord2Vec_vector_50,np.array(y_test_binary))
test_loader_50_myWord2Vec = DataLoader(testset_50_myWord2Vec,batch_size=1,shuffle=False)

### Learning Rate = 0.0001
### Epochs = 30
### Optimiser = torch.optim.Adam
### Loss Function = Binary Cross Entropy with Logits Loss

In [None]:
Input_dim = 300
Hidden_dim = 50
Output_dim = 1
epochs = 30

net_RNN_myWord2Vec = RNN(Input_dim, Hidden_dim, Output_dim)
net_RNN_myWord2Vec = net_RNN_myWord2Vec.to(device)

optimizer = torch.optim.Adam(net_RNN_myWord2Vec.parameters(),lr=0.0001)
criterion = nn.BCEWithLogitsLoss()
criterion = criterion.to(device)

#print(net_RNN_myWord2Vec)

In [None]:
net_RNN_myWord2Vec.train()
for epoch in range(epochs):    
    epoch_loss = 0
    epoch_acc = 0    
    for X_batch, y_batch in train_loader_50_myWord2Vec:
        optimizer.zero_grad()
        
        X_batch, y_batch = X_batch.to(device), y_batch.to(device)
        y_pred = net_RNN_myWord2Vec(X_batch).squeeze(1)
        loss = criterion(y_pred, y_batch)
        acc = binary_acc(y_pred, y_batch)
        
        loss.backward()
        optimizer.step()
        
        epoch_loss += loss.item()
        epoch_acc += acc.item()
        
    #print(f'Epoch {epoch+0:03}: | Loss: {epoch_loss/len(train_loader_50_myWord2Vec):.5f} | Acc: {epoch_acc/len(train_loader_50_myWord2Vec):.3f}')

In [None]:
y_pred_list = [] 

net_RNN_myWord2Vec.eval()
with torch.no_grad():
    for X_batch, _ in test_loader_50_myWord2Vec:
        X_batch = X_batch.to(device)
        y_test_pred = net_RNN_myWord2Vec(X_batch).squeeze(1)
        y_test_pred = torch.sigmoid(y_test_pred)
        y_pred_tag = torch.round(y_test_pred)
        y_pred_list.append(y_pred_tag.cpu().numpy())
y_pred_list = [a.tolist() for a in y_pred_list]

In [None]:
#print(confusion_matrix(y_test_binary, y_pred_list))
print(accuracy_score(y_test_binary, y_pred_list))

### RNN (Binary-myWord2Vec) - 78.31%


## RNN Google Ternary 

In [None]:
trainset_50_ternary = dataset(X_train_ternary_google_vector_50,np.array(y_train_ternary))
train_loader_50_ternary = DataLoader(trainset_50_ternary,batch_size=64,shuffle=False)

testset_50_ternary = dataset(X_test_ternary_google_vector_50,np.array(y_test_ternary))
test_loader_50_ternary = DataLoader(testset_50_ternary,batch_size=1,shuffle=False)

### RNN for Ternary Classification of 50 word reviews
#### Input Layer Dimension: 300 (shape of input - batch_size x 50 x 300)
#### Hidden State Size Dimension: 50
#### Output Layer Dimension: 3
#### Using Relu between RNN and Full Connected Layer

In [None]:
class RNN_Ternary(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim):
        
        super().__init__()
        
        self.rnn = nn.RNN(input_dim, hidden_dim, batch_first = True)
        
        self.fc = nn.Linear(hidden_dim, output_dim)
        
        self.relu = nn.ReLU()
                
    def forward(self, x):       
        output, hidden = self.rnn(x)
        x = self.relu(hidden.squeeze(0))       
        pred = self.fc(x)
                
        return pred

### Learning Rate = 0.0001
### Epochs = 50
### Optimiser = torch.optim.Adam
### Loss Function = Cross Entropy Loss

In [None]:
Input_dim = 300
Hidden_dim = 50
Output_dim = 3
epochs = 50

net_RNN_ternary = RNN_Ternary(Input_dim, Hidden_dim, Output_dim)
net_RNN_ternary = net_RNN_ternary.to(device)

optimizer = torch.optim.Adam(net_RNN_ternary.parameters(),lr=0.0001)
criterion = nn.CrossEntropyLoss()
criterion = criterion.to(device)

#print(net_RNN_ternary)

In [None]:
net_RNN_ternary.train()
for epoch in range(epochs):    
    epoch_loss = 0
    epoch_acc = 0    
    for X_batch, y_batch in train_loader_50_ternary:
        optimizer.zero_grad()
        
        X_batch, y_batch = X_batch.to(device), y_batch.to(device)
        y_pred = net_RNN_ternary(X_batch)
        
        loss = criterion(y_pred, y_batch.long())
        acc = multi_acc(y_pred, y_batch.long())
        
        loss.backward()
        optimizer.step()
        
        epoch_loss += loss.item()
        epoch_acc += acc.item()
        
    #print(f'Epoch {epoch+0:03}: | Loss: {epoch_loss/len(train_loader_50_ternary):.5f} | Acc: {epoch_acc/len(train_loader_50_ternary):.3f}')

In [None]:
y_pred_list = [] 

net_RNN_ternary.eval()
with torch.no_grad():
    for X_batch, _ in test_loader_50_ternary:
        X_batch = X_batch.to(device)
        y_test_pred = net_RNN_ternary(X_batch)
        _, y_pred_tags = torch.max(y_test_pred, dim = 1)
        y_pred_list.append(y_pred_tags.cpu().numpy())

y_pred_list = [a.squeeze().tolist() for a in y_pred_list]

In [None]:
#print(confusion_matrix(y_test_ternary, y_pred_list))
print(accuracy_score(y_test_ternary, y_pred_list))

### RNN(Ternary-Google) - 69%


## RNN myWord2Vec Ternary



In [None]:
trainset_50_ternary_myWord2Vec = dataset(X_train_ternary_myWord2Vec_vector_50,np.array(y_train_ternary))
train_loader_50_ternary_myWord2Vec = DataLoader(trainset_50_ternary_myWord2Vec,batch_size=64,shuffle=False)

testset_50_ternary_myWord2Vec = dataset(X_test_ternary_myWord2Vec_vector_50,np.array(y_test_ternary))
test_loader_50_ternary_myWord2Vec = DataLoader(testset_50_ternary_myWord2Vec,batch_size=1,shuffle=False)

### Learning Rate = 0.0001
### Epochs = 50
### Optimiser = torch.optim.Adam
### Loss Function = Cross Entropy Loss

In [None]:
Input_dim = 300
Hidden_dim = 50
Output_dim = 3
epochs = 50

net_RNN_ternary_myWord2Vec = RNN_Ternary(Input_dim, Hidden_dim, Output_dim)
net_RNN_ternary_myWord2Vec = net_RNN_ternary_myWord2Vec.to(device)

optimizer = torch.optim.Adam(net_RNN_ternary_myWord2Vec.parameters(),lr=0.0001)
criterion = nn.CrossEntropyLoss()
criterion = criterion.to(device)

#print(net_RNN_ternary_myWord2Vec)

In [None]:
net_RNN_ternary_myWord2Vec.train()
for epoch in range(epochs):    
    epoch_loss = 0
    epoch_acc = 0    
    for X_batch, y_batch in train_loader_50_ternary_myWord2Vec:
        optimizer.zero_grad()
        
        X_batch, y_batch = X_batch.to(device), y_batch.to(device)
        y_pred = net_RNN_ternary_myWord2Vec(X_batch)
        
        loss = criterion(y_pred, y_batch.long())
        acc = multi_acc(y_pred, y_batch.long())
        
        loss.backward()
        optimizer.step()
        
        epoch_loss += loss.item()
        epoch_acc += acc.item()
        
    #print(f'Epoch {epoch+0:03}: | Loss: {epoch_loss/len(train_loader_50_ternary_myWord2Vec):.5f} | Acc: {epoch_acc/len(train_loader_50_ternary_myWord2Vec):.3f}')

In [None]:
y_pred_list = [] 

net_RNN_ternary_myWord2Vec.eval()
with torch.no_grad():
    for X_batch, _ in test_loader_50_ternary_myWord2Vec:
        X_batch = X_batch.to(device)
        y_test_pred = net_RNN_ternary_myWord2Vec(X_batch)
        _, y_pred_tags = torch.max(y_test_pred, dim = 1)
        y_pred_list.append(y_pred_tags.cpu().numpy())

y_pred_list = [a.squeeze().tolist() for a in y_pred_list]

In [None]:
#print(confusion_matrix(y_test_ternary, y_pred_list))
print(accuracy_score(y_test_ternary, y_pred_list))

### RNN(Ternary-myWord2Vec) - 61.80%


### RNN (Binary-Google) - 83%
### RNN (Binary-myWord2Vec) - 78.31%
### RNN (Ternary-Google) - 69%
### RNN (Ternary-myWord2Vec) - 61.80%

## Comparing the RNN accuracies to the FNN accuracies, we can see that the accuracies are lesser in the case of RNN. This might be because of Vanishing/Exploding Gradients because of which the first few layers of the Time Step either loss out on updation (gradient is very low) or dictate the final result (gradients are very high). The gradients can be clipped manually to make sure this doesn't happen, but we can take care of this using an LSTM/GRU cell instead. We also see that in this case the GoogleWord2Vec model does better in both the binary and ternary cases. This might be because the RNN takes into account the states of the previous inputs as well and the GoogleWord2Vec model has word embeddings that is able to generalize this history of words better than our Word2Vec model word embeddings.

# GRU Google Binary 
### Refrences: https://blog.floydhub.com/gru-with-pytorch/
### https://github.com/hpanwar08/sentence-classification-pytorch/blob/master/Sentiment%20analysis%20pytorch.ipynb

### GRU for Binary Classification of 50 word reviews
#### Input Layer Dimension: 300 (shape of input - batch_size x 50 x 300)
#### Number of GRU layers:  1
#### We have to initialize a hidden layer of zeros with the shape (1, batch_size, 50) which is given as an input to the GRU cell. 
#### Hidden State Size Dimension: 50
#### Output Layer Dimension: 1
#### The last row of the hidden layer in our Model will have the predicted output
#### The last row is sent through a Relu function after which it's sent to the Fully Connected Layer

In [None]:
class GRU(nn.Module):
    def __init__(self, input_dim, hidden_dim, n_layers, output_dim):
        
        super().__init__()
        
        self.n_layers = n_layers
        
        self.hidden_dim = hidden_dim
        
        self.input_dim = input_dim
        
        self.gru = nn.GRU(input_dim, hidden_dim, n_layers, batch_first = True)
        
        self.fc = nn.Linear(hidden_dim, output_dim)
        
        self.relu = nn.ReLU()
        
    def forward(self, x, h, batch_size):   
        
        self.h = self.init_hidden(batch_size)
                
        output, self.h = self.gru(x, self.h)
                
        pred =  self.fc(self.relu(self.h[-1]))
        
        return pred, h
    
    def init_hidden(self, batch_size):
        return Variable(torch.zeros((1,batch_size,50))).cuda()

### Learning Rate = 0.001
### Epochs = 10
### Optimiser = torch.optim.Adam
### Loss Function = Binary Cross Entropy with Logits Loss
### Batch Size = 64

In [None]:
Input_dim = 300
Hidden_dim = 50
Output_dim = 1
n_layers = 1
epochs = 10
batch_size = 64

net_GRU = GRU(Input_dim, Hidden_dim, n_layers, Output_dim)
net_GRU = net_GRU.to(device)

optimizer = torch.optim.Adam(net_GRU.parameters(),lr=0.001)
criterion = nn.BCEWithLogitsLoss()
criterion = criterion.to(device)

#print(net_GRU)

In [None]:
net_GRU.train()
for epoch in range(epochs):  
    
    epoch_loss = 0
    epoch_acc = 0    
    
    h = net_GRU.init_hidden(64)
    
    for X_batch, y_batch in train_loader_50:
        
        optimizer.zero_grad()
        
        h = h.data
        
        X_batch, y_batch = X_batch.to(device), y_batch.to(device)
        
        batch = X_batch.size(0)
  
        y_pred,h = net_GRU(X_batch, h, batch)
        
        y_pred = y_pred.squeeze(1)
        
        loss = criterion(y_pred, y_batch)
        acc = binary_acc(y_pred, y_batch)
        
        loss.backward()
        optimizer.step()
        
        epoch_loss += loss.item()
        epoch_acc += acc.item()
        
    #print(f'Epoch {epoch+0:03}: | Loss: {epoch_loss/len(train_loader_50):.5f} | Acc: {epoch_acc/len(train_loader_50):.3f}')

In [None]:
y_pred_list = [] 

net_GRU.eval()
with torch.no_grad():
    h = net_GRU.init_hidden(64)
    for X_batch, _ in test_loader_50:
        X_batch = X_batch.to(device)
        batch = X_batch.size(0)
        y_test_pred,h = net_GRU(X_batch,h,batch)
        y_test_pred = y_test_pred.squeeze(1)
        y_test_pred = torch.sigmoid(y_test_pred)
        y_pred_tag = torch.round(y_test_pred)
        y_pred_list.append(y_pred_tag.cpu().numpy())

y_pred_list = [a.tolist() for a in y_pred_list]

In [None]:
#print(confusion_matrix(y_test_binary, y_pred_list))
print(accuracy_score(y_test_binary, y_pred_list))

### GRU(Binary-Google) - 87%


## GRU myWord2Vec Binary


### Learning Rate = 0.001
### Epochs = 10
### Optimiser = torch.optim.Adam
### Loss Function = Binary Cross Entropy with Logits Loss
### Batch Size = 64

In [None]:
Input_dim = 300
Hidden_dim = 50
Output_dim = 1
n_layers = 1
epochs = 10
batch_size = 64

net_GRU_myWord2Vec = GRU(Input_dim, Hidden_dim, n_layers, Output_dim)
net_GRU_myWord2Vec = net_GRU_myWord2Vec.to(device)

optimizer = torch.optim.Adam(net_GRU_myWord2Vec.parameters(),lr=0.001)
criterion = nn.BCEWithLogitsLoss()
criterion = criterion.to(device)

#print(net_GRU_myWord2Vec)

In [None]:
net_GRU_myWord2Vec.train()
for epoch in range(epochs):  
    
    epoch_loss = 0
    epoch_acc = 0    
    
    h = net_GRU_myWord2Vec.init_hidden(64)
    
    for X_batch, y_batch in train_loader_50_myWord2Vec:
        
        optimizer.zero_grad()
        
        h = h.data
        
        X_batch, y_batch = X_batch.to(device), y_batch.to(device)
        
        batch = X_batch.size(0)
  
        y_pred,h = net_GRU_myWord2Vec(X_batch, h, batch)
        
        y_pred = y_pred.squeeze(1)
        
        loss = criterion(y_pred, y_batch)
        acc = binary_acc(y_pred, y_batch)
        
        loss.backward()
        optimizer.step()
        
        epoch_loss += loss.item()
        epoch_acc += acc.item()
        
    #print(f'Epoch {epoch+0:03}: | Loss: {epoch_loss/len(train_loader_50_myWord2Vec):.5f} | Acc: {epoch_acc/len(train_loader_50_myWord2Vec):.3f}')

In [None]:
y_pred_list = [] 

net_GRU_myWord2Vec.eval()
with torch.no_grad():
    h = net_GRU_myWord2Vec.init_hidden(64)
    for X_batch, _ in test_loader_50_myWord2Vec:
        X_batch = X_batch.to(device)
        batch = X_batch.size(0)
        y_test_pred,h = net_GRU_myWord2Vec(X_batch,h,batch)
        y_test_pred = y_test_pred.squeeze(1)
        y_test_pred = torch.sigmoid(y_test_pred)
        y_pred_tag = torch.round(y_test_pred)
        y_pred_list.append(y_pred_tag.cpu().numpy())

y_pred_list = [a.tolist() for a in y_pred_list]

In [None]:
#print(confusion_matrix(y_test_binary, y_pred_list))
print(accuracy_score(y_test_binary, y_pred_list))

### GRU(Binary-myWord2Vec) - 86.6%


## GRU Google Ternary

### GRU for Ternary Classification of 50 word reviews
#### Input Layer Dimension: 300 (shape of input - batch_size x 50 x 300)
#### Number of GRU layers:  1
#### We have to initialize a hidden layer of zeros with the shape (1, batch_size, 50) which is given as an input to the GRU cell. 
#### Hidden State Size Dimension: 50
#### Output Layer Dimension: 3
#### The last row of the hidden layer in our Model will have the predicted output
#### The last row is sent through a Relu function after which it's sent to the Fully Connected Layer

In [None]:
Input_dim = 300
Hidden_dim = 50
Output_dim = 3
n_layers = 1
epochs = 10
batch_size = 64

net_GRU_ternary = GRU(Input_dim, Hidden_dim, n_layers, Output_dim)
net_GRU = net_GRU_ternary.to(device)

optimizer = torch.optim.Adam(net_GRU_ternary.parameters(),lr=0.001)
criterion = nn.CrossEntropyLoss()
criterion = criterion.to(device)

#print(net_GRU_ternary)

### Learning Rate = 0.001
### Epochs = 10
### Optimiser = torch.optim.Adam
### Loss Function = Cross Entropy Loss
### Batch Size = 64

In [None]:
net_GRU_ternary.train()
for epoch in range(epochs):  
    
    epoch_loss = 0
    epoch_acc = 0    
    
    h = net_GRU_ternary.init_hidden(64)
    
    for X_batch, y_batch in train_loader_50_ternary:
        
        optimizer.zero_grad()
        
        h = h.data
        
        X_batch, y_batch = X_batch.to(device), y_batch.to(device)
        
        batch = X_batch.size(0)
  
        y_pred,h = net_GRU_ternary(X_batch, h, batch)
        
        y_pred = y_pred.squeeze(1)
        
        loss = criterion(y_pred, y_batch.long())
        acc = multi_acc(y_pred, y_batch.long())
        
        loss.backward()
        optimizer.step()
        
        epoch_loss += loss.item()
        epoch_acc += acc.item()
        
    #print(f'Epoch {epoch+0:03}: | Loss: {epoch_loss/len(train_loader_50_ternary):.5f} | Acc: {epoch_acc/len(train_loader_50_ternary):.3f}')

In [None]:
y_pred_list = [] 

net_GRU_ternary.eval()
with torch.no_grad():
    h = net_GRU.init_hidden(64)
    for X_batch, _ in test_loader_50_ternary:
        X_batch = X_batch.to(device)
        batch = X_batch.size(0)
        y_test_pred,h = net_GRU_ternary(X_batch, h, batch)
        _, y_pred_tags = torch.max(y_test_pred.squeeze(1), dim = 1)
        y_pred_list.append(y_pred_tags.cpu().numpy())

y_pred_list = [a.squeeze().tolist() for a in y_pred_list]

In [None]:
#print(confusion_matrix(y_test_ternary, y_pred_list))
print(accuracy_score(y_test_ternary, y_pred_list))

### GRU(Ternary - Google) - 71.7%


## GRU myWord2Vec Ternary

### Learning Rate = 0.001
### Epochs = 10
### Optimiser = torch.optim.Adam
### Loss Function = Cross Entropy Loss
### Batch Size = 64

In [None]:
Input_dim = 300
Hidden_dim = 50
Output_dim = 3
n_layers = 1
epochs = 10
batch_size = 64

net_GRU_ternary_myWord2Vec = GRU(Input_dim, Hidden_dim, n_layers, Output_dim)
net_GRU_ternary_myWord2Vec = net_GRU_ternary_myWord2Vec.to(device)

optimizer = torch.optim.Adam(net_GRU_ternary_myWord2Vec.parameters(),lr=0.001)
criterion = nn.CrossEntropyLoss()
criterion = criterion.to(device)

#print(net_GRU_ternary_myWord2Vec)

In [None]:
net_GRU_ternary_myWord2Vec.train()
for epoch in range(epochs):  
    
    epoch_loss = 0
    epoch_acc = 0    
    
    h = net_GRU_ternary_myWord2Vec.init_hidden(64)
    
    for X_batch, y_batch in train_loader_50_ternary_myWord2Vec:
        
        optimizer.zero_grad()
        
        h = h.data
        
        X_batch, y_batch = X_batch.to(device), y_batch.to(device)
        
        batch = X_batch.size(0)
  
        y_pred,h = net_GRU_ternary_myWord2Vec(X_batch, h, batch)
        
        y_pred = y_pred.squeeze(1)
        
        loss = criterion(y_pred, y_batch.long())
        acc = multi_acc(y_pred, y_batch.long())
        
        loss.backward()
        optimizer.step()
        
        epoch_loss += loss.item()
        epoch_acc += acc.item()
        
    #print(f'Epoch {epoch+0:03}: | Loss: {epoch_loss/len(train_loader_50_ternary_myWord2Vec):.5f} | Acc: {epoch_acc/len(train_loader_50_ternary_myWord2Vec):.3f}')

In [None]:
y_pred_list = [] 

net_GRU_ternary_myWord2Vec.eval()
with torch.no_grad():
    h = net_GRU_ternary_myWord2Vec.init_hidden(64)
    for X_batch, _ in test_loader_50_ternary_myWord2Vec:
        X_batch = X_batch.to(device)
        batch = X_batch.size(0)
        y_test_pred,h = net_GRU_ternary_myWord2Vec(X_batch, h, batch)
        _, y_pred_tags = torch.max(y_test_pred.squeeze(1), dim = 1)
        y_pred_list.append(y_pred_tags.cpu().numpy())

y_pred_list = [a.squeeze().tolist() for a in y_pred_list]

In [None]:
#print(confusion_matrix(y_test_ternary, y_pred_list))
print(accuracy_score(y_test_ternary, y_pred_list))

### GRU(Ternary - myWord2Vec) - 71.1%

### GRU(Binary-Google) - 87%
### GRU(Binary-myWord2Vec) - 86.6%
### GRU(Ternary - Google) - 71.7%
### GRU(Ternary - myWord2Vec) - 71.1%

## With just 10 epochs, we can see that the accuracies for the GRU model are much better than the RNN and it's better than all the other models we've trained till now. The GRU cell overcomes the vanishing/exploding gradient problem using the update and reset gates. The GRU model gives us the best accuracies for the Ternary Dataset as well. Just like in the case of the RNN, we can see that the GoogleWord2Vec model does better our trained Word2Vec model