In [1]:
# all the necessary imports
import torch
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn
from torch import optim
import torch
import torchtext
from torchtext.legacy.data import Field, LabelField
from torchtext.legacy.data import TabularDataset
from torchtext.legacy.data import Iterator, BucketIterator
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

## Yelp review rating prediction 

Millions of people share a great number of reviews about business on [Yelp.com](https://www.yelp.com/) and Yelp mobile app everyday. These reviews and ratings help other users to make a choice. We used [Yelp APIs (application programming interface)](https://www.yelp.ca/developers) to collect over 35,000 reviews of 1,000 restaurants in New York City. We split this dataset into 90\% TRAIN set (28,000 reviews), 10\% DEV set (3,500 reviews), and 10\% TEST set (3,500 reviews). Each review has text review content and a corresponding label (i.e., 5-level rating star). This table shows the class ditribution of TRAIN and DEV sets.

|    Rating  |   # of Train   reviews| # of Dev reviews    |  
| ---------- | -----------------  |-----|  
| 1star      | 5,619              | 683 |  
| 2star      | 5,616              | 677 |  
| 3star      | 5,583              | 713 |  
| 4star      | 5,532              | 733 |  
| 5star      | 5,650              | 694 |  


In directory `./data/yelp_review/`, we provide the `TRAIN` and `DEV` sets with the corresponding labels.
We use the TRAIN and DEV sets to develop a classification system for this task. 

## Mounting the drive on colab

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


## Data Preprocessing

In [3]:
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
import spacy
import string
spacy_en = spacy.load('en_core_web_sm')
# punctuation = set(string.punctuation)
# stop_words = set(stopwords.words('english'))
def tokenize_en(text):
    """
    Tokenizes English text from a string into a list of strings (tokens)
    """
    tokens = []
    for tok in spacy_en.tokenizer(text):
      tokens.append(tok.text)

    return tokens

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [4]:
TEXT = Field(sequential=True, tokenize=tokenize_en, lower=False)
LABEL = Field(sequential=False, unk_token = None)

In [5]:
train,val,_ = TabularDataset.splits(path= "/yelp/",train='train.tsv', validation="val.tsv", test="test.tsv", # file names
    format='tsv',
    skip_header=True, # if your tsv file has a header, make sure to pass this to ensure it doesn't get proceesed as data!
    fields=[('tweet', TEXT), ('label', LABEL)])

_,_,test = TabularDataset.splits(path= "/yelp/",train='train.tsv', validation="val.tsv", test="test.tsv", # file names
    format='tsv',
    skip_header=True, # if your tsv file has a header, make sure to pass this to ensure it doesn't get proceesed as data!
    fields=[('tweet', TEXT)])

In [6]:
TEXT.build_vocab(train) # builds vocabulary based on all the words that occur at least twice in the training set
LABEL.build_vocab(train)

In [7]:
print(len(TEXT.vocab.stoi))
print(len(LABEL.vocab.stoi))

53723
5


In [8]:
print(LABEL.vocab.stoi)

defaultdict(None, {'5star': 0, '1star': 1, '2star': 2, '3star': 3, '4star': 4})


In [9]:
#Creating the val, test and train iterators.

val_iter = Iterator(
 dataset = val, # we pass in the datasets we want the iterator to draw data from
 batch_size= 128,
 sort_key= None, 
 sort= False,
# A key to use for sorting examples in order to batch together examples with similar lengths and minimize padding. 
 sort_within_batch= False,
 train= False
)


test_iter = Iterator(
 dataset = test, # we pass in the datasets we want the iterator to draw data from
 batch_size= 128,
 sort_key= None, 
 sort= False,
# A key to use for sorting examples in order to batch together examples with similar lengths and minimize padding. 
 sort_within_batch= False,
 train= False
)


train_iter,_,_ = BucketIterator.splits(
 (train,val, test), # we pass in the datasets we want the iterator to draw data from
 batch_sizes= (128,128,128),
 sort_key=lambda x: len(x.tweet), 
 sort= True,
# A key to use for sorting examples in order to batch together examples with similar lengths and minimize padding. 
 sort_within_batch= True
)

## Model Evaluation

In [10]:
## GRU model for evaluation.

import torch.nn as nn
class GRUmodel(nn.Module):
  
  def __init__(self, embedding_size, vocab_size, output_size, hidden_size, layers):
    # In the constructor we define the layers for our model
    super(GRUmodel, self).__init__()
    # word embedding lookup table
    self.embedding = nn.Embedding(num_embeddings=vocab_size, embedding_dim=embedding_size)
    # core GRU module
    self.GRU_layer = nn.GRU(input_size=embedding_size, hidden_size= hidden_size, num_layers=layers) 
    # activation function
    self.activation_fn = nn.Tanh()
    # classification related modules
    self.linear_layer = nn.Linear(hidden_size, output_size) 
    self.softmax_layer = nn.LogSoftmax(dim=1)
    self.debug = False
  
  def forward(self, x):
    # In the forward function we define the forward propagation logic
    if self.debug:
        print("input word indices shape = ", x.size())
    out = self.embedding(x)
    if self.debug:
        print("word embeddings shape = ", out.size())
    out, _ = self.GRU_layer(out) # since we are not feeding h_0 explicitly, h_0 will be initialized to zeros by default
    if self.debug:
        print("RNN output (features from last layer of RNN for all timesteps) shape = ", out.size())
    # classify based on the hidden representation after RNN processes the last token
    out = out[-1]
    if self.debug:
        print("Tweet embeddings or RNN output (features from last layer of RNN for the last timestep only) shape = ", out.size())
    out = self.activation_fn(out)
    if self.debug:
        print("ReLU output shape = ", out.size())
    out = self.linear_layer(out)
    if self.debug:
        print("linear layer output shape = ", out.size())
    out = self.softmax_layer(out) # accepts 2D or more dimensional inputs
    if self.debug:
        print("softmax layer output shape = ", out.size())
    return out

In [11]:
## Evaluation function

import numpy as np
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score

def train(loader,model,criterion,optimizer,device):
    total_loss = 0.0
    # iterate throught the data loader
    num_sample = 0
    for batch in loader:
        # load the current batch
        batch_input = batch.tweet
        batch_output = batch.label
        
        batch_input = batch_input.to(device)
        batch_output = batch_output.to(device)
        # forward propagation
        # pass the data through the model
        model_outputs = model(batch_input)
        # compute the loss
        cur_loss = criterion(model_outputs, batch_output)
        total_loss += cur_loss.item()

        # backward propagation (compute the gradients and update the model)
        # clear the buffer
        optimizer.zero_grad()
        # compute the gradients
        cur_loss.backward()
        # update the weights
        optimizer.step()

        num_sample += batch_output.shape[0]
    return total_loss/num_sample

# evaluation logic based on classification accuracy
def evaluate(loader,model,criterion,device):
    all_pred=[]
    all_label = []
    with torch.no_grad(): # impacts the autograd engine and deactivate it. reduces memory usage and speeds up computation
        for batch in loader:
             # load the current batch
            batch_input = batch.tweet
            batch_output = batch.label

            batch_input = batch_input.to(device)
            # forward propagation
            # pass the data through the model
            model_outputs = model(batch_input)
            # print(model_outputs.shape)
            # identify the predicted class for each example in the batch
            probabilities, predicted = torch.max(model_outputs.cpu().data, 1)
            # put all the true labels and predictions to two lists

            all_pred.extend(predicted)
            all_label.extend(batch_output)
      
    accuracy = accuracy_score(all_label, all_pred)
    f1score = f1_score(all_label, all_pred, average='macro') 
    return accuracy,f1score

# Hyper-parameter optimization

In [None]:


# from numpy.lib.histograms import histogramdd
# from torch._C import JitType
import scipy.stats

LEARNING_RATE= 0.1
MAX_EPOCHS=10
EMBEDDING_SIZE = 300
VOCAB_SIZE = len(TEXT.vocab.stoi)
NUM_CLASSES = len(LABEL.vocab.stoi)

def random_search(hidden_units_list, layers_list):
    results = []
    for i in layers_list:
        for j in hidden_units_list:
            config = {
                #define hyperparameters here
                "layers": i,
                "hidden_size": j
                }
            print(config)
            # model = ConvNet(config["layers"],3,config["filters"],nn.ReLU(),output_size=3, VOCAB_SIZE=VOCAB_SIZE, WORD_VEC_SIZE=WORD_VEC_SIZE)
            model = LSTMmodel(EMBEDDING_SIZE, VOCAB_SIZE, NUM_CLASSES, config["hidden_size"], config["layers"]) 
            model.to(device)
            #print﴾model﴿
            criterion = nn.NLLLoss()
            optimizer = torch.optim.SGD(model.parameters(), lr=LEARNING_RATE)
            max_val = 0
            best_epoch = 0
            for epoch in range(MAX_EPOCHS):
                # train the model for one pass over the data
                train_loss = train(train_iter,model,criterion,optimizer,device)
                # compute the training accuracy
                train_acc = evaluate(train_iter,model,criterion,device)
                # compute the validation accuracy
                val_acc = evaluate(val_iter,model,criterion,device)
                if val_acc[0] > max_val:
                    max_val = val_acc[0]
                    best_epoch = epoch+1
                # print the loss for every epoch
                print('epoch ',epoch+1,'loss ', train_loss,'Train Accuracy & F1',train_acc,'Validation Accuracy & F1 ', val_acc)
                # Append the results for every epoch
                results.append((max_val,best_epoch,config))
    return results

In [None]:
# Calling the hyperparamater optimization for LSTM
layers_list = [1, 2, 3]
hidden_size_list = [100, 200, 300, 400, 500]
random_search(hidden_size_list,layers_list)

{'layers': 1, 'hidden_size': 100}
epoch  1 loss  0.012513524643012456 Train Accuracy & F1 (0.21967857142857142, 0.1561862096076843) Validation Accuracy & F1  (0.20057142857142857, 0.08284198878417495)
epoch  2 loss  0.012396843305655889 Train Accuracy & F1 (0.25292857142857145, 0.21729026335388588) Validation Accuracy & F1  (0.206, 0.09951114433113004)
epoch  3 loss  0.012215453054223741 Train Accuracy & F1 (0.27625, 0.2517508151616518) Validation Accuracy & F1  (0.21, 0.11226215087233997)
epoch  4 loss  0.011965337663888931 Train Accuracy & F1 (0.30864285714285716, 0.29423468799825697) Validation Accuracy & F1  (0.23285714285714285, 0.15803692391593838)
epoch  5 loss  0.011633581097636904 Train Accuracy & F1 (0.3452857142857143, 0.33655521453760784) Validation Accuracy & F1  (0.27485714285714286, 0.23471415949363372)
epoch  6 loss  0.01109419098496437 Train Accuracy & F1 (0.38425, 0.37559391462502256) Validation Accuracy & F1  (0.3417142857142857, 0.32342508351012683)
epoch  7 loss  0

[(0.20057142857142857, 1, {'hidden_size': 100, 'layers': 1}),
 (0.206, 2, {'hidden_size': 100, 'layers': 1}),
 (0.21, 3, {'hidden_size': 100, 'layers': 1}),
 (0.23285714285714285, 4, {'hidden_size': 100, 'layers': 1}),
 (0.27485714285714286, 5, {'hidden_size': 100, 'layers': 1}),
 (0.3417142857142857, 6, {'hidden_size': 100, 'layers': 1}),
 (0.388, 7, {'hidden_size': 100, 'layers': 1}),
 (0.388, 7, {'hidden_size': 100, 'layers': 1}),
 (0.3945714285714286, 9, {'hidden_size': 100, 'layers': 1}),
 (0.3945714285714286, 9, {'hidden_size': 100, 'layers': 1}),
 (0.20142857142857143, 1, {'hidden_size': 200, 'layers': 1}),
 (0.2057142857142857, 2, {'hidden_size': 200, 'layers': 1}),
 (0.21457142857142858, 3, {'hidden_size': 200, 'layers': 1}),
 (0.23285714285714285, 4, {'hidden_size': 200, 'layers': 1}),
 (0.32371428571428573, 5, {'hidden_size': 200, 'layers': 1}),
 (0.35428571428571426, 6, {'hidden_size': 200, 'layers': 1}),
 (0.37114285714285716, 7, {'hidden_size': 200, 'layers': 1}),
 (0.371

In [None]:
# Calling the hyperparamater optimization for GRU
layers_list = [1, 2, 3]
hidden_size_list = [100, 200, 300, 400, 500]
random_search(hidden_size_list,layers_list)

{'layers': 1, 'hidden_size': 100}
epoch  1 loss  0.012467223639999117 Train Accuracy & F1 (0.24025, 0.2009991723701005) Validation Accuracy & F1  (0.20457142857142857, 0.09401845629764669)
epoch  2 loss  0.012156574325902121 Train Accuracy & F1 (0.29267857142857145, 0.27796937765618857) Validation Accuracy & F1  (0.22457142857142856, 0.14935808339511786)
epoch  3 loss  0.01180074484007699 Train Accuracy & F1 (0.33153571428571427, 0.32245941371911524) Validation Accuracy & F1  (0.2782857142857143, 0.23607915813859828)
epoch  4 loss  0.01137922260590962 Train Accuracy & F1 (0.3699642857142857, 0.36316473653574527) Validation Accuracy & F1  (0.32285714285714284, 0.2928692996212117)
epoch  5 loss  0.010735602063792092 Train Accuracy & F1 (0.427, 0.4101477712656755) Validation Accuracy & F1  (0.37714285714285717, 0.3299930264341712)
epoch  6 loss  0.010071007668972015 Train Accuracy & F1 (0.46267857142857144, 0.4481340561097754) Validation Accuracy & F1  (0.39285714285714285, 0.351791871747

[(0.20457142857142857, 1, {'hidden_size': 100, 'layers': 1}),
 (0.22457142857142856, 2, {'hidden_size': 100, 'layers': 1}),
 (0.2782857142857143, 3, {'hidden_size': 100, 'layers': 1}),
 (0.32285714285714284, 4, {'hidden_size': 100, 'layers': 1}),
 (0.37714285714285717, 5, {'hidden_size': 100, 'layers': 1}),
 (0.39285714285714285, 6, {'hidden_size': 100, 'layers': 1}),
 (0.422, 7, {'hidden_size': 100, 'layers': 1}),
 (0.42457142857142854, 8, {'hidden_size': 100, 'layers': 1}),
 (0.44285714285714284, 9, {'hidden_size': 100, 'layers': 1}),
 (0.44285714285714284, 9, {'hidden_size': 100, 'layers': 1}),
 (0.2, 1, {'hidden_size': 200, 'layers': 1}),
 (0.20114285714285715, 2, {'hidden_size': 200, 'layers': 1}),
 (0.23485714285714285, 3, {'hidden_size': 200, 'layers': 1}),
 (0.236, 4, {'hidden_size': 200, 'layers': 1}),
 (0.36228571428571427, 5, {'hidden_size': 200, 'layers': 1}),
 (0.4154285714285714, 6, {'hidden_size': 200, 'layers': 1}),
 (0.4154285714285714, 6, {'hidden_size': 200, 'layers'

In [None]:
# Calling the hyperparamater optimization for Ensemble of GRU,LSTM
layers_list = [1,2]
hidden_size_list = [100,200]
random_search(hidden_size_list,layers_list)

{'layers': 1, 'hidden_size': 100}
epoch  1 loss  0.012522491071905409 Train Accuracy & F1 (0.21735714285714286, 0.15361606304701242) Validation Accuracy & F1  (0.19542857142857142, 0.06754167280268557)
epoch  2 loss  0.012450981387070247 Train Accuracy & F1 (0.24471428571428572, 0.21402334972287748) Validation Accuracy & F1  (0.19485714285714287, 0.06744818503972153)
epoch  3 loss  0.012249883941241673 Train Accuracy & F1 (0.28214285714285714, 0.2544283262226445) Validation Accuracy & F1  (0.19514285714285715, 0.06798020554900973)
epoch  4 loss  0.011688113033771515 Train Accuracy & F1 (0.3407857142857143, 0.32119285104039097) Validation Accuracy & F1  (0.1957142857142857, 0.06809411879594567)
epoch  5 loss  0.011006993519408361 Train Accuracy & F1 (0.39964285714285713, 0.362908521237034) Validation Accuracy & F1  (0.1957142857142857, 0.06805006334620607)
epoch  6 loss  0.010523114174604416 Train Accuracy & F1 (0.4205, 0.38319304061201426) Validation Accuracy & F1  (0.196, 0.0686036433

[(0.19542857142857142, 1, {'hidden_size': 100, 'layers': 1}),
 (0.19542857142857142, 1, {'hidden_size': 100, 'layers': 1}),
 (0.19542857142857142, 1, {'hidden_size': 100, 'layers': 1}),
 (0.1957142857142857, 4, {'hidden_size': 100, 'layers': 1}),
 (0.1957142857142857, 4, {'hidden_size': 100, 'layers': 1}),
 (0.196, 6, {'hidden_size': 100, 'layers': 1}),
 (0.196, 6, {'hidden_size': 100, 'layers': 1}),
 (0.196, 6, {'hidden_size': 100, 'layers': 1}),
 (0.28114285714285714, 9, {'hidden_size': 100, 'layers': 1}),
 (0.28114285714285714, 9, {'hidden_size': 100, 'layers': 1}),
 (0.19485714285714287, 1, {'hidden_size': 200, 'layers': 1}),
 (0.19485714285714287, 1, {'hidden_size': 200, 'layers': 1}),
 (0.19542857142857142, 3, {'hidden_size': 200, 'layers': 1}),
 (0.19542857142857142, 3, {'hidden_size': 200, 'layers': 1}),
 (0.19542857142857142, 3, {'hidden_size': 200, 'layers': 1}),
 (0.19542857142857142, 3, {'hidden_size': 200, 'layers': 1}),
 (0.19885714285714284, 7, {'hidden_size': 200, 'laye

# Final model training and evaluation

{'layers': 2, 'hidden_size': 100}

In [12]:
EMBEDDING_SIZE = 300
VOCAB_SIZE = len(TEXT.vocab.stoi)
NUM_CLASSES = len(LABEL.vocab.stoi)
HIDDEN_SIZE = 100
NUM_LAYERS = 2


model = GRUmodel(EMBEDDING_SIZE, VOCAB_SIZE, NUM_CLASSES, HIDDEN_SIZE, NUM_LAYERS) 
print(model)
model.to(device)

GRUmodel(
  (embedding): Embedding(53723, 300)
  (GRU_layer): GRU(300, 100, num_layers=2)
  (activation_fn): Tanh()
  (linear_layer): Linear(in_features=100, out_features=5, bias=True)
  (softmax_layer): LogSoftmax(dim=1)
)


GRUmodel(
  (embedding): Embedding(53723, 300)
  (GRU_layer): GRU(300, 100, num_layers=2)
  (activation_fn): Tanh()
  (linear_layer): Linear(in_features=100, out_features=5, bias=True)
  (softmax_layer): LogSoftmax(dim=1)
)

In [13]:
LEARNING_RATE = 0.01
criterion = nn.NLLLoss()
# create an instance of SGD with required hyperparameters
optimizer = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE)

In [14]:
import os
if not os.path.exists("./drive/MyDrive/checkpoint/"): # check if the directory doesn't exist already
    os.mkdir("./drive/MyDrive/checkpoint/")

MAX_EPOCHS = 10
for epoch in range(MAX_EPOCHS):
    # train the model for one pass over the data
    train_loss = train(train_iter,model,criterion,optimizer,device)  
    # compute the training accuracy
    train_acc = evaluate(train_iter,model,criterion,device)
    # compute the validation accuracy
    val_acc = evaluate(val_iter,model,criterion,device)
    
    # print the loss for every epoch
    print('epoch ',epoch+1,'loss ', train_loss,'Train Accuracy & F1',train_acc,'Validation Accuracy & F1 ', val_acc)
    
    # save model, optimizer, and number of epoch to a dictionary
    model_save = {
            'epoch': epoch,  # number of epoch
            'model_state_dict': model.state_dict(), # model parameters 
            'optimizer_state_dict': optimizer.state_dict(), # save optimizer 
            'loss': train_loss # training loss
            }
    
    # use torch.save to store 
    torch.save(model_save, "./drive/MyDrive/checkpoint/model_{}.pt".format(epoch))

epoch  1 loss  0.008283695816993713 Train Accuracy & F1 (0.6234285714285714, 0.6003394299877461) Validation Accuracy & F1  (0.524, 0.4845806076611903)
epoch  2 loss  0.006178911034549986 Train Accuracy & F1 (0.7830357142857143, 0.7816272535409867) Validation Accuracy & F1  (0.5914285714285714, 0.5850062893863802)
epoch  3 loss  0.005233762388782842 Train Accuracy & F1 (0.8058214285714286, 0.8025334131242282) Validation Accuracy & F1  (0.5745714285714286, 0.5648060747426003)
epoch  4 loss  0.004621785409748554 Train Accuracy & F1 (0.8279285714285715, 0.8261795274606903) Validation Accuracy & F1  (0.5708571428571428, 0.5646306792226341)
epoch  5 loss  0.004181097785277026 Train Accuracy & F1 (0.8375, 0.8356076297073939) Validation Accuracy & F1  (0.5708571428571428, 0.5653740134206102)
epoch  6 loss  0.003946723548429353 Train Accuracy & F1 (0.83625, 0.8352716220424211) Validation Accuracy & F1  (0.5525714285714286, 0.5489130315748321)
epoch  7 loss  0.003886275543698243 Train Accuracy &

In [16]:
# The best epoch is 2 an we will be generating our predictions based on the best model.

EMBEDDING_SIZE = 300
VOCAB_SIZE = 53723
NUM_CLASSES = len(LABEL.vocab.stoi)
HIDDEN_SIZE = 100
NUM_LAYERS = 2
# define a new model

model = GRUmodel(EMBEDDING_SIZE, VOCAB_SIZE, NUM_CLASSES, HIDDEN_SIZE, NUM_LAYERS) 

# load checkpoint

checkpoint = torch.load("./drive/MyDrive/checkpoint/model_1.pt")

# assign the parameters of checkpoint to this new model

model.load_state_dict(checkpoint['model_state_dict'])
# model.to(device)

print(model) # can be used for inference or for further training

GRUmodel(
  (embedding): Embedding(53723, 300)
  (GRU_layer): GRU(300, 100, num_layers=2)
  (activation_fn): Tanh()
  (linear_layer): Linear(in_features=100, out_features=5, bias=True)
  (softmax_layer): LogSoftmax(dim=1)
)


In [17]:
## Funtion to generate the predictions.

def inference(loader):
    all_pred=[]
    all_label = []
    with torch.no_grad(): # impacts the autograd engine and deactivate it. reduces memory usage and speeds up computation
        for batch in loader:
            # load the current batch
            batch_input = batch.tweet
            # forward propagation
            # pass the data through the model
            model_outputs = model(batch_input)
            # identify the predicted class for each example in the batch
            probabilities, predicted = torch.max(model_outputs.cpu().data, 1)
            # put all the true labels and predictions to two lists
            all_pred.extend(predicted)

    return all_pred

In [18]:
pred_list = inference(val_iter)

In [19]:

test_list = inference(test_iter)


In [20]:

pred_list = [x.item() for x in pred_list]
pred_list = [LABEL.vocab.itos[x] for x in pred_list]

In [21]:
test_list = [x.item() for x in test_list]
test_list = [LABEL.vocab.itos[x] for x in test_list]

##  Generating Submission File

In [22]:
def out_prediction(first_name, last_name, prediction_list, dataset, submit_number):
    """
    out_prediction takes three input varibles: first_name, last_name, prediction_list
    <first_name>, string, your first name, e.g., Tom
    <last_name>, string, your last name, e.g., Smith
    <prediction_list>, list of string which includes all your predications of TEST samples
                        e.g., ['1star','5star','3star']
    <dataset> dev or test
    <submit_number> index of your submission
                        
    Generate a file is named with <yourfirstname>_<yourlastname>_PRED.txt in current directory
    """
    output_file = open("{}_{}_{}_{}.txt".format(first_name,last_name, dataset, submit_number),'w')
    for item in prediction_list:
        output_file.write(item+"\n")
    output_file.close() 

A example of using `out_prediction` funtion. You can find a file `Tom_Smith_PRED.txt` in your diretory.

In [24]:
out_prediction("Varadraj", "Poojari",test_list, "test", "4")

## Steps Taken


1. Hyper parameter optimization 

- Based on results of the hyperparameter optimization, the number of layers chosen are 2 and number of hidden units are 100. 

- Hyperparamters used :

   EMBEDDING_SIZE = 300
   
   VOCAB_SIZE = len(TEXT.vocab.stoi). # Vocab size used is 10002 as we set max_size to 10000 while creating vocabulary
   
   NUM_CLASSES = len(LABEL.vocab.stoi)
   
   HIDDEN_SIZE = 100
   
   NUM_LAYERS = 2
   
   
2. Strategies attempted :

- Tried running three models - GRU, LSTM and CONVNet. Based on the results, it seemed that the GRU model performed the best.
- Took the top 10000 words while building vocabulary by adding `max_size = 10000`
- Chose embedding size as 300 for the model.
- Ran hyperparameter optimization with a small dataset first on all the above mentioned models and then based on the results, ran the model on the complete train set for each of the above model. Tried different values for number of layers, hidden size and learning rate. The best possible parameters that I considered for GRU model based on the results are `num_layers` = 2 and `hidden_size` = 100