Ran the file on google colab

# 1. Dataset Generation

In [115]:
!pip install contractions
#! pip install bs4

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [116]:
import pandas as pd
import numpy as np
# import nltk
# nltk.download('wordnet')
import re
from bs4 import BeautifulSoup
import contractions

reading just the review body and star rating columns

In [4]:
df = pd.read_csv("data.tsv", usecols = ['star_rating','review_body'], 
                 sep='\t', on_bad_lines='skip')

  exec(code_obj, self.user_global_ns, self.user_ns)


In [5]:
df

Unnamed: 0,star_rating,review_body
0,5,so beautiful even tho clearly not high end ......
1,5,"Great product.. I got this set for my mother, ..."
2,5,Exactly as pictured and my daughter's friend l...
3,5,Love it. Fits great. Super comfortable and nea...
4,5,Got this as a Mother's Day gift for my Mom and...
...,...,...
1767046,4,It is nice looking and everything (it is sterl...
1767047,4,"my boyfriend bought me this last christmas, an..."
1767048,4,This is a great way to quickly start learning ...
1767049,5,the 14kt gold earrings look remarkable...would...


checking for null values

In [6]:
df.isnull().sum()

star_rating      9
review_body    244
dtype: int64

removing the null values

In [7]:
df.dropna(inplace=True)

In [8]:
df.isnull().sum()

star_rating    0
review_body    0
dtype: int64

In [9]:
#checking for unique labels in our dataframe
df.star_rating.unique()

array([5, 1, 4, 3, 2, '5', '1', '3', '4', '2'], dtype=object)

In [10]:
df.star_rating.value_counts()

5    1040896
4     259019
3     153660
1     150441
2      97259
5      40015
4      11411
3       5999
1       4566
2       3541
Name: star_rating, dtype: int64

In [11]:
df.dtypes

star_rating    object
review_body    object
dtype: object

In [12]:
#converting the columns to required datatypes, integer and string
df['star_rating'] = df['star_rating'].astype(int)
df['review_body'] = df['review_body'].astype('string')

In [13]:
#checking for unique labels again to make sure we have the 5 classes
df.star_rating.unique()

array([5, 1, 4, 3, 2])

In [14]:
df.star_rating.value_counts()

5    1080911
4     270430
3     159659
1     155007
2     100800
Name: star_rating, dtype: int64

In [15]:
df.dtypes

star_rating     int64
review_body    string
dtype: object

In [16]:
#checking for empty reviews before cleaning 
c=0
for i in df['review_body']:
  if i == '':
    c+=1
c

0

cleaning the data

In [17]:
df['review_body'] = df['review_body'].str.lower()

In [18]:
def clean(row):
    

    soup = BeautifulSoup(row.review_body, "html.parser")

    #this extracts all the text from the document and removes html tags
    text1 = soup.get_text(' ')
    
    #removing any urls
    text2 = re.sub(r'http\S+', '', text1)
    #ftext = re.sub(r'^https?:\/\/.*[\r\n]*', '', text)

    #performing contractions
    text3 = contractions.fix(text2)

    #removing non-alphabetic characters
    regex = re.compile('[^a-zA-Z ]')
    text4 = regex.sub('', text3)
    

    #removing extra white spaces
    text5 = re.sub(' +', ' ',  text4)

    row.review_body = text5

    return row

In [19]:
df = df.apply(clean, axis='columns')

  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' Beautiful Soup.' % markup)
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup


In [20]:
#checking for empty reviews after cleaning
c=0
for i in df['review_body']:
  if i == '':
    c+=1
c

1132

In [21]:
#replacing empty reiews with nan value
df.replace(r'^\s*$', np.nan, regex=True, inplace=True)

In [22]:
df.isnull().sum()

star_rating       0
review_body    1178
dtype: int64

In [23]:
#removing reviews with nan value
df.dropna(inplace=True)

In [24]:
df

Unnamed: 0,star_rating,review_body
0,5,so beautiful even though clearly not high end ...
1,5,great product i got this set for my mother as ...
2,5,exactly as pictured and my daughters friend lo...
3,5,love it fits great super comfortable and neat ...
4,5,got this as a mothers day gift for my mom and ...
...,...,...
1767046,4,it is nice looking and everything it is sterli...
1767047,4,my boyfriend bought me this last christmas and...
1767048,4,this is a great way to quickly start learning ...
1767049,5,the kt gold earrings look remarkablewould defi...


shuffling the rows

In [25]:
df = df.sample(frac=1).reset_index(drop=True)
df

Unnamed: 0,star_rating,review_body
0,5,just received this ring as a replacement for m...
1,5,these do run a bit small originally ordered th...
2,1,these are not well made and look much better o...
3,4,brautiful earings only thing the picture made ...
4,5,beautiful
...,...,...
1765624,5,this was a christmas present from my husband i...
1765625,5,this combo always appears very much in style a...
1765626,5,not bad could be dangerously sharp on neck and...
1765627,5,nice looking ring very happy fits a little snu...


selecting 20000 reviews randomly from each rating class.

In [26]:
s1 = df[df.star_rating.eq(1)].sample(20000)
s2 = df[df.star_rating.eq(2)].sample(20000)
s3 = df[df.star_rating.eq(3)].sample(20000)
s4 = df[df.star_rating.eq(4)].sample(20000)
s5 = df[df.star_rating.eq(5)].sample(20000)

In [27]:
newdf = pd.concat([s1,s2,s3,s4,s5], ignore_index=True)

In [28]:
newdf

Unnamed: 0,star_rating,review_body
0,1,pretty sure this ring is make out of plastic i...
1,1,a full inch less in fact i returned this chain...
2,1,they were too small even for a baby i regret b...
3,1,nice looking chain nice thickness but after we...
4,1,a good friend purchased this bracelet for me i...
...,...,...
99995,5,great deal for look of rose gold
99996,5,beautiful pendant but chain was broken
99997,5,very nice chain for the price it looks very sp...
99998,5,love it best gift ever


In [29]:
newdf.star_rating.value_counts()

1    20000
2    20000
3    20000
4    20000
5    20000
Name: star_rating, dtype: int64

In [30]:
newdf.isnull().sum()

star_rating    0
review_body    0
dtype: int64

In [31]:
c=0
for i in newdf['review_body']:
  if i=='':
    c+=1
c

0

Saving the cleaned data so we can load it and use directly later on.

In [32]:
newdf.to_csv("cleaneddata.csv", index=False)

# Restart from here if the system crashes

loading saved cleaned data

In [117]:
import pandas as pd
import numpy as np

In [2]:
newdf = pd.read_csv("cleaneddata.csv")
#newdf = newdf[['star_rating','review_body']]
newdf

Unnamed: 0,star_rating,review_body
0,1,pretty sure this ring is make out of plastic i...
1,1,a full inch less in fact i returned this chain...
2,1,they were too small even for a baby i regret b...
3,1,nice looking chain nice thickness but after we...
4,1,a good friend purchased this bracelet for me i...
...,...,...
99995,5,great deal for look of rose gold
99996,5,beautiful pendant but chain was broken
99997,5,very nice chain for the price it looks very sp...
99998,5,love it best gift ever


In [55]:
newdf.isnull().sum()

star_rating    0
review_body    0
dtype: int64

# 2. Word Embeddings

upgrading gensim and loading the word2vec api

In [118]:
!pip install --upgrade gensim

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [119]:
import gensim.models
import gensim.downloader as api
wv = api.load('word2vec-google-news-300')

In [7]:
#splitting each review into individual words
newdf['review_body'] = newdf['review_body'].str.split()
newdf['review_body']

#saving the splitted reviews in a list
sentences = newdf['review_body'].tolist()
#sentences

In [8]:
#confirming the shape of word2vec vectors
wv['king'].shape

(300,)

In [9]:
#a=np.zeros(300)

In [10]:
#a.shape

# 2 (a)

In [11]:
wv['dissappointed']

array([-1.46484375e-02,  3.29589844e-02,  1.81640625e-01, -5.61523438e-02,
       -2.69531250e-01,  1.58203125e-01,  3.24218750e-01, -4.63867188e-03,
        4.49218750e-02,  1.63085938e-01,  2.47802734e-02, -1.91650391e-02,
       -1.10839844e-01, -1.02539062e-01,  7.91015625e-02,  9.22851562e-02,
        1.13281250e-01,  9.13085938e-02, -8.30078125e-02,  9.08203125e-02,
        7.17773438e-02,  2.26562500e-01,  5.54687500e-01, -1.08398438e-01,
        1.77734375e-01,  1.68457031e-02,  1.16210938e-01, -2.97851562e-02,
        3.47656250e-01, -1.66015625e-01,  1.19140625e-01,  1.82617188e-01,
       -1.23901367e-02, -1.00097656e-01,  2.71484375e-01,  4.73022461e-03,
        8.78906250e-02,  9.37500000e-02, -1.08886719e-01, -1.33789062e-01,
        2.21679688e-01,  4.27246094e-03,  1.70898438e-01,  1.27929688e-01,
        1.02539062e-01, -3.67187500e-01, -1.06445312e-01, -1.57226562e-01,
       -2.69531250e-01, -9.52148438e-02,  6.39648438e-02, -6.00585938e-02,
       -5.37109375e-02,  

In [12]:
wv.similarity('king', 'queen')

0.6510957

In [13]:
wv.similarity('awesome','nice')

0.6404187

In [15]:
wv.similarity('bad','useless')

0.32079965

In [19]:
wv.similarity('best','worst')

0.58351105

In [16]:
wv.most_similar(positive=['great', 'nice'], topn=5)

[('terrific', 0.8015964031219482),
 ('fantastic', 0.7995432615280151),
 ('wonderful', 0.7832685708999634),
 ('good', 0.7787696719169617),
 ('awesome', 0.7119093537330627)]

In [17]:
wv.most_similar(positive=['good', 'bad'], negative=['excellent'], topn=5)

[('terrible', 0.5714249610900879),
 ('Bad', 0.5631329417228699),
 ('horrible', 0.5500000715255737),
 ('coulda_shoulda_woulda', 0.5419288873672485),
 ('lousy', 0.5275474190711975)]

In [18]:
wv.doesnt_match(['shorts', 'shirt', 'watch', 'pant', 'shoes', 'hankerchief'])

'watch'

In [None]:
# newdf['review_body'] = newdf['review_body'].str.split()
# newdf['review_body']

0        [i, was, disaapointed, on, how, small, the, cr...
1        [nothing, like, the, picture, its, dingy, and,...
2        [the, butterfly, is, red, and, purple, not, pi...
3        [the, picture, is, accurate, but, causes, the,...
4                                            [they, broke]
                               ...                        
99995    [i, loved, it, it, was, just, like, the, pictu...
99996    [i, must, have, purchased, this, after, the, g...
99997    [these, pearl, earrings, are, a, perfect, size...
99998    [bought, these, as, a, family, inspirational, ...
99999                               [simple, and, refined]
Name: review_body, Length: 100000, dtype: object

In [None]:
# newdf.isnull().sum()

star_rating    0
review_body    0
dtype: int64

In [None]:
# import gensim.models

# sentences = newdf['review_body'].tolist()
# sentences


In [None]:
# type(sentences[99][7])

str

In [None]:
# sentences[4040:4050]

In [None]:
# k=sentences[:4050]

# 2 (b)

In [20]:
model = gensim.models.Word2Vec(sentences=sentences, vector_size=300, window=11, min_count=10)

In [21]:
# from gensim.test.utils import common_texts
# common_texts

In [22]:
model.wv.similarity('king', 'queen')

0.3779027

In [25]:
model.wv.similarity('awesome','nice')

0.46043932

In [26]:
model.wv.similarity('bad','useless')

0.11656557

In [27]:
model.wv.similarity('best','worst')

0.6234996

In [28]:
model.wv.most_similar(positive=['great', 'nice'], topn=5)

[('good', 0.8338949084281921),
 ('fantastic', 0.7374886870384216),
 ('wonderful', 0.7228246927261353),
 ('decent', 0.7185121774673462),
 ('cool', 0.6643893122673035)]

In [29]:
model.wv.most_similar(positive=['good', 'bad'], negative=['excellent'], topn=5)

[('cool', 0.5621054768562317),
 ('big', 0.479255348443985),
 ('cheap', 0.44816023111343384),
 ('funny', 0.4401387572288513),
 ('great', 0.4369107484817505)]

In [30]:
model.wv.doesnt_match(['shorts', 'shirt', 'watch', 'pant', 'shoes', 'hankerchief'])



'watch'

In [32]:
#model.wv['prince']

**CONCLUSION**


* Comparing vectors generated by our data and the pretrained model show that our dataset has limited
vocabulary as is evident in the doesnt_match example.

* Our model sometimes predict incorrect words eg. in 

*model.wv.most_similar(positive=['good', 'bad'], negative=['excellent'], topn=5) *

it predicts cool, funny, great which do not make sence in this case. 
However, the pretrained model predicts perfectly.

* They work equally well in the following case,

*model.wv.most_similar(positive=['great', 'nice'], topn=5)*

* The pretrained model encodes semantic similarities between words better than our model as is evident from the first three examples of similarity between,
king, queen
awesome, nice
bad, useless
However, in the case of similarity between best and worst, our model encodes the similarity better than the pretrained model.

* From these test cases the pretrained model is better than our model in terms of encoding semantic similarities between words.





# 3. Simple models

average of word vectors

In [80]:
X_avgvec = []

In [81]:
t=0
c=0
for lst in sentences:
  l=0
  sm=np.zeros(300)
  for wrd in lst:
    t+=1
    try:
      sm+=wv[wrd]
      l+=1
    except:
      c+=1
  if l!=0:
    avg=sm/l
  X_avgvec.append(avg)

In [82]:
type(X_avgvec)

list

In [83]:
len(X_avgvec)

100000

In [84]:
X_avgvec[9].shape

(300,)

In [85]:
k=np.array(X_avgvec)

In [86]:
k.shape

(100000, 300)

In [87]:
X_avgvec[0].shape

(300,)

calculating percentage of words not in the pretrained model.

In [42]:
c

337310

In [45]:
t

3635802

In [46]:
c*100/t

9.277457903373175

# Train-Test Split

In [120]:
from sklearn.model_selection import train_test_split

In [89]:
X_train, X_test, y_train, y_test = train_test_split(X_avgvec, 
                                      newdf['star_rating'], test_size=0.2)

In [90]:
type(X_train)

list

In [91]:
y_train.value_counts()

3    16076
1    16021
2    16005
4    15951
5    15947
Name: star_rating, dtype: int64

In [92]:
y_test.value_counts()

5    4053
4    4049
2    3995
1    3979
3    3924
Name: star_rating, dtype: int64

In [121]:
from sklearn.metrics import f1_score, recall_score, classification_report, precision_score
from sklearn.metrics import precision_recall_fscore_support
from sklearn.metrics import accuracy_score

# Perceptron

In [122]:
from sklearn.linear_model import Perceptron

In [95]:
p = Perceptron()
p.fit(X_train,y_train)

Perceptron()

In [96]:
predictions = p.predict(X_test)

In [97]:
print(classification_report(y_test,predictions))

              precision    recall  f1-score   support

           1       0.69      0.17      0.28      3979
           2       0.28      0.82      0.42      3995
           3       0.34      0.07      0.11      3924
           4       0.41      0.21      0.28      4049
           5       0.60      0.66      0.63      4053

    accuracy                           0.39     20000
   macro avg       0.47      0.39      0.34     20000
weighted avg       0.47      0.39      0.34     20000



# SVM

In [123]:
from sklearn.svm import LinearSVC
svm = LinearSVC()


In [99]:
svm.fit(X_train,y_train)

LinearSVC()

In [100]:
predictions = svm.predict(X_test)

In [101]:
print(classification_report(y_test,predictions))

              precision    recall  f1-score   support

           1       0.51      0.72      0.59      3979
           2       0.38      0.25      0.30      3995
           3       0.39      0.36      0.38      3924
           4       0.45      0.32      0.37      4049
           5       0.59      0.76      0.66      4053

    accuracy                           0.48     20000
   macro avg       0.46      0.48      0.46     20000
weighted avg       0.47      0.48      0.46     20000



**CONCLUSION**

For tfidf the accuracy values were 41% and 50% for Perceptron and SVM
respectively. These values are very close to the accuracy values these models are 
giving after being trained on word2vec.
Thus it seems that training these two models on word level embeddings compared 
to document level embeddings does not lead to any significant improvement in 
accuracy of these models for predicting the ratings.

# 4. Feedforward Neural Networks

In [124]:
import torch
from torch.utils.data import DataLoader, Dataset
#import torchvision
import torchtext.transforms as transforms
from torch.utils.data.sampler import SubsetRandomSampler
#import matplotlib.pyplot as plt
#import numpy as np
#import pandas as pd

In [103]:
dtype=torch.cuda.FloatTensor

In [104]:
class Dataset(object):
    """An abstract class representing a Dataset.
    All other datasets should subclass it. All subclasses should override
    ``__len__``, that provides the size of the dataset, and ``__getitem__``,
    supporting integer indexing in range from 0 to len(self) exclusive.
    """

    def __getitem__(self, index):
        raise NotImplementedError

    def __len__(self):
        raise NotImplementedError

    def __add__(self, other):
        return ConcatDataset([self, other])

In [106]:
class TrainRW(Dataset):
    
    def __init__(self, review, rating):
        self.labels = rating
        self.text = review
        
    def __len__(self):
        return len(self.labels)
    
    def __getitem__(self, index):
        # load the label and text for the given index

         label = self.labels[index]
         text = self.text[index]
        
        # if self.transform is not None:
        #     text = self.transform(text)
            
        # return text, label

         return torch.from_numpy(text.astype(np.float32)), label

In [107]:
class TestRW(Dataset):
    
    def __init__(self, review, rating):
        self.labels = rating
        self.text = review
        
    def __len__(self):
        return len(self.labels)
    
    def __getitem__(self, index):
        # load text for the given index
        text = self.text[index]
        
        # if self.transform is not None:
        #     text = self.transform(text)
            
        #return text
        return torch.from_numpy(text.astype(np.float32))

In [125]:
import torch
torch.cuda.is_available()

False

In [109]:
torch.__version__

'1.12.1+cu113'

One hot encode the training labels to feed into our model.

In [126]:
from sklearn.preprocessing import OneHotEncoder
ohe    = OneHotEncoder(sparse=False)
j= y_train.values.reshape(len(y_train),1)
y_ohe = ohe.fit_transform(j)
y_ohe.shape

(80000, 5)

# 4(a)

 Tutorial used as reference
https://www.kaggle.com/mishra1993/pytorch-multi-layer-perceptron-mnist

In [111]:
train_data = TrainRW(X_train, y_ohe)
test_data = TestRW(X_test, y_test)

In [112]:
type(train_data[1][1])

numpy.ndarray

visualizing the training data

In [113]:
train_data[9]

(tensor([ 7.2510e-03,  7.1411e-03,  1.4453e-02,  1.3276e-01, -9.6436e-02,
         -4.6875e-02,  6.2732e-02, -8.5596e-02,  1.2100e-01,  5.2844e-02,
          1.2878e-02, -1.4171e-01,  1.6223e-02, -5.9717e-02, -8.1445e-02,
         -1.3867e-02,  7.3511e-02,  7.2119e-02,  4.9988e-02, -8.7866e-02,
          4.3750e-02, -1.4258e-02, -2.3828e-02, -4.1901e-03,  1.1982e-01,
         -4.3506e-02, -3.2788e-02,  1.0159e-01,  9.5847e-02,  7.9248e-02,
          4.3457e-03,  2.4683e-02,  5.4395e-02,  8.2581e-03, -3.3350e-02,
         -4.9353e-02,  3.1250e-03, -1.8628e-02,  5.9708e-02,  7.0858e-02,
          4.1406e-02,  4.2480e-03,  1.5635e-01, -2.2952e-02, -5.1941e-02,
          1.9485e-02, -1.2061e-01, -3.8477e-02,  1.1567e-01,  7.3486e-03,
          6.1224e-02, -6.0461e-02,  7.6074e-02, -6.2744e-02, -7.5317e-03,
          8.9771e-02,  8.4644e-02,  1.4575e-02, -1.7786e-02, -2.5781e-02,
         -9.1846e-02,  7.6563e-02, -6.4502e-02, -1.2896e-01, -4.2236e-03,
          1.1987e-02, -5.3711e-04,  1.

In [114]:
# number of subprocesses to use for data loading
num_workers = 0
# how many samples per batch to load
batch_size = 20
# percentage of training set to use as validation
valid_size = 0.2


# obtain training indices that will be used for validation
num_train = len(train_data)
indices = list(range(num_train))
np.random.shuffle(indices)
split = int(np.floor(valid_size * num_train))
train_idx, valid_idx = indices[split:], indices[:split]

# define samplers for obtaining training and validation batches
train_sampler = SubsetRandomSampler(train_idx)
valid_sampler = SubsetRandomSampler(valid_idx)

# prepare data loaders
train_loader = torch.utils.data.DataLoader(train_data, batch_size=batch_size,
    sampler=train_sampler, num_workers=num_workers,)
valid_loader = torch.utils.data.DataLoader(train_data, batch_size=batch_size, 
    sampler=valid_sampler, num_workers=num_workers)
test_loader = torch.utils.data.DataLoader(test_data, batch_size=batch_size, 
    num_workers=num_workers)

In [127]:
import torch.nn as nn
import torch.nn.functional as F

# define the NN architecture
class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        # number of hidden nodes in each layer
        hidden_1 = 50
        hidden_2 = 10
        # linear layer (300 -> hidden_1)
        self.fc1 = nn.Linear(300, hidden_1)
        # linear layer (n_hidden -> hidden_2)
        self.fc2 = nn.Linear(hidden_1, hidden_2)
        # linear layer (n_hidden -> 5)
        self.fc3 = nn.Linear(hidden_2, 5)
        # dropout layer (p=0.2)
        # dropout prevents overfitting of data
        self.dropout = nn.Dropout(0.2)

    def forward(self, x):
        # add hidden layer, with relu activation function
        x = F.relu(self.fc1(x))
        # add dropout layer
        x = self.dropout(x)
        # add hidden layer, with relu activation function
        x = F.relu(self.fc2(x))
        # add dropout layer
        x = self.dropout(x)
        # add output layer
        x = self.fc3(x)
        return x

# initialize the NN
model = Net()
#model.cuda()
print(model)

Net(
  (fc1): Linear(in_features=300, out_features=50, bias=True)
  (fc2): Linear(in_features=50, out_features=10, bias=True)
  (fc3): Linear(in_features=10, out_features=5, bias=True)
  (dropout): Dropout(p=0.2, inplace=False)
)


In [116]:
# specify loss function (categorical cross-entropy)
criterion = nn.CrossEntropyLoss()

# specify optimizer (stochastic gradient descent) and learning rate = 0.01

#optimizer = torch.optim.SGD(model.parameters(), lr=0.01)

optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

In [117]:
# number of epochs to train the model
n_epochs = 50

# initialize tracker for minimum validation loss
valid_loss_min = np.Inf # set initial "min" to infinity

for epoch in range(n_epochs):
    # monitor training loss
    train_loss = 0.0
    valid_loss = 0.0
    
    ###################
    # train the model #
    ###################
    model.train() # prep model for training
    for data, target in train_loader:
        # clear the gradients of all optimized variables
        optimizer.zero_grad()
        # forward pass: compute predicted outputs by passing inputs to the model
        output = model(data)
        # calculate the loss
        loss = criterion(output, target)
        # backward pass: compute gradient of the loss with respect to model parameters
        loss.backward()
        # perform a single optimization step (parameter update)
        optimizer.step()
        # update running training loss
        train_loss += loss.item()*data.size(0)
        
    ######################    
    # validate the model #
    ######################
    model.eval() # prep model for evaluation
    for data, target in valid_loader:
        # forward pass: compute predicted outputs by passing inputs to the model
        output = model(data)
        # calculate the loss
        loss = criterion(output, target)
        # update running validation loss 
        valid_loss += loss.item()*data.size(0)
        
    # print training/validation statistics 
    # calculate average loss over an epoch
    train_loss = train_loss/len(train_loader.dataset)
    valid_loss = valid_loss/len(valid_loader.dataset)
    
    print('Epoch: {} \tTraining Loss: {:.6f} \tValidation Loss: {:.6f}'.format(
        epoch+1, 
        train_loss,
        valid_loss
        ))
    
    # save model if validation loss has decreased
    if valid_loss <= valid_loss_min:
        print('Validation loss decreased ({:.6f} --> {:.6f}).  Saving model ...'.format(
        valid_loss_min,
        valid_loss))
        torch.save(model.state_dict(), 'model.pt')
        valid_loss_min = valid_loss

Epoch: 1 	Training Loss: 1.054113 	Validation Loss: 0.242447
Validation loss decreased (inf --> 0.242447).  Saving model ...
Epoch: 2 	Training Loss: 0.988229 	Validation Loss: 0.239729
Validation loss decreased (0.242447 --> 0.239729).  Saving model ...
Epoch: 3 	Training Loss: 0.971067 	Validation Loss: 0.234142
Validation loss decreased (0.239729 --> 0.234142).  Saving model ...
Epoch: 4 	Training Loss: 0.962908 	Validation Loss: 0.237177
Epoch: 5 	Training Loss: 0.957198 	Validation Loss: 0.232682
Validation loss decreased (0.234142 --> 0.232682).  Saving model ...
Epoch: 6 	Training Loss: 0.950669 	Validation Loss: 0.231263
Validation loss decreased (0.232682 --> 0.231263).  Saving model ...
Epoch: 7 	Training Loss: 0.948470 	Validation Loss: 0.230644
Validation loss decreased (0.231263 --> 0.230644).  Saving model ...
Epoch: 8 	Training Loss: 0.942286 	Validation Loss: 0.230347
Validation loss decreased (0.230644 --> 0.230347).  Saving model ...
Epoch: 9 	Training Loss: 0.937937 

Load the Model with the Lowest Validation Loss

In [131]:
model.load_state_dict(torch.load('model.pt'))

<All keys matched successfully>

In [132]:
test_loader = torch.utils.data.DataLoader(test_data, batch_size=1, 
    num_workers=num_workers)

In [133]:
def predict(model, dataloader):
    prediction_list = []
    for i, batch in enumerate(dataloader):
        outputs = model(batch)
        _, predicted = torch.max(outputs.data, 1) 
        #print(outputs.data)
        #print(torch.max(outputs.data, 1))
        prediction_list.append(predicted.cpu())
    return prediction_list

In [134]:
predictions = predict(model,test_loader)

In [135]:
len(predictions)

20000

In [136]:
type(predictions[0].tolist())

list

In [137]:
predictions[0].tolist()[0]

0

In [138]:
# converting the list of tensors to a list of integers and adding 1 to change 
# from indices(0 to 4) to labels(1 to 5)

pred=[]
for i in predictions:
  pred.append(i.tolist()[0]+1)

In [139]:
#pred

In [140]:
#predictions = np.array(predictions)

In [141]:
#predictions

In [142]:
type(y_test)

pandas.core.series.Series

In [143]:
print(classification_report(y_test,pred))

              precision    recall  f1-score   support

           1       0.58      0.63      0.61      3979
           2       0.39      0.38      0.38      3995
           3       0.42      0.38      0.40      3924
           4       0.46      0.34      0.39      4049
           5       0.61      0.78      0.69      4053

    accuracy                           0.50     20000
   macro avg       0.49      0.50      0.49     20000
weighted avg       0.49      0.50      0.49     20000



# 4(b)

Concatination of word vectors

In [144]:
X_concat=[]

for lst in sentences:
  try:
    concat = wv[lst[0]]
  except:
    concat = np.zeros(300)
  
  for i in range(1,10):
    
    try:
      concat = np.concatenate((concat,wv[lst[i]]))
    except:
      concat = np.concatenate((concat,np.zeros(300)))
  
  X_concat.append(concat)


In [146]:
X_concat[9].shape

(3000,)

In [147]:
len(X_concat)

100000

In [148]:
len(X_concat[1])

3000

In [149]:
X_train, X_test, y_train, y_test = train_test_split(X_concat, 
                                      newdf['star_rating'], test_size=0.2)

In [150]:
# one hot encoding the labels

from sklearn.preprocessing import OneHotEncoder
ohe    = OneHotEncoder(sparse=False)
j= y_train.values.reshape(len(y_train),1)
y_ohe = ohe.fit_transform(j)
y_ohe.shape

(80000, 5)

In [152]:
train_data = TrainRW(X_train, y_ohe)
test_data = TestRW(X_test, y_test)

In [153]:
len(train_data[0][0])

3000

In [154]:
# number of subprocesses to use for data loading
num_workers = 0
# how many samples per batch to load
batch_size = 20
# percentage of training set to use as validation
valid_size = 0.2

# convert data to torch.FloatTensor
#transform = transforms.ToTensor()

# obtain training indices that will be used for validation
num_train = len(train_data)
indices = list(range(num_train))
np.random.shuffle(indices)
split = int(np.floor(valid_size * num_train))
train_idx, valid_idx = indices[split:], indices[:split]

# define samplers for obtaining training and validation batches
train_sampler = SubsetRandomSampler(train_idx)
valid_sampler = SubsetRandomSampler(valid_idx)

# prepare data loaders
train_loader = torch.utils.data.DataLoader(train_data, batch_size=batch_size,
    sampler=train_sampler, num_workers=num_workers,)
valid_loader = torch.utils.data.DataLoader(train_data, batch_size=batch_size, 
    sampler=valid_sampler, num_workers=num_workers)
test_loader = torch.utils.data.DataLoader(test_data, batch_size=batch_size, 
    num_workers=num_workers)

In [155]:
import torch.nn as nn
import torch.nn.functional as F

# define the NN architecture
class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        # number of hidden nodes in each layer
        hidden_1 = 50
        hidden_2 = 10
        # linear layer (3000 -> hidden_1)
        self.fc1 = nn.Linear(3000, hidden_1)
        # linear layer (n_hidden -> hidden_2)
        self.fc2 = nn.Linear(hidden_1, hidden_2)
        # linear layer (n_hidden -> 5)
        self.fc3 = nn.Linear(hidden_2, 5)
        # dropout layer (p=0.2)
        # dropout prevents overfitting of data
        self.dropout = nn.Dropout(0.2)

    def forward(self, x):
        # add hidden layer, with relu activation function
        x = F.relu(self.fc1(x))
        # add dropout layer
        x = self.dropout(x)
        # add hidden layer, with relu activation function
        x = F.relu(self.fc2(x))
        # add dropout layer
        x = self.dropout(x)
        # add output layer
        x = self.fc3(x)
        return x

# initialize the NN
model = Net()
#model.cuda()
print(model)

Net(
  (fc1): Linear(in_features=3000, out_features=50, bias=True)
  (fc2): Linear(in_features=50, out_features=10, bias=True)
  (fc3): Linear(in_features=10, out_features=5, bias=True)
  (dropout): Dropout(p=0.2, inplace=False)
)


In [194]:
# specify loss function (categorical cross-entropy)
criterion = nn.CrossEntropyLoss()

# specify optimizer (stochastic gradient descent) and learning rate = 0.01

#optimizer = torch.optim.SGD(model.parameters(), lr=0.01)

optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

In [195]:
# number of epochs to train the model
n_epochs = 50

# initialize tracker for minimum validation loss
valid_loss_min = np.Inf # set initial "min" to infinity

for epoch in range(n_epochs):
    # monitor training loss
    train_loss = 0.0
    valid_loss = 0.0
    
    ###################
    # train the model #
    ###################
    model.train() # prep model for training
    for data, target in train_loader:
        # clear the gradients of all optimized variables
        optimizer.zero_grad()
        # forward pass: compute predicted outputs by passing inputs to the model
        output = model(data)
        # calculate the loss
        loss = criterion(output, target)
        # backward pass: compute gradient of the loss with respect to model parameters
        loss.backward()
        # perform a single optimization step (parameter update)
        optimizer.step()
        # update running training loss
        train_loss += loss.item()*data.size(0)
        
    ######################    
    # validate the model #
    ######################
    model.eval() # prep model for evaluation
    for data, target in valid_loader:
        # forward pass: compute predicted outputs by passing inputs to the model
        output = model(data)
        # calculate the loss
        loss = criterion(output, target)
        # update running validation loss 
        valid_loss += loss.item()*data.size(0)
        
    # print training/validation statistics 
    # calculate average loss over an epoch
    train_loss = train_loss/len(train_loader.dataset)
    valid_loss = valid_loss/len(valid_loader.dataset)
    
    print('Epoch: {} \tTraining Loss: {:.6f} \tValidation Loss: {:.6f}'.format(
        epoch+1, 
        train_loss,
        valid_loss
        ))
    
    # save model if validation loss has decreased
    if valid_loss <= valid_loss_min:
        print('Validation loss decreased ({:.6f} --> {:.6f}).  Saving model ...'.format(
        valid_loss_min,
        valid_loss))
        torch.save(model.state_dict(), 'model2.pt')
        valid_loss_min = valid_loss

Epoch: 1 	Training Loss: 0.972874 	Validation Loss: 0.268727
Validation loss decreased (inf --> 0.268727).  Saving model ...
Epoch: 2 	Training Loss: 0.943562 	Validation Loss: 0.272710
Epoch: 3 	Training Loss: 0.922142 	Validation Loss: 0.274256
Epoch: 4 	Training Loss: 0.902834 	Validation Loss: 0.278536
Epoch: 5 	Training Loss: 0.887126 	Validation Loss: 0.283988
Epoch: 6 	Training Loss: 0.872651 	Validation Loss: 0.291932
Epoch: 7 	Training Loss: 0.858009 	Validation Loss: 0.296557
Epoch: 8 	Training Loss: 0.845718 	Validation Loss: 0.302367
Epoch: 9 	Training Loss: 0.835409 	Validation Loss: 0.308386
Epoch: 10 	Training Loss: 0.824048 	Validation Loss: 0.314525
Epoch: 11 	Training Loss: 0.813987 	Validation Loss: 0.319673
Epoch: 12 	Training Loss: 0.806649 	Validation Loss: 0.327686
Epoch: 13 	Training Loss: 0.796446 	Validation Loss: 0.337366
Epoch: 14 	Training Loss: 0.788575 	Validation Loss: 0.339223
Epoch: 15 	Training Loss: 0.783078 	Validation Loss: 0.346340
Epoch: 16 	Trai

In [207]:
model.load_state_dict(torch.load('model2.pt'))

<All keys matched successfully>

In [208]:
test_loader = torch.utils.data.DataLoader(test_data, batch_size=1, 
    num_workers=num_workers)

In [209]:
def predict(model, dataloader):
    prediction_list = []
    for i, batch in enumerate(dataloader):
        outputs = model(batch)
        _, predicted = torch.max(outputs.data, 1) 
        #print(outputs.data)
        #print(torch.max(outputs.data, 1))
        prediction_list.append(predicted.cpu())
    return prediction_list

In [210]:
predictions = predict(model,test_loader)

In [211]:
#converting the list of tensors to a list of integers and adding 1 to change from 
#indices(0 to 4) to labels(1 to 5)

pred=[]
for i in predictions:
  pred.append(i.tolist()[0]+1)

In [212]:
print(classification_report(y_test,pred))

              precision    recall  f1-score   support

           1       0.46      0.50      0.48      3996
           2       0.33      0.27      0.30      4048
           3       0.34      0.35      0.34      3966
           4       0.36      0.23      0.28      4022
           5       0.48      0.70      0.57      3968

    accuracy                           0.41     20000
   macro avg       0.39      0.41      0.39     20000
weighted avg       0.39      0.41      0.39     20000



**CONCLUSION**

The feedforward neural networks got an accuracy of 50% and 41% for the average and concatenated word2vec vectors respectively. This is very close to the accuracies we got for the Perceptron and SVM models (39% and 48% respectively).

Thus, the neural networks do not provide any significant improvement over the "Simple Models" for this classification problem.

Also, the performance we got for the  average Word2Vec vectors with FNN, closely aligns with the performance we got for SVM, and the performance we got for the  concatenated Word2Vec vectors with FNN resembles that of Perceptron.  

# 5. Recurrent Neural Networks

# **If the system crashes restart from here for the rest of the assignment.**

Used 
https://pytorch.org/tutorials/intermediate/char_rnn_classification_tutorial.html as reference

preparing input

In [1]:
import torch

In [2]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [3]:
torch.cuda.device_count()

1

In [4]:
#torch.cuda.get_device_name(0)

In [5]:
import pandas as pd
import numpy as np

!pip install --upgrade gensim
import gensim.downloader as api
wv = api.load('word2vec-google-news-300')

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting gensim
  Downloading gensim-4.2.0-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (24.1 MB)
[K     |████████████████████████████████| 24.1 MB 1.4 MB/s 
Installing collected packages: gensim
  Attempting uninstall: gensim
    Found existing installation: gensim 3.6.0
    Uninstalling gensim-3.6.0:
      Successfully uninstalled gensim-3.6.0
Successfully installed gensim-4.2.0


In [14]:
newdf = pd.read_csv("cleaneddata.csv")
#newdf = newdf[['star_rating','review_body']]
newdf

newdf['review_body'] = newdf['review_body'].str.split()
newdf['review_body']


sentences = newdf['review_body'].tolist()
#sentences



In [15]:
newdf.isnull().sum()

star_rating    0
review_body    0
dtype: int64

preparing data

In [16]:
batch_size = 1

X_rnn=[]

for lst in sentences:
  #print(lst)
  t = torch.zeros(20, batch_size, 300).to(device)
  for i in range(20):
    
    try:
      #print(t[i][0])
      t[i][0] = torch.from_numpy(wv[lst[i]])
      #print(wv[lst[i]])
      #print(t[i][0])
    except:
      pass
  #print(t)
  X_rnn.append(t)


In [9]:
X_rnn[0].is_cuda

False

In [10]:
len(X_rnn)

100000

In [11]:
X_rnn[1][2][0]

tensor([-0.3008,  0.0737,  0.0972, -0.1318,  0.3281,  0.1797,  0.0593,  0.0581,
        -0.0791,  0.2676, -0.0952,  0.1592,  0.1719, -0.1167,  0.2275, -0.0645,
         0.0228,  0.1797, -0.0033, -0.0437, -0.4336, -0.0820, -0.1738,  0.0479,
        -0.2100,  0.0337, -0.2178,  0.3359,  0.1680, -0.0286, -0.1582,  0.1729,
         0.0272, -0.0542, -0.1445, -0.0085,  0.1514,  0.1445, -0.4043,  0.0967,
         0.3828,  0.0723,  0.2002, -0.3262, -0.0175, -0.0815, -0.1006,  0.1226,
         0.2041, -0.5664,  0.0447, -0.0618, -0.0669, -0.1865,  0.2031,  0.2070,
        -0.0698, -0.0339, -0.0449,  0.1426,  0.0498,  0.1328, -0.2695, -0.1855,
         0.0593, -0.0527, -0.0635,  0.3535, -0.0615,  0.4766,  0.0415,  0.0284,
         0.0654, -0.2061, -0.3359, -0.0227, -0.0276, -0.2969,  0.2598, -0.1240,
         0.0117, -0.0186, -0.0057,  0.1729, -0.0554,  0.1377,  0.1943,  0.3633,
         0.2910,  0.1133, -0.0649, -0.0165, -0.0107,  0.4355, -0.1846, -0.2812,
         0.0801,  0.0364,  0.3770,  0.01

In [12]:
X_rnn[0].shape

torch.Size([20, 1, 300])

In [13]:
#torch.zeros(2,1,10)

In [14]:
type(X_rnn[0][0][0])

torch.Tensor

Splitting the data

In [17]:
from sklearn.model_selection import train_test_split

In [18]:
X_train, X_test, y_train, y_test = train_test_split(X_rnn, 
                                      newdf['star_rating'], test_size=0.2)

In [19]:
# from sklearn.preprocessing import OneHotEncoder
# ohe    = OneHotEncoder(sparse=False)
# j= y_train.values.reshape(len(y_train),1)
# y_ohe = ohe.fit_transform(j)
# y_ohe.shape

In [20]:
y_tensor=[]
for i in y_train:
   y_tensor.append(torch.tensor([i-1]).to(device))

In [19]:
type(y_tensor[0])

torch.Tensor

In [20]:
y_tensor[0].is_cuda

False

In [21]:
X_train[1].size()

torch.Size([20, 1, 300])

# 5 (a)

Building the RNN

In [50]:
import torch.nn as nn

class RNN(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(RNN, self).__init__()

        self.hidden_size = hidden_size

        self.i2h = nn.Linear(input_size + hidden_size, hidden_size)
        self.i2o = nn.Linear(input_size + hidden_size, output_size)
        self.softmax = nn.LogSoftmax(dim=1)

    def forward(self, input, hidden):
        combined = torch.cat((input, hidden), 1)
        hidden = self.i2h(combined)
        output = self.i2o(combined)
        output = self.softmax(output)
        return output, hidden

    def initHidden(self):
        return torch.zeros(1, self.hidden_size)

n_hidden = 20
rnn = RNN(300, n_hidden, 5)

testing on a single input word

In [51]:
input = X_rnn[70][1]
hidden = torch.zeros(1, 20)

output, next_hidden = rnn(input, hidden)

In [52]:
input.shape

torch.Size([1, 300])

In [53]:
next_hidden

tensor([[-0.0510,  0.0822, -0.0475, -0.0424,  0.0198, -0.0118, -0.0505, -0.1642,
          0.0293, -0.0511, -0.0993, -0.0210, -0.0721,  0.0308,  0.0806,  0.0274,
          0.0301,  0.0286, -0.0825, -0.1470]], grad_fn=<AddmmBackward0>)

In [54]:
output

tensor([[-1.6628, -1.5736, -1.5548, -1.6330, -1.6269]],
       grad_fn=<LogSoftmaxBackward0>)

understanding the working of topk funtion

In [55]:
x = torch.arange(1., 6.)
print(x)
top_n, top_i = torch.topk(x, 1)

tensor([1., 2., 3., 4., 5.])


In [56]:
top_i[0].item()+1


5

In [57]:
# def categoryFromOutput(output):
#     top_n, top_i = output.topk(1)
#     category_i = top_i[0].item()
#     return category_i+1

In [30]:
#defining loss function
criterion = nn.NLLLoss()

In [41]:
learning_rate = 0.001 

def train(category_tensor, line_tensor):
    hidden = rnn.initHidden()

    rnn.zero_grad()

    for i in range(line_tensor.size()[0]):
        output, hidden = rnn(line_tensor[i], hidden)

    loss = criterion(output, category_tensor)
    loss.backward()

    # Add parameters' gradients to their values, multiplied by learning rate
    for p in rnn.parameters():
        p.data.add_(p.grad.data, alpha=-learning_rate)

    return output, loss.item()

training

In [58]:
epochs=8
for e in range(epochs):
  for i in range(80000):
      output, loss = train(y_tensor[i], X_train[i])
  print(loss)


1.6044834852218628
1.6049904823303223
1.6604976654052734
1.730271577835083
1.6326987743377686
1.642892599105835
1.6316699981689453
1.6272491216659546


predictions


In [59]:
def evaluate(line_tensor):
    hidden = rnn.initHidden()

    for i in range(line_tensor.size()[0]):
        output, hidden = rnn(line_tensor[i], hidden)

    return output


In [60]:
def predict(input):
    with torch.no_grad():
        output = evaluate(input)
    return output

In [61]:
pred=[]
for i in range(20000):
  out = predict(X_test[i])
  top_n, top_i = torch.topk(out, 1)
  pred.append(top_i[0].item()+1)

In [62]:
#pred

In [63]:
from sklearn.metrics import f1_score, recall_score, classification_report, precision_score
from sklearn.metrics import precision_recall_fscore_support
from sklearn.metrics import accuracy_score

In [64]:
print(classification_report(y_test,pred))

              precision    recall  f1-score   support

           1       0.43      0.75      0.54      3957
           2       0.29      0.11      0.16      4038
           3       0.34      0.43      0.38      4063
           4       0.39      0.30      0.34      3996
           5       0.62      0.53      0.58      3946

    accuracy                           0.42     20000
   macro avg       0.41      0.42      0.40     20000
weighted avg       0.41      0.42      0.40     20000



In [65]:
print(accuracy_score(y_test, pred))

0.422


**CONCLUSION**

Comparing RNN to FNN we see that RNN is giving an accuracy of around 43% which lies within the range of accuracy given by FNN for averaged and concatenated datasets. Thus, we can say that RNN does not perform better than FNN in this case. This may be due to the fact that our dataset is very simple, that is, reviews having negative words are low rated and reviews having positive words are highly rated. The reviews having negative words with high ratings and reviews with positive words having low ratings may be very less in our dataset. So, not much use of memory of RNNs as long term dependencies do not come into play much. 

# 5 (b)

In [32]:
import torch.nn as nn

class GRU2(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(GRU2, self).__init__()

        self.hidden_size = hidden_size

        #self.gru = nn.GRU(input_size, hidden_size)
        self.i2h = nn.GRU(input_size, hidden_size)
        #self.i2h = nn.GRU(input_size + hidden_size, hidden_size)
        self.i2o = nn.Linear(hidden_size, output_size)
        self.softmax = nn.LogSoftmax(dim=1)
        #self.fc = nn.Linear(hidden_size, output_size)

    def forward(self, input):
        #combined = torch.cat((input,hidden), 1)
        #combined = reduce(lambda x,y: torch.cat((x,y)), list_tensor[:-1])
        #hidden = self.initHidden()
        output, hidden = self.i2h(input)
        output = self.i2o(output)
        output = self.softmax(output)
        return output, hidden             

    def initHidden(self):
        return torch.zeros(1, self.hidden_size).to(device)

    # def forward(self, x):
    #     h0 = torch.zeros(x.size(0), self.hidden_size)
        
    #     out,_ = self.gru(x, h0)
    #     out = out.reshape(out.shape[0], -1)
    #     out = self.fc(out)
    #     return out



n_hidden = 20
gru = GRU2(300, n_hidden, 5).to(device)

In [67]:
print(device)

cpu


In [None]:
X_rnn[0].shape

torch.Size([20, 1, 300])

testing on a single input

In [33]:
input = X_rnn[70000][0].to(device)
hidden = torch.zeros(1, 20).to(device)


#output, next_hidden = gru(input, hidden)
output, next_hidden = gru(input)

In [94]:
input.shape

torch.Size([1, 300])

In [95]:
type(output)

torch.Tensor

In [96]:
output

tensor([[-1.5371, -1.5556, -1.6692, -1.6139, -1.6796]],
       grad_fn=<LogSoftmaxBackward0>)

In [97]:
output.size()

torch.Size([1, 5])

In [98]:
output.shape[0]

1

In [99]:
#output.reshape(output.shape[1], -1)

In [100]:
type(next_hidden)

torch.Tensor

In [101]:
hidden

tensor([[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]])

In [102]:
next_hidden

tensor([[-0.1024, -0.0620,  0.0651, -0.0901,  0.1541,  0.3548,  0.2269,  0.2714,
         -0.1405,  0.2131,  0.0745,  0.0784, -0.1761,  0.0017, -0.3040, -0.2906,
          0.3127,  0.0436, -0.1989,  0.1522]], grad_fn=<SqueezeBackward1>)

In [34]:
o2, h2 = gru(X_rnn[70001][0])

In [105]:
h2

tensor([[-0.0179, -0.1378, -0.0002,  0.0898, -0.0494,  0.0860,  0.1210,  0.0348,
         -0.0361,  0.0052, -0.1434,  0.0590,  0.0380,  0.1283,  0.0090, -0.0451,
          0.0064,  0.1396,  0.1589,  0.0355]], grad_fn=<SqueezeBackward1>)

In [106]:
#defining loss function
criterion = nn.NLLLoss()

In [107]:
learning_rate = 0.002

def traingru(category_tensor, line_tensor):
    hidden = gru.initHidden()
    #hidden = torch.zeros(1, 20)

    gru.zero_grad()

    for i in range(line_tensor.size()[0]):
        #output, hidden = gru(line_tensor[i], hidden)
        output, hidden = gru(line_tensor[i])

    loss = criterion(output, category_tensor)
    loss.backward()

    # Add parameters' gradients to their values, multiplied by learning rate
    for p in gru.parameters():
        p.data.add_(p.grad.data, alpha=-learning_rate)

    return output, loss.item()

In [108]:
epochs=5
for e in range(epochs):
  for i in range(80000):
      #try:
        output, loss = traingru(y_tensor[i], X_train[i])
      #except:
      #  print(i)
  print(loss)


1.5669699907302856
1.5284433364868164
1.5298733711242676
1.5444746017456055
1.5598870515823364


In [109]:
def evaluate(line_tensor):
    hidden = gru.initHidden()
    #hidden = torch.zeros(1, 20)

    for i in range(line_tensor.size()[0]):
        #output, hidden = gru(line_tensor[i], hidden)
        output, hidden = gru(line_tensor[i])
    return output


In [110]:
def predict(input):
    with torch.no_grad():
        output = evaluate(input)
    return output

In [111]:
pred=[]
for i in range(20000):
  
  out = predict(X_test[i])
  
  top_n, top_i = torch.topk(out, 1)
  pred.append(top_i[0].item()+1)

In [30]:
from sklearn.metrics import f1_score, recall_score, classification_report, precision_score
from sklearn.metrics import precision_recall_fscore_support
from sklearn.metrics import accuracy_score

In [113]:
print(classification_report(y_test,pred))

              precision    recall  f1-score   support

           1       0.26      0.19      0.22      3957
           2       0.24      0.14      0.18      4038
           3       0.23      0.25      0.24      4063
           4       0.25      0.07      0.11      3996
           5       0.24      0.56      0.33      3946

    accuracy                           0.24     20000
   macro avg       0.24      0.24      0.22     20000
weighted avg       0.24      0.24      0.22     20000



In [114]:
print(accuracy_score(y_test, pred))

0.2407


**CONCLUSION**

GRU does not seem to do any better than RNN. It may due to the fact that our dataset is very simple and hence long range dependencies do not come into play. So, reviews with negative words are low rated and reviews with positive words are high rated. The number of reviews having positive words with low rating or reviews with negative words having high rating seem to be very small. Also, I trained the GRU and RNN for very few epochs. Training them for more epochs could increase their accuracy.

# Testing a variation of GRU where input + hidden vectors are fed as input to the GRU 

In [21]:
import torch.nn as nn

class GRU2(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(GRU2, self).__init__()

        self.hidden_size = hidden_size

        #self.gru = nn.GRU(input_size, hidden_size)
        #self.i2h = nn.GRU(input_size, hidden_size)
        self.i2h = nn.GRU(input_size + hidden_size, hidden_size)
        self.i2o = nn.Linear(hidden_size * 2, output_size)
        self.softmax = nn.LogSoftmax(dim=1)
        #self.fc = nn.Linear(hidden_size, output_size)

    def forward(self, input, hidden):
        combined = torch.cat((input,hidden), 1)
        #combined = reduce(lambda x,y: torch.cat((x,y)), list_tensor[:-1])
        #hidden = self.initHidden()
        output, hidden = self.i2h(combined)
        combined = torch.cat((output,hidden), 1)
        output = self.i2o(combined)
        output = self.softmax(output)
        return output, hidden             

    def initHidden(self):
        return torch.zeros(1, self.hidden_size).to(device)

    # def forward(self, x):
    #     h0 = torch.zeros(x.size(0), self.hidden_size)
        
    #     out,_ = self.gru(x, h0)
    #     out = out.reshape(out.shape[0], -1)
    #     out = self.fc(out)
    #     return out



n_hidden = 20
gru = GRU2(300, n_hidden, 5).to(device)

In [22]:
#defining loss function
criterion = nn.NLLLoss()

In [23]:
learning_rate = 0.002

def traingru(category_tensor, line_tensor):
    hidden = gru.initHidden()
    #hidden = torch.zeros(1, 20)

    gru.zero_grad()

    for i in range(line_tensor.size()[0]):
        output, hidden = gru(line_tensor[i], hidden)
        #output, hidden = gru(line_tensor[i])

    loss = criterion(output, category_tensor)
    loss.backward()

    # Add parameters' gradients to their values, multiplied by learning rate
    for p in gru.parameters():
        p.data.add_(p.grad.data, alpha=-learning_rate)

    return output, loss.item()

In [24]:
epochs=2
for e in range(epochs):
  for i in range(80000):
      #try:
        output, loss = traingru(y_tensor[i], X_train[i])
      #except:
      #  print(i)
  print(loss)


1.5117350816726685
1.5787842273712158


In [25]:
def evaluate(line_tensor):
    hidden = gru.initHidden()
    #hidden = torch.zeros(1, 20)

    for i in range(line_tensor.size()[0]):
        output, hidden = gru(line_tensor[i], hidden)
        #output, hidden = gru(line_tensor[i])
    return output


In [26]:
def predict(input):
    with torch.no_grad():
        output = evaluate(input)
    return output

In [27]:
pred=[]
for i in range(20000):
  
  out = predict(X_test[i])
  
  top_n, top_i = torch.topk(out, 1)
  pred.append(top_i[0].item()+1)

In [31]:
print(classification_report(y_test,pred))

              precision    recall  f1-score   support

           1       0.27      0.20      0.23      4019
           2       0.32      0.01      0.01      4034
           3       0.23      0.32      0.27      4020
           4       0.23      0.18      0.20      3927
           5       0.26      0.55      0.36      4000

    accuracy                           0.25     20000
   macro avg       0.26      0.25      0.21     20000
weighted avg       0.26      0.25      0.21     20000



INSIGHT - The variation on GRU gives similar accuracy as the general GRU.

# Generating requirements file

In [128]:
#uncomment the line below to generate a requirements file

#!pip freeze > requirements.txt