# **Amazon Reviews sentiment Analysis**

### **Imports**

In [283]:
from torchtext import datasets
import pandas as pd
import numpy as np
import spacy
from torch.utils.data import DataLoader,Dataset
import torch
from torch import nn
import  torch.nn.functional as F

### **loading dataset**

In [284]:
train,test = datasets.AmazonReviewFull(root='data/')

In [285]:
train = pd.read_csv('data/AmazonReviewFull/amazon_review_full_csv/train.csv',nrows=5000,header=None)

In [286]:
train.dropna(inplace=True)

In [287]:
test = pd.read_csv('data/AmazonReviewFull/amazon_review_full_csv/test.csv',nrows=1000,header=None)

In [288]:
test.dropna(inplace=True)

In [289]:
test.dropna(inplace=True)

### **preprocessing**

#### **merging**

In [290]:
train['reviews'] = train.iloc[:,1]+ ' ' +train.iloc[:,2]

In [291]:
train

Unnamed: 0,0,1,2,reviews
0,3,more like funchuck,Gave this to my dad for a gag gift after direc...,more like funchuck Gave this to my dad for a g...
1,5,Inspiring,I hope a lot of people hear this cd. We need m...,Inspiring I hope a lot of people hear this cd....
2,5,The best soundtrack ever to anything.,I'm reading a lot of reviews saying that this ...,The best soundtrack ever to anything. I'm read...
3,4,Chrono Cross OST,The music of Yasunori Misuda is without questi...,Chrono Cross OST The music of Yasunori Misuda ...
4,5,Too good to be true,Probably the greatest soundtrack in history! U...,Too good to be true Probably the greatest soun...
...,...,...,...,...
4995,2,Waste oF $MONEY$,Waste Of Time & Money... I found that if you w...,Waste oF $MONEY$ Waste Of Time & Money... I fo...
4996,4,trying to win better,This book cuts down the odds with a better cha...,trying to win better This book cuts down the o...
4997,2,"don""t waste your money just buy the the lotter...",this book have almost the same information as ...,"don""t waste your money just buy the the lotter..."
4998,2,The odds are against you not for you.,Winning is pure luck. There is no amount of st...,The odds are against you not for you. Winning ...


#### **dropping other merged individual columns**

In [292]:
train.drop([1,2],axis=1,inplace=True)

#### **encoding** labels(1-5) -> (0-4)

In [293]:
#some ordinal encoding on labels
train[0] = train[0].apply(lambda x: x-1)

In [294]:
train[0].unique()

array([2, 4, 3, 0, 1])

**get spacy work done**

In [295]:
nlp = spacy.load('en_core_web_sm')

In [296]:
nlp

<spacy.lang.en.English at 0x7fb30d55b100>

#### **tokenization, lemmatization, punctuation removal**

In [297]:
def prep(text):
    doc = nlp(text)
    tokens = [token.lemma_ for token in doc if not token.is_punct and not token.is_stop]
    return tokens

In [298]:
text = 'It can be done better than this, because this version little bit harsh when it comes to prep'
print(prep(text))

['well', 'version', 'little', 'bit', 'harsh', 'come', 'prep']


### **textual prep. is almost done, yet word 2 vector**

In [299]:
from sklearn.model_selection import train_test_split
from torchtext.vocab import FastText


In [300]:
word_indexer = FastText('simple')

In [301]:
#Fast text has vocab, it can map words to indexes in that vocab
#we cleaned text, got token lemmatized, now we can get indexes for each word
word_indexer.stoi['chicken']

5160

In [302]:
# # but there is a case when word is not existed in that vocab
# word_indexer.stoi('Saidalikhon')
# #we should handle it

**token endcoding**

In [303]:
def token_encoder(token,word_indexer):
    if token == '<pad>':
        return 1
    else:
        try:
            return word_indexer.stoi[token]
        except:
            if type(token) != str :
                print(f'expected str, but got {type(token)} instead.')
            else:
                return 0

In [304]:
token_encoder('Saidalikhon',word_indexer)

0

**text encoding**

In [305]:
def text_encoder(tokens,word_indexer):
    '''
    input - list of lemmatized tokens
    returns - list of encoded tokens
    '''
    return [token_encoder(token,word_indexer) for token in tokens]

In [306]:
text = 'It might be the case , when you got job done in Pytorch, then realise there is a better and efficient way of it.'
text_encoder(prep(text),word_indexer)



[624, 216, 1044, 0, 13200, 178, 6975, 220]

**padding**

In [307]:
#we have reviews, with different length
#but our model expects the same dimensionality across its life cycle
#so we should add padding(if review has less amount word than max_length)
#or we should slice the review if it has more than max_length
# 5 * [1] -> [1,1,1,1,1], 0 * [1] -> [], -x * [1] -> []
def padding(list_indexed,max_length,pad=1):
    #padding short reviews
    res = list_indexed + (max_length - len(list_indexed))*[pad]
    # slicing prior to return, if review is longer
    return res[:max_length]

In [308]:
text = 'It might be the case , when you got job done in Pytorch, then realise there is a better and efficient way of it.'
padding(text_encoder(prep(text),word_indexer),10)

[624, 216, 1044, 0, 13200, 178, 6975, 220, 1, 1]

In [309]:
padding(text_encoder(prep(text),word_indexer),5)

[624, 216, 1044, 0, 13200]

**splitting into train and val**

In [310]:
x = train['reviews'].values
y=train[0].values
x_train,x_val,y_train,y_val = train_test_split(x,y,test_size=0.2,random_state=42)

**Custom Dataset**

In [311]:
class CustomDataset(Dataset):
    def __init__(self,x,y,max_length=32):
        self.max_length = max_length
        self.vec = FastText('simple')
        self.vectorizer = lambda x: self.vec.vectors[x]
        self.vec.vectors[0] = torch.zeros(self.vec.vectors[0].shape[0])
        self.vec.vectors[0] = -torch.ones(self.vec.vectors[0].shape[0])
        self.labels = y
        self.inputs = [padding(text_encoder(prep(review),self.vec),self.max_length) for review in x]
        

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, i):
        assert len(self.inputs[i]) == self.max_length
        return self.inputs[i],self.labels[i]



#### we create our custom collate function, later use inside dataloader, as collate function to batchify

In [312]:
trainset = CustomDataset(x_train,y_train)

In [313]:
#each item of the the set, contains (input,label)
print(trainset[0][1],'\n',trainset[0][0])

3 
 [2847, 930, 0, 0, 31413, 2895, 5828, 5072, 3776, 0, 216, 229, 338, 14262, 930, 437, 0, 437, 16633, 30659, 3192, 1368, 1358, 14262, 1948, 148, 0, 2895, 6143, 1736, 5828, 0]


In [314]:
trainset = CustomDataset(x_train,y_train)
def collate(batch,vectorizer=FastText('simple').vectors):
    #inner torch.stack is stacking vectorized words into review tensor
    #outer torch.stack is stacking that review into batch tensor
    inputs = torch.stack([torch.stack([vectorizer[token] for token in item[0]]) for item in batch])
    #
    #converting labels into Long type tensors, as criterion functions expects that dtype
    labels = torch.LongTensor([item[1] for item in batch])
    return inputs, labels
dataloader_tr = DataLoader(trainset,batch_size=64,collate_fn=collate)


In [315]:
next(iter(dataloader_tr))[0].shape


torch.Size([64, 32, 300])

In [316]:
valset = CustomDataset(x_val,y_val)
dataloader_val = DataLoader(valset,batch_size=64,collate_fn=collate)


## Model

In [322]:
embed_dim =300
class Model(nn.Module):
    def __init__(self,hidden1,hidden2,out_dim,max_length=32):
        super(Model,self).__init__()
        self.fc = nn.Linear(embed_dim*max_length,hidden1)
        self.fc2 = nn.Linear(hidden1,hidden2)
        self.fc3 = nn.Linear(hidden2,out_dim)
    
    def forward(self,x):
        z = F.relu(self.fc(x))
        z = F.relu(self.fc2(z))
        z = F.log_softmax(self.fc3(z),dim=1)

        return z

        


In [323]:
from torch.optim import Adam
model = Model(100,50,5)
criterion = nn.NLLLoss()
optimizer = Adam(model.parameters(),lr=0.003)
epochs = 2
for epoch in range(epochs):
    print(f'running {epoch+1} out of {epochs} epochs ')
    model.train()
    for i,(x_tr,y_tr) in enumerate(iter(dataloader_tr)):
        optimizer.zero_grad()
        print(i)
        pred_tr = model.forward(x_tr.view(x_tr.shape[0],-1))
        loss = criterion(pred_tr,y_tr)
        loss.backward()
        optimizer.step()
    print(loss.item())

    

running 1 out of 2 epochs 
0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
1.5515178442001343
running 2 out of 2 epochs 
0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
1.0193592309951782
