# **Amazon Reviews sentiment Analysis**

### **Imports**

In [169]:
from torchtext import datasets
import pandas as pd
import numpy as np
import spacy
from torch.utils.data import DataLoader,Dataset
import torch
from torch import nn
import  torch.nn.functional as F

### **loading dataset**

In [170]:
train,test = datasets.AmazonReviewFull(root='data/')

In [171]:
train = pd.read_csv('data/AmazonReviewFull/amazon_review_full_csv/train.csv',nrows=500,header=None)

In [172]:
train.dropna(inplace=True)

In [173]:
test = pd.read_csv('data/AmazonReviewFull/amazon_review_full_csv/test.csv',nrows=100,header=None)

In [174]:
test.dropna(inplace=True)

In [175]:
test.dropna(inplace=True)

### **preprocessing**

#### **merging**

In [176]:
train['reviews'] = train.iloc[:,1]+ ' ' +train.iloc[:,2]

In [177]:
train

Unnamed: 0,0,1,2,reviews
0,3,more like funchuck,Gave this to my dad for a gag gift after direc...,more like funchuck Gave this to my dad for a g...
1,5,Inspiring,I hope a lot of people hear this cd. We need m...,Inspiring I hope a lot of people hear this cd....
2,5,The best soundtrack ever to anything.,I'm reading a lot of reviews saying that this ...,The best soundtrack ever to anything. I'm read...
3,4,Chrono Cross OST,The music of Yasunori Misuda is without questi...,Chrono Cross OST The music of Yasunori Misuda ...
4,5,Too good to be true,Probably the greatest soundtrack in history! U...,Too good to be true Probably the greatest soun...
...,...,...,...,...
495,2,not really worth it,I wore this for my wedding since I hadn't lost...,not really worth it I wore this for my wedding...
496,2,Painful Experience,I think the concept is great but with this par...,Painful Experience I think the concept is grea...
497,3,It works but it hurts,It sucked all of me in & I looked great. I wor...,It works but it hurts It sucked all of me in &...
498,3,not what I expected,"Based on the positive reviews, I was excited t...",not what I expected Based on the positive revi...


#### **dropping other merged individual columns**

In [178]:
train.drop([1,2],axis=1,inplace=True)

#### **encoding** labels(1-5) -> (0-4)

In [179]:
#some ordinal encoding on labels
train[0] = train[0].apply(lambda x: x-1)

In [180]:
train[0].unique()

array([2, 4, 3, 0, 1])

**get spacy work done**

In [181]:
nlp = spacy.load('en_core_web_sm')

In [182]:
nlp

<spacy.lang.en.English at 0x7f9fa9c4cc40>

#### **tokenization, lemmatization, punctuation removal**

In [183]:
def prep(text):
    doc = nlp(text)
    tokens = [token.lemma_ for token in doc if not token.is_punct and not token.is_stop]
    return tokens

In [184]:
text = 'It can be done better than this, because this version little bit harsh when it comes to prep'
print(prep(text))

['well', 'version', 'little', 'bit', 'harsh', 'come', 'prep']


### **textual prep. is almost done, yet word 2 vector**

In [185]:
from sklearn.model_selection import train_test_split
from torchtext.vocab import FastText


In [186]:
word_indexer = FastText('simple')

In [187]:
#Fast text has vocab, it can map words to indexes in that vocab
#we cleaned text, got token lemmatized, now we can get indexes for each word
word_indexer.stoi['chicken']

5160

In [188]:
# # but there is a case when word is not existed in that vocab
# word_indexer.stoi('Saidalikhon')
# #we should handle it

**token endcoding**

In [189]:
def token_encoder(token,word_indexer):
    if token == '<pad>':
        return 1
    else:
        try:
            return word_indexer.stoi[token]
        except:
            if type(token) != str :
                print(f'expected str, but got {type(token)} instead.')
            else:
                return 0

In [190]:
token_encoder('Saidalikhon',word_indexer)

0

**text encoding**

In [191]:
def text_encoder(tokens,word_indexer):
    '''
    input - list of lemmatized tokens
    returns - list of encoded tokens
    '''
    return [token_encoder(token,word_indexer) for token in tokens]

In [192]:
text = 'It might be the case , when you got job done in Pytorch, then realise there is a better and efficient way of it.'
text_encoder(prep(text),word_indexer)



[624, 216, 1044, 0, 13200, 178, 6975, 220]

**padding**

In [193]:
#we have reviews, with different length
#but our model expects the same dimensionality across its life cycle
#so we should add padding(if review has less amount word than max_length)
#or we should slice the review if it has more than max_length
# 5 * [1] -> [1,1,1,1,1], 0 * [1] -> [], -x * [1] -> []
def padding(list_indexed,max_length,pad=1):
    #padding short reviews
    res = list_indexed + (max_length - len(list_indexed))*[pad]
    # slicing prior to return, if review is longer
    return res[:max_length]

In [194]:
text = 'It might be the case , when you got job done in Pytorch, then realise there is a better and efficient way of it.'
padding(text_encoder(prep(text),word_indexer),10)

[624, 216, 1044, 0, 13200, 178, 6975, 220, 1, 1]

In [195]:
padding(text_encoder(prep(text),word_indexer),5)

[624, 216, 1044, 0, 13200]

**splitting into train and val**

In [196]:
x = train['reviews'].values
y=train[0].values
x_train,x_val,y_train,y_val = train_test_split(x,y,test_size=0.2,random_state=42)

**Custom Dataset**

In [197]:
class CustomDataset(Dataset):
    def __init__(self,x,y,max_length=32):
        self.max_length = max_length
        self.vec = FastText('simple')
        self.vec.vectors[0] = torch.zeros(self.vec.vectors[0].shape[0])
        self.vec.vectors[0] = -torch.ones(self.vec.vectors[0].shape[0])
        self.labels = y
        self.inputs = [padding(text_encoder(prep(review),self.vec),self.max_length) for review in x]
        

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, i):
        assert len(self.inputs[i]) == self.max_length
        return self.inputs[i],self.labels[i]



#### we create our custom collate function, later use inside dataloader, as collate function to batchify

In [198]:
#each item of the the set, contains (input,label)

In [199]:
trainset = CustomDataset(x_train,y_train)
def collate(batch,vectorizer=FastText('simple').vectors):
    #inner torch.stack is stacking vectorized words into review tensor
    #outer torch.stack is stacking that review into batch tensor
    inputs = torch.stack([torch.stack([vectorizer[token] for token in item[0]]) for item in batch])
    #
    #converting labels into Long type tensors, as criterion functions expects that dtype
    labels = torch.LongTensor([item[1] for item in batch])
    return inputs, labels
dataloader_tr2save = DataLoader(trainset,batch_size=64,collate_fn=collate)
torch.save(dataloader_tr2save,'train.pth')
dataloader_tr = torch.load('train.pth')


AttributeError: Can't pickle local object 'CustomDataset.__init__.<locals>.<lambda>'

In [None]:
valset = CustomDataset(x_val,y_val)
dataloader_val2save = DataLoader(valset,batch_size=64,collate_fn=collate)
torch.save(dataloader_val2save,'test.pth')
dataloader_val = torch.load('test.pth')


## Model

In [None]:
embed_dim =300
class Model(nn.Module):
    def __init__(self,hidden1,hidden2,out_dim,max_length=32):
        super(Model,self).__init__()
        self.fc = nn.Linear(max_length*embed_dim,hidden1)
        self.fc2 = nn.Linear(hidden1,hidden2)
        self.fc3 = nn.Linear(hidden2,out_dim)
    
    def forward(self,x):
        z = F.relu(self.fc(x))
        z = F.relu(self.fc2(z))
        z = F.log_softmax(self.fc3(z),dim=1)

        return z

        


In [None]:
from torch.optim import Adam
model = Model(100,50,5)
criterion = nn.NLLLoss()
optimizer = Adam(model.parameters(),lr=0.003)
epochs = 2
pr = 100
for epoch in range(epochs):
    losses_val = []
    losses_tr = []
    accs_val = []
    accs_tr = []
    acc_tr = 0
    acc_val = 0
    loss_tr = 0
    loss_val = 0

    print(f'running {epoch+1} out of {epochs} epochs ')
    model.train()
    for i,(x_tr,y_tr) in enumerate(iter(dataloader_tr)):
        optimizer.zero_grad()
        pred_tr = model.forward(x_tr.view(x_tr.shape[0],-1))
        pred_tr_ps = torch.exp(pred_tr.detach())
        loss_t = criterion(pred_tr,y_tr)
        loss_t.backward()
        optimizer.step()

        #metrics
        loss_tr += loss_t.item()
        acc_tr += y_tr.eq(pred_tr_ps.argmax()).float().mean().item()
        if (i+1) % pr == 0 :
            print(f'\trunning train iteration {i+1}')
            print(f'\t\ttrain loss : {loss_tr/pr:.3f}\t accuracy : {acc_tr/pr:.3f}')
            losses_tr.append(loss_tr/pr)
            accs_tr.append(acc_tr/pr)
            loss_tr = 0
            acc_tr = 0
    print(f'train loss : {np.mean(losses_tr)}\t accuracy : {np.mean(accs_tr)}')

    
    model.eval()
    with torch.no_grad():
        for j,(x_val,y_val) in enumerate(iter(dataloader_val)):
            pred_val = model.forward(x_val.view(x_val.shape[0],-1))
            pred_val_ts = torch.exp(pred_val.detach())
            loss_v = criterion(pred_val,y_val)
            #metrics
            loss_val += loss_v.item()
            acc_val += y_val.eq(torch.exp(pred_val).detach().argmax()).float().mean().item()
            if (j+1) % pr == 0 :
                print(f'\trunning validation iteration {j+1}')
                print(f'\t\tvalidation loss : {loss_val/pr:.3f}\t accuracy : {acc_val/pr:.3f}')
                losses_val.append(loss_val/pr)
                accs_val.append(acc_val/pr)
                loss_val = 0
                acc_val = 0
        print(f'** train loss : {np.mean(losses_val):.3f}\t accuracy : {np.mean(accs_val):3f} **')


    

running 1 out of 2 epochs 
	running train iteration 100
		train loss : 1.546	 accuracy : 0.002
	running train iteration 200
		train loss : 1.478	 accuracy : 0.004
	running train iteration 300
		train loss : 1.444	 accuracy : 0.010
	running train iteration 400
		train loss : 1.427	 accuracy : 0.004
	running train iteration 500
		train loss : 1.421	 accuracy : 0.006
	running train iteration 600
		train loss : 1.406	 accuracy : 0.002
train loss : 1.4536795826752982	 accuracy : 0.004609375
	running validation iteration 100
		validation loss : 1.411	 accuracy : 0.002
** train loss : 1.4105382359027863	 accuracy : 0.00171875 **
running 2 out of 2 epochs 
	running train iteration 100
		train loss : 1.335	 accuracy : 0.002
	running train iteration 200
		train loss : 1.304	 accuracy : 0.002
	running train iteration 300
		train loss : 1.294	 accuracy : 0.000
	running train iteration 400
		train loss : 1.286	 accuracy : 0.002
	running train iteration 500
		train loss : 1.291	 accuracy : 0.004
	ru

In [None]:
#testing
with torch.no_grad():
    model.eval()
    for 