# **Amazon Reviews sentiment Analysis**

### **Imports**

In [1]:
from torchtext import datasets
import pandas as pd
import numpy as np
import spacy
from torch.utils.data import DataLoader,Dataset
import torch
from torch import nn
import  torch.nn.functional as F

In [2]:
# datasets.AmazonReviewFull(root='data')

### **loading dataset**

In [3]:
def load(path,nrows):
    """
    takes - path str
    returns -> features-ndarray and labels-ndarray
    """
    df = pd.read_csv(path,header=None,nrows=nrows)
    # ordinal embedding targets
    func = lambda x: x-1
    df['stars'] = df[0].apply(func)
    #mergin'
    df['reviews'] = df.iloc[:,1]+ ' ' + df.iloc[:,2]
    #dropping
    df.drop([0,1,2],axis=1,inplace=True)
    df.dropna(inplace=True)

    return df['reviews'].values,df['stars'].values

In [4]:
def split(path,nrows,test_size=0.2):
    '''
    takes path, test ratio
    returns x_tr, x_val, y_train, y_val
    '''
    x,y = load(path,nrows)
    print('splitted')

    return train_test_split(x,y,test_size=test_size,random_state=0)

### **preprocessing**

**get spacy work done**

In [5]:
nlp = spacy.load('en_core_web_sm')

In [6]:
nlp

<spacy.lang.en.English at 0x7fa940081190>

#### **tokenization, lemmatization, punctuation removal**

In [7]:
def prep(text):
    doc = nlp(text)
    tokens = [token.lemma_ for token in doc if not token.is_punct and not token.is_stop]
    return tokens

In [8]:
text = 'It can be done better than this, because this version little bit harsh when it comes to prep'
print(prep(text))

['well', 'version', 'little', 'bit', 'harsh', 'come', 'prep']


### **textual prep. is almost done, yet word 2 vector**

In [9]:
from sklearn.model_selection import train_test_split
from torchtext.vocab import FastText


In [10]:
word_indexer = FastText('simple')

In [11]:
#Fast text has vocab, it can map words to indexes in that vocab
#we cleaned text, got token lemmatized, now we can get indexes for each word
word_indexer.stoi['chicken']

5160

In [12]:
# # but there is a case when word is not existed in that vocab
# word_indexer.stoi('Saidalikhon')
# #we should handle it

**token endcoding**

In [13]:
def token_encoder(token,word_indexer):
    if token == '<pad>':
        return 1
    else:
        try:
            return word_indexer.stoi[token]
        except:
            if type(token) != str :
                print(f'expected str, but got {type(token)} instead.')
            else:
                return 0

In [14]:
token_encoder('Saidalikhon',word_indexer)

0

**text encoding**

In [15]:
def text_encoder(tokens,word_indexer):
    '''
    input - list of lemmatized tokens
    returns - list of encoded tokens
    '''
    return [token_encoder(token,word_indexer) for token in tokens]

In [16]:
text = 'It might be the case , when you got job done in Pytorch, then realise there is a better and efficient way of it.'
text_encoder(prep(text),word_indexer)



[624, 216, 1044, 0, 13200, 178, 6975, 220]

**padding**

In [17]:
#we have reviews, with different length
#but our model expects the same dimensionality across its life cycle
#so we should add padding(if review has less amount word than max_length)
#or we should slice the review if it has more than max_length
# 5 * [1] -> [1,1,1,1,1], 0 * [1] -> [], -x * [1] -> []
def padding(list_indexed,max_length,pad=1):
    #padding short reviews
    res = list_indexed + (max_length - len(list_indexed))*[pad]
    # slicing prior to return, if review is longer
    return res[:max_length]

In [18]:
text = 'It might be the case , when you got job done in Pytorch, then realise there is a better and efficient way of it.'
padding(text_encoder(prep(text),word_indexer),10)

[624, 216, 1044, 0, 13200, 178, 6975, 220, 1, 1]

In [19]:
padding(text_encoder(prep(text),word_indexer),5)

[624, 216, 1044, 0, 13200]

**Custom Dataset**

In [20]:
class CustomDataset(Dataset):
    def __init__(self,x,y,max_length=32):
        self.max_length = max_length
        self.vec = FastText('simple')
        self.vec.vectors[0] = torch.zeros(self.vec.vectors[0].shape[0])
        self.vec.vectors[0] = -torch.ones(self.vec.vectors[0].shape[0])
        self.labels = y
        self.inputs = [padding(text_encoder(prep(review),self.vec),self.max_length) for review in x]
        

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, i):
        assert len(self.inputs[i]) == self.max_length
        return self.inputs[i],self.labels[i]



#### we create our custom collate function, later use inside dataloader, as collate function to batchify

In [21]:

def collate(batch,vectorizer=FastText('simple').vectors):
    #inner torch.stack is stacking vectorized words into review tensor
    #outer torch.stack is stacking that review into batch tensor
    inputs = torch.stack([torch.stack([vectorizer[token] for token in item[0]]) for item in batch])
    #
    #converting labels into Long type tensors, as criterion functions expects that dtype
    labels = torch.LongTensor([item[1] for item in batch])
    return inputs, labels


In [22]:
#train_path = 'data/AmazonReviewFull/amazon_review_full_csv/train.csv'
# x_tr,x_val,y_tr,y_val = split(train_path,nrows=600000)# 

In [23]:
# trainset = CustomDataset(x_tr,y_tr)
# dataloader_tr2save = DataLoader(trainset,batch_size=64,collate_fn=collate)
# torch.save(dataloader_tr2save,'dataloader_tr.pth')

In [24]:
# valset = CustomDataset(x_val,y_val)
# dataloader_val2save = DataLoader(valset,batch_size=64,collate_fn=collate)
# torch.save(dataloader_val2save,'dataloader_test.pth')

In [25]:
dataloader_tr = torch.load('dataloader_tr.pth')
dataloader_val = torch.load('dataloader_val.pth')

In [26]:
# test_path = 'data/AmazonReviewFull/amazon_review_full_csv/test.csv'
# x_test,y_test = load(test_path,nrows=120000)
# trainset = CustomDataset(x_test,y_test)
# dataloader_tr2save = DataLoader(trainset,batch_size=64,collate_fn=collate)
# torch.save(dataloader_tr2save,'dataloader_test.pth')

## Model

In [27]:
embed_dim =300
class Model(nn.Module):
    def __init__(self,hidden1,hidden2,hidden3,out_dim,max_length=32):
        super(Model,self).__init__()
        self.fc = nn.Linear(max_length*embed_dim,hidden1)
        self.fc2 = nn.Linear(hidden1,hidden2)
        self.fc3 = nn.Linear(hidden2,hidden3)
        self.fc4 = nn.Linear(hidden3,out_dim)
    
    def forward(self,x):
        z = F.relu(self.fc(x))
        z = F.relu(self.fc2(z))
        z = F.relu(self.fc3(z))
        z = F.log_softmax(self.fc4(z),dim=1)

        return z

        


In [30]:
from torch.optim import Adam
model = Model(1000,200,50,5)
criterion = nn.NLLLoss()
optimizer = Adam(model.parameters(),lr=0.00001)
epochs = 20
pr = 250
for epoch in range(epochs):
    losses_val = []
    losses_tr = []
    accs_val = []
    accs_tr = []
    acc_tr = 0
    acc_val = 0
    loss_tr = 0
    loss_val = 0

    print(f'running {epoch+1} out of {epochs} epochs ')
    model.train()
    for i,(x_tr,y_tr) in enumerate(iter(dataloader_tr)):
        optimizer.zero_grad()
        pred_tr = model.forward(x_tr.view(x_tr.shape[0],-1))
        pred_tr_ps = torch.exp(pred_tr.detach())
        loss_t = criterion(pred_tr,y_tr)
        loss_t.backward()
        optimizer.step()

        #metrics
        loss_tr += loss_t.item()
        acc_tr += y_tr.eq(pred_tr_ps.argmax(dim=1)).float().mean().item()
        if (i+1) % pr == 0 :
            print(f'\trunning train iteration {i+1}')
            print(f'\t\ttrain loss : {loss_tr/pr:.3f}\t accuracy : {acc_tr/pr:.3f}')
            losses_tr.append(loss_tr/pr)
            accs_tr.append(acc_tr/pr)
            loss_tr = 0
            acc_tr = 0
    print(f'train loss : {np.mean(losses_tr)}\t accuracy : {np.mean(accs_tr)}')

    
    model.eval()
    with torch.no_grad():
        for j,(x_val,y_val) in enumerate(iter(dataloader_val)):
            pred_val = model.forward(x_val.view(x_val.shape[0],-1))
            pred_val_ts = torch.exp(pred_val.detach())
            loss_v = criterion(pred_val,y_val)
            #metrics
            loss_val += loss_v.item()
            acc_val += y_val.eq(torch.exp(pred_val).argmax(dim=1)).float().mean().item()
            if (j+1) % pr == 0 :
                print(f'\trunning validation iteration {j+1}')
                print(f'\t\tvalidation loss : {loss_val/pr:.3f}\t accuracy : {acc_val/pr:.3f}')
                losses_val.append(loss_val/pr)
                accs_val.append(acc_val/pr)
                loss_val = 0
                acc_val = 0
        print(f'** train loss : {np.mean(losses_val):.3f}\t accuracy : {np.mean(accs_val):3f} **')
        #saving
    if epoch%2==0:
        torch.save(model.state_dict(),'model_semanticer.pth')


running 1 out of 20 epochs 
	running train iteration 250
		train loss : 1.607	 accuracy : 0.231
	running train iteration 500
		train loss : 1.588	 accuracy : 0.268
	running train iteration 750
		train loss : 1.551	 accuracy : 0.312
	running train iteration 1000
		train loss : 1.508	 accuracy : 0.339
	running train iteration 1250
		train loss : 1.479	 accuracy : 0.350
	running train iteration 1500
		train loss : 1.452	 accuracy : 0.366
	running train iteration 1750
		train loss : 1.432	 accuracy : 0.366
	running train iteration 2000
		train loss : 1.414	 accuracy : 0.379
	running train iteration 2250
		train loss : 1.412	 accuracy : 0.378
	running train iteration 2500
		train loss : 1.404	 accuracy : 0.379
	running train iteration 2750
		train loss : 1.397	 accuracy : 0.384
	running train iteration 3000
		train loss : 1.384	 accuracy : 0.390
	running train iteration 3250
		train loss : 1.382	 accuracy : 0.394
	running train iteration 3500
		train loss : 1.380	 accuracy : 0.392
	running 

### testing