In [None]:
import torch
import torch.nn as nn
import os
import pandas as pd
import numpy as np
from torchtext.legacy.data import BucketIterator,TabularDataset,Field,LabelField
import spacy
import nltk
import torch.nn.functional as F
from torchtext.data.utils import get_tokenizer
import random

In [None]:
data=pd.read_csv("/content/drive/MyDrive/dataset/RNN_E-Commerce_review/RNN_Dataset.csv")
data.head()

Unnamed: 0,text,label
0,absolutely wonderful silky and sexy and comfor...,1
1,love this dress it is sooo pretty i happened t...,1
2,i had such high hopes for this dress and reall...,0
3,i love love love this jumpsuit it is fun flirt...,1
4,this shirt is very flattering to all due to th...,1


In [None]:
tokenizer=get_tokenizer("basic_english")

In [None]:
TEXT = Field(tokenize = 'spacy',
                  tokenizer_language = 'en_core_web_sm')

LABEL = LabelField(dtype=torch.float)

In [None]:
FIELDS = [("text",TEXT),("label",LABEL)]

In [None]:
training_data = TabularDataset(path="/content/drive/MyDrive/dataset/RNN_E-Commerce_review/RNN_Dataset.csv",format="csv",fields=FIELDS,skip_header=True)
print(vars(training_data[3]))

{'text': ['i', 'love', 'love', 'love', 'this', 'jumpsuit', 'it', 'is', 'fun', 'flirty', 'and', 'fabulous', 'every', 'time', 'i', 'wear', 'it', 'i', 'get', 'nothing', 'but', 'great', 'compliments'], 'label': '1'}


In [None]:
train_data,valid_data = training_data.split(split_ratio=0.7,random_state=random.seed(2020))
print(len(train_data),len(valid_data))

15849 6792


In [None]:
print("Full dataset size ",len(training_data))
print("Training dataset size ",len(train_data))
print("Testing dataset size ",len(valid_data))

Full dataset size  22641
Training dataset size  15849
Testing dataset size  6792


In [None]:
MAX_VOCAB_SIZE = 25_000
TEXT.build_vocab(training_data)
LABEL.build_vocab(training_data)

In [None]:
print(f"Unique tokens in TEXT vocabulary: {len(TEXT.vocab)}")
print(f"Unique tokens in LABEL vocabulary: {len(LABEL.vocab)}")

Unique tokens in TEXT vocabulary: 14070
Unique tokens in LABEL vocabulary: 2


In [None]:
print(TEXT.vocab.freqs.most_common(20))

[('the', 76167), ('i', 67219), ('it', 49287), ('and', 49009), ('a', 43018), ('is', 38324), ('this', 25762), ('to', 24637), ('in', 20753), ('not', 18281), ('but', 16556), ('on', 15328), ('for', 14000), ('of', 13476), ('was', 12938), ('with', 12803), ('so', 12023), ('am', 11760), ('my', 11028), ('dress', 10567)]


In [None]:
print(TEXT.vocab.itos[:10])

['<unk>', '<pad>', 'the', 'i', 'it', 'and', 'a', 'is', 'this', 'to']


In [None]:
print(LABEL.vocab.stoi)

defaultdict(None, {'1': 0, '0': 1})


In [None]:
BATCH_SIZE = 64

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

train_iterator, valid_iterator= BucketIterator.splits(
    (train_data, valid_data), 
    sort_key=lambda x:len(x.text),
    sort_within_batch=False,
    batch_size = BATCH_SIZE)

In [None]:
text,_=next(iter(train_iterator))
text.shape

torch.Size([109, 64])

In [None]:
text2,_=next(iter(valid_iterator))
text2.shape

torch.Size([9, 64])

In [None]:
class RNN(nn.Module):
  def __init__(self,vocab_size,embedding_dim,hidden_dim,output_size):
    super(RNN,self).__init__()

 
    self.embedding = nn.Embedding(vocab_size,embedding_dim)
    self.rnn = nn.RNN(embedding_dim,hidden_dim)
    self.fc = nn.Linear(hidden_dim,output_dim)
    self.dropout = nn.Dropout(p=0.3)


  def forward(self,text):
    # x =[sentence_length,batch_size]
    x = self.dropout(self.embedding(text))
    # x =[sentence_length,batch_size,embedding dim]
    output,hidden = self.rnn(x)
    assert torch.equal(output[-1,:,:],hidden.squeeze(0))
    return self.fc(hidden.squeeze(0))


In [None]:
vocab_size = len(TEXT.vocab)
embedding_dim = 100
hidden_dim = 150
output_dim = 1

In [None]:

model=RNN(vocab_size,embedding_dim,hidden_dim,output_dim)

In [None]:
# for text,label in train_iterator:
#   print(model(text),"label ",label)
#   break

In [None]:
emb=nn.Embedding(1000,128)

In [None]:
emb(torch.LongTensor([3,4]))

tensor([[ 1.5901,  1.0236, -0.0419,  0.0341, -0.8468, -1.1274, -1.7975, -0.0831,
         -0.4116,  0.0686,  0.4389,  0.3112,  0.1420, -0.1316,  1.7200, -0.2779,
         -0.8220, -0.9431,  0.8283,  0.4614, -0.1204,  1.0759, -0.7698, -0.9323,
         -0.6768,  0.8141,  0.2555, -1.0896,  0.9790,  0.6021, -1.2100,  0.2789,
          0.7198, -1.0660,  0.3831,  0.6329,  1.1118, -0.4526, -0.7986, -1.0991,
         -1.0717,  0.2905,  0.6722,  2.0561, -0.4112, -0.1748,  1.6049, -1.8833,
          0.8247,  1.1390,  0.3200, -0.2319, -0.6911, -0.2879, -0.2936, -0.5025,
          0.5543,  0.1409, -1.8505,  0.3478,  1.6374, -0.3293, -0.6548, -0.8412,
         -0.3300,  2.0884,  0.1128,  1.6142, -0.1688,  0.1149,  1.0879,  1.0631,
          0.2627,  0.3531, -1.5252,  0.7875,  0.9610,  0.3846, -0.7143,  0.1425,
         -0.0856,  0.3615,  0.2744,  0.5631, -0.3674, -1.8275, -0.3024, -1.6035,
          1.0152, -1.5461,  1.3092, -0.4289, -3.1407, -0.3735, -1.2941,  0.6659,
         -1.0245,  0.3620,  

In [None]:
def count_parameters(model):
  return sum(p.numel() for p in model.parameters() if p.requires_grad)

In [None]:
print(f'The model has {count_parameters(model):,} trainable parameters')

The model has 1,444,951 trainable parameters


In [None]:
1e-2

0.01

In [None]:
import torch.optim as optim

optimizer = optim.SGD(model.parameters(), lr=1e-2)

In [None]:
criterion = nn.BCEWithLogitsLoss()

In [None]:
model = model.to(device)
criterion = criterion.to(device)

In [None]:
def binary_accuracy(preds, y):
    """
    Returns accuracy per batch, i.e. if you get 8/10 right, this returns 0.8, NOT 8
    """

    #round predictions to the closest integer
    rounded_preds = torch.round(torch.sigmoid(preds))
    correct = (rounded_preds == y).float() #convert into float for division 
    acc = correct.sum() / len(correct)
    return acc

In [None]:
def train(model, iterator, optimizer, criterion):
    
    epoch_loss = 0
    epoch_acc = 0
    
    model.train()
    
    for batch in iterator:
        
        optimizer.zero_grad()
                
        predictions = model(batch.text).squeeze(1)
        
        loss = criterion(predictions, batch.label)
        
        acc = binary_accuracy(predictions, batch.label)
        
        loss.backward()
        
        optimizer.step()
        
        epoch_loss += loss.item()
        epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [None]:
def evaluate(model, iterator, criterion):
    
    epoch_loss = 0
    epoch_acc = 0
    
    model.eval()
    
    with torch.no_grad():
    
        for batch in iterator:

            predictions = model(batch.text).squeeze(1)
            
            loss = criterion(predictions, batch.label)
            
            acc = binary_accuracy(predictions, batch.label)

            epoch_loss += loss.item()
            epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [None]:
N_EPOCHS = 20
dirs="/content/drive/MyDrive/dataset/RNN_E-Commerce_review/"
best_valid_loss = float('inf')

for epoch in range(N_EPOCHS):

    train_loss, train_acc = train(model, train_iterator, optimizer, criterion)
    valid_loss, valid_acc = evaluate(model, valid_iterator, criterion)
    
    

 
    
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), dirs+'tut1-model2.pt')
    
    # print(f'Epoch: {epoch+1:02} | Epoch Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. Acc: {valid_acc*100:.2f}%')

	Train Loss: 0.484 | Train Acc: 81.16%
	 Val. Loss: 0.604 |  Val. Acc: 63.99%
	Train Loss: 0.479 | Train Acc: 81.37%
	 Val. Loss: 0.596 |  Val. Acc: 66.41%
	Train Loss: 0.478 | Train Acc: 81.50%
	 Val. Loss: 0.587 |  Val. Acc: 69.16%
	Train Loss: 0.478 | Train Acc: 81.52%
	 Val. Loss: 0.585 |  Val. Acc: 71.58%
	Train Loss: 0.477 | Train Acc: 81.58%
	 Val. Loss: 0.576 |  Val. Acc: 74.24%
	Train Loss: 0.477 | Train Acc: 81.73%
	 Val. Loss: 0.567 |  Val. Acc: 76.46%
	Train Loss: 0.477 | Train Acc: 81.69%
	 Val. Loss: 0.563 |  Val. Acc: 77.77%
	Train Loss: 0.476 | Train Acc: 81.92%
	 Val. Loss: 0.557 |  Val. Acc: 78.94%
	Train Loss: 0.475 | Train Acc: 81.96%
	 Val. Loss: 0.553 |  Val. Acc: 79.78%
	Train Loss: 0.474 | Train Acc: 81.96%
	 Val. Loss: 0.548 |  Val. Acc: 80.30%
	Train Loss: 0.474 | Train Acc: 81.97%
	 Val. Loss: 0.544 |  Val. Acc: 80.56%
	Train Loss: 0.474 | Train Acc: 82.02%
	 Val. Loss: 0.541 |  Val. Acc: 80.87%
	Train Loss: 0.475 | Train Acc: 82.00%
	 Val. Loss: 0.537 |  Val

In [None]:
import spacy
nlp = spacy.load('en_core_web_sm')

def predict_sentiment(model, sentence):
    model.eval()
    tokenized = [tok.text for tok in nlp.tokenizer(sentence)]
    indexed = [TEXT.vocab.stoi[t] for t in tokenized]
    length = [len(indexed)]
    tensor = torch.LongTensor(indexed).to(device)
    tensor = tensor.unsqueeze(1)
    prediction = torch.sigmoid(model(tensor))
    return prediction.item()

In [None]:
predict_sentiment(model,data.text[1])

0.4321616291999817

In [None]:
a=data.text[22634],data.label[22634]

In [None]:
a

('i was surprised at the positive reviews for this product its terrible it cuts you in a weird place to make you look wide the skirt is also not like the picture its darker and heavier the material is not great i had to return ',
 0)

In [None]:
data.tail(10)

Unnamed: 0,text,label
22631,cute dress but not for me the waist is too hig...,1
22632,these bottoms are very cute but defiantly chee...,1
22633,i am so impressed with the beautiful color com...,1
22634,i was surprised at the positive reviews for th...,0
22635,so i was not sure about ordering this skirt be...,1
22636,i was very happy to snag this dress at such a ...,1
22637,it reminds me of maternity clothes soft stretc...,1
22638,this fit well but the top was very see through...,0
22639,i bought this dress for a wedding i have this ...,1
22640,this dress in a lovely platinum is feminine an...,1
