# Scrap Amazon reviews

In [None]:
import pandas as pd
import numpy as np

Next two functions loads the reviews from amazon website for the given product asin and number of pages.

In [None]:
!pip install requests_html --quiet
from requests_html import HTMLSession

class Reviews:
  def __init__(self, asin):
    self.asin = asin
    self.session = HTMLSession()
    self.headers = {'User-Agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/106.0.0.0 Safari/537.36'}
    self.url = f'https://www.amazon.co.uk/product-reviews/{self.asin}/ref=cm_cr_arp_d_viewopt_srt?ie=UTF8&reviewerType=all_reviews&sortBy=recent&pageNumber='
  def pagination(self, page):
    r = self.session.get(self.url + str(int(page)), headers=self.headers)
    if not r.html.find('div[data-hook=review]'):
      return False 
    else:
      return r.html.find('div[data-hook=review]')
  def parse(self, reviews):
    total = []
    if reviews:
      for review in reviews:
        rating = review.find('i[data-hook=review-star-rating] span', first=True).text
        title = review.find('a[data-hook=review-title]', first=True).text
        body = review.find('span[data-hook=review-body] span', first=True).text.replace('\n','').strip()

        data= { 'title': title, 'rating' : float(rating[:3]), 'body': body }
        total.append(data)
    return total


In [None]:
import time
n=100
asin = 'B07L5GDTYY'

def get_reviews(n, asin):
  """
  Input
    n - number of pages
    asin - product id
  Output
    reviews from first n pages
  """
  reviews=[]
  amz=Reviews(asin)
  for x in range(1, n):
    time.sleep(0.15)
    page=amz.pagination(x)
    current = amz.parse(page)
    if current != []:
      for review in current:
        reviews.append(review)
  reviews=pd.DataFrame(reviews)
  return reviews

In [None]:
data = get_reviews(n, asin)
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 990 entries, 0 to 989
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   title   990 non-null    object 
 1   rating  990 non-null    float64
 2   body    990 non-null    object 
dtypes: float64(1), object(2)
memory usage: 23.3+ KB


In [None]:
data['text'] = data['title'] + ' ' + data['body'] 
data.head()

Unnamed: 0,title,rating,body,text
0,Page turning buttons are a reel asset.,5.0,"Built in light is brilliant. I brought two, on...",Page turning buttons are a reel asset. Built i...
1,Outstanding,5.0,One of the best devices around. Perfect for tr...,Outstanding One of the best devices around. Pe...
2,"If you like books, buy a book, if you like rea...",5.0,The larger screen and the buttons make all the...,"If you like books, buy a book, if you like rea..."
3,Replaced my older kindle,5.0,This is a great kindle and I love the buttons ...,Replaced my older kindle This is a great kindl...
4,Perfect. Does everything I hoped it would.,5.0,I bought a Kindle years ago but it was the Fir...,Perfect. Does everything I hoped it would. I b...


# Pre-Trained Transformer: bert-base-multilingual-uncased-sentiment
This a bert-base-multilingual-uncased model finetuned for sentiment analysis on product reviews in six languages: English, Dutch, German, French, Spanish and Italian. It predicts the sentiment of the review as a number of stars (between 1 and 5).

This model is intended for direct use as a sentiment analysis model for product reviews in any of the six languages above, or for further finetuning on related sentiment analysis tasks.

https://huggingface.co/nlptown/bert-base-multilingual-uncased-sentiment?text=I+like+you.+I+love+you


In [None]:
!pip install transformers --quiet

In [119]:
# import modules
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch

tokenizer = AutoTokenizer.from_pretrained('nlptown/bert-base-multilingual-uncased-sentiment')
model = AutoModelForSequenceClassification.from_pretrained('nlptown/bert-base-multilingual-uncased-sentiment')

In [None]:
def sentiment_score(review):
    """ Tokenize review and predict sentiment class """
    tokens = tokenizer.encode(review, return_tensors='pt')
    result = model(tokens)
    return int(torch.argmax(result.logits))+1

def sentiment_confidence(review):
    """ Tokenize review and return sentiment class probability for the most likely class """
    tokens = tokenizer.encode(review, return_tensors='pt')
    result = model(tokens)
    logits=result.logits.detach().numpy()
    logits=np.exp(logits)/np.sum(np.exp(logits))
    ind=int(np.argmax(logits))
    return logits[0][ind]

In [None]:
data['sentiment'] = data['text'].apply(lambda x: sentiment_score(x[:512]))
data['sent_confidence'] = data['text'].apply(lambda x: sentiment_confidence(x[:512]))

In [None]:
data[['title','rating','sentiment','sent_confidence']].sort_values(by = 'sent_confidence', ascending = False).head(20)

Unnamed: 0,title,rating,sentiment,sent_confidence
471,Short battery life,3.0,3,0.984622
843,My favourite purchase of the year,5.0,5,0.981894
741,Best kindle ever,5.0,5,0.975159
150,Best kindle ever 😃,5.0,5,0.970501
442,This kindle is crap,4.0,1,0.968494
525,Best kindle ever. In love.,5.0,5,0.968025
833,Definitely recommended. Brilliant!,5.0,5,0.967772
375,Battery life,4.0,4,0.966709
5,Horrible most expensive screw up I ever mad,1.0,1,0.966579
304,Brilliant,5.0,5,0.966486


In [None]:
from sklearn.metrics import confusion_matrix
print(confusion_matrix(data.rating.values, data.sentiment.values))

[[ 86  24   1   0   2]
 [ 20  38   9   3   0]
 [ 13  52  41   7   0]
 [  4  14  34  57   9]
 [  5  10  21 125 415]]


In [None]:
acc_exact = sum(data['rating'] == data['sentiment'])/len(data)
acc_by_1 = sum(np.abs(data['rating'] - data['sentiment']) <= 1)/len(data)
print(f'Accuracy (exact)  is {round(acc_exact, 2)}')
print(f'Accuracy (off-by-1) is {round(acc_by_1, 2)}')

Accuracy  is 0.64
Accuracy (off-by-1) is 0.93


$\bullet$ Accuracy (exact) is the exact match on the number of stars.

$\bullet$ Accuracy (off-by-1) is the percentage of reviews where the number of stars the model predicts differs by a maximum of 1 from the number given by the human reviewer.

In [120]:
# personal review
def predict_my_rating():
  review = input('Insert your review : ')
  print(f"Your rating for the product is {sentiment_score(review)}")

predict_my_rating()

Insert your review : Give me my money back!
Your rating for the product is 1


In [89]:
predict_my_rating()

Insert your review : I can not believe I paid money for this product :/
Your rating for the product is 1


In [96]:
predict_my_rating()

Insert your review : Been using it for 2 month. So far it is good
Your rating for the product is 4


#  Custom LSTM network

Here, we use the model we built for amazon reviews dataset (see "Sentiment analysis for Amazon reviews.ipynb"). This model predicts whether the review is positive or negative.


In [101]:
# first drop neutral reviews 
# binary classification: 1,2 rankings are considered as a positive case
# load model and tokenizer from file

from copy import deepcopy
df = deepcopy(data[data.rating != 3][['text','rating']])
df['rating'] = (df['rating']<3).astype('int')

texts = df['text'].values
labels = df['rating'].values

texts[0]

"Page turning buttons are a reel asset. Built in light is brilliant. I brought two, one for myself and one for a Birthday present. Mine is fine as I have had kindles before but the present is a pain as it's all in dollars not stering, tryed lots of things to change it, and can't order any books, not happy as I brought it as a present, its embarrassing. I will have to get in touch with Amazon."

In [102]:
# normalize texts
import re

NON_ALPHANUM = re.compile(r'[\W]')
NON_ASCII = re.compile(r'[^a-z0-1\s]')
def normalize_texts(texts):
    normalized_texts = []
    for text in texts:
        lower = text.lower()
        no_punctuation = NON_ALPHANUM.sub(r' ', lower)
        no_non_ascii = NON_ASCII.sub(r'', no_punctuation)
        normalized_texts.append(no_non_ascii)
    return normalized_texts
texts = normalize_texts(texts)

In [103]:
import tensorflow
from tensorflow.keras.preprocessing.sequence import pad_sequences
import pickle

#load tokenizer and the model
Tokenizer = pickle.load(open('tokenizer.pkl', 'rb'))
model = pickle.load(open('model.pkl','rb'))

# tokenize and padd the sequences
max_length = 241
texts = Tokenizer.texts_to_sequences(texts)
texts = pad_sequences(texts, maxlen=max_length)

# predict the sentiment class
preds = model.predict(texts)




In [73]:
from sklearn.metrics import accuracy_score, roc_auc_score, classification_report, f1_score
print('AUC score: {:0.4}'.format(roc_auc_score(labels, preds)))
print('Accuracy score: {:0.4}'.format(accuracy_score(labels, 1 * (preds > 0.85))))
print(classification_report(labels, 1 * (preds > 0.85))) # imbalanced - less positive class

AUC score: 0.9753
Accuracy score: 0.951
              precision    recall  f1-score   support

           0       0.98      0.96      0.97       694
           1       0.85      0.92      0.89       183

    accuracy                           0.95       877
   macro avg       0.92      0.94      0.93       877
weighted avg       0.95      0.95      0.95       877



In [113]:
# predict your own review with the custom model
def predict_sentiment():
  review = input('Insert your review: ')
  review = Tokenizer.texts_to_sequences([review])
  review = pad_sequences(review, maxlen=max_length)
  print(f'Good review prob: {1-model.predict([review])[0][0]}')
  print(f'Bad review prob: {model.predict([review])[0][0]}')

In [115]:
predict_sentiment()

Insert your review: I would recommend it to a friend
Good review prob: 0.946056392043829
Bad review prob: 0.053943607956171036


In [116]:
predict_sentiment()

Insert your review: The previous model was 10 times better
Good review prob: 0.34038639068603516
Bad review prob: 0.6596136093139648
