# Amazon Product Review Scraper

## Imports

In [151]:
import requests
from bs4 import BeautifulSoup
import json
import re
from urllib import parse
from numpy import random
from time import sleep

## Request URL Setup

The url links to the first (or subsequent) verified purchase customer review page of an Amazon product. The review data is to be used to find insights on the flavour preferences of consumers of a particular product so the product is expected to be offered in different flavours i.e. each review has a "Flavour Name" descriptor.

A common user agent is passed to the requests module .get method to mitigate the risk of being restricted from access by Amazon. This user agent argument to the headers parameter allows the program to imitate a request from a regular web browser.

In [141]:
# ready input to request
url = "https://www.amazon.ca/Pure-Protein-Chocolate-Deluxe-6-Count/product-reviews/B00BMHB51I/ref=cm_cr_arp_d_viewopt_rvwer?ie=UTF8&reviewerType=avp_only_reviews&pageNumber=1"

HEADERS = ({'User-Agent':
            'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.97 Safari/537.36',
            'Accept-Language': 'en-US, en;q=0.5'})

In [127]:
def get_page(url):
    """sends request to url and returns a soup object representing the html at that url"""
    response = requests.get(url,headers=HEADERS)
    soup = BeautifulSoup(response.content, "html.parser")
    return soup

## Review Scraping

For each review, a dictionary is created that stores its ID, title, content, flavour purchased and a boolean stating whether the review was from a foreign country.

Output: reviews.json

In [146]:
# scrape reviews
rev_dicts = []
while url:
    try:
        # random wait time between requests
        t = random.uniform(0,0.5)
        sleep(t)
        soup = get_page(url)
        reviews = soup.find_all(name="div",attrs={"class":"a-section celwidget"})
        if reviews is None:
            print("Error: no reviews found")
            print(url)
            break

        for item in reviews:
            rev_dict = {}

            # get review id
            m = re.search(r"^customer_review(_foreign)?-(.+)$",item['id'])
            if m:
                rev_id = m.group(2)
                rev_dict['id'] = rev_id

                # mark if review was foreign
                if m.group(1):
                    rev_dict['foreign']=True
                else:
                    rev_dict['foreign']=False
            else:
                print("Error: ID not found in customer review")
                print(item)
                break

            # get review title
            if not rev_dict['foreign']:
                rev_title = item.find("a",attrs={"data-hook":"review-title"}).get_text().strip()
                rev_dict['title'] = rev_title
            else:
                rev_title = item.find("span",attrs={"data-hook":"review-title"}).get_text().strip()
                rev_dict['title'] = rev_title

            # get flavour name
            info_strip= item.find("a",attrs={"data-hook":"format-strip"}).get_text(separator=",",strip=True)
            m = re.search(r"Flavour Name: (.+?)(,|$)",info_strip)
            if m:
                rev_flavour = m.group(1)
                rev_dict['flavour'] = rev_flavour
            else:
                rev_dict['flavour'] = 'NA'
            
            # get review content
            content = item.find("div",attrs={"class":"a-row a-spacing-small review-data"})
            for br in content.find_all("br"):
                br.replace_with("\n")
            
            rev_dict['content'] = content.get_text()
            rev_dicts.append(rev_dict)

        # go to next page
        next_div = soup.find("div",attrs={"class":"a-form-actions a-spacing-top-extra-large"})
        li = next_div.find("li",attrs={"class":"a-last"})
        a_tag = li.find("a")
        if a_tag:
            rel_url = a_tag.get("href")
            url = parse.urljoin(url,rel_url)
        else:
            url = None
    # catching url where error occurs
    except Exception as e:
        print(url)
        raise e

In [148]:
# confirm reviews scraped
print(len(rev_dicts))

2089


In [152]:
# Output
with open("reviews.json","w") as f:
    json.dump(rev_dicts,f)