# Web Scrapping of Amazon Product site for Television

In [None]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
from tqdm.notebook import tqdm_notebook

In [None]:
def scrapeAmazonReviews(url, no_of_pages = 100):
  '''Takes in the review URL of any amazon product and the number of review pages (by default = 10).
     Review URL example : 'https://www.amazon.in/Fire-TV-Stick-Alexa-Voice-Remote-3rd-Gen/product-reviews/B08R6QR863/ref=cm_cr_dp_d_show_all_btm?ie=UTF8&reviewerType=all_reviews'
     Returns a dataframe containing name of the customer, rating of the product given, date of review written, the review header, product details reviewed, the main review and how many people found it helpful.'''

  # initializing the lists
  name = []
  ratings = []
  date = []
  review_header = []
  product_details = []
  review_data = []
  helpful = []

  # only amazon page
  assert (url.startswith('https://www.amazon.in/')), "Only Amazon pages can be scraped using this."
  
  # only review pages
  assert (url.endswith('reviewerType=all_reviews')), "Not the right page. Please use the all reviews page as the URL."

  # scraping through pages
  for i in tqdm_notebook(range(1, no_of_pages + 1), desc = 'Scraping : '):
    URL = f"{url}&pageNumber={i}"
    page = requests.get(URL)

    # if page not found 404 error
    assert (page.status_code != 404), "Error 404 : Page Not Found."

    # keep requesting until successful
    while page.status_code != 200:
      page = requests.get(URL)
      
    scrape = BeautifulSoup(page.content, "html.parser")

    # get all the review cards in a pages
    cards = scrape.find_all('div', class_='a-section review aok-relative')
    # if no cards present in the page, end scraping
    if len(cards) == 0:
      print('Reached end of reviews.')
      break

    # scraping through review cards
    for card in cards:
      # try except used to add None values if value for one product review doesnt exist
      try:
        name.append(card.find('span', class_='a-profile-name').text)
      except:
        name.append(None)

      try:
        ratings.append(int(card.find('span', class_="a-icon-alt").text[0]))
      except:
        ratings.append(None)
      
      try:
        review_header.append(card.find('a', class_="a-size-base a-link-normal review-title a-color-base review-title-content a-text-bold").findChild().text)
      except:
        review_header.append(None)
      
      try:
        date.append(card.find('span', class_="a-size-base a-color-secondary review-date").text[24:])
      except:
        date.append(None)

      try:
        product_details.append(card.find('a', attrs={'class':"a-size-mini a-link-normal a-color-secondary", 'data-hook':'format-strip'}).text)
      except:
        product_details.append(None)

      try:
        review_data.append(card.find('span', attrs={'class':"a-size-base review-text review-text-content", 'data-hook':'review-body'}).findChild('span').text)
      except:
        review_data.append(None)

      try:
        helpful.append(card.find('span', class_="a-size-base a-color-tertiary cr-vote-text").text)
      except:
        helpful.append(None)

  # converting into dictionary
  reviews = {
      'Name': name,
      'Ratings' : ratings,
      'Header' : review_header,
      'Date' : date,
      'Product_Details' : product_details,
      'Review' : review_data,
      'Helpful' : helpful
  }

  # creating a dataframe
  df = pd.DataFrame(reviews)

  return df

In [None]:
url ='https://www.amazon.in/Mi-Driver-Earphones-Tangle-Free-Assistant/product-reviews/B084J4MKZG/ref=cm_cr_arp_d_paging_btm_next_101?ie=UTF8&reviewerType=all_reviews&pageNumber=110'

In [None]:
reviews_df = scrapeAmazonReviews(url, 1000)
reviews_df

Scraping :   0%|          | 0/1000 [00:00<?, ?it/s]

Reached end of reviews.


Unnamed: 0,Name,Ratings,Header,Date,Product_Details,Review,Helpful
0,Suraj Chand,4,"Mi Smart Tv 32"" :- 7/10 average.",30 August 2022,Pattern name: 2K Android SeriesSize name: 32 i...,,198 people found this helpful
1,mazharkhan07,4,Great experience with above expectations,4 October 2022,Pattern name: 4K Android SeriesSize name: 50 i...,,28 people found this helpful
2,Garima,4,SO FAR GOOD TV IN PRICE RANGE,2 September 2022,Pattern name: 4K Android SeriesSize name: 50 i...,I am writing this after using for it for more ...,63 people found this helpful
3,Baru g.,4,Ok good..,10 October 2022,Pattern name: 2K Android SeriesSize name: 32 i...,ముఖ్యంగా పిక్చర్ కలర్ అనుకున్నంత లేదు. పోనీ సె...,10 people found this helpful
4,shashikanth,4,Review after using for 2 months,23 September 2022,Pattern name: 4K Android SeriesSize name: 50 i...,Display is simply extraordinary at this price ...,26 people found this helpful
...,...,...,...,...,...,...,...
4995,Prasanna V.,1,Review after one month,15 October 2021,Pattern name: 4K Android SeriesSize name: 50 i...,"Hi, I am writing this review after one month o...",
4996,Anil Dhankhar,1,Need refund,6 May 2021,Pattern name: 4K Android SeriesSize name: 50 i...,"Being a prime user of amazon, didn't like tv ...",2 people found this helpful
4997,Amit,1,Wifi properly not working,26 March 2022,Pattern name: 2K Android SeriesSize name: 32 i...,Wifi not working,One person found this helpful
4998,Irfan,1,Slow operating,28 February 2022,Pattern name: 2K Android SeriesSize name: 32 i...,Dela in operating and also struck alot.,2 people found this helpful


In [None]:
# saving to a csv file
reviews_df.to_csv('Reviews.csv', index=False)