In [15]:
# import module
import requests
import pandas as pd
from bs4 import BeautifulSoup

In [2]:
HEADERS = {'User-Agent':
        'Mozilla/5.0 (Windows NT 10.0; Win64; x64) \
        AppleWebKit/537.36 (KHTML, like Gecko) \
        Chrome/90.0.4430.212 Safari/537.36'}

# Cookies need to be added from the webpage. They will be in the 'Network' tab of the 'Inspect' page. 
# The format is COOKIES = {'key1': 'value1', 'key2': 'value2' ...}
COOKIES = {}

In [3]:
user_query = 'boat+earphones'

In [4]:
# Following function fetches the html data from the sent URL and returns it.

def get_data(url:str):
    page = requests.get(url, cookies=COOKIES, headers=HEADERS)
    return page

In [5]:
# Following function fetches asin_number (A unique identification number given to each product in amazon) 
# of each of the product in the search results

def asin_number(soup):
    data_asins = []
    
    for item in soup.find_all("div", {"data-component-type": "s-search-result"}):
        data_asins.append(item['data-asin'])
    return data_asins

In [6]:
# At the bottom of the webpage of every product, there will be a link 'all_reviews'. We need to go to this page 
# to start scarping the reviews. The following function fetches the same link for all of the products in 
# the search results using the asin_numbers.

def fetch_href(soup):
    links = []
  
    for item in soup.findAll("a",{'data-hook':"see-all-reviews-link-foot"}):
        links.append(item['href'])
    
    return links[0]

In [7]:
# Following page fetches all of the reviews from the all_reviews page. 

def customer_review(soup):
    data_str = ""
  
    for item in soup.find_all("span", class_="a-size-base review-text review-text-content"):
        data_str = data_str + item.get_text()
  
    result = data_str.split("\n")
    return (result)
  

In [8]:
# Original URL
url = f"https://www.amazon.in/s?k={user_query}"

In [9]:
response=get_data(url)
soup=BeautifulSoup(response.content)

In [10]:
# Fetch all of data asin ids
data_asins = asin_number(soup)

In [11]:
# A small example to show how data sins look like
data_asins[0:4]

['B07S9S86BF', 'B097DTJRWZ', 'B071Z8M4KX', 'B07S9S86BF']

In [12]:
all_reviews = []
review_data = []

In [None]:
# Iterate through data-asin IDs. For this example i'm only going through the first and second product.
for data_asin in data_asins[0:2]:
    url = f"https://www.amazon.in/dp/{data_asin}"
    
    # Extract all_reviews link for each of the product 
    response=get_data(url)
    soup=BeautifulSoup(response.content)
    link = fetch_href(soup)
    
    # Fetch all of the reviws from the extracted all_reviews link 
    i = 0
    
    print(f"Fetching reviews from the product: {data_asin}")
    while 1:
        i += 1
        url = f"https://www.amazon.in{link}&pageNumber={i}"
        response=get_data(url)
        soup=BeautifulSoup(response.text)
        review_data = customer_review(soup)
        review_data = [review for review in review_data if len(review) > 0]
        if len(review_data) == 0:
            break
        
        all_reviews += review_data
        print(f"Total reviews scraped: {len(all_reviews)}")

In [16]:
reviews_df = pd.DataFrame({'reviews': all_reviews})

In [17]:
reviews_df.to_csv('amazon_reviews.csv', index=False)