# Amazon Reviews Scraping


In [1]:
# Import packages
import requests
import pandas as pd
from bs4 import BeautifulSoup
from datetime import datetime

In [2]:
# Header to set the requests as a browser requests
headers = {
    'authority': 'www.amazon.com',
    'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
    'accept-language': 'en-US,en;q=0.9,bn;q=0.8',
    'sec-ch-ua': '" Not A;Brand";v="99", "Chromium";v="102", "Google Chrome";v="102"',
    'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/102.0.0.0 Safari/537.36'
}

In [3]:
# URL of The amazon Review page
reviews_url = 'https://www.amazon.in/Ben-Martin-Relaxed-BMW-JJ3-DARK-p4-32_Dark-Blue_32/dp/B019XSHB7O/ref=sr_1_1_sspa?crid=ZRRTER88CBQ4&dib=eyJ2IjoiMSJ9._8LkrBa0cefvZUwNaVvPDwxS-bJ84ET3FOIvmeyWNQQBHx0sHuCgoHytp1-X0iIvjv9RvxclP8Vf7NmwOn7PEnrUELuiNK8ikbOoVCJ5iqCnOUrrpNzrQP39CSJVSXhOvQrEtr-DmkofDqgRD3N2mH74h-Q8Eg8G9CUPYMXPhXa_pROtwbhxzq9NFNOhX9sIqp07mC7a_N3veM8PQsp1cp0g-4f0ZErXTmTA532MUPHBFpiPe2Y0Guc99lKmH49-qge5gveoxBptf6zT9S1LlM2nwlfot0pBqrfH6qfCA7c.pdPsu5MXSNOWS5Ly9J_77dyCcLN0CZxn2FpOD_L7omo&dib_tag=se&keywords=jeans+for+men&qid=1715680009&sprefix=jea%2Caps%2C1041&sr=8-1-spons&sp_csd=d2lkZ2V0TmFtZT1zcF9hdGY&psc=1'

In [4]:
# Define Page No
len_page = 4

# Functions

In [5]:
# Extra Data as Html object from amazon Review page
def reviewsHtml(url, len_page):
    
    # Empty List define to store all pages html data
    soups = []
    
    # Loop for gather all 3000 reviews from 300 pages via range
    for page_no in range(1, len_page + 1):
        
        # parameter set as page no to the requests body
        params = {
            'ie': 'UTF8',
            'reviewerType': 'all_reviews',
            'filterByStar': 'critical',
            'pageNumber': page_no,
        }
        
        # Request make for each page
        response = requests.get(url, headers=headers)
        
        # Save Html object by using BeautifulSoup4 and lxml parser
        soup = BeautifulSoup(response.text, 'lxml')
        
        # Add single Html page data in master soups list
        soups.append(soup)
        
    return soups

In [6]:
# Grab Reviews name, description, date, stars, title from HTML
def getReviews(html_data):

    # Create Empty list to Hold all data
    data_dicts = []
    
    # Select all Reviews BOX html using css selector
    boxes = html_data.select('div[data-hook="review"]')
    
    # Iterate all Reviews BOX 
    for box in boxes:
        
        # Select Name using css selector and cleaning text using strip()
        # If Value is empty define value with 'N/A' for all.
        try:
            name = box.select_one('[class="a-profile-name"]').text.strip()
        except Exception as e:
            name = 'N/A'

        try:
            stars = box.select_one('[data-hook="review-star-rating"]').text.strip().split(' out')[0]
        except Exception as e:
            stars = 'N/A'   

        try:
            title = box.select_one('[data-hook="review-title"]').text.strip()
        except Exception as e:
            title = 'N/A'

        try:
            # Convert date str to dd/mm/yyy format
            datetime_str = box.select_one('[data-hook="review-date"]').text.strip().split(' on ')[-1]
            date = datetime.strptime(datetime_str, '%B %d, %Y').strftime("%d/%m/%Y")
        except Exception as e:
            date = 'N/A'

        try:
            description = box.select_one('[data-hook="review-body"]').text.strip()
        except Exception as e:
            description = 'N/A'

        # create Dictionary with al review data 
        data_dict = {
            'Name' : name,
            'Stars' : stars,
            'Title' : title,
            'Date' : date,
            'Description' : description
        }

        # Add Dictionary in master empty List
        data_dicts.append(data_dict)
    
    return data_dicts

# Data Process

In [7]:
# Grab all HTML
html_datas = reviewsHtml(reviews_url, len_page)

In [8]:
# Empty List to Hold all reviews data
reviews = []

In [9]:
# Iterate all Html page 
for html_data in html_datas:
    
    # Grab review data
    review = getReviews(html_data)
    
    # add review data in reviews empty list
    reviews += review

In [10]:
# Create a dataframe with reviews Data
df_reviews = pd.DataFrame(reviews)

In [11]:
df_reviews

Unnamed: 0,Name,Stars,Title,Date,Description
0,Amazon Customer,5.0,5.0 out of 5 stars\nWhite Jeans,,"Quality, material, fit, comfort, elasticity al..."
1,Amazon Customer,4.0,4.0 out of 5 stars\nExcellent product,,Second time purchase same items. Quality need ...
2,Value for money,4.0,4.0 out of 5 stars\nQuality is good but length...,,Good quality material.Waist size is matching b...
3,Vijay Singh,3.0,"3.0 out of 5 stars\nQuality, Different product...",,The actual price is wrong as per quality. Give...
4,Debabrata Mishra,5.0,5.0 out of 5 stars\nValue for money,,Good product and fittings. Overall value for ...
5,Chandana Banerjee,2.0,2.0 out of 5 stars\nVery Very Very Cheap Quali...,,This jeans is coloured with very low quality d...
6,Prem j.,5.0,5.0 out of 5 stars\nGood looking,,Fitting perfect colours same as shown . Comfor...
7,Sateesh,4.0,4.0 out of 5 stars\nGood and Comfortable,,These pair of jeans are pretty comfortable. Co...
8,Amazon Customer,5.0,5.0 out of 5 stars\nWhite Jeans,,"Quality, material, fit, comfort, elasticity al..."
9,Amazon Customer,4.0,4.0 out of 5 stars\nExcellent product,,Second time purchase same items. Quality need ...


In [13]:
# Save data
df_reviews.to_csv('reviews.csv', index=False)

In [12]:
# import pyodbc

# # Define the connection parameters
# server = 'LAPTOP-HPNNDUV2\SQLEXPRESS'
# database = 'Amazon'

# # Define the connection string for Windows authentication
# conn_str = f'DRIVER={{ODBC Driver 17 for SQL Server}};SERVER={server};DATABASE={database};Trusted_Connection=yes;'

# try:
#     # Establish a connection
#     conn = pyodbc.connect(conn_str)

#     # Create a cursor object to execute SQL queries
#     cursor = conn.cursor()

#     # Example: Execute a query
#     cursor.execute("SELECT * FROM table1")

#     # Fetch and print results
#     rows = cursor.fetchall()
#     for row in rows:
#         print(row)

#     # Don't forget to close cursor and connection
#     cursor.close()
#     conn.close()

# except Exception as e:
#     print("Error:", e)
