# Install Required Libraries

In [408]:
%pip install requests beautifulsoup4 pandas

Note: you may need to restart the kernel to use updated packages.


# Import Necessary Modules

In [474]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
import random
import os
import csv
import math
import numpy as np


# Define Function to Scrape Data

In [420]:


def get_data(pageNo):
    # Define header for CSV columns
    header = ["Name", "Image URL", "Author", "Rating", "Users Rated", "Price"]
    filename = "amazon_products.csv"
    
    # Check if the file already exists
    file_exists = os.path.isfile(filename)
    
    # Determine the write mode: append if exists, else write
    mode = "a" if file_exists else "w"
    
    # Prepare Request
    headers = {
        "User-Agent": (
            "Mozilla/5.0 (Windows NT 10.0; Win64; x64)"
            " AppleWebKit/537.36 (KHTML, like Gecko)"
            " Chrome/131.0.0.0 Safari/537.36"
        )
    }
    
    # Build the URL
    url = (
        "https://www.amazon.ca/best-sellers-books-Amazon/zgbs/books/"
        f"ref=zg_bs_pg_{pageNo}_books?_encoding=UTF8&pg={pageNo}"
    )

    # Fetch the page
    response = requests.get(url, headers=headers)
    soup = BeautifulSoup(response.text, "html.parser")
    
    # Find product items
    products = soup.findAll("div", attrs={"id": "gridItemRoot"})
    
    # Open the CSV file in the correct mode
    with open(filename, mode, newline="", encoding="utf-8") as f:
        writer = csv.writer(f)
        
        # If the file did not exist, write the header first
        if not file_exists:
            writer.writerow(header)
            file_exists = True

        # Initialize a set for existing data to prevent duplicates
        existing_data = set()

        # Check if the file exists and is not empty, then read existing rows
        if file_exists and os.path.getsize(filename) > 0:
            with open(filename, "r", newline="", encoding="utf-8") as read_file:
                reader = csv.reader(read_file)
                next(reader) 
                for row in reader:
                    existing_data.add((row[0], row[1]))  
        
        # Extract data for each product and write rows if not a duplicate
        for product in products:
            name = (
                product.find("div", class_="_cDEzb_p13n-sc-css-line-clamp-2_EWgCb") 
                or product.find("div", class_="_cDEzb_p13n-sc-css-line-clamp-1_1Fn1y")                 
            )
            
            if not name:
                name = product.find("span", class_="a-size-medium")
            
            name_text = name.get_text(strip=True) if name else "No Name"

            image = product.find("img", alt=True)
            image_url = image["src"] if image else "No Image"

            author = product.find("a", attrs={"class": "a-link-child"})
            author_name = author.text.strip() if author else "No Author"

            rating = product.find("span", attrs={"class": "a-icon-alt"})
            rating_text = rating.text.strip() if rating else "No Rating"
            
            users_rated = product.find("span", attrs={"class": "a-size-small"})
            if users_rated:
                users_rated_c = users_rated.text.replace(",", "")
                if users_rated_c.isdigit():
                    users_rated_text = users_rated_c
                else:
                    users_rated_text = "No Users Rated"
            else:
                users_rated_text = "No Users Rated"

            price = product.find("span", attrs={"class": "_cDEzb_p13n-sc-price_3mJ9Z"})
            price_text = price.text.strip() if price else "No Price"

            if (name_text, image_url) not in existing_data:
                data = [
                    name_text,
                    image_url,
                    author_name,
                    rating_text,
                    users_rated_text,
                    price_text
                ]
                
                # Write the row
                writer.writerow(data)
                existing_data.add((name_text, image_url))
    
    # After writing, read the entire CSV and return the rows
    with open(filename, "r", newline="", encoding="utf-8") as f:
        reader = csv.reader(f)
        rows = list(reader)
        
    return rows


get_data(1)
time.sleep(3)
get_data(2)


[['Name', 'Image URL', 'Author', 'Rating', 'Users Rated', 'Price'],
 ['Throne of Glass Paperback Box Set',
  'https://images-na.ssl-images-amazon.com/images/I/71FT4rsHBRL._AC_UL300_SR300,200_.jpg',
  'Sarah J. Maas',
  '4.6 out of 5 stars',
  '48862',
  '$126.00'],
 ['Ansel Adams 2025 Wall Calendar: Authorized Edition: 13-Month Nature Photography Collection (Monthly Calendar)',
  'https://images-na.ssl-images-amazon.com/images/I/81uWc4cHukL._AC_UL300_SR300,200_.jpg',
  'No Author',
  '4.9 out of 5 stars',
  'No Users Rated',
  '$17.48'],
 ['Nexus: A Brief History of Information Networks from the Stone Age to AI',
  'https://images-na.ssl-images-amazon.com/images/I/71KlKhOycHL._AC_UL300_SR300,200_.jpg',
  'Yuval Noah Harari',
  '4.5 out of 5 stars',
  '3421',
  '$25.99'],
 ['The Muscle Ladder: Get Jacked Using Science',
  'https://images-na.ssl-images-amazon.com/images/I/81HNjepZ1OL._AC_UL300_SR300,200_.jpg',
  'Jeff Nippard',
  '4.7 out of 5 stars',
  '102',
  '$54.60'],
 ['Fourth Wing

In [426]:

df = pd.read_csv("amazon_products.csv")


df.head(61)

Unnamed: 0,Name,Image URL,Author,Rating,Users Rated,Price
0,Throne of Glass Paperback Box Set,https://images-na.ssl-images-amazon.com/images...,Sarah J. Maas,4.6 out of 5 stars,48862,$126.00
1,Ansel Adams 2025 Wall Calendar: Authorized Edi...,https://images-na.ssl-images-amazon.com/images...,No Author,4.9 out of 5 stars,No Users Rated,$17.48
2,Nexus: A Brief History of Information Networks...,https://images-na.ssl-images-amazon.com/images...,Yuval Noah Harari,4.5 out of 5 stars,3421,$25.99
3,The Muscle Ladder: Get Jacked Using Science,https://images-na.ssl-images-amazon.com/images...,Jeff Nippard,4.7 out of 5 stars,102,$54.60
4,Fourth Wing,https://images-na.ssl-images-amazon.com/images...,Rebecca Yarros,4.8 out of 5 stars,293212,$22.39
5,Cozy Spaces: Coloring Book for Adults and Teen...,https://images-na.ssl-images-amazon.com/images...,Coco Wyo,4.7 out of 5 stars,1140,$10.82
6,The Living by the Let Them Go Principle: The G...,https://images-na.ssl-images-amazon.com/images...,No Author,No Rating,No Users Rated,$23.00
7,Ocean Scene: Coloring Book for Adults and Kids...,https://images-na.ssl-images-amazon.com/images...,Coco Wyo,4.8 out of 5 stars,209,$11.08
8,Stop Overthinking: 23 Techniques to Relieve St...,https://images-na.ssl-images-amazon.com/images...,Nick Trenton,4.3 out of 5 stars,11746,$14.92
9,The Lost Bookshop: The most charming and uplif...,https://images-na.ssl-images-amazon.com/images...,Evie Woods,4.3 out of 5 stars,118957,$18.66


In [428]:
df.shape

(60, 6)

In [430]:
df.head(61)

Unnamed: 0,Name,Image URL,Author,Rating,Users Rated,Price
0,Throne of Glass Paperback Box Set,https://images-na.ssl-images-amazon.com/images...,Sarah J. Maas,4.6 out of 5 stars,48862,$126.00
1,Ansel Adams 2025 Wall Calendar: Authorized Edi...,https://images-na.ssl-images-amazon.com/images...,No Author,4.9 out of 5 stars,No Users Rated,$17.48
2,Nexus: A Brief History of Information Networks...,https://images-na.ssl-images-amazon.com/images...,Yuval Noah Harari,4.5 out of 5 stars,3421,$25.99
3,The Muscle Ladder: Get Jacked Using Science,https://images-na.ssl-images-amazon.com/images...,Jeff Nippard,4.7 out of 5 stars,102,$54.60
4,Fourth Wing,https://images-na.ssl-images-amazon.com/images...,Rebecca Yarros,4.8 out of 5 stars,293212,$22.39
5,Cozy Spaces: Coloring Book for Adults and Teen...,https://images-na.ssl-images-amazon.com/images...,Coco Wyo,4.7 out of 5 stars,1140,$10.82
6,The Living by the Let Them Go Principle: The G...,https://images-na.ssl-images-amazon.com/images...,No Author,No Rating,No Users Rated,$23.00
7,Ocean Scene: Coloring Book for Adults and Kids...,https://images-na.ssl-images-amazon.com/images...,Coco Wyo,4.8 out of 5 stars,209,$11.08
8,Stop Overthinking: 23 Techniques to Relieve St...,https://images-na.ssl-images-amazon.com/images...,Nick Trenton,4.3 out of 5 stars,11746,$14.92
9,The Lost Bookshop: The most charming and uplif...,https://images-na.ssl-images-amazon.com/images...,Evie Woods,4.3 out of 5 stars,118957,$18.66


In [434]:
df['Rating'] = df['Rating'].apply(lambda x: x.split()[0])

In [440]:
df['Rating'] = pd.to_numeric(df['Rating'], errors='coerce')

In [442]:
df["Price"] = df["Price"].str.replace('$', '')

In [444]:
df["Price"] = df["Price"].str.replace(',', '')

In [446]:
df['Price'] = df['Price'].apply(lambda x: x.split('.')[0])

In [448]:
df['Price'] = df['Price'].astype(int)

In [464]:
df.head()

Unnamed: 0,Name,Image URL,Author,Rating,Users Rated,Price
0,Throne of Glass Paperback Box Set,https://images-na.ssl-images-amazon.com/images...,Sarah J. Maas,4.6,48862.0,126
1,Ansel Adams 2025 Wall Calendar: Authorized Edi...,https://images-na.ssl-images-amazon.com/images...,No Author,4.9,,17
2,Nexus: A Brief History of Information Networks...,https://images-na.ssl-images-amazon.com/images...,Yuval Noah Harari,4.5,3421.0,25
3,The Muscle Ladder: Get Jacked Using Science,https://images-na.ssl-images-amazon.com/images...,Jeff Nippard,4.7,102.0,54
4,Fourth Wing,https://images-na.ssl-images-amazon.com/images...,Rebecca Yarros,4.8,293212.0,22


In [466]:
df.dtypes

Name            object
Image URL       object
Author          object
Rating         float64
Users Rated    float64
Price            int32
dtype: object

In [476]:
df.replace(str(0), np.nan, inplace=True)
df.replace(0, np.nan, inplace=True)

In [480]:
count_nan = len(df) - df.count()

In [482]:
count_nan

Name            0
Image URL       0
Author          0
Rating          5
Users Rated    11
Price           0
dtype: int64

In [484]:
df = df.dropna()

In [486]:
data = df.sort_values(["Price"], axis=0, ascending=False)[:15]

In [488]:
data

Unnamed: 0,Name,Image URL,Author,Rating,Users Rated,Price
0,Throne of Glass Paperback Box Set,https://images-na.ssl-images-amazon.com/images...,Sarah J. Maas,4.6,48862.0,126
46,A Court of Thorns and Roses Paperback Box Set ...,https://images-na.ssl-images-amazon.com/images...,Sarah J. Maas,4.8,39890.0,69
3,The Muscle Ladder: Get Jacked Using Science,https://images-na.ssl-images-amazon.com/images...,Jeff Nippard,4.7,102.0,54
20,We Who Wrestle with God: Perceptions of the Di...,https://images-na.ssl-images-amazon.com/images...,Jordan B. Peterson,4.6,632.0,33
18,Eat Like a Girl: 100+ Delicious Recipes to Bal...,https://images-na.ssl-images-amazon.com/images...,Dr. Mindy Pelz,4.5,234.0,31
30,The Let Them Theory: A Life-Changing Tool That...,https://images-na.ssl-images-amazon.com/images...,Mel Robbins,4.6,775.0,27
44,The Feel-Good Meal Plan: A Fresh Take on Meal ...,https://images-na.ssl-images-amazon.com/images...,Lindsay Pleskot,5.0,19.0,26
2,Nexus: A Brief History of Information Networks...,https://images-na.ssl-images-amazon.com/images...,Yuval Noah Harari,4.5,3421.0,25
53,The 48 Laws of Power,https://images-na.ssl-images-amazon.com/images...,Robert Greene,4.6,79564.0,23
39,The Livy Method: Weight Loss Program Guide,https://images-na.ssl-images-amazon.com/images...,Gina Livy,4.8,189.0,23


In [490]:
from bokeh.models import ColumnDataSource
from bokeh.transform import dodge
import math
from bokeh.io import curdoc
curdoc().clear()
from bokeh.io import push_notebook, show, output_notebook
from bokeh.layouts import row
from bokeh.plotting import figure
from bokeh.transform import factor_cmap
from bokeh.models import Legend
output_notebook()

In [534]:
data_sorted = df.sort_values(["Price"], axis=0, ascending=False)[:15]

output_notebook()

# Create the figure
p = figure(x_range=data_sorted['Name'].tolist(), width=1000, height=700, title="Authors Highest Priced Books", toolbar_location=None, tools="")

# Create a vertical bar plot
p.vbar(x=data_sorted['Name'], top=data_sorted['Price'], width=0.9)

# Customize the plot appearance
p.xgrid.grid_line_color = None
p.y_range.start = 0
p.xaxis.major_label_orientation = math.pi/2 

# Show the plot
show(p, notebook_handle=True)

In [536]:
show(p)

In [546]:
data = df[df['Users Rated'] > 1000]

In [548]:
data = data.sort_values(['Rating'],axis=0, ascending=False)[:15]

In [550]:
data

Unnamed: 0,Name,Image URL,Author,Rating,Users Rated,Price
4,Fourth Wing,https://images-na.ssl-images-amazon.com/images...,Rebecca Yarros,4.8,293212.0,22
31,Little Corner: Coloring Book for Adults and Te...,https://images-na.ssl-images-amazon.com/images...,Coco Wyo,4.8,2747.0,10
27,The Wild Robot (Volume 1),https://images-na.ssl-images-amazon.com/images...,Peter Brown,4.8,12678.0,11
21,Bluey 5-Minute Stories: 6 Stories in 1 Book? H...,https://images-na.ssl-images-amazon.com/images...,Penguin Young Readers Licenses,4.8,2194.0,10
46,A Court of Thorns and Roses Paperback Box Set ...,https://images-na.ssl-images-amazon.com/images...,Sarah J. Maas,4.8,39890.0,69
55,The Pivot Year,https://images-na.ssl-images-amazon.com/images...,Brianna Wiest,4.7,1651.0,18
37,Atomic Habits: An Easy & Proven Way to Build G...,https://images-na.ssl-images-amazon.com/images...,James Clear,4.7,172018.0,23
26,Think And Grow Rich,https://images-na.ssl-images-amazon.com/images...,Napoleon Hill,4.7,30806.0,15
36,Stress Relief: Coloring Book for Adults and Ki...,https://images-na.ssl-images-amazon.com/images...,Coco Wyo,4.7,1670.0,10
19,Rich Dad Poor Dad: What the Rich Teach Their K...,https://images-na.ssl-images-amazon.com/images...,Robert T. Kiyosaki,4.7,98890.0,12


In [572]:
filtered_data = data[data['Users Rated'] > 1000]

# Create a Bokeh figure
p = figure(x_range=filtered_data['Name'].tolist(), width=1000, height=800, title="Top Rated Books with more than 1000 Customer Ratings", toolbar_location=None, tools="")

# Plot the bars
p.vbar(x=filtered_data['Name'], top=filtered_data['Rating'], width=0.9)

# Adjust grid and axis properties
p.xgrid.grid_line_color = None
p.y_range.start = 0
p.xaxis.major_label_orientation = math.pi / 2

# Show the plot
show(p, notebook_handle=True)

In [563]:
show(p)

In [580]:
data = df.sort_values(["Users Rated"], axis=0, ascending=False)[:20]

In [582]:
data

Unnamed: 0,Name,Image URL,Author,Rating,Users Rated,Price
41,The Housemaid,https://images-na.ssl-images-amazon.com/images...,Freida McFadden,4.4,416357.0,14
17,It Ends with Us: A Novel (Volume 1),https://images-na.ssl-images-amazon.com/images...,Colleen Hoover,4.6,375563.0,16
4,Fourth Wing,https://images-na.ssl-images-amazon.com/images...,Rebecca Yarros,4.8,293212.0,22
51,Atomic Habits,https://images-na.ssl-images-amazon.com/images...,James Clear,4.7,172018.0,21
37,Atomic Habits: An Easy & Proven Way to Build G...,https://images-na.ssl-images-amazon.com/images...,James Clear,4.7,172018.0,23
25,The Housemaid Is Watching,https://images-na.ssl-images-amazon.com/images...,Freida McFadden,4.0,167117.0,20
9,The Lost Bookshop: The most charming and uplif...,https://images-na.ssl-images-amazon.com/images...,Evie Woods,4.3,118957.0,18
19,Rich Dad Poor Dad: What the Rich Teach Their K...,https://images-na.ssl-images-amazon.com/images...,Robert T. Kiyosaki,4.7,98890.0,12
42,The Psychology of Money: Timeless lessons on w...,https://images-na.ssl-images-amazon.com/images...,Morgan Housel,4.7,86012.0,17
15,My First Learn-to-Write Workbook: Practice for...,https://images-na.ssl-images-amazon.com/images...,Crystal Radke,4.7,81984.0,13


In [584]:
from bokeh.transform import factor_cmap
from bokeh.models import Legend
from bokeh.palettes import Dark2_5 as palette
import itertools
from bokeh.palettes import d3
#colors has a list of colors which can be used in plots
colors = itertools.cycle(palette)

palette = d3['Category20'][20]

In [586]:
index_cmap = factor_cmap('Author', palette=palette,
                         factors=data["Author"])

In [600]:
p = figure(width=700, height=700, title="Top Authors: Rating vs. Customers Rated")

p.scatter('Rating', 'Users Rated', source=data, fill_alpha=0.6, fill_color=index_cmap, size=20, legend_field='Author')

# Customize the plot
p.xaxis.axis_label = 'RATING'
p.yaxis.axis_label = 'Users Rated'
p.legend.location = 'top_left'

# Show the plot
output_notebook()

In [598]:
show(p)