# Amazon Best Sellers

# Import Libraries

In [3]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
import numpy as np
import re
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings("ignore")
import seaborn as sns 
# this statement allows the visuals to render within your Jupyter Notebook
%matplotlib inline 
# You can configure the format of the images: ‘png’, ‘retina’, ‘jpeg’, ‘svg’, ‘pdf’.
%config InlineBackend.figure_format = 'png'

In [4]:
# Project for best sales in amazon.sa

# Best Sellers in Electronics (Page 1)

In [5]:
header = {'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.45 Safari/537.36'}
url = "https://www.amazon.sa/-/en/gp/bestsellers/electronics/?ie=UTF8&ref_=sv_sv_elec_all_1"

In [6]:
response = requests.get(url, {'headers':header})
response.status_code

200

In [6]:
response.text[:1000]

'<!doctype html><html lang="en-ae" class="a-no-js" data-19ax5a9jf="dingo"><!-- sp:feature:head-start -->\n<head><script>var aPageStart = (new Date()).getTime();</script><meta charset="utf-8"/>\n<!-- sp:end-feature:head-start -->\n\n<!-- sp:feature:cs-optimization -->\n<meta http-equiv=\'x-dns-prefetch-control\' content=\'on\'>\n<link rel="dns-prefetch" href="https://images-eu.ssl-images-amazon.com">\n<link rel="dns-prefetch" href="https://m.media-amazon.com">\n<link rel="dns-prefetch" href="https://completion.amazon.com">\n<!-- sp:end-feature:cs-optimization -->\n\n<!-- sp:feature:aui-assets -->\n<link rel="stylesheet" href="https://images-eu.ssl-images-amazon.com/images/I/11EIQ5IGqaL._RC|01ZTHTZObnL.css,41JZEtDv4tL.css,31Y8m1dzTdL.css,013z33uKh2L.css,017DsKjNQJL.css,0131vqwP5UL.css,41EWOOlBJ9L.css,11TIuySqr6L.css,01ElnPiDxWL.css,11bGSgD5pDL.css,01Dm5eKVxwL.css,01IdKcBuAdL.css,01y-XAlI+2L.css,21N4kUH7pxL.css,01oDR3IULNL.css,41CYNGpGlrL.css,01XPHJk60-L.css,114y0SIP+yL.css,21aPhFy+riL.cs

# BeautifulSoup Basics

In [7]:
page = response.text
soup = BeautifulSoup(page, 'lxml')

In [8]:
print(soup.prettify())

<!DOCTYPE html>
<html class="a-no-js" data-19ax5a9jf="dingo" lang="en-ae">
 <!-- sp:feature:head-start -->
 <head>
  <script>
   var aPageStart = (new Date()).getTime();
  </script>
  <meta charset="utf-8"/>
  <!-- sp:end-feature:head-start -->
  <!-- sp:feature:cs-optimization -->
  <meta content="on" http-equiv="x-dns-prefetch-control"/>
  <link href="https://images-eu.ssl-images-amazon.com" rel="dns-prefetch"/>
  <link href="https://m.media-amazon.com" rel="dns-prefetch"/>
  <link href="https://completion.amazon.com" rel="dns-prefetch"/>
  <!-- sp:end-feature:cs-optimization -->
  <!-- sp:feature:aui-assets -->
  <link href="https://images-eu.ssl-images-amazon.com/images/I/11EIQ5IGqaL._RC|01ZTHTZObnL.css,41JZEtDv4tL.css,31Y8m1dzTdL.css,013z33uKh2L.css,017DsKjNQJL.css,0131vqwP5UL.css,41EWOOlBJ9L.css,11TIuySqr6L.css,01ElnPiDxWL.css,11bGSgD5pDL.css,01Dm5eKVxwL.css,01IdKcBuAdL.css,01y-XAlI+2L.css,21N4kUH7pxL.css,01oDR3IULNL.css,41CYNGpGlrL.css,01XPHJk60-L.css,114y0SIP+yL.css,21aPhFy+riL

In [9]:
soup.find_all('li', {'class':'zg-item-immersion'})[0].find_all('a')[1].text

'\n3.0 out of 5 stars\n'

In [10]:
# create function to clean data 

In [11]:
def clean_num(s):
    """cleans text scraped from website
    
    Parameters:
    s (String): raw text

    Returns:
    String: cleaned digit text

    """ 

    if s is None:
        return s
    return "".join(re.findall('\d*\.?\d+',s))

def get_data(block, tag, class_text):
    """fetch data from tag and its associated class
    
    Parameters:
    block (int): Book item 
    tag (String): tag for element we want to fetch
    class_text (String): class for element we want to fetch

    Returns:
    String: data text fetched

    """ 
    try:
        return block.find(tag, class_ = class_text).text
    except AttributeError:
        return ""
    return

In [12]:
#Test with the first electroni product to ensure functions are working
for i in (soup.find("li", "zg-item-immersion")):
    print(str.strip(get_data(i, "a", "a-link-normal")))
    print(get_data(i, "span", "a-icon-alt"))
    print(clean_num(get_data(i, "span", "a-icon-alt").split(" ")[0]))
    print(get_data(i, "a","a-size-small a-link-normal"))
    print(clean_num(get_data(i, "a","a-size-small a-link-normal")))
    print(get_data(i, "span","p13n-sc-price"))
    print(clean_num(get_data(i, "span", "p13n-sc-price")))


SAMSUNG Galaxy A12 Dual SIM Smartphone - 64GB, 4GB RAM, LTE, Black (KSA Version)
3.0 out of 5 stars
3.0
2
2
SAR 539.00
539.00


In [13]:
soup.find_all("li", class_ = "zg-item-immersion")[3] 

<li class="zg-item-immersion" role="gridcell"><span class="a-list-item"><div class="a-section a-spacing-none aok-relative"><div class="a-row a-spacing-none aok-inline-block"><span class="a-size-small aok-float-left zg-badge-body zg-badge-color"><span class="zg-badge-text">#4</span></span><span class="aok-float-left zg-badge-triangle zg-badge-color"></span></div><span class="aok-inline-block zg-item"><a class="a-link-normal" href="/-/en/SAMSUNG-Galaxy-M12-Dual-Smartphone/dp/B08XY5X3J7?_encoding=UTF8&amp;psc=1"><span class="zg-text-center-align"><div class="a-section a-spacing-small"><img alt="SAMSUNG Galaxy M12 Dual SIM Smartphone - 64GB, 4GB RAM, 4G LTE, Black (KSA Version)" height="200" src="https://images-na.ssl-images-amazon.com/images/I/81DCMeVrbKS._AC_UL200_SR200,200_.jpg" width="200"/></div></span>
<div aria-hidden="true" class="p13n-sc-truncate p13n-sc-line-clamp-2" data-rows="2">
            SAMSUNG Galaxy M12 Dual SIM Smartphone - 64GB, 4GB RAM, 4G LTE, Black (KSA Version)
   

In [14]:
products_list = []
for i in (soup.find_all("li", class_ = "zg-item-immersion")):
    pro_name = (str.strip(get_data(i, "a", "a-link-normal")))
    pro_rate= (get_data(i, "span", "a-icon-alt"))
    pro_rate_clean = (clean_num(get_data(i, "span", "a-icon-alt").split(" ")[0]))
    review_num = (get_data(i, "a","a-size-small a-link-normal"))
    review_num_clean = (clean_num(get_data(i, "a","a-size-small a-link-normal")))
    pro_price = (get_data(i, "span","p13n-sc-price"))
    pro_price_clean = (clean_num(get_data(i, "span", "p13n-sc-price")))
    
    headers = ['pro_name', 'pro_rate', 'pro_rate_clean','review_num',
              'review_num_clean', 'pro_price','pro_price_clean']
    products_dict = dict(zip(headers, [ pro_name,
                                    pro_rate,
                                    pro_rate_clean,
                                    review_num,
                                    review_num_clean, 
                                    pro_price,
                                    pro_price_clean]))
    products_list.append(products_dict)

    

In [15]:
products_list

[{'pro_name': 'SAMSUNG Galaxy A12 Dual SIM Smartphone - 64GB, 4GB RAM, LTE, Black (KSA Version)',
  'pro_rate': '3.0 out of 5 stars',
  'pro_rate_clean': '3.0',
  'review_num': '2',
  'review_num_clean': '2',
  'pro_price': 'SAR 539.00',
  'pro_price_clean': '539.00'},
 {'pro_name': 'Samsung Galaxy A12 LTE Dual SIM Smartphone - 64GB Storage, 4GB RAM, Blue (KSA Version)',
  'pro_rate': '1.0 out of 5 stars',
  'pro_rate_clean': '1.0',
  'review_num': '2',
  'review_num_clean': '2',
  'pro_price': 'SAR 539.00',
  'pro_price_clean': '539.00'},
 {'pro_name': 'Apple 20W USB-C Power Adapter, White',
  'pro_rate': '4.5 out of 5 stars',
  'pro_rate_clean': '4.5',
  'review_num': '358',
  'review_num_clean': '358',
  'pro_price': 'SAR 79.00',
  'pro_price_clean': '79.00'},
 {'pro_name': 'SAMSUNG Galaxy M12 Dual SIM Smartphone - 64GB, 4GB RAM, 4G LTE, Black (KSA Version)',
  'pro_rate': '4.0 out of 5 stars',
  'pro_rate_clean': '4.0',
  'review_num': '38',
  'review_num_clean': '38',
  'pro_price

In [16]:
products_df = pd.DataFrame(products_list).replace("",np.nan)  #convert list of dict to df
products_df

Unnamed: 0,pro_name,pro_rate,pro_rate_clean,review_num,review_num_clean,pro_price,pro_price_clean
0,"SAMSUNG Galaxy A12 Dual SIM Smartphone - 64GB,...",3.0 out of 5 stars,3.0,2,2,SAR 539.00,539.0
1,Samsung Galaxy A12 LTE Dual SIM Smartphone - 6...,1.0 out of 5 stars,1.0,2,2,SAR 539.00,539.0
2,"Apple 20W USB-C Power Adapter, White",4.5 out of 5 stars,4.5,358,358,SAR 79.00,79.0
3,"SAMSUNG Galaxy M12 Dual SIM Smartphone - 64GB,...",4.0 out of 5 stars,4.0,38,38,SAR 499.00,499.0
4,"SAMSUNG Galaxy M12 Dual SIM Smartphone - 64GB,...",4.2 out of 5 stars,4.2,22,22,SAR 499.00,499.0
5,New Apple Airpods Pro,4.3 out of 5 stars,4.3,2350,2350,SAR 749.00,749.0
6,"Anker PowerCore Select 20000, 20000mAh Power B...",4.1 out of 5 stars,4.1,54,54,SAR 94.00,94.0
7,"SAMSUNG Galaxy M12 Dual SIM Smartphone - 64GB,...",4.5 out of 5 stars,4.5,10,10,SAR 499.00,499.0
8,Apple AirPods with Charging Case,4.1 out of 5 stars,4.1,2774,2774,SAR 475.00,475.0
9,"Stylus Pen for iPad with Palm Rejection, Activ...",3.3 out of 5 stars,3.3,5545,5545,SAR 58.96,58.96


# Best Sellers in   Electronics (Page 2)

In [18]:
url = 'https://www.amazon.sa/-/en/gp/bestsellers/electronics/ref=zg_bs_pg_2?ie=UTF8&pg=2'

In [19]:
response = requests.get(url)
response.status_code

200

In [20]:
response.text[:1000]

'<!doctype html><html lang="en-ae" class="a-no-js" data-19ax5a9jf="dingo"><!-- sp:feature:head-start -->\n<head><script>var aPageStart = (new Date()).getTime();</script><meta charset="utf-8"/>\n<!-- sp:end-feature:head-start -->\n\n<!-- sp:feature:cs-optimization -->\n<meta http-equiv=\'x-dns-prefetch-control\' content=\'on\'>\n<link rel="dns-prefetch" href="https://images-eu.ssl-images-amazon.com">\n<link rel="dns-prefetch" href="https://m.media-amazon.com">\n<link rel="dns-prefetch" href="https://completion.amazon.com">\n<!-- sp:end-feature:cs-optimization -->\n\n<!-- sp:feature:aui-assets -->\n<link rel="stylesheet" href="https://images-eu.ssl-images-amazon.com/images/I/11EIQ5IGqaL._RC|01ZTHTZObnL.css,41JZEtDv4tL.css,31Y8m1dzTdL.css,013z33uKh2L.css,017DsKjNQJL.css,0131vqwP5UL.css,41EWOOlBJ9L.css,11TIuySqr6L.css,01ElnPiDxWL.css,11bGSgD5pDL.css,01Dm5eKVxwL.css,01IdKcBuAdL.css,01y-XAlI+2L.css,21N4kUH7pxL.css,01oDR3IULNL.css,41CYNGpGlrL.css,01XPHJk60-L.css,114y0SIP+yL.css,21aPhFy+riL.cs

In [21]:
page = response.text
soup = BeautifulSoup(page, 'lxml')

In [22]:
print(soup.prettify())

<!DOCTYPE html>
<html class="a-no-js" data-19ax5a9jf="dingo" lang="en-ae">
 <!-- sp:feature:head-start -->
 <head>
  <script>
   var aPageStart = (new Date()).getTime();
  </script>
  <meta charset="utf-8"/>
  <!-- sp:end-feature:head-start -->
  <!-- sp:feature:cs-optimization -->
  <meta content="on" http-equiv="x-dns-prefetch-control"/>
  <link href="https://images-eu.ssl-images-amazon.com" rel="dns-prefetch"/>
  <link href="https://m.media-amazon.com" rel="dns-prefetch"/>
  <link href="https://completion.amazon.com" rel="dns-prefetch"/>
  <!-- sp:end-feature:cs-optimization -->
  <!-- sp:feature:aui-assets -->
  <link href="https://images-eu.ssl-images-amazon.com/images/I/11EIQ5IGqaL._RC|01ZTHTZObnL.css,41JZEtDv4tL.css,31Y8m1dzTdL.css,013z33uKh2L.css,017DsKjNQJL.css,0131vqwP5UL.css,41EWOOlBJ9L.css,11TIuySqr6L.css,01ElnPiDxWL.css,11bGSgD5pDL.css,01Dm5eKVxwL.css,01IdKcBuAdL.css,01y-XAlI+2L.css,21N4kUH7pxL.css,01oDR3IULNL.css,41CYNGpGlrL.css,01XPHJk60-L.css,114y0SIP+yL.css,21aPhFy+riL

In [23]:
soup.find_all('li', {'class':'zg-item-immersion'})[1].find_all('a')[1].text

'\n3.6 out of 5 stars\n'

In [24]:
def clean_num(s):
    """cleans text scraped from website
    
    Parameters:
    s (String): raw text

    Returns:
    String: cleaned digit text

    """ 

    if s is None:
        return s
    return "".join(re.findall('\d*\.?\d+',s))

def get_data(block, tag, class_text):
    """fetch data from tag and its associated class
    
    Parameters:
    block (int): Book item 
    tag (String): tag for element we want to fetch
    class_text (String): class for element we want to fetch

    Returns:
    String: data text fetched

    """ 
    try:
        return block.find(tag, class_ = class_text).text
    except AttributeError:
        return ""
    return

In [25]:
for i in (soup.find("li", "zg-item-immersion")):
    print(str.strip(get_data(i, "a", "a-link-normal")))
    print(get_data(i, "span", "a-icon-alt"))
    print(clean_num(get_data(i, "span", "a-icon-alt").split(" ")[0]))
    print(get_data(i, "a","a-size-small a-link-normal"))
    print(clean_num(get_data(i, "a","a-size-small a-link-normal")))
    print(get_data(i, "span","p13n-sc-price"))
    print(clean_num(get_data(i, "span", "p13n-sc-price")))

Duracell Rechargeable AA 4 2500mAh
4.2 out of 5 stars
4.2
9
9
SAR 55.22
55.22


In [52]:
soup.find_all("li", class_ = "zg-item-immersion")[1] 

<li class="zg-item-immersion" role="gridcell"><span class="a-list-item"><div class="a-section a-spacing-none aok-relative"><div class="a-row a-spacing-none aok-inline-block"><span class="a-size-small aok-float-left zg-badge-body zg-badge-color"><span class="zg-badge-text">#2</span></span><span class="aok-float-left zg-badge-triangle zg-badge-color"></span></div><span class="aok-inline-block zg-item"><a class="a-link-normal" href="/%D9%84%D8%B9%D8%A8%D8%A9-%D9%81%D9%8A%D9%81%D8%A7-22-PlayStation-%D8%B3%D8%B9%D9%88%D8%AF%D9%8A%D8%A9/dp/B098KVCFM8?_encoding=UTF8&amp;psc=1"><span class="zg-text-center-align"><div class="a-section a-spacing-small"><img alt="لعبة فيفا 22 - PlayStation 4 (نسخة السعودية)" height="200" src="https://images-na.ssl-images-amazon.com/images/I/81OdjI%2Bq5-L._AC_UL200_SR200,200_.jpg" width="200"/></div></span>
<div aria-hidden="true" class="p13n-sc-truncate p13n-sc-line-clamp-1" data-rows="1">
            لعبة فيفا 22 - PlayStation 4 (نسخة السعودية)
        </div>
</

In [27]:
products_list = []
for i in (soup.find_all("li", class_ = "zg-item-immersion")):
    pro_name = (str.strip(get_data(i, "a", "a-link-normal")))
    pro_rate= (get_data(i, "span", "a-icon-alt"))
    pro_rate_clean = (clean_num(get_data(i, "span", "a-icon-alt").split(" ")[0]))
    review_num = (get_data(i, "a","a-size-small a-link-normal"))
    review_num_clean = (clean_num(get_data(i, "a","a-size-small a-link-normal")))
    pro_price = (get_data(i, "span","p13n-sc-price"))
    pro_price_clean = (clean_num(get_data(i, "span", "p13n-sc-price")))
    
    headers = ['pro_name', 'pro_rate', 'pro_rate_clean','review_num',
              'review_num_clean', 'pro_price','pro_price_clean']
    products_dict = dict(zip(headers, [ pro_name,
                                    pro_rate,
                                    pro_rate_clean,
                                    review_num,
                                    review_num_clean, 
                                    pro_price,
                                    pro_price_clean]))
    products_list.append(products_dict)

In [28]:
products_df = pd.DataFrame(products_list).replace("",np.nan)  #convert list of dict to df
products_df

Unnamed: 0,pro_name,pro_rate,pro_rate_clean,review_num,review_num_clean,pro_price,pro_price_clean
0,Duracell Rechargeable AA 4 2500mAh,4.2 out of 5 stars,4.2,9,9,SAR 55.22,55.22
1,Samsung Galaxy S21 5G | Pro Grade Triple Camer...,3.6 out of 5 stars,3.6,61,61,"SAR 2,299.00",2299.0
2,"TP-Link AC1750 Wi-Fi Range Extender - RE450, W...",4.0 out of 5 stars,4.0,5915,5915,SAR 190.00,190.0
3,"Samsung Galaxy A02 Dual SIM Smartphone - 32GB,...",3.6 out of 5 stars,3.6,14,14,SAR 469.00,469.0
4,Glorious Model D Minus - Matte White,4.6 out of 5 stars,4.6,255,255,SAR 189.00,189.0
5,"SAMSUNG Galaxy Tab A7 Lite Tablet - 32GB, 3GB ...",3.2 out of 5 stars,3.2,10,10,SAR 595.00,595.0
6,Samsung Galaxy S21 Ultra 5G Android Smartphone...,3.8 out of 5 stars,3.8,125,125,"SAR 3,499.00",3499.0
7,Apple iPhone 12 Pro Max With FaceTime (128GB) ...,4.4 out of 5 stars,4.4,548,548,"SAR 4,399.00",4399.0
8,New Apple iPhone 13 with FaceTime (256GB) - Blue,4.3 out of 5 stars,4.3,39,39,"SAR 3,766.25",3766.25
9,Apple AirPods Pro,4.3 out of 5 stars,4.3,2350,2350,SAR 785.00,785.0


# Best Sellers in Camera & Photo Products (Page 1)

In [150]:
url = 'https://www.amazon.sa/-/en/gp/bestsellers/electronics/16966385031/ref=zg_bs_nav_1_electronics'

In [151]:
response = requests.get(url)
response.status_code

200

In [152]:
response.text[:1000]

'<!doctype html><html lang="en-ae" class="a-no-js" data-19ax5a9jf="dingo"><!-- sp:feature:head-start -->\n<head><script>var aPageStart = (new Date()).getTime();</script><meta charset="utf-8"/>\n<!-- sp:end-feature:head-start -->\n\n<!-- sp:feature:cs-optimization -->\n<meta http-equiv=\'x-dns-prefetch-control\' content=\'on\'>\n<link rel="dns-prefetch" href="https://images-eu.ssl-images-amazon.com">\n<link rel="dns-prefetch" href="https://m.media-amazon.com">\n<link rel="dns-prefetch" href="https://completion.amazon.com">\n<!-- sp:end-feature:cs-optimization -->\n\n<!-- sp:feature:aui-assets -->\n<link rel="stylesheet" href="https://images-eu.ssl-images-amazon.com/images/I/11EIQ5IGqaL._RC|01ZTHTZObnL.css,41JZEtDv4tL.css,31Y8m1dzTdL.css,013z33uKh2L.css,017DsKjNQJL.css,0131vqwP5UL.css,41EWOOlBJ9L.css,11TIuySqr6L.css,01ElnPiDxWL.css,11bGSgD5pDL.css,01Dm5eKVxwL.css,01IdKcBuAdL.css,01y-XAlI+2L.css,21N4kUH7pxL.css,01oDR3IULNL.css,41CYNGpGlrL.css,01XPHJk60-L.css,114y0SIP+yL.css,21aPhFy+riL.cs

In [153]:
page = response.text
soup = BeautifulSoup(page, 'lxml')

In [154]:
print(soup.prettify())

<!DOCTYPE html>
<html class="a-no-js" data-19ax5a9jf="dingo" lang="en-ae">
 <!-- sp:feature:head-start -->
 <head>
  <script>
   var aPageStart = (new Date()).getTime();
  </script>
  <meta charset="utf-8"/>
  <!-- sp:end-feature:head-start -->
  <!-- sp:feature:cs-optimization -->
  <meta content="on" http-equiv="x-dns-prefetch-control"/>
  <link href="https://images-eu.ssl-images-amazon.com" rel="dns-prefetch"/>
  <link href="https://m.media-amazon.com" rel="dns-prefetch"/>
  <link href="https://completion.amazon.com" rel="dns-prefetch"/>
  <!-- sp:end-feature:cs-optimization -->
  <!-- sp:feature:aui-assets -->
  <link href="https://images-eu.ssl-images-amazon.com/images/I/11EIQ5IGqaL._RC|01ZTHTZObnL.css,41JZEtDv4tL.css,31Y8m1dzTdL.css,013z33uKh2L.css,017DsKjNQJL.css,0131vqwP5UL.css,41EWOOlBJ9L.css,11TIuySqr6L.css,01ElnPiDxWL.css,11bGSgD5pDL.css,01Dm5eKVxwL.css,01IdKcBuAdL.css,01y-XAlI+2L.css,21N4kUH7pxL.css,01oDR3IULNL.css,41CYNGpGlrL.css,01XPHJk60-L.css,114y0SIP+yL.css,21aPhFy+riL

In [155]:
soup.find_all('li', {'class':'zg-item-immersion'})[1].find_all('a')[1].text

'\n4.3 out of 5 stars\n'

In [156]:
def clean_num(s):
    """cleans text scraped from website
    
    Parameters:
    s (String): raw text

    Returns:
    String: cleaned digit text

    """ 

    if s is None:
        return s
    return "".join(re.findall('\d*\.?\d+',s))

def get_data(block, tag, class_text):
    """fetch data from tag and its associated class
    
    Parameters:
    block (int): Book item 
    tag (String): tag for element we want to fetch
    class_text (String): class for element we want to fetch

    Returns:
    String: data text fetched

    """ 
    try:
        return block.find(tag, class_ = class_text).text
    except AttributeError:
        return ""
    return

In [157]:
for i in (soup.find("li", "zg-item-immersion")):
    print(str.strip(get_data(i, "a", "a-link-normal")))
    print(get_data(i, "span", "a-icon-alt"))
    print(clean_num(get_data(i, "span", "a-icon-alt").split(" ")[0]))
    print(get_data(i, "a","a-size-small a-link-normal"))
    print(clean_num(get_data(i, "a","a-size-small a-link-normal")))
    print(get_data(i, "span","p13n-sc-price"))
    print(clean_num(get_data(i, "span", "p13n-sc-price")))

EZVIZ C6N Pan/Tilt Camera FHD Indoor,1080p WiFi Smart Home Security Camera IR Night Vision Motion Detection Auto Tracking Baby/Elder/Pet Cloud Storage/SD Slot 2-Way Audio Wi-Fi 2.4G iOS Android App
4.2 out of 5 stars
4.2
1,408
1408
SAR 98.00
98.00


In [158]:
soup.find_all("li", class_ = "zg-item-immersion")[1] 

<li class="zg-item-immersion" role="gridcell"><span class="a-list-item"><div class="a-section a-spacing-none aok-relative"><div class="a-row a-spacing-none aok-inline-block"><span class="a-size-small aok-float-left zg-badge-body zg-badge-color"><span class="zg-badge-text">#2</span></span><span class="aok-float-left zg-badge-triangle zg-badge-color"></span></div><span class="aok-inline-block zg-item"><a class="a-link-normal" href="/-/en/EZVIZ-Indoor-Security-Camera-Android/dp/B07ZC1Q6W6?_encoding=UTF8&amp;psc=1"><span class="zg-text-center-align"><div class="a-section a-spacing-small"><img alt="EZVIZ C6N FHD Indoor Security Camera WiFi Smart 2.4G with iOS and Android App" height="200" src="https://images-na.ssl-images-amazon.com/images/I/51bA8A2cU9L._AC_UL200_SR200,200_.jpg" width="200"/></div></span>
<div aria-hidden="true" class="p13n-sc-truncate p13n-sc-line-clamp-2" data-rows="2">
            EZVIZ C6N FHD Indoor Security Camera WiFi Smart 2.4G with iOS and Android App
        </div

In [159]:
products_list = []
for i in (soup.find_all("li", class_ = "zg-item-immersion")):
    pro_name = (str.strip(get_data(i, "a", "a-link-normal")))
    pro_rate= (get_data(i, "span", "a-icon-alt"))
    pro_rate_clean = (clean_num(get_data(i, "span", "a-icon-alt").split(" ")[0]))
    review_num = (get_data(i, "a","a-size-small a-link-normal"))
    review_num_clean = (clean_num(get_data(i, "a","a-size-small a-link-normal")))
    pro_price = (get_data(i, "span","p13n-sc-price"))
    pro_price_clean = (clean_num(get_data(i, "span", "p13n-sc-price")))
    
    headers = ['pro_name', 'pro_rate', 'pro_rate_clean','review_num',
              'review_num_clean', 'pro_price','pro_price_clean']
    products_dict = dict(zip(headers, [ pro_name,
                                    pro_rate,
                                    pro_rate_clean,
                                    review_num,
                                    review_num_clean, 
                                    pro_price,
                                    pro_price_clean]))
    products_list.append(products_dict)

In [160]:
products_df = pd.DataFrame(products_list).replace("",np.nan)  #convert list of dict to df
products_df

Unnamed: 0,pro_name,pro_rate,pro_rate_clean,review_num,review_num_clean,pro_price,pro_price_clean
0,"EZVIZ C6N Pan/Tilt Camera FHD Indoor,1080p WiF...",4.2 out of 5 stars,4.2,1408.0,1408.0,SAR 98.00,98.0
1,EZVIZ C6N FHD Indoor Security Camera WiFi Smar...,4.3 out of 5 stars,4.3,391.0,391.0,SAR 98.00,98.0
2,SanDisk Ultra Micro SD Card Android - Class 10...,4.4 out of 5 stars,4.4,116942.0,116942.0,SAR 35.00,35.0
3,SanDisk Extreme microSDXC UHS-I Card- 128GB,4.4 out of 5 stars,4.4,116942.0,116942.0,SAR 77.00,77.0
4,EZVIZ C2C 1080P Smart Home Security Wifi Camer...,,,,,SAR 69.00,69.0
5,"6 PCS Camera Cover,Sliding Webcam Cover,Ultra ...",4.4 out of 5 stars,4.4,45.0,45.0,SAR 9.00,9.0
6,SanDisk Extreme microSDXC UHS-I Card- 256GB,4.4 out of 5 stars,4.4,116942.0,116942.0,SAR 168.00,168.0
7,"Fujifilm Instax Mini, 10 sheet x 2 pack, White",4.6 out of 5 stars,4.6,74.0,74.0,SAR 69.00,69.0
8,Eufy Indoor Cam 2K Pan & Tilt Home Security Ca...,4.5 out of 5 stars,4.5,10.0,10.0,SAR 155.00,155.0
9,SanDisk Ultra microSDXC 128GB 100MB/s Class 10...,4.4 out of 5 stars,4.4,116942.0,116942.0,SAR 45.00,45.0


# Best Sellers in Camera & Photo Products (Page 2)

In [162]:
url ='https://www.amazon.sa/-/en/gp/bestsellers/electronics/16966385031/ref=zg_bs_pg_2?ie=UTF8&pg=2'

In [163]:
response = requests.get(url)
response.status_code

200

In [164]:
response.text[:1000]

'<!doctype html><html lang="en-ae" class="a-no-js" data-19ax5a9jf="dingo"><!-- sp:feature:head-start -->\n<head><script>var aPageStart = (new Date()).getTime();</script><meta charset="utf-8"/>\n<!-- sp:end-feature:head-start -->\n\n<!-- sp:feature:cs-optimization -->\n<meta http-equiv=\'x-dns-prefetch-control\' content=\'on\'>\n<link rel="dns-prefetch" href="https://images-eu.ssl-images-amazon.com">\n<link rel="dns-prefetch" href="https://m.media-amazon.com">\n<link rel="dns-prefetch" href="https://completion.amazon.com">\n<!-- sp:end-feature:cs-optimization -->\n\n<!-- sp:feature:aui-assets -->\n<link rel="stylesheet" href="https://images-eu.ssl-images-amazon.com/images/I/11EIQ5IGqaL._RC|01ZTHTZObnL.css,41JZEtDv4tL.css,31Y8m1dzTdL.css,013z33uKh2L.css,017DsKjNQJL.css,0131vqwP5UL.css,41EWOOlBJ9L.css,11TIuySqr6L.css,01ElnPiDxWL.css,11bGSgD5pDL.css,01Dm5eKVxwL.css,01IdKcBuAdL.css,01y-XAlI+2L.css,21N4kUH7pxL.css,01oDR3IULNL.css,41CYNGpGlrL.css,01XPHJk60-L.css,114y0SIP+yL.css,21aPhFy+riL.cs

In [174]:
page = response.text
soup = BeautifulSoup(page, 'lxml')

In [166]:
print(soup.prettify())

<!DOCTYPE html>
<html class="a-no-js" data-19ax5a9jf="dingo" lang="en-ae">
 <!-- sp:feature:head-start -->
 <head>
  <script>
   var aPageStart = (new Date()).getTime();
  </script>
  <meta charset="utf-8"/>
  <!-- sp:end-feature:head-start -->
  <!-- sp:feature:cs-optimization -->
  <meta content="on" http-equiv="x-dns-prefetch-control"/>
  <link href="https://images-eu.ssl-images-amazon.com" rel="dns-prefetch"/>
  <link href="https://m.media-amazon.com" rel="dns-prefetch"/>
  <link href="https://completion.amazon.com" rel="dns-prefetch"/>
  <!-- sp:end-feature:cs-optimization -->
  <!-- sp:feature:aui-assets -->
  <link href="https://images-eu.ssl-images-amazon.com/images/I/11EIQ5IGqaL._RC|01ZTHTZObnL.css,41JZEtDv4tL.css,31Y8m1dzTdL.css,013z33uKh2L.css,017DsKjNQJL.css,0131vqwP5UL.css,41EWOOlBJ9L.css,11TIuySqr6L.css,01ElnPiDxWL.css,11bGSgD5pDL.css,01Dm5eKVxwL.css,01IdKcBuAdL.css,01y-XAlI+2L.css,21N4kUH7pxL.css,01oDR3IULNL.css,41CYNGpGlrL.css,01XPHJk60-L.css,114y0SIP+yL.css,21aPhFy+riL

In [167]:
soup.find_all('li', {'class':'zg-item-immersion'})[1].find_all('a')[1].text

'\n5.0 out of 5 stars\n'

In [168]:
def clean_num(s):
    """cleans text scraped from website
    
    Parameters:
    s (String): raw text

    Returns:
    String: cleaned digit text

    """ 

    if s is None:
        return s
    return "".join(re.findall('\d*\.?\d+',s))

def get_data(block, tag, class_text):
    """fetch data from tag and its associated class
    
    Parameters:
    block (int): Book item 
    tag (String): tag for element we want to fetch
    class_text (String): class for element we want to fetch

    Returns:
    String: data text fetched

    """ 
    try:
        return block.find(tag, class_ = class_text).text
    except AttributeError:
        return ""
    return

In [169]:
for i in (soup.find("li", "zg-item-immersion")):
    print(str.strip(get_data(i, "a", "a-link-normal")))
    print(get_data(i, "span", "a-icon-alt"))
    print(clean_num(get_data(i, "span", "a-icon-alt").split(" ")[0]))
    print(get_data(i, "a","a-size-small a-link-normal"))
    print(clean_num(get_data(i, "a","a-size-small a-link-normal")))
    print(get_data(i, "span","p13n-sc-price"))
    print(clean_num(get_data(i, "span", "p13n-sc-price")))

UGREEN 8K DisplayPort Cable Ultra HD Gold-Plated DisplayPort 1.4 Male to Male Nylon Braided Cable SPCC Shell, Support 7680x4320 Resolution 8K@60Hz, 4K@144Hz, 32.4Gbps HDP HDCP for HDTV Monitor-1Meter
4.1 out of 5 stars
4.1
80
80
SAR 56.80
56.80


In [170]:
products_list = []
for i in (soup.find_all("li", class_ = "zg-item-immersion")):
    pro_name = (str.strip(get_data(i, "a", "a-link-normal")))
    pro_rate= (get_data(i, "span", "a-icon-alt"))
    pro_rate_clean = (clean_num(get_data(i, "span", "a-icon-alt").split(" ")[0]))
    review_num = (get_data(i, "a","a-size-small a-link-normal"))
    review_num_clean = (clean_num(get_data(i, "a","a-size-small a-link-normal")))
    pro_price = (get_data(i, "span","p13n-sc-price"))
    pro_price_clean = (clean_num(get_data(i, "span", "p13n-sc-price")))
    
    headers = ['pro_name', 'pro_rate', 'pro_rate_clean','review_num',
              'review_num_clean', 'pro_price','pro_price_clean']
    products_dict = dict(zip(headers, [ pro_name,
                                    pro_rate,
                                    pro_rate_clean,
                                    review_num,
                                    review_num_clean, 
                                    pro_price,
                                    pro_price_clean]))
    products_list.append(products_dict)

In [171]:
products_df = pd.DataFrame(products_list).replace("",np.nan)  #convert list of dict to df
products_df

Unnamed: 0,pro_name,pro_rate,pro_rate_clean,review_num,review_num_clean,pro_price,pro_price_clean
0,UGREEN 8K DisplayPort Cable Ultra HD Gold-Plat...,4.1 out of 5 stars,4.1,80.0,80.0,SAR 56.80,56.8
1,YSK-Replacement Remote control for Class pro s...,5.0 out of 5 stars,5.0,2.0,2.0,SAR 80.00,80.0
2,UGREEN 8K HDMI 2.1 Male to Male Flat Cable Sup...,4.6 out of 5 stars,4.6,57.0,57.0,SAR 113.00,113.0
3,"todayto 30W Universal AC/DC, 3V 4.5V 6V 9V 12V...",3.5 out of 5 stars,3.5,43.0,43.0,SAR 57.00,57.0
4,SanDisk Ultra 256GB SDXC Memory Card 120MB/s,4.7 out of 5 stars,4.7,19036.0,19036.0,SAR 140.22,140.22
5,"Canon EF 50mm f/1.8 STM Standard Lens,Black",4.8 out of 5 stars,4.8,14669.0,14669.0,SAR 499.00,499.0
6,"Flat Cable HDMI to HDMI 1080P 10 Meter, Black",3.6 out of 5 stars,3.6,38.0,38.0,SAR 19.95,19.95
7,EZVIZ C6W Wifi Smart Home Indoor Security Camera,4.1 out of 5 stars,4.1,62.0,62.0,SAR 199.00,199.0
8,AmazonBasics DisplayPort to HDMI Display Adapt...,4.1 out of 5 stars,4.1,1650.0,1650.0,SAR 29.00,29.0
9,UGREEN Micro HDMI to HDMI Adapter Cable Male t...,4.6 out of 5 stars,4.6,7113.0,7113.0,SAR 42.40,42.4


# Best Sellers in Binoculars, Telescopes & Optics (Page 1) 

In [193]:
url ="https://www.amazon.sa/-/en/gp/bestsellers/electronics/16966400031/ref=zg_bs_nav_2_16966385031"


In [194]:
response = requests.get(url)
response.status_code

200

In [195]:
response.text[:1000]

'<!doctype html><html lang="en-ae" class="a-no-js" data-19ax5a9jf="dingo"><!-- sp:feature:head-start -->\n<head><script>var aPageStart = (new Date()).getTime();</script><meta charset="utf-8"/>\n<!-- sp:end-feature:head-start -->\n\n<!-- sp:feature:cs-optimization -->\n<meta http-equiv=\'x-dns-prefetch-control\' content=\'on\'>\n<link rel="dns-prefetch" href="https://images-eu.ssl-images-amazon.com">\n<link rel="dns-prefetch" href="https://m.media-amazon.com">\n<link rel="dns-prefetch" href="https://completion.amazon.com">\n<!-- sp:end-feature:cs-optimization -->\n\n<!-- sp:feature:aui-assets -->\n<link rel="stylesheet" href="https://images-eu.ssl-images-amazon.com/images/I/11EIQ5IGqaL._RC|01ZTHTZObnL.css,41JZEtDv4tL.css,31Y8m1dzTdL.css,013z33uKh2L.css,017DsKjNQJL.css,0131vqwP5UL.css,41EWOOlBJ9L.css,11TIuySqr6L.css,01ElnPiDxWL.css,11bGSgD5pDL.css,01Dm5eKVxwL.css,01IdKcBuAdL.css,01y-XAlI+2L.css,21N4kUH7pxL.css,01oDR3IULNL.css,41CYNGpGlrL.css,01XPHJk60-L.css,114y0SIP+yL.css,21aPhFy+riL.cs

In [196]:
page = response.text
soup = BeautifulSoup(page, 'lxml')

In [197]:
print(soup.prettify())

<!DOCTYPE html>
<html class="a-no-js" data-19ax5a9jf="dingo" lang="en-ae">
 <!-- sp:feature:head-start -->
 <head>
  <script>
   var aPageStart = (new Date()).getTime();
  </script>
  <meta charset="utf-8"/>
  <!-- sp:end-feature:head-start -->
  <!-- sp:feature:cs-optimization -->
  <meta content="on" http-equiv="x-dns-prefetch-control"/>
  <link href="https://images-eu.ssl-images-amazon.com" rel="dns-prefetch"/>
  <link href="https://m.media-amazon.com" rel="dns-prefetch"/>
  <link href="https://completion.amazon.com" rel="dns-prefetch"/>
  <!-- sp:end-feature:cs-optimization -->
  <!-- sp:feature:aui-assets -->
  <link href="https://images-eu.ssl-images-amazon.com/images/I/11EIQ5IGqaL._RC|01ZTHTZObnL.css,41JZEtDv4tL.css,31Y8m1dzTdL.css,013z33uKh2L.css,017DsKjNQJL.css,0131vqwP5UL.css,41EWOOlBJ9L.css,11TIuySqr6L.css,01ElnPiDxWL.css,11bGSgD5pDL.css,01Dm5eKVxwL.css,01IdKcBuAdL.css,01y-XAlI+2L.css,21N4kUH7pxL.css,01oDR3IULNL.css,41CYNGpGlrL.css,01XPHJk60-L.css,114y0SIP+yL.css,21aPhFy+riL

In [198]:
soup.find_all('li', {'class':'zg-item-immersion'})[1].find_all('a')[1].text

'\n4.3 out of 5 stars\n'

In [199]:
def clean_num(s):
    """cleans text scraped from website
    
    Parameters:
    s (String): raw text

    Returns:
    String: cleaned digit text

    """ 

    if s is None:
        return s
    return "".join(re.findall('\d*\.?\d+',s))

def get_data(block, tag, class_text):
    """fetch data from tag and its associated class
    
    Parameters:
    block (int): Book item 
    tag (String): tag for element we want to fetch
    class_text (String): class for element we want to fetch

    Returns:
    String: data text fetched

    """ 
    try:
        return block.find(tag, class_ = class_text).text
    except AttributeError:
        return ""
    return

In [200]:
for i in (soup.find("li", "zg-item-immersion")):
    print(str.strip(get_data(i, "a", "a-link-normal")))
    print(get_data(i, "span", "a-icon-alt"))
    print(clean_num(get_data(i, "span", "a-icon-alt").split(" ")[0]))
    print(get_data(i, "a","a-size-small a-link-normal"))
    print(clean_num(get_data(i, "a","a-size-small a-link-normal")))
    print(get_data(i, "span","p13n-sc-price"))
    print(clean_num(get_data(i, "span", "p13n-sc-price")))

40X60 HD Mini Day and Night Vision Monocular Telescope with Tripod Phone Clip Handheld Optical Monocular Outdoor Camping
3.1 out of 5 stars
3.1
9
9
SAR 87.99
87.99


In [201]:
products_list = []
for i in (soup.find_all("li", class_ = "zg-item-immersion")):
    pro_name = (str.strip(get_data(i, "a", "a-link-normal")))
    pro_rate= (get_data(i, "span", "a-icon-alt"))
    pro_rate_clean = (clean_num(get_data(i, "span", "a-icon-alt").split(" ")[0]))
    review_num = (get_data(i, "a","a-size-small a-link-normal"))
    review_num_clean = (clean_num(get_data(i, "a","a-size-small a-link-normal")))
    pro_price = (get_data(i, "span","p13n-sc-price"))
    pro_price_clean = (clean_num(get_data(i, "span", "p13n-sc-price")))
    
    headers = ['pro_name', 'pro_rate', 'pro_rate_clean','review_num',
              'review_num_clean', 'pro_price','pro_price_clean']
    products_dict = dict(zip(headers, [ pro_name,
                                    pro_rate,
                                    pro_rate_clean,
                                    review_num,
                                    review_num_clean, 
                                    pro_price,
                                    pro_price_clean]))
    products_list.append(products_dict)

In [202]:
products_df = pd.DataFrame(products_list).replace("",np.nan)  #convert list of dict to df
products_df

Unnamed: 0,pro_name,pro_rate,pro_rate_clean,review_num,review_num_clean,pro_price,pro_price_clean
0,40X60 HD Mini Day and Night Vision Monocular T...,3.1 out of 5 stars,3.1,9.0,9.0,SAR 87.99,87.99
1,Celestron Portable Telescope Travel Scope 70,4.3 out of 5 stars,4.3,9013.0,9013.0,SAR 610.00,610.0
2,Celestron - 70mm Travel Scope DX - Portable Re...,4.2 out of 5 stars,4.2,1355.0,1355.0,SAR 529.82,529.82
3,ComCreate 8X Folding High Powered Binoculars W...,1.0 out of 5 stars,1.0,1.0,1.0,SAR 100.00,100.0
4,Celestron 93230 8 to 24mm 1.25 Zoom Eyepiece,4.7 out of 5 stars,4.7,1291.0,1291.0,SAR 399.64,399.64
5,Celestron 93529 X-Cel LX 1.25-Inch 2x Barlow L...,4.7 out of 5 stars,4.7,264.0,264.0,SAR 417.84,417.84
6,Celestron 21039 PowerSeeker 50 Telescope,4.1 out of 5 stars,4.1,2221.0,2221.0,SAR 244.00,244.0
7,Apple Pencil Tips - 4 pack,4.8 out of 5 stars,4.8,8.0,8.0,SAR 98.00,98.0
8,10X25 Small Compact Lightweight Binoculars for...,3.3 out of 5 stars,3.3,11.0,11.0,SAR 75.00,75.0
9,Nikon BAA807SB Fernglas Aculon A30 8x25 Binocu...,4.5 out of 5 stars,4.5,68.0,68.0,SAR 282.83,282.83


# Best Sellers in Binoculars, Telescopes & Optics (Page 2)

In [204]:
url="https://www.amazon.sa/-/en/gp/bestsellers/electronics/16966400031/ref=zg_bs_pg_2?ie=UTF8&pg=2"

In [205]:
response = requests.get(url)
response.status_code

200

In [206]:
response.text[:1000]

'<!doctype html><html lang="en-ae" class="a-no-js" data-19ax5a9jf="dingo"><!-- sp:feature:head-start -->\n<head><script>var aPageStart = (new Date()).getTime();</script><meta charset="utf-8"/>\n<!-- sp:end-feature:head-start -->\n\n<!-- sp:feature:cs-optimization -->\n<meta http-equiv=\'x-dns-prefetch-control\' content=\'on\'>\n<link rel="dns-prefetch" href="https://images-eu.ssl-images-amazon.com">\n<link rel="dns-prefetch" href="https://m.media-amazon.com">\n<link rel="dns-prefetch" href="https://completion.amazon.com">\n<!-- sp:end-feature:cs-optimization -->\n\n<!-- sp:feature:aui-assets -->\n<link rel="stylesheet" href="https://images-eu.ssl-images-amazon.com/images/I/11EIQ5IGqaL._RC|01ZTHTZObnL.css,41JZEtDv4tL.css,31Y8m1dzTdL.css,013z33uKh2L.css,017DsKjNQJL.css,0131vqwP5UL.css,41EWOOlBJ9L.css,11TIuySqr6L.css,01ElnPiDxWL.css,11bGSgD5pDL.css,01Dm5eKVxwL.css,01IdKcBuAdL.css,01y-XAlI+2L.css,21N4kUH7pxL.css,01oDR3IULNL.css,41CYNGpGlrL.css,01XPHJk60-L.css,114y0SIP+yL.css,21aPhFy+riL.cs

In [207]:
page = response.text
soup = BeautifulSoup(page, 'lxml')

In [208]:
print(soup.prettify())

<!DOCTYPE html>
<html class="a-no-js" data-19ax5a9jf="dingo" lang="en-ae">
 <!-- sp:feature:head-start -->
 <head>
  <script>
   var aPageStart = (new Date()).getTime();
  </script>
  <meta charset="utf-8"/>
  <!-- sp:end-feature:head-start -->
  <!-- sp:feature:cs-optimization -->
  <meta content="on" http-equiv="x-dns-prefetch-control"/>
  <link href="https://images-eu.ssl-images-amazon.com" rel="dns-prefetch"/>
  <link href="https://m.media-amazon.com" rel="dns-prefetch"/>
  <link href="https://completion.amazon.com" rel="dns-prefetch"/>
  <!-- sp:end-feature:cs-optimization -->
  <!-- sp:feature:aui-assets -->
  <link href="https://images-eu.ssl-images-amazon.com/images/I/11EIQ5IGqaL._RC|01ZTHTZObnL.css,41JZEtDv4tL.css,31Y8m1dzTdL.css,013z33uKh2L.css,017DsKjNQJL.css,0131vqwP5UL.css,41EWOOlBJ9L.css,11TIuySqr6L.css,01ElnPiDxWL.css,11bGSgD5pDL.css,01Dm5eKVxwL.css,01IdKcBuAdL.css,01y-XAlI+2L.css,21N4kUH7pxL.css,01oDR3IULNL.css,41CYNGpGlrL.css,01XPHJk60-L.css,114y0SIP+yL.css,21aPhFy+riL

In [209]:
soup.find_all('li', {'class':'zg-item-immersion'})[1].find_all('a')[1].text

'\n2.7 out of 5 stars\n'

In [None]:
def clean_num(s):
    """cleans text scraped from website
    
    Parameters:
    s (String): raw text

    Returns:
    String: cleaned digit text

    """ 

    if s is None:
        return s
    return "".join(re.findall('\d*\.?\d+',s))

def get_data(block, tag, class_text):
    """fetch data from tag and its associated class
    
    Parameters:
    block (int): Book item 
    tag (String): tag for element we want to fetch
    class_text (String): class for element we want to fetch

    Returns:
    String: data text fetched

    """ 
    try:
        return block.find(tag, class_ = class_text).text
    except AttributeError:
        return ""
    return

In [211]:
for i in (soup.find("li", "zg-item-immersion")):
    print(str.strip(get_data(i, "a", "a-link-normal")))
    print(get_data(i, "span", "a-icon-alt"))
    print(clean_num(get_data(i, "span", "a-icon-alt").split(" ")[0]))
    print(get_data(i, "a","a-size-small a-link-normal"))
    print(clean_num(get_data(i, "a","a-size-small a-link-normal")))
    print(get_data(i, "span","p13n-sc-price"))
    print(clean_num(get_data(i, "span", "p13n-sc-price")))

2021 Newest Telescope, 60mm Aperture 500mm AZ Mount Astronomical Refracting Telescope Adjustable Portable Telescopes with Adjustable Tripod, Phone Adapter, Nylon Bag…




SAR 527.46
527.46


In [212]:
products_list = []
for i in (soup.find_all("li", class_ = "zg-item-immersion")):
    pro_name = (str.strip(get_data(i, "a", "a-link-normal")))
    pro_rate= (get_data(i, "span", "a-icon-alt"))
    pro_rate_clean = (clean_num(get_data(i, "span", "a-icon-alt").split(" ")[0]))
    review_num = (get_data(i, "a","a-size-small a-link-normal"))
    review_num_clean = (clean_num(get_data(i, "a","a-size-small a-link-normal")))
    pro_price = (get_data(i, "span","p13n-sc-price"))
    pro_price_clean = (clean_num(get_data(i, "span", "p13n-sc-price")))
    
    headers = ['pro_name', 'pro_rate', 'pro_rate_clean','review_num',
              'review_num_clean', 'pro_price','pro_price_clean']
    products_dict = dict(zip(headers, [ pro_name,
                                    pro_rate,
                                    pro_rate_clean,
                                    review_num,
                                    review_num_clean, 
                                    pro_price,
                                    pro_price_clean]))
    products_list.append(products_dict)

In [213]:
products_df = pd.DataFrame(products_list).replace("",np.nan)  #convert list of dict to df
products_df

Unnamed: 0,pro_name,pro_rate,pro_rate_clean,review_num,review_num_clean,pro_price,pro_price_clean
0,"2021 Newest Telescope, 60mm Aperture 500mm AZ ...",,,,,SAR 527.46,527.46
1,10X22 Binocular Telescope Night Version Foldab...,2.7 out of 5 stars,2.7,14.0,14.0,SAR 85.00,85.0
2,SCOKC 10x-20x Zoom Binoculars Monocular HD Pow...,4.3 out of 5 stars,4.3,114.0,114.0,SAR 299.00,299.0
3,Celestron - NexStar 127SLT Computerized Telesc...,4.3 out of 5 stars,4.3,892.0,892.0,"SAR 3,499.00",3499.0
4,Celestron Upclose 20x50 Binoculars Porro Prism...,4.0 out of 5 stars,4.0,959.0,959.0,SAR 197.97,197.97
5,Celestron – Outland X 10x50 Binoculars – Water...,4.5 out of 5 stars,4.5,3196.0,3196.0,SAR 474.46,474.46
6,National Geographic 10x50 Porro Binocular,4.2 out of 5 stars,4.2,125.0,125.0,,
7,Carson MicroFlip 100x-250x LED Lighted Pocket ...,4.1 out of 5 stars,4.1,3891.0,3891.0,SAR 89.65,89.65
8,"Opticron BGA 10x42 Monocular, black",4.7 out of 5 stars,4.7,103.0,103.0,SAR 906.45,906.45
9,"Telescope Phone Mount, Universal Smart Phone A...",3.5 out of 5 stars,3.5,22.0,22.0,SAR 53.90,53.9


# Best Sellers in Computers, Components & Accessories (Page 1)

In [216]:
url= " https://www.amazon.sa/-/en/gp/bestsellers/electronics/16966388031/ref=zg_bs_nav_1_electronics"

In [217]:
response = requests.get(url)
response.status_code

200

In [218]:
response.text[:1000]

'<!doctype html><html lang="en-ae" class="a-no-js" data-19ax5a9jf="dingo"><!-- sp:feature:head-start -->\n<head><script>var aPageStart = (new Date()).getTime();</script><meta charset="utf-8"/>\n<!-- sp:end-feature:head-start -->\n\n<!-- sp:feature:cs-optimization -->\n<meta http-equiv=\'x-dns-prefetch-control\' content=\'on\'>\n<link rel="dns-prefetch" href="https://images-eu.ssl-images-amazon.com">\n<link rel="dns-prefetch" href="https://m.media-amazon.com">\n<link rel="dns-prefetch" href="https://completion.amazon.com">\n<!-- sp:end-feature:cs-optimization -->\n\n<!-- sp:feature:aui-assets -->\n<link rel="stylesheet" href="https://images-eu.ssl-images-amazon.com/images/I/11EIQ5IGqaL._RC|01ZTHTZObnL.css,41JZEtDv4tL.css,31Y8m1dzTdL.css,013z33uKh2L.css,017DsKjNQJL.css,0131vqwP5UL.css,41EWOOlBJ9L.css,11TIuySqr6L.css,01ElnPiDxWL.css,11bGSgD5pDL.css,01Dm5eKVxwL.css,01IdKcBuAdL.css,01y-XAlI+2L.css,21N4kUH7pxL.css,01oDR3IULNL.css,41CYNGpGlrL.css,01XPHJk60-L.css,114y0SIP+yL.css,21aPhFy+riL.cs

In [219]:
page = response.text
soup = BeautifulSoup(page, 'lxml')

In [220]:
print(soup.prettify())

<!DOCTYPE html>
<html class="a-no-js" data-19ax5a9jf="dingo" lang="en-ae">
 <!-- sp:feature:head-start -->
 <head>
  <script>
   var aPageStart = (new Date()).getTime();
  </script>
  <meta charset="utf-8"/>
  <!-- sp:end-feature:head-start -->
  <!-- sp:feature:cs-optimization -->
  <meta content="on" http-equiv="x-dns-prefetch-control"/>
  <link href="https://images-eu.ssl-images-amazon.com" rel="dns-prefetch"/>
  <link href="https://m.media-amazon.com" rel="dns-prefetch"/>
  <link href="https://completion.amazon.com" rel="dns-prefetch"/>
  <!-- sp:end-feature:cs-optimization -->
  <!-- sp:feature:aui-assets -->
  <link href="https://images-eu.ssl-images-amazon.com/images/I/11EIQ5IGqaL._RC|01ZTHTZObnL.css,41JZEtDv4tL.css,31Y8m1dzTdL.css,013z33uKh2L.css,017DsKjNQJL.css,0131vqwP5UL.css,41EWOOlBJ9L.css,11TIuySqr6L.css,01ElnPiDxWL.css,11bGSgD5pDL.css,01Dm5eKVxwL.css,01IdKcBuAdL.css,01y-XAlI+2L.css,21N4kUH7pxL.css,01oDR3IULNL.css,41CYNGpGlrL.css,01XPHJk60-L.css,114y0SIP+yL.css,21aPhFy+riL

In [221]:
soup.find_all('li', {'class':'zg-item-immersion'})[1].find_all('a')[1].text

'\n4.1 out of 5 stars\n'

In [223]:
def clean_num(s):
    """cleans text scraped from website
    
    Parameters:
    s (String): raw text

    Returns:
    String: cleaned digit text

    """ 

    if s is None:
        return s
    return "".join(re.findall('\d*\.?\d+',s))

def get_data(block, tag, class_text):
    """fetch data from tag and its associated class
    
    Parameters:
    block (int): Book item 
    tag (String): tag for element we want to fetch
    class_text (String): class for element we want to fetch

    Returns:
    String: data text fetched

    """ 
    try:
        return block.find(tag, class_ = class_text).text
    except AttributeError:
        return ""
    return

In [224]:
for i in (soup.find("li", "zg-item-immersion")):
    print(str.strip(get_data(i, "a", "a-link-normal")))
    print(get_data(i, "span", "a-icon-alt"))
    print(clean_num(get_data(i, "span", "a-icon-alt").split(" ")[0]))
    print(get_data(i, "a","a-size-small a-link-normal"))
    print(clean_num(get_data(i, "a","a-size-small a-link-normal")))
    print(get_data(i, "span","p13n-sc-price"))
    print(clean_num(get_data(i, "span", "p13n-sc-price")))

Stylus Pen for iPad with Palm Rejection, Active Pencil Compatible with (2018-2021) iPad Pro 11 & 12.9 inch, iPad 9th/8th/7th/6th Gen, iPad Air 4th/3rd Gen,iPad Mini 6th/5th Gen
3.3 out of 5 stars
3.3
5,545
5545
SAR 58.93
58.93


In [225]:
products_list = []
for i in (soup.find_all("li", class_ = "zg-item-immersion")):
    pro_name = (str.strip(get_data(i, "a", "a-link-normal")))
    pro_rate= (get_data(i, "span", "a-icon-alt"))
    pro_rate_clean = (clean_num(get_data(i, "span", "a-icon-alt").split(" ")[0]))
    review_num = (get_data(i, "a","a-size-small a-link-normal"))
    review_num_clean = (clean_num(get_data(i, "a","a-size-small a-link-normal")))
    pro_price = (get_data(i, "span","p13n-sc-price"))
    pro_price_clean = (clean_num(get_data(i, "span", "p13n-sc-price")))
    
    headers = ['pro_name', 'pro_rate', 'pro_rate_clean','review_num',
              'review_num_clean', 'pro_price','pro_price_clean']
    products_dict = dict(zip(headers, [ pro_name,
                                    pro_rate,
                                    pro_rate_clean,
                                    review_num,
                                    review_num_clean, 
                                    pro_price,
                                    pro_price_clean]))
    products_list.append(products_dict)

In [226]:
products_df = pd.DataFrame(products_list).replace("",np.nan)  #convert list of dict to df
products_df

Unnamed: 0,pro_name,pro_rate,pro_rate_clean,review_num,review_num_clean,pro_price,pro_price_clean
0,"Stylus Pen for iPad with Palm Rejection, Activ...",3.3 out of 5 stars,3.3,5545,5545,SAR 58.93,58.93
1,530pcs/set Heat Shrink Tubing Insulation Shrin...,4.1 out of 5 stars,4.1,117,117,SAR 20.22,20.22
2,"SAMSUNG 870 QVO SATA III 2.5"" SSD 1TB (MZ-77Q1...",4.8 out of 5 stars,4.8,8200,8200,SAR 427.50,427.5
3,"Logitech M171 Wireless Mouse, 2.4 GHz with USB...",4.4 out of 5 stars,4.4,3838,3838,SAR 80.00,80.0
4,SAMSUNG 980 PRO 500GB PCIe NVMe Gen4 Internal ...,4.8 out of 5 stars,4.8,2578,2578,SAR 311.63,311.63
5,"Anker USB C to Lightning Cable (6 ft), Powerli...",4.0 out of 5 stars,4.0,9,9,SAR 46.95,46.95
6,Kingston A400 SATA SSD Solid State Drive 2.5 I...,4.5 out of 5 stars,4.5,31649,31649,SAR 209.00,209.0
7,Anker PowerLine III USB-C to USB-C Cable USB-C...,5.0 out of 5 stars,5.0,5,5,SAR 26.00,26.0
8,Glorious Large Gaming Mouse Pad 11''x13'' - Black,4.7 out of 5 stars,4.7,7990,7990,SAR 37.00,37.0
9,"TP-Link AC1750 Wi-Fi Range Extender - RE450, W...",4.0 out of 5 stars,4.0,5915,5915,SAR 190.00,190.0


 # Best Sellers in Computers, Components & Accessories (Page 2)

In [228]:
url ="https://www.amazon.sa/-/en/gp/bestsellers/electronics/16966388031/ref=zg_bs_pg_2?ie=UTF8&pg=2"

In [229]:
response = requests.get(url)
response.status_code

200

In [230]:
response.text[:1000]

'<!doctype html><html lang="en-ae" class="a-no-js" data-19ax5a9jf="dingo"><!-- sp:feature:head-start -->\n<head><script>var aPageStart = (new Date()).getTime();</script><meta charset="utf-8"/>\n<!-- sp:end-feature:head-start -->\n\n<!-- sp:feature:cs-optimization -->\n<meta http-equiv=\'x-dns-prefetch-control\' content=\'on\'>\n<link rel="dns-prefetch" href="https://images-eu.ssl-images-amazon.com">\n<link rel="dns-prefetch" href="https://m.media-amazon.com">\n<link rel="dns-prefetch" href="https://completion.amazon.com">\n<!-- sp:end-feature:cs-optimization -->\n\n<!-- sp:feature:aui-assets -->\n<link rel="stylesheet" href="https://images-eu.ssl-images-amazon.com/images/I/11EIQ5IGqaL._RC|01ZTHTZObnL.css,41JZEtDv4tL.css,31Y8m1dzTdL.css,013z33uKh2L.css,017DsKjNQJL.css,0131vqwP5UL.css,41EWOOlBJ9L.css,11TIuySqr6L.css,01ElnPiDxWL.css,11bGSgD5pDL.css,01Dm5eKVxwL.css,01IdKcBuAdL.css,01y-XAlI+2L.css,21N4kUH7pxL.css,01oDR3IULNL.css,41CYNGpGlrL.css,01XPHJk60-L.css,114y0SIP+yL.css,21aPhFy+riL.cs

In [231]:
page = response.text
soup = BeautifulSoup(page, 'lxml')

In [232]:
print(soup.prettify())

<!DOCTYPE html>
<html class="a-no-js" data-19ax5a9jf="dingo" lang="en-ae">
 <!-- sp:feature:head-start -->
 <head>
  <script>
   var aPageStart = (new Date()).getTime();
  </script>
  <meta charset="utf-8"/>
  <!-- sp:end-feature:head-start -->
  <!-- sp:feature:cs-optimization -->
  <meta content="on" http-equiv="x-dns-prefetch-control"/>
  <link href="https://images-eu.ssl-images-amazon.com" rel="dns-prefetch"/>
  <link href="https://m.media-amazon.com" rel="dns-prefetch"/>
  <link href="https://completion.amazon.com" rel="dns-prefetch"/>
  <!-- sp:end-feature:cs-optimization -->
  <!-- sp:feature:aui-assets -->
  <link href="https://images-eu.ssl-images-amazon.com/images/I/11EIQ5IGqaL._RC|01ZTHTZObnL.css,41JZEtDv4tL.css,31Y8m1dzTdL.css,013z33uKh2L.css,017DsKjNQJL.css,0131vqwP5UL.css,41EWOOlBJ9L.css,11TIuySqr6L.css,01ElnPiDxWL.css,11bGSgD5pDL.css,01Dm5eKVxwL.css,01IdKcBuAdL.css,01y-XAlI+2L.css,21N4kUH7pxL.css,01oDR3IULNL.css,41CYNGpGlrL.css,01XPHJk60-L.css,114y0SIP+yL.css,21aPhFy+riL

In [233]:
soup.find_all('li', {'class':'zg-item-immersion'})[1].find_all('a')[1].text

'\n4.1 out of 5 stars\n'

In [234]:
def clean_num(s):
    """cleans text scraped from website
    
    Parameters:
    s (String): raw text

    Returns:
    String: cleaned digit text

    """ 

    if s is None:
        return s
    return "".join(re.findall('\d*\.?\d+',s))

def get_data(block, tag, class_text):
    """fetch data from tag and its associated class
    
    Parameters:
    block (int): Book item 
    tag (String): tag for element we want to fetch
    class_text (String): class for element we want to fetch

    Returns:
    String: data text fetched

    """ 
    try:
        return block.find(tag, class_ = class_text).text
    except AttributeError:
        return ""
    return

In [235]:
for i in (soup.find("li", "zg-item-immersion")):
    print(str.strip(get_data(i, "a", "a-link-normal")))
    print(get_data(i, "span", "a-icon-alt"))
    print(clean_num(get_data(i, "span", "a-icon-alt").split(" ")[0]))
    print(get_data(i, "a","a-size-small a-link-normal"))
    print(clean_num(get_data(i, "a","a-size-small a-link-normal")))
    print(get_data(i, "span","p13n-sc-price"))
    print(clean_num(get_data(i, "span", "p13n-sc-price")))

Laptop Stand, BiuLing Adjustable Portable Laptop Holder for Desk, Aluminum Foldable Laptop Riser with 6 Levels of Height Adjustment, Compatible with MacBook Air Pro, Dell, HP, Lenovo,10-15.6” Laptops
3.6 out of 5 stars
3.6
42
42
SAR 45.99
45.99


In [236]:
products_list = []
for i in (soup.find_all("li", class_ = "zg-item-immersion")):
    pro_name = (str.strip(get_data(i, "a", "a-link-normal")))
    pro_rate= (get_data(i, "span", "a-icon-alt"))
    pro_rate_clean = (clean_num(get_data(i, "span", "a-icon-alt").split(" ")[0]))
    review_num = (get_data(i, "a","a-size-small a-link-normal"))
    review_num_clean = (clean_num(get_data(i, "a","a-size-small a-link-normal")))
    pro_price = (get_data(i, "span","p13n-sc-price"))
    pro_price_clean = (clean_num(get_data(i, "span", "p13n-sc-price")))
    
    headers = ['pro_name', 'pro_rate', 'pro_rate_clean','review_num',
              'review_num_clean', 'pro_price','pro_price_clean']
    products_dict = dict(zip(headers, [ pro_name,
                                    pro_rate,
                                    pro_rate_clean,
                                    review_num,
                                    review_num_clean, 
                                    pro_price,
                                    pro_price_clean]))
    products_list.append(products_dict)

In [237]:
products_df = pd.DataFrame(products_list).replace("",np.nan)  #convert list of dict to df
products_df

Unnamed: 0,pro_name,pro_rate,pro_rate_clean,review_num,review_num_clean,pro_price,pro_price_clean
0,"Laptop Stand, BiuLing Adjustable Portable Lapt...",3.6 out of 5 stars,3.6,42.0,42.0,SAR 45.99,45.99
1,iPhone Charger Lightning Cable 10ft - by TalkW...,4.1 out of 5 stars,4.1,7091.0,7091.0,SAR 35.00,35.0
2,Logitech Z150 Compact Multimedia Stereo Speake...,4.4 out of 5 stars,4.4,1486.0,1486.0,SAR 84.00,84.0
3,SanDisk Ultra Dual Drive Luxe USB Type-C 128GB...,4.4 out of 5 stars,4.4,3470.0,3470.0,SAR 65.00,65.0
4,ProCase iPad Air 4 Screen Protector 10.9 2020/...,4.6 out of 5 stars,4.6,1984.0,1984.0,SAR 32.70,32.7
5,New Office 2021 Pro For Windows | Lifetime Act...,5.0 out of 5 stars,5.0,2.0,2.0,SAR 30.00,30.0
6,[2 Pack] ProCase Galaxy Tab A7 10.4 2020 Scree...,4.3 out of 5 stars,4.3,1718.0,1718.0,SAR 30.98,30.98
7,Samsung Electronics (MZ-V8V1T0B/AM) 980 SSD 1T...,4.8 out of 5 stars,4.8,1567.0,1567.0,SAR 559.00,559.0
8,"2021 Apple iPad Pro (11-inch, Wi-Fi, 128GB) - ...",4.6 out of 5 stars,4.6,54.0,54.0,"SAR 3,449.00",3449.0
9,UGREEN USB C Cable USB A to Type C Data Cable ...,4.4 out of 5 stars,4.4,118.0,118.0,SAR 36.00,36.0


# Best Sellers in Headphones, Earbuds & Accessories (Page 1 )

In [238]:
url = "https://www.amazon.sa/-/en/gp/bestsellers/electronics/16966390031/ref=zg_bs_pg_1?ie=UTF8&pg=1"

In [239]:
response = requests.get(url)
response.status_code

200

In [240]:
response.text[:1000]

'<!doctype html><html lang="en-ae" class="a-no-js" data-19ax5a9jf="dingo"><!-- sp:feature:head-start -->\n<head><script>var aPageStart = (new Date()).getTime();</script><meta charset="utf-8"/>\n<!-- sp:end-feature:head-start -->\n\n<!-- sp:feature:cs-optimization -->\n<meta http-equiv=\'x-dns-prefetch-control\' content=\'on\'>\n<link rel="dns-prefetch" href="https://images-eu.ssl-images-amazon.com">\n<link rel="dns-prefetch" href="https://m.media-amazon.com">\n<link rel="dns-prefetch" href="https://completion.amazon.com">\n<!-- sp:end-feature:cs-optimization -->\n\n<!-- sp:feature:aui-assets -->\n<link rel="stylesheet" href="https://images-eu.ssl-images-amazon.com/images/I/11EIQ5IGqaL._RC|01ZTHTZObnL.css,41JZEtDv4tL.css,31Y8m1dzTdL.css,013z33uKh2L.css,017DsKjNQJL.css,0131vqwP5UL.css,41EWOOlBJ9L.css,11TIuySqr6L.css,01ElnPiDxWL.css,11bGSgD5pDL.css,01Dm5eKVxwL.css,01IdKcBuAdL.css,01y-XAlI+2L.css,21N4kUH7pxL.css,01oDR3IULNL.css,41CYNGpGlrL.css,01XPHJk60-L.css,114y0SIP+yL.css,21aPhFy+riL.cs

In [241]:
page = response.text
soup = BeautifulSoup(page, 'lxml')

In [242]:
print(soup.prettify())

<!DOCTYPE html>
<html class="a-no-js" data-19ax5a9jf="dingo" lang="en-ae">
 <!-- sp:feature:head-start -->
 <head>
  <script>
   var aPageStart = (new Date()).getTime();
  </script>
  <meta charset="utf-8"/>
  <!-- sp:end-feature:head-start -->
  <!-- sp:feature:cs-optimization -->
  <meta content="on" http-equiv="x-dns-prefetch-control"/>
  <link href="https://images-eu.ssl-images-amazon.com" rel="dns-prefetch"/>
  <link href="https://m.media-amazon.com" rel="dns-prefetch"/>
  <link href="https://completion.amazon.com" rel="dns-prefetch"/>
  <!-- sp:end-feature:cs-optimization -->
  <!-- sp:feature:aui-assets -->
  <link href="https://images-eu.ssl-images-amazon.com/images/I/11EIQ5IGqaL._RC|01ZTHTZObnL.css,41JZEtDv4tL.css,31Y8m1dzTdL.css,013z33uKh2L.css,017DsKjNQJL.css,0131vqwP5UL.css,41EWOOlBJ9L.css,11TIuySqr6L.css,01ElnPiDxWL.css,11bGSgD5pDL.css,01Dm5eKVxwL.css,01IdKcBuAdL.css,01y-XAlI+2L.css,21N4kUH7pxL.css,01oDR3IULNL.css,41CYNGpGlrL.css,01XPHJk60-L.css,114y0SIP+yL.css,21aPhFy+riL

In [243]:
soup.find_all('li', {'class':'zg-item-immersion'})[1].find_all('a')[1].text

'\n4.1 out of 5 stars\n'

In [244]:
def clean_num(s):
    """cleans text scraped from website
    
    Parameters:
    s (String): raw text

    Returns:
    String: cleaned digit text

    """ 

    if s is None:
        return s
    return "".join(re.findall('\d*\.?\d+',s))

def get_data(block, tag, class_text):
    """fetch data from tag and its associated class
    
    Parameters:
    block (int): Book item 
    tag (String): tag for element we want to fetch
    class_text (String): class for element we want to fetch

    Returns:
    String: data text fetched

    """ 
    try:
        return block.find(tag, class_ = class_text).text
    except AttributeError:
        return ""
    return

In [245]:
for i in (soup.find("li", "zg-item-immersion")):
    print(str.strip(get_data(i, "a", "a-link-normal")))
    print(get_data(i, "span", "a-icon-alt"))
    print(clean_num(get_data(i, "span", "a-icon-alt").split(" ")[0]))
    print(get_data(i, "a","a-size-small a-link-normal"))
    print(clean_num(get_data(i, "a","a-size-small a-link-normal")))
    print(get_data(i, "span","p13n-sc-price"))
    print(clean_num(get_data(i, "span", "p13n-sc-price")))

New Apple Airpods Pro
4.3 out of 5 stars
4.3
2,350
2350
SAR 749.00
749.00


In [246]:
products_list = []
for i in (soup.find_all("li", class_ = "zg-item-immersion")):
    pro_name = (str.strip(get_data(i, "a", "a-link-normal")))
    pro_rate= (get_data(i, "span", "a-icon-alt"))
    pro_rate_clean = (clean_num(get_data(i, "span", "a-icon-alt").split(" ")[0]))
    review_num = (get_data(i, "a","a-size-small a-link-normal"))
    review_num_clean = (clean_num(get_data(i, "a","a-size-small a-link-normal")))
    pro_price = (get_data(i, "span","p13n-sc-price"))
    pro_price_clean = (clean_num(get_data(i, "span", "p13n-sc-price")))
    
    headers = ['pro_name', 'pro_rate', 'pro_rate_clean','review_num',
              'review_num_clean', 'pro_price','pro_price_clean']
    products_dict = dict(zip(headers, [ pro_name,
                                    pro_rate,
                                    pro_rate_clean,
                                    review_num,
                                    review_num_clean, 
                                    pro_price,
                                    pro_price_clean]))
    products_list.append(products_dict)

In [247]:
products_df = pd.DataFrame(products_list).replace("",np.nan)  #convert list of dict to df
products_df

Unnamed: 0,pro_name,pro_rate,pro_rate_clean,review_num,review_num_clean,pro_price,pro_price_clean
0,New Apple Airpods Pro,4.3 out of 5 stars,4.3,2350,2350,SAR 749.00,749.0
1,Apple AirPods with Charging Case,4.1 out of 5 stars,4.1,2774,2774,SAR 475.00,475.0
2,Sony WH-1000XM4 Wireless Noise Cancelling Blue...,4.4 out of 5 stars,4.4,63,63,SAR 990.00,990.0
3,SoundPEATS Air3 Wireless Earbuds Mini Bluetoot...,4.0 out of 5 stars,4.0,299,299,SAR 143.20,143.2
4,"JBL In-Ear Headphones, Black, T110",4.2 out of 5 stars,4.2,17957,17957,SAR 28.15,28.15
5,Wireless Earbuds SoundPEATS TrueAir2 Bluetooth...,3.8 out of 5 stars,3.8,2094,2094,SAR 127.20,127.2
6,"Anker Soundcore Life Q30 Bluetooth Headphones,...",4.6 out of 5 stars,4.6,11968,11968,SAR 249.00,249.0
7,HyperX Cloud II Gaming Headset for PC & PS4 & ...,4.6 out of 5 stars,4.6,36153,36153,SAR 419.00,419.0
8,New Apple AirPods (3rd generation),4.3 out of 5 stars,4.3,25,25,SAR 829.00,829.0
9,Sony WI-C200 Wireless In-ear Bluetooth Headpho...,3.7 out of 5 stars,3.7,260,260,SAR 93.00,93.0


# Best Sellers in Headphones, Earbuds & Accessories (Page 2 )

In [248]:
url="https://www.amazon.sa/-/en/gp/bestsellers/electronics/16966390031/ref=zg_bs_pg_2?ie=UTF8&pg=2"

In [249]:
response = requests.get(url)
response.status_code

200

In [250]:
response.text[:1000]

'<!doctype html><html lang="en-ae" class="a-no-js" data-19ax5a9jf="dingo"><!-- sp:feature:head-start -->\n<head><script>var aPageStart = (new Date()).getTime();</script><meta charset="utf-8"/>\n<!-- sp:end-feature:head-start -->\n\n<!-- sp:feature:cs-optimization -->\n<meta http-equiv=\'x-dns-prefetch-control\' content=\'on\'>\n<link rel="dns-prefetch" href="https://images-eu.ssl-images-amazon.com">\n<link rel="dns-prefetch" href="https://m.media-amazon.com">\n<link rel="dns-prefetch" href="https://completion.amazon.com">\n<!-- sp:end-feature:cs-optimization -->\n\n<!-- sp:feature:aui-assets -->\n<link rel="stylesheet" href="https://images-eu.ssl-images-amazon.com/images/I/11EIQ5IGqaL._RC|01ZTHTZObnL.css,41JZEtDv4tL.css,31Y8m1dzTdL.css,013z33uKh2L.css,017DsKjNQJL.css,0131vqwP5UL.css,41EWOOlBJ9L.css,11TIuySqr6L.css,01ElnPiDxWL.css,11bGSgD5pDL.css,01Dm5eKVxwL.css,01IdKcBuAdL.css,01y-XAlI+2L.css,21N4kUH7pxL.css,01oDR3IULNL.css,41CYNGpGlrL.css,01XPHJk60-L.css,114y0SIP+yL.css,21aPhFy+riL.cs

In [251]:
page = response.text
soup = BeautifulSoup(page, 'lxml')

In [252]:
print(soup.prettify())

<!DOCTYPE html>
<html class="a-no-js" data-19ax5a9jf="dingo" lang="en-ae">
 <!-- sp:feature:head-start -->
 <head>
  <script>
   var aPageStart = (new Date()).getTime();
  </script>
  <meta charset="utf-8"/>
  <!-- sp:end-feature:head-start -->
  <!-- sp:feature:cs-optimization -->
  <meta content="on" http-equiv="x-dns-prefetch-control"/>
  <link href="https://images-eu.ssl-images-amazon.com" rel="dns-prefetch"/>
  <link href="https://m.media-amazon.com" rel="dns-prefetch"/>
  <link href="https://completion.amazon.com" rel="dns-prefetch"/>
  <!-- sp:end-feature:cs-optimization -->
  <!-- sp:feature:aui-assets -->
  <link href="https://images-eu.ssl-images-amazon.com/images/I/11EIQ5IGqaL._RC|01ZTHTZObnL.css,41JZEtDv4tL.css,31Y8m1dzTdL.css,013z33uKh2L.css,017DsKjNQJL.css,0131vqwP5UL.css,41EWOOlBJ9L.css,11TIuySqr6L.css,01ElnPiDxWL.css,11bGSgD5pDL.css,01Dm5eKVxwL.css,01IdKcBuAdL.css,01y-XAlI+2L.css,21N4kUH7pxL.css,01oDR3IULNL.css,41CYNGpGlrL.css,01XPHJk60-L.css,114y0SIP+yL.css,21aPhFy+riL

In [253]:
soup.find_all('li', {'class':'zg-item-immersion'})[1].find_all('a')[1].text

'\n4.0 out of 5 stars\n'

In [254]:
def clean_num(s):
    """cleans text scraped from website
    
    Parameters:
    s (String): raw text

    Returns:
    String: cleaned digit text

    """ 

    if s is None:
        return s
    return "".join(re.findall('\d*\.?\d+',s))

def get_data(block, tag, class_text):
    """fetch data from tag and its associated class
    
    Parameters:
    block (int): Book item 
    tag (String): tag for element we want to fetch
    class_text (String): class for element we want to fetch

    Returns:
    String: data text fetched

    """ 
    try:
        return block.find(tag, class_ = class_text).text
    except AttributeError:
        return ""
    return

In [255]:
for i in (soup.find("li", "zg-item-immersion")):
    print(str.strip(get_data(i, "a", "a-link-normal")))
    print(get_data(i, "span", "a-icon-alt"))
    print(clean_num(get_data(i, "span", "a-icon-alt").split(" ")[0]))
    print(get_data(i, "a","a-size-small a-link-normal"))
    print(clean_num(get_data(i, "a","a-size-small a-link-normal")))
    print(get_data(i, "span","p13n-sc-price"))
    print(clean_num(get_data(i, "span", "p13n-sc-price")))

Huawei Honor xsport Bluetooth Headset am61 IPX5 Waterproof BT4.1 Music Mic Control Wireless Earphone For android
2.7 out of 5 stars
2.7
3
3
SAR 115.00
115.00


In [256]:
products_list = []
for i in (soup.find_all("li", class_ = "zg-item-immersion")):
    pro_name = (str.strip(get_data(i, "a", "a-link-normal")))
    pro_rate= (get_data(i, "span", "a-icon-alt"))
    pro_rate_clean = (clean_num(get_data(i, "span", "a-icon-alt").split(" ")[0]))
    review_num = (get_data(i, "a","a-size-small a-link-normal"))
    review_num_clean = (clean_num(get_data(i, "a","a-size-small a-link-normal")))
    pro_price = (get_data(i, "span","p13n-sc-price"))
    pro_price_clean = (clean_num(get_data(i, "span", "p13n-sc-price")))
    
    headers = ['pro_name', 'pro_rate', 'pro_rate_clean','review_num',
              'review_num_clean', 'pro_price','pro_price_clean']
    products_dict = dict(zip(headers, [ pro_name,
                                    pro_rate,
                                    pro_rate_clean,
                                    review_num,
                                    review_num_clean, 
                                    pro_price,
                                    pro_price_clean]))
    products_list.append(products_dict)

In [257]:
products_df = pd.DataFrame(products_list).replace("",np.nan)  #convert list of dict to df
products_df

Unnamed: 0,pro_name,pro_rate,pro_rate_clean,review_num,review_num_clean,pro_price,pro_price_clean
0,Huawei Honor xsport Bluetooth Headset am61 IPX...,2.7 out of 5 stars,2.7,3.0,3.0,SAR 115.00,115.0
1,JBL TUNE 700BT WIRELESS OVER-EAR HEADPHONE - B...,4.0 out of 5 stars,4.0,1621.0,1621.0,SAR 213.99,213.99
2,"True Wireless Earphones, Bluetooth 5 earbuds w...",4.3 out of 5 stars,4.3,36.0,36.0,SAR 52.00,52.0
3,Nylon Case For Apple Air Pods Protective Bluet...,2.3 out of 5 stars,2.3,9.0,9.0,SAR 48.00,48.0
4,beats Flex – All-Day Wireless Earphones – beat...,3.4 out of 5 stars,3.4,29.0,29.0,SAR 169.00,169.0
5,JBL T110 Wired Universal In-Ear Headphone with...,4.3 out of 5 stars,4.3,20787.0,20787.0,SAR 28.15,28.15
6,SKULLCANDY Dime True Wireless In-Ear Earbuds W...,4.1 out of 5 stars,4.1,3418.0,3418.0,SAR 99.00,99.0
7,Sony WH-CH510 Wireless Bluetooth On-Ear with M...,3.6 out of 5 stars,3.6,112.0,112.0,SAR 142.00,142.0
8,HUAWEI FreeBuds 4 Wireless Bluetooth Open-fit ...,4.0 out of 5 stars,4.0,26.0,26.0,SAR 544.75,544.75
9,Sony WI-XB400 Wireless Extra Bass in-Ear Headp...,3.7 out of 5 stars,3.7,204.0,204.0,SAR 159.00,159.0


# Best Sellers in Mobile Phones & Communication Products (Page 1 )

In [3]:
url="https://www.amazon.sa/-/en/gp/bestsellers/electronics/16966387031/ref=zg_bs_nav_1_electronics"

In [4]:
response = requests.get(url)
response.status_code

200

In [5]:
response.text[:1000]

'<!doctype html><html lang="en-ae" class="a-no-js" data-19ax5a9jf="dingo"><!-- sp:feature:head-start -->\n<head><script>var aPageStart = (new Date()).getTime();</script><meta charset="utf-8"/>\n<!-- sp:end-feature:head-start -->\n\n<!-- sp:feature:cs-optimization -->\n<meta http-equiv=\'x-dns-prefetch-control\' content=\'on\'>\n<link rel="dns-prefetch" href="https://images-eu.ssl-images-amazon.com">\n<link rel="dns-prefetch" href="https://m.media-amazon.com">\n<link rel="dns-prefetch" href="https://completion.amazon.com">\n<!-- sp:end-feature:cs-optimization -->\n\n<!-- sp:feature:aui-assets -->\n<link rel="stylesheet" href="https://images-eu.ssl-images-amazon.com/images/I/11EIQ5IGqaL._RC|01ZTHTZObnL.css,41JZEtDv4tL.css,31Y8m1dzTdL.css,013z33uKh2L.css,017DsKjNQJL.css,0131vqwP5UL.css,41EWOOlBJ9L.css,11TIuySqr6L.css,01ElnPiDxWL.css,11bGSgD5pDL.css,01Dm5eKVxwL.css,01IdKcBuAdL.css,01y-XAlI+2L.css,21N4kUH7pxL.css,01oDR3IULNL.css,41CYNGpGlrL.css,01XPHJk60-L.css,114y0SIP+yL.css,21aPhFy+riL.cs

In [13]:
page = response.text
soup = BeautifulSoup(page, 'lxml')

In [7]:
print(soup.prettify())

<!DOCTYPE html>
<html class="a-no-js" data-19ax5a9jf="dingo" lang="en-ae">
 <!-- sp:feature:head-start -->
 <head>
  <script>
   var aPageStart = (new Date()).getTime();
  </script>
  <meta charset="utf-8"/>
  <!-- sp:end-feature:head-start -->
  <!-- sp:feature:cs-optimization -->
  <meta content="on" http-equiv="x-dns-prefetch-control"/>
  <link href="https://images-eu.ssl-images-amazon.com" rel="dns-prefetch"/>
  <link href="https://m.media-amazon.com" rel="dns-prefetch"/>
  <link href="https://completion.amazon.com" rel="dns-prefetch"/>
  <!-- sp:end-feature:cs-optimization -->
  <!-- sp:feature:aui-assets -->
  <link href="https://images-eu.ssl-images-amazon.com/images/I/11EIQ5IGqaL._RC|01ZTHTZObnL.css,41JZEtDv4tL.css,31Y8m1dzTdL.css,013z33uKh2L.css,017DsKjNQJL.css,0131vqwP5UL.css,41EWOOlBJ9L.css,11TIuySqr6L.css,01ElnPiDxWL.css,11bGSgD5pDL.css,01Dm5eKVxwL.css,01IdKcBuAdL.css,01y-XAlI+2L.css,21N4kUH7pxL.css,01oDR3IULNL.css,41CYNGpGlrL.css,01XPHJk60-L.css,114y0SIP+yL.css,21aPhFy+riL

In [8]:
soup.find_all('li', {'class':'zg-item-immersion'})[1].find_all('a')[1].text

'\n1.0 out of 5 stars\n'

In [9]:
def clean_num(s):
    """cleans text scraped from website
    
    Parameters:
    s (String): raw text

    Returns:
    String: cleaned digit text

    """ 

    if s is None:
        return s
    return "".join(re.findall('\d*\.?\d+',s))

def get_data(block, tag, class_text):
    """fetch data from tag and its associated class
    
    Parameters:
    block (int): Book item 
    tag (String): tag for element we want to fetch
    class_text (String): class for element we want to fetch

    Returns:
    String: data text fetched

    """ 
    try:
        return block.find(tag, class_ = class_text).text
    except AttributeError:
        return ""
    return

In [10]:
for i in (soup.find("li", "zg-item-immersion")):
    print(str.strip(get_data(i, "a", "a-link-normal")))
    print(get_data(i, "span", "a-icon-alt"))
    print(clean_num(get_data(i, "span", "a-icon-alt").split(" ")[0]))
    print(get_data(i, "a","a-size-small a-link-normal"))
    print(clean_num(get_data(i, "a","a-size-small a-link-normal")))
    print(get_data(i, "span","p13n-sc-price"))
    print(clean_num(get_data(i, "span", "p13n-sc-price")))

SAMSUNG Galaxy A12 Dual SIM Smartphone - 64GB, 4GB RAM, LTE, Black (KSA Version)
3.0 out of 5 stars
3.0
2
2
SAR 539.00
539.00


In [11]:
products_list = []
for i in (soup.find_all("li", class_ = "zg-item-immersion")):
    pro_name = (str.strip(get_data(i, "a", "a-link-normal")))
    pro_rate= (get_data(i, "span", "a-icon-alt"))
    pro_rate_clean = (clean_num(get_data(i, "span", "a-icon-alt").split(" ")[0]))
    review_num = (get_data(i, "a","a-size-small a-link-normal"))
    review_num_clean = (clean_num(get_data(i, "a","a-size-small a-link-normal")))
    pro_price = (get_data(i, "span","p13n-sc-price"))
    pro_price_clean = (clean_num(get_data(i, "span", "p13n-sc-price")))
    
    headers = ['pro_name', 'pro_rate', 'pro_rate_clean','review_num',
              'review_num_clean', 'pro_price','pro_price_clean']
    products_dict = dict(zip(headers, [ pro_name,
                                    pro_rate,
                                    pro_rate_clean,
                                    review_num,
                                    review_num_clean, 
                                    pro_price,
                                    pro_price_clean]))
    products_list.append(products_dict)

In [12]:
products_df = pd.DataFrame(products_list).replace("",np.nan)  #convert list of dict to df
products_df

Unnamed: 0,pro_name,pro_rate,pro_rate_clean,review_num,review_num_clean,pro_price,pro_price_clean
0,"SAMSUNG Galaxy A12 Dual SIM Smartphone - 64GB,...",3.0 out of 5 stars,3.0,2.0,2.0,SAR 539.00,539.0
1,Samsung Galaxy A12 LTE Dual SIM Smartphone - 6...,1.0 out of 5 stars,1.0,2.0,2.0,SAR 539.00,539.0
2,"Apple 20W USB-C Power Adapter, White",4.5 out of 5 stars,4.5,358.0,358.0,SAR 79.00,79.0
3,"SAMSUNG Galaxy M12 Dual SIM Smartphone - 64GB,...",4.0 out of 5 stars,4.0,38.0,38.0,SAR 499.00,499.0
4,"Anker PowerCore Select 20000, 20000mAh Power B...",4.1 out of 5 stars,4.1,54.0,54.0,SAR 94.00,94.0
5,"SAMSUNG Galaxy M12 Dual SIM Smartphone - 64GB,...",4.2 out of 5 stars,4.2,22.0,22.0,SAR 499.00,499.0
6,New Apple Airpods Pro,4.3 out of 5 stars,4.3,2350.0,2350.0,SAR 749.00,749.0
7,"SAMSUNG Galaxy M12 Dual SIM Smartphone - 64GB,...",4.5 out of 5 stars,4.5,10.0,10.0,SAR 499.00,499.0
8,Apple AirPods with Charging Case,4.1 out of 5 stars,4.1,2775.0,2775.0,SAR 475.00,475.0
9,JBL Flip 5 Portable Speaker Waterproof Wireles...,4.6 out of 5 stars,4.6,23515.0,23515.0,SAR 249.00,249.0


# Best Sellers in Mobile Phones & Communication Products (Page 2 )

In [14]:
url="https://www.amazon.sa/-/en/gp/bestsellers/electronics/16966387031/ref=zg_bs_pg_2?ie=UTF8&pg=2"

In [15]:
response = requests.get(url)
response.status_code

200

In [16]:
response.text[:1000]

'<!doctype html><html lang="en-ae" class="a-no-js" data-19ax5a9jf="dingo"><!-- sp:feature:head-start -->\n<head><script>var aPageStart = (new Date()).getTime();</script><meta charset="utf-8"/>\n<!-- sp:end-feature:head-start -->\n\n<!-- sp:feature:cs-optimization -->\n<meta http-equiv=\'x-dns-prefetch-control\' content=\'on\'>\n<link rel="dns-prefetch" href="https://images-eu.ssl-images-amazon.com">\n<link rel="dns-prefetch" href="https://m.media-amazon.com">\n<link rel="dns-prefetch" href="https://completion.amazon.com">\n<!-- sp:end-feature:cs-optimization -->\n\n<!-- sp:feature:aui-assets -->\n<link rel="stylesheet" href="https://images-eu.ssl-images-amazon.com/images/I/11EIQ5IGqaL._RC|01ZTHTZObnL.css,41JZEtDv4tL.css,31Y8m1dzTdL.css,013z33uKh2L.css,017DsKjNQJL.css,0131vqwP5UL.css,41EWOOlBJ9L.css,11TIuySqr6L.css,01ElnPiDxWL.css,11bGSgD5pDL.css,01Dm5eKVxwL.css,01IdKcBuAdL.css,01y-XAlI+2L.css,21N4kUH7pxL.css,01oDR3IULNL.css,41CYNGpGlrL.css,01XPHJk60-L.css,114y0SIP+yL.css,21aPhFy+riL.cs

In [17]:
page = response.text
soup = BeautifulSoup(page, 'lxml')

In [18]:
print(soup.prettify())

<!DOCTYPE html>
<html class="a-no-js" data-19ax5a9jf="dingo" lang="en-ae">
 <!-- sp:feature:head-start -->
 <head>
  <script>
   var aPageStart = (new Date()).getTime();
  </script>
  <meta charset="utf-8"/>
  <!-- sp:end-feature:head-start -->
  <!-- sp:feature:cs-optimization -->
  <meta content="on" http-equiv="x-dns-prefetch-control"/>
  <link href="https://images-eu.ssl-images-amazon.com" rel="dns-prefetch"/>
  <link href="https://m.media-amazon.com" rel="dns-prefetch"/>
  <link href="https://completion.amazon.com" rel="dns-prefetch"/>
  <!-- sp:end-feature:cs-optimization -->
  <!-- sp:feature:aui-assets -->
  <link href="https://images-eu.ssl-images-amazon.com/images/I/11EIQ5IGqaL._RC|01ZTHTZObnL.css,41JZEtDv4tL.css,31Y8m1dzTdL.css,013z33uKh2L.css,017DsKjNQJL.css,0131vqwP5UL.css,41EWOOlBJ9L.css,11TIuySqr6L.css,01ElnPiDxWL.css,11bGSgD5pDL.css,01Dm5eKVxwL.css,01IdKcBuAdL.css,01y-XAlI+2L.css,21N4kUH7pxL.css,01oDR3IULNL.css,41CYNGpGlrL.css,01XPHJk60-L.css,114y0SIP+yL.css,21aPhFy+riL

In [19]:
soup.find_all('li', {'class':'zg-item-immersion'})[1].find_all('a')[1].text

'\n4.3 out of 5 stars\n'

In [20]:
def clean_num(s):
    """cleans text scraped from website
    
    Parameters:
    s (String): raw text

    Returns:
    String: cleaned digit text

    """ 

    if s is None:
        return s
    return "".join(re.findall('\d*\.?\d+',s))

def get_data(block, tag, class_text):
    """fetch data from tag and its associated class
    
    Parameters:
    block (int): Book item 
    tag (String): tag for element we want to fetch
    class_text (String): class for element we want to fetch

    Returns:
    String: data text fetched

    """ 
    try:
        return block.find(tag, class_ = class_text).text
    except AttributeError:
        return ""
    return

In [21]:
for i in (soup.find("li", "zg-item-immersion")):
    print(str.strip(get_data(i, "a", "a-link-normal")))
    print(get_data(i, "span", "a-icon-alt"))
    print(clean_num(get_data(i, "span", "a-icon-alt").split(" ")[0]))
    print(get_data(i, "a","a-size-small a-link-normal"))
    print(clean_num(get_data(i, "a","a-size-small a-link-normal")))
    print(get_data(i, "span","p13n-sc-price"))
    print(clean_num(get_data(i, "span", "p13n-sc-price")))

UGREEN iPhone Protective Case Compatible for iPhone 13 Pro Clear Case with Shock Absorption Anti Scratch TPU Precise Cutouts and Slim Fit Case Easy to Install iPhone 13 Pro Transparent Cover 6.1 inch
4.5 out of 5 stars
4.5
56
56
SAR 31.20
31.20


In [22]:
products_list = []
for i in (soup.find_all("li", class_ = "zg-item-immersion")):
    pro_name = (str.strip(get_data(i, "a", "a-link-normal")))
    pro_rate= (get_data(i, "span", "a-icon-alt"))
    pro_rate_clean = (clean_num(get_data(i, "span", "a-icon-alt").split(" ")[0]))
    review_num = (get_data(i, "a","a-size-small a-link-normal"))
    review_num_clean = (clean_num(get_data(i, "a","a-size-small a-link-normal")))
    pro_price = (get_data(i, "span","p13n-sc-price"))
    pro_price_clean = (clean_num(get_data(i, "span", "p13n-sc-price")))
    
    headers = ['pro_name', 'pro_rate', 'pro_rate_clean','review_num',
              'review_num_clean', 'pro_price','pro_price_clean']
    products_dict = dict(zip(headers, [ pro_name,
                                    pro_rate,
                                    pro_rate_clean,
                                    review_num,
                                    review_num_clean, 
                                    pro_price,
                                    pro_price_clean]))
    products_list.append(products_dict)

In [23]:
products_df = pd.DataFrame(products_list).replace("",np.nan)  #convert list of dict to df
products_df

Unnamed: 0,pro_name,pro_rate,pro_rate_clean,review_num,review_num_clean,pro_price,pro_price_clean
0,UGREEN iPhone Protective Case Compatible for i...,4.5 out of 5 stars,4.5,56,56,SAR 31.20,31.2
1,"Anker Powerline Micro USB - Charging Cable, wi...",4.3 out of 5 stars,4.3,43,43,SAR 19.00,19.0
2,"Anker Nano iPhone Charger, 20W PIQ 3.0 Durable...",4.2 out of 5 stars,4.2,135,135,SAR 44.00,44.0
3,"Moto G9 Power Smartphone ,20W Super Charge, 60...",4.0 out of 5 stars,4.0,54,54,SAR 549.00,549.0
4,"Wireless Earbuds, SOUNDPEATS S5 Over-Ear Hooks...",4.4 out of 5 stars,4.4,190,190,SAR 103.20,103.2
5,Sony WF-1000XM4 Industry Leading Noise Cancell...,3.9 out of 5 stars,3.9,28,28,"SAR 1,049.00",1049.0
6,Apple iPhone 12 With FaceTime (128GB) - Purple,4.4 out of 5 stars,4.4,353,353,"SAR 3,149.00",3149.0
7,Compatible with Apple Watch Case Series 4 Seri...,3.9 out of 5 stars,3.9,204,204,SAR 12.77,12.77
8,Apple iPhone 12 With FaceTime (128GB) - Black,4.4 out of 5 stars,4.4,353,353,"SAR 3,299.00",3299.0
9,SoundPEATS Smart Watch New Upgraded 13 Sports ...,4.2 out of 5 stars,4.2,490,490,SAR 159.00,159.0


# Best Sellers in Wearable Technology (Page 1)

In [7]:
url= "https://www.amazon.sa/-/en/gp/bestsellers/electronics/16966397031/ref=zg_bs_pg_1?ie=UTF8&pg=1"

In [8]:
response = requests.get(url)
response.status_code

200

In [9]:
response.text[:1000]

'<!doctype html><html lang="en-ae" class="a-no-js" data-19ax5a9jf="dingo"><!-- sp:feature:head-start -->\n<head><script>var aPageStart = (new Date()).getTime();</script><meta charset="utf-8"/>\n<!-- sp:end-feature:head-start -->\n\n<!-- sp:feature:cs-optimization -->\n<meta http-equiv=\'x-dns-prefetch-control\' content=\'on\'>\n<link rel="dns-prefetch" href="https://images-eu.ssl-images-amazon.com">\n<link rel="dns-prefetch" href="https://m.media-amazon.com">\n<link rel="dns-prefetch" href="https://completion.amazon.com">\n<!-- sp:end-feature:cs-optimization -->\n\n<!-- sp:feature:aui-assets -->\n<link rel="stylesheet" href="https://images-eu.ssl-images-amazon.com/images/I/11EIQ5IGqaL._RC|01ZTHTZObnL.css,41JZEtDv4tL.css,31Y8m1dzTdL.css,013z33uKh2L.css,017DsKjNQJL.css,0131vqwP5UL.css,41EWOOlBJ9L.css,11TIuySqr6L.css,01ElnPiDxWL.css,11bGSgD5pDL.css,01Dm5eKVxwL.css,01IdKcBuAdL.css,01y-XAlI+2L.css,21N4kUH7pxL.css,01oDR3IULNL.css,41CYNGpGlrL.css,01XPHJk60-L.css,114y0SIP+yL.css,21aPhFy+riL.cs

In [10]:
page = response.text
soup = BeautifulSoup(page, 'lxml')

In [11]:
print(soup.prettify())

<!DOCTYPE html>
<html class="a-no-js" data-19ax5a9jf="dingo" lang="en-ae">
 <!-- sp:feature:head-start -->
 <head>
  <script>
   var aPageStart = (new Date()).getTime();
  </script>
  <meta charset="utf-8"/>
  <!-- sp:end-feature:head-start -->
  <!-- sp:feature:cs-optimization -->
  <meta content="on" http-equiv="x-dns-prefetch-control"/>
  <link href="https://images-eu.ssl-images-amazon.com" rel="dns-prefetch"/>
  <link href="https://m.media-amazon.com" rel="dns-prefetch"/>
  <link href="https://completion.amazon.com" rel="dns-prefetch"/>
  <!-- sp:end-feature:cs-optimization -->
  <!-- sp:feature:aui-assets -->
  <link href="https://images-eu.ssl-images-amazon.com/images/I/11EIQ5IGqaL._RC|01ZTHTZObnL.css,41JZEtDv4tL.css,31Y8m1dzTdL.css,013z33uKh2L.css,017DsKjNQJL.css,0131vqwP5UL.css,41EWOOlBJ9L.css,11TIuySqr6L.css,01ElnPiDxWL.css,11bGSgD5pDL.css,01Dm5eKVxwL.css,01IdKcBuAdL.css,01y-XAlI+2L.css,21N4kUH7pxL.css,01oDR3IULNL.css,41CYNGpGlrL.css,01XPHJk60-L.css,114y0SIP+yL.css,21aPhFy+riL

In [12]:
soup.find_all('li', {'class':'zg-item-immersion'})[1].find_all('a')[1].text

'\n4.6 out of 5 stars\n'

In [13]:
def clean_num(s):
    """cleans text scraped from website
    
    Parameters:
    s (String): raw text

    Returns:
    String: cleaned digit text

    """ 

    if s is None:
        return s
    return "".join(re.findall('\d*\.?\d+',s))

def get_data(block, tag, class_text):
    """fetch data from tag and its associated class
    
    Parameters:
    block (int): Book item 
    tag (String): tag for element we want to fetch
    class_text (String): class for element we want to fetch

    Returns:
    String: data text fetched

    """ 
    try:
        return block.find(tag, class_ = class_text).text
    except AttributeError:
        return ""
    return

In [14]:
for i in (soup.find("li", "zg-item-immersion")):
    print(str.strip(get_data(i, "a", "a-link-normal")))
    print(get_data(i, "span", "a-icon-alt"))
    print(clean_num(get_data(i, "span", "a-icon-alt").split(" ")[0]))
    print(get_data(i, "a","a-size-small a-link-normal"))
    print(clean_num(get_data(i, "a","a-size-small a-link-normal")))
    print(get_data(i, "span","p13n-sc-price"))
    print(clean_num(get_data(i, "span", "p13n-sc-price")))

Apple Watch Series 6 (GPS, 44mm) - Space Grey Aluminium Case with Black Sport Band
4.6 out of 5 stars
4.6
306
306
SAR 1,349.00
1349.00


In [15]:
products_list = []
for i in (soup.find_all("li", class_ = "zg-item-immersion")):
    pro_name = (str.strip(get_data(i, "a", "a-link-normal")))
    pro_rate= (get_data(i, "span", "a-icon-alt"))
    pro_rate_clean = (clean_num(get_data(i, "span", "a-icon-alt").split(" ")[0]))
    review_num = (get_data(i, "a","a-size-small a-link-normal"))
    review_num_clean = (clean_num(get_data(i, "a","a-size-small a-link-normal")))
    pro_price = (get_data(i, "span","p13n-sc-price"))
    pro_price_clean = (clean_num(get_data(i, "span", "p13n-sc-price")))
    
    headers = ['pro_name', 'pro_rate', 'pro_rate_clean','review_num',
              'review_num_clean', 'pro_price','pro_price_clean']
    products_dict = dict(zip(headers, [ pro_name,
                                    pro_rate,
                                    pro_rate_clean,
                                    review_num,
                                    review_num_clean, 
                                    pro_price,
                                    pro_price_clean]))
    products_list.append(products_dict)

In [16]:
products_df = pd.DataFrame(products_list).replace("",np.nan)  #convert list of dict to df
products_df

Unnamed: 0,pro_name,pro_rate,pro_rate_clean,review_num,review_num_clean,pro_price,pro_price_clean
0,"Apple Watch Series 6 (GPS, 44mm) - Space Grey ...",4.6 out of 5 stars,4.6,306.0,306.0,"SAR 1,349.00",1349.0
1,"Apple Watch Series 6 (GPS, 40mm) - Gold Alumin...",4.6 out of 5 stars,4.6,306.0,306.0,"SAR 1,199.00",1199.0
2,"HUAWEI Band 6, All-day SpO2 Monitoring, 1.47"" ...",4.4 out of 5 stars,4.4,814.0,814.0,SAR 188.95,188.95
3,"Apple Watch Series 6 (GPS, 40mm) - Blue Alumin...",4.6 out of 5 stars,4.6,306.0,306.0,"SAR 1,199.00",1199.0
4,"Apple Watch Series 6 (GPS, 40mm) - Space Grey ...",4.6 out of 5 stars,4.6,306.0,306.0,"SAR 1,199.00",1199.0
5,Compatible with Apple Watch Case Series 4 Seri...,3.9 out of 5 stars,3.9,204.0,204.0,SAR 12.77,12.77
6,"Apple Watch Series 6 (GPS, 44mm) - Blue Alumin...",4.6 out of 5 stars,4.6,306.0,306.0,"SAR 1,349.00",1349.0
7,"ibsun Wireless Charger, 3 in 1 Wireless Chargi...",4.1 out of 5 stars,4.1,184.0,184.0,SAR 100.00,100.0
8,"Fitbit-Inspire 2, Black/Black",4.5 out of 5 stars,4.5,18382.0,18382.0,SAR 259.00,259.0
9,Milanese Loop Bracelet Stainless Steel band Fo...,4.3 out of 5 stars,4.3,29.0,29.0,SAR 25.99,25.99


# Best Sellers in Wearable Technology (Page 2)

In [17]:
url = "https://www.amazon.sa/-/en/gp/bestsellers/electronics/16966397031/ref=zg_bs_pg_2?ie=UTF8&pg=2"

In [18]:
response = requests.get(url)
response.status_code

200

In [19]:
response.text[:1000]

'<!doctype html><html lang="en-ae" class="a-no-js" data-19ax5a9jf="dingo"><!-- sp:feature:head-start -->\n<head><script>var aPageStart = (new Date()).getTime();</script><meta charset="utf-8"/>\n<!-- sp:end-feature:head-start -->\n\n<!-- sp:feature:cs-optimization -->\n<meta http-equiv=\'x-dns-prefetch-control\' content=\'on\'>\n<link rel="dns-prefetch" href="https://images-eu.ssl-images-amazon.com">\n<link rel="dns-prefetch" href="https://m.media-amazon.com">\n<link rel="dns-prefetch" href="https://completion.amazon.com">\n<!-- sp:end-feature:cs-optimization -->\n\n<!-- sp:feature:aui-assets -->\n<link rel="stylesheet" href="https://images-eu.ssl-images-amazon.com/images/I/11EIQ5IGqaL._RC|01ZTHTZObnL.css,41JZEtDv4tL.css,31Y8m1dzTdL.css,013z33uKh2L.css,017DsKjNQJL.css,0131vqwP5UL.css,41EWOOlBJ9L.css,11TIuySqr6L.css,01ElnPiDxWL.css,11bGSgD5pDL.css,01Dm5eKVxwL.css,01IdKcBuAdL.css,01y-XAlI+2L.css,21N4kUH7pxL.css,01oDR3IULNL.css,41CYNGpGlrL.css,01XPHJk60-L.css,114y0SIP+yL.css,21aPhFy+riL.cs

In [20]:
page = response.text
soup = BeautifulSoup(page, 'lxml')

In [21]:
print(soup.prettify())

<!DOCTYPE html>
<html class="a-no-js" data-19ax5a9jf="dingo" lang="en-ae">
 <!-- sp:feature:head-start -->
 <head>
  <script>
   var aPageStart = (new Date()).getTime();
  </script>
  <meta charset="utf-8"/>
  <!-- sp:end-feature:head-start -->
  <!-- sp:feature:cs-optimization -->
  <meta content="on" http-equiv="x-dns-prefetch-control"/>
  <link href="https://images-eu.ssl-images-amazon.com" rel="dns-prefetch"/>
  <link href="https://m.media-amazon.com" rel="dns-prefetch"/>
  <link href="https://completion.amazon.com" rel="dns-prefetch"/>
  <!-- sp:end-feature:cs-optimization -->
  <!-- sp:feature:aui-assets -->
  <link href="https://images-eu.ssl-images-amazon.com/images/I/11EIQ5IGqaL._RC|01ZTHTZObnL.css,41JZEtDv4tL.css,31Y8m1dzTdL.css,013z33uKh2L.css,017DsKjNQJL.css,0131vqwP5UL.css,41EWOOlBJ9L.css,11TIuySqr6L.css,01ElnPiDxWL.css,11bGSgD5pDL.css,01Dm5eKVxwL.css,01IdKcBuAdL.css,01y-XAlI+2L.css,21N4kUH7pxL.css,01oDR3IULNL.css,41CYNGpGlrL.css,01XPHJk60-L.css,114y0SIP+yL.css,21aPhFy+riL

In [22]:
soup.find_all('li', {'class':'zg-item-immersion'})[1].find_all('a')[1].text

'\n3.6 out of 5 stars\n'

In [23]:
def clean_num(s):
    """cleans text scraped from website
    
    Parameters:
    s (String): raw text

    Returns:
    String: cleaned digit text

    """ 

    if s is None:
        return s
    return "".join(re.findall('\d*\.?\d+',s))

def get_data(block, tag, class_text):
    """fetch data from tag and its associated class
    
    Parameters:
    block (int): Book item 
    tag (String): tag for element we want to fetch
    class_text (String): class for element we want to fetch

    Returns:
    String: data text fetched

    """ 
    try:
        return block.find(tag, class_ = class_text).text
    except AttributeError:
        return ""
    return

In [24]:
for i in (soup.find("li", "zg-item-immersion")):
    print(str.strip(get_data(i, "a", "a-link-normal")))
    print(get_data(i, "span", "a-icon-alt"))
    print(clean_num(get_data(i, "span", "a-icon-alt").split(" ")[0]))
    print(get_data(i, "a","a-size-small a-link-normal"))
    print(clean_num(get_data(i, "a","a-size-small a-link-normal")))
    print(get_data(i, "span","p13n-sc-price"))
    print(clean_num(get_data(i, "span", "p13n-sc-price")))

INFOSUN Compatible with Apple Watch Bands 42mm/44mm/45mm Women Men,Braided Solo Loop Replacement Band Strap Stretchable Elastic Sport Wristbandfor Series 7/6/SE/5/4/3/2/1 (42mm/44mm/45mm-L, Black)
3.5 out of 5 stars
3.5
33
33
SAR 37.13
37.13


In [25]:
products_list = []
for i in (soup.find_all("li", class_ = "zg-item-immersion")):
    pro_name = (str.strip(get_data(i, "a", "a-link-normal")))
    pro_rate= (get_data(i, "span", "a-icon-alt"))
    pro_rate_clean = (clean_num(get_data(i, "span", "a-icon-alt").split(" ")[0]))
    review_num = (get_data(i, "a","a-size-small a-link-normal"))
    review_num_clean = (clean_num(get_data(i, "a","a-size-small a-link-normal")))
    pro_price = (get_data(i, "span","p13n-sc-price"))
    pro_price_clean = (clean_num(get_data(i, "span", "p13n-sc-price")))
    
    headers = ['pro_name', 'pro_rate', 'pro_rate_clean','review_num',
              'review_num_clean', 'pro_price','pro_price_clean']
    products_dict = dict(zip(headers, [ pro_name,
                                    pro_rate,
                                    pro_rate_clean,
                                    review_num,
                                    review_num_clean, 
                                    pro_price,
                                    pro_price_clean]))
    products_list.append(products_dict)

In [26]:
products_df = pd.DataFrame(products_list).replace("",np.nan)  #convert list of dict to df
products_df

Unnamed: 0,pro_name,pro_rate,pro_rate_clean,review_num,review_num_clean,pro_price,pro_price_clean
0,INFOSUN Compatible with Apple Watch Bands 42mm...,3.5 out of 5 stars,3.5,33.0,33.0,SAR 37.13,37.13
1,"Smart Watch, AGPTEK 1.69""(43mm) Smartwatch for...",3.6 out of 5 stars,3.6,157.0,157.0,SAR 203.06,203.06
2,Compatible with HUAWEI Samsung Watch Silicone ...,3.7 out of 5 stars,3.7,4.0,4.0,SAR 14.99,14.99
3,6 Pack Sport Bands Compatible with Apple Watch...,,,,,SAR 71.42,71.42
4,Rekletier Replacement Bands Compatible with Fi...,,,,,SAR 42.00,42.0
5,GEEAN Stainless Steel Watch Band for Apple Wat...,3.7 out of 5 stars,3.7,8.0,8.0,SAR 41.65,41.65
6,"Apple Watch Series 7 (GPS, 45mm) - Blue Alumin...",,,,,"SAR 1,929.00",1929.0
7,"OMIRA Smart Watch, Fitness Tracker with Heart ...",3.8 out of 5 stars,3.8,34.0,34.0,SAR 119.99,119.99
8,"Compatible with Apple Watch Bands 44mm 42mm, A...",3.8 out of 5 stars,3.8,41.0,41.0,SAR 25.99,25.99
9,Replacement Bands Compatible for Fitbit Charge...,2.5 out of 5 stars,2.5,3.0,3.0,SAR 30.77,30.77


# Best Sellers in Car & Vehicle Electronics (Page 1)

In [27]:
url="https://www.amazon.sa/-/en/gp/bestsellers/electronics/16966386031/ref=zg_bs_nav_1_electronics"

In [28]:
response = requests.get(url)
response.status_code

200

In [29]:
response.text[:1000]

'<!doctype html><html lang="en-ae" class="a-no-js" data-19ax5a9jf="dingo"><!-- sp:feature:head-start -->\n<head><script>var aPageStart = (new Date()).getTime();</script><meta charset="utf-8"/>\n<!-- sp:end-feature:head-start -->\n\n<!-- sp:feature:cs-optimization -->\n<meta http-equiv=\'x-dns-prefetch-control\' content=\'on\'>\n<link rel="dns-prefetch" href="https://images-eu.ssl-images-amazon.com">\n<link rel="dns-prefetch" href="https://m.media-amazon.com">\n<link rel="dns-prefetch" href="https://completion.amazon.com">\n<!-- sp:end-feature:cs-optimization -->\n\n<!-- sp:feature:aui-assets -->\n<link rel="stylesheet" href="https://images-eu.ssl-images-amazon.com/images/I/11EIQ5IGqaL._RC|01ZTHTZObnL.css,41JZEtDv4tL.css,31Y8m1dzTdL.css,013z33uKh2L.css,017DsKjNQJL.css,0131vqwP5UL.css,41EWOOlBJ9L.css,11TIuySqr6L.css,01ElnPiDxWL.css,11bGSgD5pDL.css,01Dm5eKVxwL.css,01IdKcBuAdL.css,01y-XAlI+2L.css,21N4kUH7pxL.css,01oDR3IULNL.css,41CYNGpGlrL.css,01XPHJk60-L.css,114y0SIP+yL.css,21aPhFy+riL.cs

In [30]:
page = response.text
soup = BeautifulSoup(page, 'lxml')

In [31]:
print(soup.prettify())

<!DOCTYPE html>
<html class="a-no-js" data-19ax5a9jf="dingo" lang="en-ae">
 <!-- sp:feature:head-start -->
 <head>
  <script>
   var aPageStart = (new Date()).getTime();
  </script>
  <meta charset="utf-8"/>
  <!-- sp:end-feature:head-start -->
  <!-- sp:feature:cs-optimization -->
  <meta content="on" http-equiv="x-dns-prefetch-control"/>
  <link href="https://images-eu.ssl-images-amazon.com" rel="dns-prefetch"/>
  <link href="https://m.media-amazon.com" rel="dns-prefetch"/>
  <link href="https://completion.amazon.com" rel="dns-prefetch"/>
  <!-- sp:end-feature:cs-optimization -->
  <!-- sp:feature:aui-assets -->
  <link href="https://images-eu.ssl-images-amazon.com/images/I/11EIQ5IGqaL._RC|01ZTHTZObnL.css,41JZEtDv4tL.css,31Y8m1dzTdL.css,013z33uKh2L.css,017DsKjNQJL.css,0131vqwP5UL.css,41EWOOlBJ9L.css,11TIuySqr6L.css,01ElnPiDxWL.css,11bGSgD5pDL.css,01Dm5eKVxwL.css,01IdKcBuAdL.css,01y-XAlI+2L.css,21N4kUH7pxL.css,01oDR3IULNL.css,41CYNGpGlrL.css,01XPHJk60-L.css,114y0SIP+yL.css,21aPhFy+riL

In [32]:
soup.find_all('li', {'class':'zg-item-immersion'})[1].find_all('a')[1].text

'\n4.3 out of 5 stars\n'

In [33]:
def clean_num(s):
    """cleans text scraped from website
    
    Parameters:
    s (String): raw text

    Returns:
    String: cleaned digit text

    """ 

    if s is None:
        return s
    return "".join(re.findall('\d*\.?\d+',s))

def get_data(block, tag, class_text):
    """fetch data from tag and its associated class
    
    Parameters:
    block (int): Book item 
    tag (String): tag for element we want to fetch
    class_text (String): class for element we want to fetch

    Returns:
    String: data text fetched

    """ 
    try:
        return block.find(tag, class_ = class_text).text
    except AttributeError:
        return ""
    return

In [34]:
for i in (soup.find("li", "zg-item-immersion")):
    print(str.strip(get_data(i, "a", "a-link-normal")))
    print(get_data(i, "span", "a-icon-alt"))
    print(clean_num(get_data(i, "span", "a-icon-alt").split(" ")[0]))
    print(get_data(i, "a","a-size-small a-link-normal"))
    print(clean_num(get_data(i, "a","a-size-small a-link-normal")))
    print(get_data(i, "span","p13n-sc-price"))
    print(clean_num(get_data(i, "span", "p13n-sc-price")))

UGREEN Bluetooth Aux Adapter, Bluetooth 5.0 Audio Receiver for Wireless Music Stream with Hand-free Call, aptX LL, 10H Play Time, Auto-Repair for Car Speaker, Headphone, Audio Sound System, etc
4.5 out of 5 stars
4.5
218
218
SAR 93.60
93.60


In [35]:
products_list = []
for i in (soup.find_all("li", class_ = "zg-item-immersion")):
    pro_name = (str.strip(get_data(i, "a", "a-link-normal")))
    pro_rate= (get_data(i, "span", "a-icon-alt"))
    pro_rate_clean = (clean_num(get_data(i, "span", "a-icon-alt").split(" ")[0]))
    review_num = (get_data(i, "a","a-size-small a-link-normal"))
    review_num_clean = (clean_num(get_data(i, "a","a-size-small a-link-normal")))
    pro_price = (get_data(i, "span","p13n-sc-price"))
    pro_price_clean = (clean_num(get_data(i, "span", "p13n-sc-price")))
    
    headers = ['pro_name', 'pro_rate', 'pro_rate_clean','review_num',
              'review_num_clean', 'pro_price','pro_price_clean']
    products_dict = dict(zip(headers, [ pro_name,
                                    pro_rate,
                                    pro_rate_clean,
                                    review_num,
                                    review_num_clean, 
                                    pro_price,
                                    pro_price_clean]))
    products_list.append(products_dict)

In [36]:
products_df = pd.DataFrame(products_list).replace("",np.nan)  #convert list of dict to df
products_df

Unnamed: 0,pro_name,pro_rate,pro_rate_clean,review_num,review_num_clean,pro_price,pro_price_clean
0,"UGREEN Bluetooth Aux Adapter, Bluetooth 5.0 Au...",4.5 out of 5 stars,4.5,218.0,218.0,SAR 93.60,93.6
1,UGREEN Fast Car Charger Adapter 42.5W Dual USB...,4.3 out of 5 stars,4.3,2190.0,2190.0,SAR 63.20,63.2
2,UGREEN PD 20W Car Charger Fast Charging for iP...,4.3 out of 5 stars,4.3,2190.0,2190.0,SAR 44.00,44.0
3,UGREEN Car Phone Mount Dashboard Car Holder Wi...,4.4 out of 5 stars,4.4,1387.0,1387.0,SAR 76.00,76.0
4,Anker Power Drive 2 24W 2 Port Car Charger For...,4.2 out of 5 stars,4.2,30.0,30.0,SAR 28.00,28.0
5,"Magnetic Phone Car Mount, WORLDMOM Strong Magn...",,,,,SAR 25.50,25.5
6,UGREEN Magnetic Car Phone Holder Air Vent Moun...,4.0 out of 5 stars,4.0,1771.0,1771.0,SAR 69.60,69.6
7,UGREEN Car Phone Holder Magnetic Dashboard Mob...,4.2 out of 5 stars,4.2,3135.0,3135.0,SAR 52.00,52.0
8,"Muson Car Phone Mount, Dashboard/Air Vent/Wind...",3.7 out of 5 stars,3.7,9.0,9.0,SAR 60.00,60.0
9,UGREEN Car Air Vent Mount Cell Phone Holder Gr...,4.4 out of 5 stars,4.4,19681.0,19681.0,SAR 58.40,58.4


# Best Sellers in Car & Vehicle Electronics (Page 2)

In [37]:
url="https://www.amazon.sa/-/en/gp/bestsellers/electronics/16966386031/ref=zg_bs_pg_2?ie=UTF8&pg=2"

In [38]:
response = requests.get(url)
response.status_code

200

In [39]:
response.text[:1000]

'<!doctype html><html lang="en-ae" class="a-no-js" data-19ax5a9jf="dingo"><!-- sp:feature:head-start -->\n<head><script>var aPageStart = (new Date()).getTime();</script><meta charset="utf-8"/>\n<!-- sp:end-feature:head-start -->\n\n<!-- sp:feature:cs-optimization -->\n<meta http-equiv=\'x-dns-prefetch-control\' content=\'on\'>\n<link rel="dns-prefetch" href="https://images-eu.ssl-images-amazon.com">\n<link rel="dns-prefetch" href="https://m.media-amazon.com">\n<link rel="dns-prefetch" href="https://completion.amazon.com">\n<!-- sp:end-feature:cs-optimization -->\n\n<!-- sp:feature:aui-assets -->\n<link rel="stylesheet" href="https://images-eu.ssl-images-amazon.com/images/I/11EIQ5IGqaL._RC|01ZTHTZObnL.css,41JZEtDv4tL.css,31Y8m1dzTdL.css,013z33uKh2L.css,017DsKjNQJL.css,0131vqwP5UL.css,41EWOOlBJ9L.css,11TIuySqr6L.css,01ElnPiDxWL.css,11bGSgD5pDL.css,01Dm5eKVxwL.css,01IdKcBuAdL.css,01y-XAlI+2L.css,21N4kUH7pxL.css,01oDR3IULNL.css,41CYNGpGlrL.css,01XPHJk60-L.css,114y0SIP+yL.css,21aPhFy+riL.cs

In [40]:
page = response.text
soup = BeautifulSoup(page, 'lxml')

In [41]:
print(soup.prettify())

<!DOCTYPE html>
<html class="a-no-js" data-19ax5a9jf="dingo" lang="en-ae">
 <!-- sp:feature:head-start -->
 <head>
  <script>
   var aPageStart = (new Date()).getTime();
  </script>
  <meta charset="utf-8"/>
  <!-- sp:end-feature:head-start -->
  <!-- sp:feature:cs-optimization -->
  <meta content="on" http-equiv="x-dns-prefetch-control"/>
  <link href="https://images-eu.ssl-images-amazon.com" rel="dns-prefetch"/>
  <link href="https://m.media-amazon.com" rel="dns-prefetch"/>
  <link href="https://completion.amazon.com" rel="dns-prefetch"/>
  <!-- sp:end-feature:cs-optimization -->
  <!-- sp:feature:aui-assets -->
  <link href="https://images-eu.ssl-images-amazon.com/images/I/11EIQ5IGqaL._RC|01ZTHTZObnL.css,41JZEtDv4tL.css,31Y8m1dzTdL.css,013z33uKh2L.css,017DsKjNQJL.css,0131vqwP5UL.css,41EWOOlBJ9L.css,11TIuySqr6L.css,01ElnPiDxWL.css,11bGSgD5pDL.css,01Dm5eKVxwL.css,01IdKcBuAdL.css,01y-XAlI+2L.css,21N4kUH7pxL.css,01oDR3IULNL.css,41CYNGpGlrL.css,01XPHJk60-L.css,114y0SIP+yL.css,21aPhFy+riL

In [42]:
soup.find_all('li', {'class':'zg-item-immersion'})[1].find_all('a')[1].text

'\n4.5 out of 5 stars\n'

In [43]:
def clean_num(s):
    """cleans text scraped from website
    
    Parameters:
    s (String): raw text

    Returns:
    String: cleaned digit text

    """ 

    if s is None:
        return s
    return "".join(re.findall('\d*\.?\d+',s))

def get_data(block, tag, class_text):
    """fetch data from tag and its associated class
    
    Parameters:
    block (int): Book item 
    tag (String): tag for element we want to fetch
    class_text (String): class for element we want to fetch

    Returns:
    String: data text fetched

    """ 
    try:
        return block.find(tag, class_ = class_text).text
    except AttributeError:
        return ""
    return

In [44]:
for i in (soup.find("li", "zg-item-immersion")):
    print(str.strip(get_data(i, "a", "a-link-normal")))
    print(get_data(i, "span", "a-icon-alt"))
    print(clean_num(get_data(i, "span", "a-icon-alt").split(" ")[0]))
    print(get_data(i, "a","a-size-small a-link-normal"))
    print(clean_num(get_data(i, "a","a-size-small a-link-normal")))
    print(get_data(i, "span","p13n-sc-price"))
    print(clean_num(get_data(i, "span", "p13n-sc-price")))

Hide Car Charger for Routers 1.5 Meter Pin: 2.5
2.9 out of 5 stars
2.9
8
8




In [45]:
products_list = []
for i in (soup.find_all("li", class_ = "zg-item-immersion")):
    pro_name = (str.strip(get_data(i, "a", "a-link-normal")))
    pro_rate= (get_data(i, "span", "a-icon-alt"))
    pro_rate_clean = (clean_num(get_data(i, "span", "a-icon-alt").split(" ")[0]))
    review_num = (get_data(i, "a","a-size-small a-link-normal"))
    review_num_clean = (clean_num(get_data(i, "a","a-size-small a-link-normal")))
    pro_price = (get_data(i, "span","p13n-sc-price"))
    pro_price_clean = (clean_num(get_data(i, "span", "p13n-sc-price")))
    
    headers = ['pro_name', 'pro_rate', 'pro_rate_clean','review_num',
              'review_num_clean', 'pro_price','pro_price_clean']
    products_dict = dict(zip(headers, [ pro_name,
                                    pro_rate,
                                    pro_rate_clean,
                                    review_num,
                                    review_num_clean, 
                                    pro_price,
                                    pro_price_clean]))
    products_list.append(products_dict)

In [46]:
products_df = pd.DataFrame(products_list).replace("",np.nan)  #convert list of dict to df
products_df

Unnamed: 0,pro_name,pro_rate,pro_rate_clean,review_num,review_num_clean,pro_price,pro_price_clean
0,Hide Car Charger for Routers 1.5 Meter Pin: 2.5,2.9 out of 5 stars,2.9,8.0,8.0,,
1,"Tekpatt Gravity Car Phone Holder, Universal Ai...",4.5 out of 5 stars,4.5,8.0,8.0,SAR 37.90,37.9
2,Universal Dashboard Car Mount Holder Rearview ...,3.0 out of 5 stars,3.0,36.0,36.0,SAR 19.63,19.63
3,"Tekpatt Car Phone Holder, Universal Air Vent C...",3.5 out of 5 stars,3.5,8.0,8.0,SAR 29.90,29.9
4,Decdeal 95pcs 5050 RGB LED-Strip Connector Kit...,,,,,SAR 75.59,75.59
5,"LE LED Strip Lights for TV, 6.56Ft RGB Color C...",3.4 out of 5 stars,3.4,3.0,3.0,SAR 178.00,178.0
6,AMH Universal Magnetic Mobile Phone Holder,3.5 out of 5 stars,3.5,9.0,9.0,SAR 25.00,25.0
7,"Car Phone Mount, PHOCAR Universal Phone Holder...",2.8 out of 5 stars,2.8,6.0,6.0,SAR 29.99,29.99
8,Anker Mini 24W 4.8A Metal Dual USB Car Charger...,,,,,SAR 49.00,49.0
9,GoolRC 700W Adjustable Speed Car Waxing Polish...,1.0 out of 5 stars,1.0,1.0,1.0,SAR 121.89,121.89


# Best Sellers in Electrical Power Accessories (Page 1)

In [47]:
url="https://www.amazon.sa/-/en/gp/bestsellers/electronics/16966395031/ref=zg_bs_nav_1_electronics"

In [48]:
response = requests.get(url)
response.status_code

200

In [49]:
response.text[:1000]

'<!doctype html><html lang="en-ae" class="a-no-js" data-19ax5a9jf="dingo"><!-- sp:feature:head-start -->\n<head><script>var aPageStart = (new Date()).getTime();</script><meta charset="utf-8"/>\n<!-- sp:end-feature:head-start -->\n\n<!-- sp:feature:cs-optimization -->\n<meta http-equiv=\'x-dns-prefetch-control\' content=\'on\'>\n<link rel="dns-prefetch" href="https://images-eu.ssl-images-amazon.com">\n<link rel="dns-prefetch" href="https://m.media-amazon.com">\n<link rel="dns-prefetch" href="https://completion.amazon.com">\n<!-- sp:end-feature:cs-optimization -->\n\n<!-- sp:feature:aui-assets -->\n<link rel="stylesheet" href="https://images-eu.ssl-images-amazon.com/images/I/11EIQ5IGqaL._RC|01ZTHTZObnL.css,41JZEtDv4tL.css,31Y8m1dzTdL.css,013z33uKh2L.css,017DsKjNQJL.css,0131vqwP5UL.css,41EWOOlBJ9L.css,11TIuySqr6L.css,01ElnPiDxWL.css,11bGSgD5pDL.css,01Dm5eKVxwL.css,01IdKcBuAdL.css,01y-XAlI+2L.css,21N4kUH7pxL.css,01oDR3IULNL.css,41CYNGpGlrL.css,01XPHJk60-L.css,114y0SIP+yL.css,21aPhFy+riL.cs

In [50]:
page = response.text
soup = BeautifulSoup(page, 'lxml')

In [51]:
print(soup.prettify())

<!DOCTYPE html>
<html class="a-no-js" data-19ax5a9jf="dingo" lang="en-ae">
 <!-- sp:feature:head-start -->
 <head>
  <script>
   var aPageStart = (new Date()).getTime();
  </script>
  <meta charset="utf-8"/>
  <!-- sp:end-feature:head-start -->
  <!-- sp:feature:cs-optimization -->
  <meta content="on" http-equiv="x-dns-prefetch-control"/>
  <link href="https://images-eu.ssl-images-amazon.com" rel="dns-prefetch"/>
  <link href="https://m.media-amazon.com" rel="dns-prefetch"/>
  <link href="https://completion.amazon.com" rel="dns-prefetch"/>
  <!-- sp:end-feature:cs-optimization -->
  <!-- sp:feature:aui-assets -->
  <link href="https://images-eu.ssl-images-amazon.com/images/I/11EIQ5IGqaL._RC|01ZTHTZObnL.css,41JZEtDv4tL.css,31Y8m1dzTdL.css,013z33uKh2L.css,017DsKjNQJL.css,0131vqwP5UL.css,41EWOOlBJ9L.css,11TIuySqr6L.css,01ElnPiDxWL.css,11bGSgD5pDL.css,01Dm5eKVxwL.css,01IdKcBuAdL.css,01y-XAlI+2L.css,21N4kUH7pxL.css,01oDR3IULNL.css,41CYNGpGlrL.css,01XPHJk60-L.css,114y0SIP+yL.css,21aPhFy+riL

In [52]:
soup.find_all('li', {'class':'zg-item-immersion'})[1].find_all('a')[1].text

'\n3.9 out of 5 stars\n'

In [53]:
def clean_num(s):
    """cleans text scraped from website
    
    Parameters:
    s (String): raw text

    Returns:
    String: cleaned digit text

    """ 

    if s is None:
        return s
    return "".join(re.findall('\d*\.?\d+',s))

def get_data(block, tag, class_text):
    """fetch data from tag and its associated class
    
    Parameters:
    block (int): Book item 
    tag (String): tag for element we want to fetch
    class_text (String): class for element we want to fetch

    Returns:
    String: data text fetched

    """ 
    try:
        return block.find(tag, class_ = class_text).text
    except AttributeError:
        return ""
    return

In [54]:
for i in (soup.find("li", "zg-item-immersion")):
    print(str.strip(get_data(i, "a", "a-link-normal")))
    print(get_data(i, "span", "a-icon-alt"))
    print(clean_num(get_data(i, "span", "a-icon-alt").split(" ")[0]))
    print(get_data(i, "a","a-size-small a-link-normal"))
    print(clean_num(get_data(i, "a","a-size-small a-link-normal")))
    print(get_data(i, "span","p13n-sc-price"))
    print(clean_num(get_data(i, "span", "p13n-sc-price")))

Apple 20W USB-C Power Adapter, White
4.5 out of 5 stars
4.5
359
359
SAR 79.00
79.00


In [55]:
products_list = []
for i in (soup.find_all("li", class_ = "zg-item-immersion")):
    pro_name = (str.strip(get_data(i, "a", "a-link-normal")))
    pro_rate= (get_data(i, "span", "a-icon-alt"))
    pro_rate_clean = (clean_num(get_data(i, "span", "a-icon-alt").split(" ")[0]))
    review_num = (get_data(i, "a","a-size-small a-link-normal"))
    review_num_clean = (clean_num(get_data(i, "a","a-size-small a-link-normal")))
    pro_price = (get_data(i, "span","p13n-sc-price"))
    pro_price_clean = (clean_num(get_data(i, "span", "p13n-sc-price")))
    
    headers = ['pro_name', 'pro_rate', 'pro_rate_clean','review_num',
              'review_num_clean', 'pro_price','pro_price_clean']
    products_dict = dict(zip(headers, [ pro_name,
                                    pro_rate,
                                    pro_rate_clean,
                                    review_num,
                                    review_num_clean, 
                                    pro_price,
                                    pro_price_clean]))
    products_list.append(products_dict)

In [56]:
products_df = pd.DataFrame(products_list).replace("",np.nan)  #convert list of dict to df
products_df

Unnamed: 0,pro_name,pro_rate,pro_rate_clean,review_num,review_num_clean,pro_price,pro_price_clean
0,"Apple 20W USB-C Power Adapter, White",4.5 out of 5 stars,4.5,359.0,359.0,SAR 79.00,79.0
1,Extension Cord with 3 Power Socket and 3 USB S...,3.9 out of 5 stars,3.9,41.0,41.0,SAR 59.99,59.99
2,"LDNIO SC3604 Socket extension, 6USB Ports 3.4A...",3.9 out of 5 stars,3.9,99.0,99.0,SAR 61.66,61.66
3,Multi Extension Socket Plug Adapter with 3 USB...,4.6 out of 5 stars,4.6,32.0,32.0,SAR 72.79,72.79
4,Universal AU US EU to UK AC Power Plug Travel ...,3.6 out of 5 stars,3.6,21.0,21.0,SAR 12.99,12.99
5,"FLIZIL Fast Charging Universal Travel Adapter,...",4.5 out of 5 stars,4.5,24.0,24.0,SAR 74.88,74.88
6,"Travel Adapter for KSA/UAE/UK, Plug for US/EU/...",4.4 out of 5 stars,4.4,53.0,53.0,SAR 80.00,80.0
7,FLIZIL USB C Cable 3A Fast Charging Cable Nylo...,4.5 out of 5 stars,4.5,24.0,24.0,SAR 22.88,22.88
8,Belkin BSV804ar2M 8-Outlet Surge Protection Ex...,4.5 out of 5 stars,4.5,98.0,98.0,SAR 128.99,128.99
9,"FLIZIL Lightning Cable, Apple MFi Certified Li...",4.5 out of 5 stars,4.5,24.0,24.0,SAR 23.77,23.77


# Best Sellers in Electrical Power Accessories (Page 2)

In [57]:
url="https://www.amazon.sa/-/en/gp/bestsellers/electronics/16966395031/ref=zg_bs_pg_2?ie=UTF8&pg=2"

In [58]:
response = requests.get(url)
response.status_code

200

In [59]:
response.text[:1000]

'<!doctype html><html lang="en-ae" class="a-no-js" data-19ax5a9jf="dingo"><!-- sp:feature:head-start -->\n<head><script>var aPageStart = (new Date()).getTime();</script><meta charset="utf-8"/>\n<!-- sp:end-feature:head-start -->\n\n<!-- sp:feature:cs-optimization -->\n<meta http-equiv=\'x-dns-prefetch-control\' content=\'on\'>\n<link rel="dns-prefetch" href="https://images-eu.ssl-images-amazon.com">\n<link rel="dns-prefetch" href="https://m.media-amazon.com">\n<link rel="dns-prefetch" href="https://completion.amazon.com">\n<!-- sp:end-feature:cs-optimization -->\n\n<!-- sp:feature:aui-assets -->\n<link rel="stylesheet" href="https://images-eu.ssl-images-amazon.com/images/I/11EIQ5IGqaL._RC|01ZTHTZObnL.css,41JZEtDv4tL.css,31Y8m1dzTdL.css,013z33uKh2L.css,017DsKjNQJL.css,0131vqwP5UL.css,41EWOOlBJ9L.css,11TIuySqr6L.css,01ElnPiDxWL.css,11bGSgD5pDL.css,01Dm5eKVxwL.css,01IdKcBuAdL.css,01y-XAlI+2L.css,21N4kUH7pxL.css,01oDR3IULNL.css,41CYNGpGlrL.css,01XPHJk60-L.css,114y0SIP+yL.css,21aPhFy+riL.cs

In [60]:
page = response.text
soup = BeautifulSoup(page, 'lxml')

In [61]:
print(soup.prettify())

<!DOCTYPE html>
<html class="a-no-js" data-19ax5a9jf="dingo" lang="en-ae">
 <!-- sp:feature:head-start -->
 <head>
  <script>
   var aPageStart = (new Date()).getTime();
  </script>
  <meta charset="utf-8"/>
  <!-- sp:end-feature:head-start -->
  <!-- sp:feature:cs-optimization -->
  <meta content="on" http-equiv="x-dns-prefetch-control"/>
  <link href="https://images-eu.ssl-images-amazon.com" rel="dns-prefetch"/>
  <link href="https://m.media-amazon.com" rel="dns-prefetch"/>
  <link href="https://completion.amazon.com" rel="dns-prefetch"/>
  <!-- sp:end-feature:cs-optimization -->
  <!-- sp:feature:aui-assets -->
  <link href="https://images-eu.ssl-images-amazon.com/images/I/11EIQ5IGqaL._RC|01ZTHTZObnL.css,41JZEtDv4tL.css,31Y8m1dzTdL.css,013z33uKh2L.css,017DsKjNQJL.css,0131vqwP5UL.css,41EWOOlBJ9L.css,11TIuySqr6L.css,01ElnPiDxWL.css,11bGSgD5pDL.css,01Dm5eKVxwL.css,01IdKcBuAdL.css,01y-XAlI+2L.css,21N4kUH7pxL.css,01oDR3IULNL.css,41CYNGpGlrL.css,01XPHJk60-L.css,114y0SIP+yL.css,21aPhFy+riL

In [62]:
soup.find_all('li', {'class':'zg-item-immersion'})[1].find_all('a')[1].text

'\n3.7 out of 5 stars\n'

In [63]:
def clean_num(s):
    """cleans text scraped from website
    
    Parameters:
    s (String): raw text

    Returns:
    String: cleaned digit text

    """ 

    if s is None:
        return s
    return "".join(re.findall('\d*\.?\d+',s))

def get_data(block, tag, class_text):
    """fetch data from tag and its associated class
    
    Parameters:
    block (int): Book item 
    tag (String): tag for element we want to fetch
    class_text (String): class for element we want to fetch

    Returns:
    String: data text fetched

    """ 
    try:
        return block.find(tag, class_ = class_text).text
    except AttributeError:
        return ""
    return

In [64]:
for i in (soup.find("li", "zg-item-immersion")):
    print(str.strip(get_data(i, "a", "a-link-normal")))
    print(get_data(i, "span", "a-icon-alt"))
    print(clean_num(get_data(i, "span", "a-icon-alt").split(" ")[0]))
    print(get_data(i, "a","a-size-small a-link-normal"))
    print(clean_num(get_data(i, "a","a-size-small a-link-normal")))
    print(get_data(i, "span","p13n-sc-price"))
    print(clean_num(get_data(i, "span", "p13n-sc-price")))

Power connection of 2500 watts multi-use
4.0 out of 5 stars
4.0
35
35
SAR 85.60
85.60


In [65]:
products_list = []
for i in (soup.find_all("li", class_ = "zg-item-immersion")):
    pro_name = (str.strip(get_data(i, "a", "a-link-normal")))
    pro_rate= (get_data(i, "span", "a-icon-alt"))
    pro_rate_clean = (clean_num(get_data(i, "span", "a-icon-alt").split(" ")[0]))
    review_num = (get_data(i, "a","a-size-small a-link-normal"))
    review_num_clean = (clean_num(get_data(i, "a","a-size-small a-link-normal")))
    pro_price = (get_data(i, "span","p13n-sc-price"))
    pro_price_clean = (clean_num(get_data(i, "span", "p13n-sc-price")))
    
    headers = ['pro_name', 'pro_rate', 'pro_rate_clean','review_num',
              'review_num_clean', 'pro_price','pro_price_clean']
    products_dict = dict(zip(headers, [ pro_name,
                                    pro_rate,
                                    pro_rate_clean,
                                    review_num,
                                    review_num_clean, 
                                    pro_price,
                                    pro_price_clean]))
    products_list.append(products_dict)

In [66]:
products_df = pd.DataFrame(products_list).replace("",np.nan)  #convert list of dict to df
products_df

Unnamed: 0,pro_name,pro_rate,pro_rate_clean,review_num,review_num_clean,pro_price,pro_price_clean
0,Power connection of 2500 watts multi-use,4.0 out of 5 stars,4.0,35.0,35.0,SAR 85.60,85.6
1,2-Socket Extension Cord with on/off Switch wit...,3.7 out of 5 stars,3.7,22.0,22.0,SAR 41.21,41.21
2,"Extension Cord, 5 Meters, 4 Ports, German",4.9 out of 5 stars,4.9,19.0,19.0,SAR 134.00,134.0
3,Extension Cord 4 Way Power Strip White Extensi...,4.6 out of 5 stars,4.6,224.0,224.0,SAR 79.00,79.0
4,Narken E-Series Electric Universal Extension C...,,,,,SAR 69.30,69.3
5,Philips 5 Way Heavy duty Indivisual Switch Soc...,5.0 out of 5 stars,5.0,2.0,2.0,SAR 92.93,92.93
6,Haylink CE ROHS سبائك الألومنيوم المشبك سطح ال...,,,,,SAR 189.88,189.88
7,LENCENT 2Pcs UK to EU Euro Europe Plug Adapter...,,,,,SAR 45.99,45.99
8,Mark 3864 High Quality Power socket Extension 10m,2.6 out of 5 stars,2.6,6.0,6.0,SAR 82.17,82.17
9,Black Extension Lead 4way Power Socket 1.8m Ex...,5.0 out of 5 stars,5.0,6.0,6.0,SAR 55.99,55.99


# Best Sellers in Computer Tablets (Page 1)

In [68]:
url="https://www.amazon.sa/-/en/gp/bestsellers/electronics/16966433031/ref=zg_bs_nav_1_electronics"

In [69]:
response = requests.get(url)
response.status_code

200

In [70]:
response.text[:1000]

'<!doctype html><html lang="en-ae" class="a-no-js" data-19ax5a9jf="dingo"><!-- sp:feature:head-start -->\n<head><script>var aPageStart = (new Date()).getTime();</script><meta charset="utf-8"/>\n<!-- sp:end-feature:head-start -->\n\n<!-- sp:feature:cs-optimization -->\n<meta http-equiv=\'x-dns-prefetch-control\' content=\'on\'>\n<link rel="dns-prefetch" href="https://images-eu.ssl-images-amazon.com">\n<link rel="dns-prefetch" href="https://m.media-amazon.com">\n<link rel="dns-prefetch" href="https://completion.amazon.com">\n<!-- sp:end-feature:cs-optimization -->\n\n<!-- sp:feature:aui-assets -->\n<link rel="stylesheet" href="https://images-eu.ssl-images-amazon.com/images/I/11EIQ5IGqaL._RC|01ZTHTZObnL.css,41JZEtDv4tL.css,31Y8m1dzTdL.css,013z33uKh2L.css,017DsKjNQJL.css,0131vqwP5UL.css,41EWOOlBJ9L.css,11TIuySqr6L.css,01ElnPiDxWL.css,11bGSgD5pDL.css,01Dm5eKVxwL.css,01IdKcBuAdL.css,01y-XAlI+2L.css,21N4kUH7pxL.css,01oDR3IULNL.css,41CYNGpGlrL.css,01XPHJk60-L.css,114y0SIP+yL.css,21aPhFy+riL.cs

In [71]:
page = response.text
soup = BeautifulSoup(page, 'lxml')

In [72]:
print(soup.prettify())

<!DOCTYPE html>
<html class="a-no-js" data-19ax5a9jf="dingo" lang="en-ae">
 <!-- sp:feature:head-start -->
 <head>
  <script>
   var aPageStart = (new Date()).getTime();
  </script>
  <meta charset="utf-8"/>
  <!-- sp:end-feature:head-start -->
  <!-- sp:feature:cs-optimization -->
  <meta content="on" http-equiv="x-dns-prefetch-control"/>
  <link href="https://images-eu.ssl-images-amazon.com" rel="dns-prefetch"/>
  <link href="https://m.media-amazon.com" rel="dns-prefetch"/>
  <link href="https://completion.amazon.com" rel="dns-prefetch"/>
  <!-- sp:end-feature:cs-optimization -->
  <!-- sp:feature:aui-assets -->
  <link href="https://images-eu.ssl-images-amazon.com/images/I/11EIQ5IGqaL._RC|01ZTHTZObnL.css,41JZEtDv4tL.css,31Y8m1dzTdL.css,013z33uKh2L.css,017DsKjNQJL.css,0131vqwP5UL.css,41EWOOlBJ9L.css,11TIuySqr6L.css,01ElnPiDxWL.css,11bGSgD5pDL.css,01Dm5eKVxwL.css,01IdKcBuAdL.css,01y-XAlI+2L.css,21N4kUH7pxL.css,01oDR3IULNL.css,41CYNGpGlrL.css,01XPHJk60-L.css,114y0SIP+yL.css,21aPhFy+riL

In [73]:
soup.find_all('li', {'class':'zg-item-immersion'})[1].find_all('a')[1].text

'\n4.1 out of 5 stars\n'

In [74]:
def clean_num(s):
    """cleans text scraped from website
    
    Parameters:
    s (String): raw text

    Returns:
    String: cleaned digit text

    """ 

    if s is None:
        return s
    return "".join(re.findall('\d*\.?\d+',s))

def get_data(block, tag, class_text):
    """fetch data from tag and its associated class
    
    Parameters:
    block (int): Book item 
    tag (String): tag for element we want to fetch
    class_text (String): class for element we want to fetch

    Returns:
    String: data text fetched

    """ 
    try:
        return block.find(tag, class_ = class_text).text
    except AttributeError:
        return ""
    return

In [75]:
for i in (soup.find("li", "zg-item-immersion")):
    print(str.strip(get_data(i, "a", "a-link-normal")))
    print(get_data(i, "span", "a-icon-alt"))
    print(clean_num(get_data(i, "span", "a-icon-alt").split(" ")[0]))
    print(get_data(i, "a","a-size-small a-link-normal"))
    print(clean_num(get_data(i, "a","a-size-small a-link-normal")))
    print(get_data(i, "span","p13n-sc-price"))
    print(clean_num(get_data(i, "span", "p13n-sc-price")))

2020 Apple iPad Air (10.9-inch, Wi-Fi, 64GB) - Space Grey (4th Generation)
4.4 out of 5 stars
4.4
157
157
SAR 2,449.00
2449.00


In [76]:
products_list = []
for i in (soup.find_all("li", class_ = "zg-item-immersion")):
    pro_name = (str.strip(get_data(i, "a", "a-link-normal")))
    pro_rate= (get_data(i, "span", "a-icon-alt"))
    pro_rate_clean = (clean_num(get_data(i, "span", "a-icon-alt").split(" ")[0]))
    review_num = (get_data(i, "a","a-size-small a-link-normal"))
    review_num_clean = (clean_num(get_data(i, "a","a-size-small a-link-normal")))
    pro_price = (get_data(i, "span","p13n-sc-price"))
    pro_price_clean = (clean_num(get_data(i, "span", "p13n-sc-price")))
    
    headers = ['pro_name', 'pro_rate', 'pro_rate_clean','review_num',
              'review_num_clean', 'pro_price','pro_price_clean']
    products_dict = dict(zip(headers, [ pro_name,
                                    pro_rate,
                                    pro_rate_clean,
                                    review_num,
                                    review_num_clean, 
                                    pro_price,
                                    pro_price_clean]))
    products_list.append(products_dict)

In [77]:
products_df = pd.DataFrame(products_list).replace("",np.nan)  #convert list of dict to df
products_df

Unnamed: 0,pro_name,pro_rate,pro_rate_clean,review_num,review_num_clean,pro_price,pro_price_clean
0,"2020 Apple iPad Air (10.9-inch, Wi-Fi, 64GB) -...",4.4 out of 5 stars,4.4,157.0,157.0,"SAR 2,449.00",2449.0
1,Samsung Electronics A7 Tablet 10.4 Wi-Fi 64GB ...,4.1 out of 5 stars,4.1,22.0,22.0,SAR 793.32,793.32
2,"SAMSUNG Galaxy Tab A7 Lite Tablet - 32GB, 3GB ...",3.2 out of 5 stars,3.2,10.0,10.0,SAR 595.00,595.0
3,"2021 Apple 10.2-inch iPad (Wi-Fi, 64GB) - Spac...",4.2 out of 5 stars,4.2,36.0,36.0,"SAR 1,549.00",1549.0
4,"2021 Apple iPad Pro (11-inch, Wi-Fi, 128GB) - ...",4.6 out of 5 stars,4.6,54.0,54.0,"SAR 3,499.00",3499.0
5,"Lenovo Smart Tab M10 Plus, 10.3"" Android Table...",4.2 out of 5 stars,4.2,982.0,982.0,SAR 881.55,881.55
6,"Samsung Galaxy Tab S7+ Wi-Fi, Mystic Black - 2...",4.7 out of 5 stars,4.7,1307.0,1307.0,"SAR 3,152.63",3152.63
7,"SAMSUNG Galaxy Tab A7 Lite Tablet - 32GB, 3GB ...",3.2 out of 5 stars,3.2,10.0,10.0,SAR 595.00,595.0
8,"2020 Apple iPad Air (10.9-inch, Wi-Fi, 256GB) ...",4.4 out of 5 stars,4.4,157.0,157.0,"SAR 3,399.00",3399.0
9,"SAMSUNG Galaxy Tab A7 Lite Tablet - 32GB, 3GB ...",3.2 out of 5 stars,3.2,10.0,10.0,SAR 577.62,577.62


# Best Sellers in Computer Tablets (Page 2)

In [78]:
url="https://www.amazon.sa/-/en/gp/bestsellers/electronics/16966433031/ref=zg_bs_pg_2?ie=UTF8&pg=2"

In [79]:
response = requests.get(url)
response.status_code

200

In [80]:
response.text[:1000]

'<!doctype html><html lang="en-ae" class="a-no-js" data-19ax5a9jf="dingo"><!-- sp:feature:head-start -->\n<head><script>var aPageStart = (new Date()).getTime();</script><meta charset="utf-8"/>\n<!-- sp:end-feature:head-start -->\n\n<!-- sp:feature:cs-optimization -->\n<meta http-equiv=\'x-dns-prefetch-control\' content=\'on\'>\n<link rel="dns-prefetch" href="https://images-eu.ssl-images-amazon.com">\n<link rel="dns-prefetch" href="https://m.media-amazon.com">\n<link rel="dns-prefetch" href="https://completion.amazon.com">\n<!-- sp:end-feature:cs-optimization -->\n\n<!-- sp:feature:aui-assets -->\n<link rel="stylesheet" href="https://images-eu.ssl-images-amazon.com/images/I/11EIQ5IGqaL._RC|01ZTHTZObnL.css,41JZEtDv4tL.css,31Y8m1dzTdL.css,013z33uKh2L.css,017DsKjNQJL.css,0131vqwP5UL.css,41EWOOlBJ9L.css,11TIuySqr6L.css,01ElnPiDxWL.css,11bGSgD5pDL.css,01Dm5eKVxwL.css,01IdKcBuAdL.css,01y-XAlI+2L.css,21N4kUH7pxL.css,01oDR3IULNL.css,41CYNGpGlrL.css,01XPHJk60-L.css,114y0SIP+yL.css,21aPhFy+riL.cs

In [81]:
page = response.text
soup = BeautifulSoup(page, 'lxml')

In [82]:
print(soup.prettify())

<!DOCTYPE html>
<html class="a-no-js" data-19ax5a9jf="dingo" lang="en-ae">
 <!-- sp:feature:head-start -->
 <head>
  <script>
   var aPageStart = (new Date()).getTime();
  </script>
  <meta charset="utf-8"/>
  <!-- sp:end-feature:head-start -->
  <!-- sp:feature:cs-optimization -->
  <meta content="on" http-equiv="x-dns-prefetch-control"/>
  <link href="https://images-eu.ssl-images-amazon.com" rel="dns-prefetch"/>
  <link href="https://m.media-amazon.com" rel="dns-prefetch"/>
  <link href="https://completion.amazon.com" rel="dns-prefetch"/>
  <!-- sp:end-feature:cs-optimization -->
  <!-- sp:feature:aui-assets -->
  <link href="https://images-eu.ssl-images-amazon.com/images/I/11EIQ5IGqaL._RC|01ZTHTZObnL.css,41JZEtDv4tL.css,31Y8m1dzTdL.css,013z33uKh2L.css,017DsKjNQJL.css,0131vqwP5UL.css,41EWOOlBJ9L.css,11TIuySqr6L.css,01ElnPiDxWL.css,11bGSgD5pDL.css,01Dm5eKVxwL.css,01IdKcBuAdL.css,01y-XAlI+2L.css,21N4kUH7pxL.css,01oDR3IULNL.css,41CYNGpGlrL.css,01XPHJk60-L.css,114y0SIP+yL.css,21aPhFy+riL

In [83]:
soup.find_all('li', {'class':'zg-item-immersion'})[1].find_all('a')[1].text

'\n3.8 out of 5 stars\n'

In [84]:
def clean_num(s):
    """cleans text scraped from website
    
    Parameters:
    s (String): raw text

    Returns:
    String: cleaned digit text

    """ 

    if s is None:
        return s
    return "".join(re.findall('\d*\.?\d+',s))

def get_data(block, tag, class_text):
    """fetch data from tag and its associated class
    
    Parameters:
    block (int): Book item 
    tag (String): tag for element we want to fetch
    class_text (String): class for element we want to fetch

    Returns:
    String: data text fetched

    """ 
    try:
        return block.find(tag, class_ = class_text).text
    except AttributeError:
        return ""
    return

In [85]:
for i in (soup.find("li", "zg-item-immersion")):
    print(str.strip(get_data(i, "a", "a-link-normal")))
    print(get_data(i, "span", "a-icon-alt"))
    print(clean_num(get_data(i, "span", "a-icon-alt").split(" ")[0]))
    print(get_data(i, "a","a-size-small a-link-normal"))
    print(clean_num(get_data(i, "a","a-size-small a-link-normal")))
    print(get_data(i, "span","p13n-sc-price"))
    print(clean_num(get_data(i, "span", "p13n-sc-price")))

SAMSUNG Galaxy Tab S7 11-inch Android Tablet 128GB Wi-Fi Bluetooth S Pen Fast Charging USB-C Port, Mystic Silver
4.7 out of 5 stars
4.7
1,307
1307
SAR 2,162.42
2162.42


In [86]:
products_list = []
for i in (soup.find_all("li", class_ = "zg-item-immersion")):
    pro_name = (str.strip(get_data(i, "a", "a-link-normal")))
    pro_rate= (get_data(i, "span", "a-icon-alt"))
    pro_rate_clean = (clean_num(get_data(i, "span", "a-icon-alt").split(" ")[0]))
    review_num = (get_data(i, "a","a-size-small a-link-normal"))
    review_num_clean = (clean_num(get_data(i, "a","a-size-small a-link-normal")))
    pro_price = (get_data(i, "span","p13n-sc-price"))
    pro_price_clean = (clean_num(get_data(i, "span", "p13n-sc-price")))
    
    headers = ['pro_name', 'pro_rate', 'pro_rate_clean','review_num',
              'review_num_clean', 'pro_price','pro_price_clean']
    products_dict = dict(zip(headers, [ pro_name,
                                    pro_rate,
                                    pro_rate_clean,
                                    review_num,
                                    review_num_clean, 
                                    pro_price,
                                    pro_price_clean]))
    products_list.append(products_dict)

In [87]:
products_df = pd.DataFrame(products_list).replace("",np.nan)  #convert list of dict to df
products_df

Unnamed: 0,pro_name,pro_rate,pro_rate_clean,review_num,review_num_clean,pro_price,pro_price_clean
0,SAMSUNG Galaxy Tab S7 11-inch Android Tablet 1...,4.7 out of 5 stars,4.7,1307.0,1307.0,"SAR 2,162.42",2162.42
1,"Lenovo Tab M10 (TB-X505X), 10.1 Inch Tablet, 1...",3.8 out of 5 stars,3.8,21.0,21.0,SAR 572.00,572.0
2,"Lenovo Tab M7 (TB-7305X) 7 inch Tablet, Wi-Fi ...",3.2 out of 5 stars,3.2,23.0,23.0,SAR 425.00,425.0
3,SAMSUNG Galaxy Tab S7 11-inch Android Tablet 1...,4.7 out of 5 stars,4.7,1307.0,1307.0,"SAR 2,159.79",2159.79
4,"Datazone Kids tablet, Childern tablet 7.0 inch...",1.0 out of 5 stars,1.0,4.0,4.0,SAR 159.00,159.0
5,"Xiaomi Pad 5 11"" Tablet, 256GB, 6GB RAM, Wi-Fi...",5.0 out of 5 stars,5.0,1.0,1.0,,
6,"2020 Apple iPad Air (10.9-inch, Wi-Fi, 256GB) ...",4.4 out of 5 stars,4.4,157.0,157.0,"SAR 3,399.00",3399.0
7,"HUAWEI MatePad T8 Tablet, 8 inch, 32GB 2GB RAM...",,,,,SAR 529.00,529.0
8,"Microsoft Surface Pro 7 Tablet - 12.3 Inch, 10...",4.5 out of 5 stars,4.5,89.0,89.0,"SAR 3,999.00",3999.0
9,"Samsung Galaxy Tab S7 FE Tablet - 64GB, 4GB RA...",2.0 out of 5 stars,2.0,1.0,1.0,"SAR 1,999.00",1999.0
