In [1]:
import os
import re
from pprint import pprint
from requests_html import HTML
from selenium import webdriver
from selenium.webdriver.firefox.options import Options
from time import sleep

In [2]:
cwd = os.getcwd()
BASE_DIR = os.path.dirname(cwd)
GECKODRIVER_DIR = os.path.join(BASE_DIR, '.geckodriver-firefox')
GECKODRIVER_PATH = os.path.join(GECKODRIVER_DIR, 'geckodriver')

In [3]:
options = Options()
options.add_argument('--headless')
driver = webdriver.Firefox(executable_path=GECKODRIVER_PATH, options=options)

In [4]:
categories = [
    'https://www.amazon.in/gp/bestsellers/gift-cards/',
    'https://www.amazon.in/gp/bestsellers/videogames/',
    'https://www.amazon.in/gp/bestsellers/electronics/',
]

In [5]:
first_url = categories[1]

In [6]:
driver.get(first_url)

In [7]:
body_element = driver.find_element_by_css_selector('body')
html_txt = body_element.get_attribute('innerHTML')

In [8]:
html_obj = HTML(html=html_txt)

In [9]:
new_links = [link for link in html_obj.links if link.startswith('/')]
page_links = [f"https://www.amazon.in{x}" for x in new_links]
pprint(page_links)

['https://www.amazon.in/product-reviews/B07K6VR134/ref=zg_bs_videogames_cr_7/260-4607565-2776363?ie=UTF8&refRID=V7ASCKPSBJQH4TGGKHS7',
 'https://www.amazon.in/Minecraft-Java-Download-Code-Only/dp/B07VMXV8KS/ref=zg_bs_videogames_16/260-4607565-2776363?_encoding=UTF8&psc=1&refRID=V7ASCKPSBJQH4TGGKHS7',
 'https://www.amazon.in/Xbox-One/b/?ie=UTF8&node=2785596031&ref_=sv_v_1',
 'https://www.amazon.in/Car-Motorbike-Store/b/?ie=UTF8&node=4772060031&ref_=nav_cs_automotive_75137ca568f6495387a781885869ac30',
 'https://www.amazon.in/Oculus-Quest-Advanced-All-One/dp/B08F7PTF53/ref=zg_bs_videogames_44/260-4607565-2776363?_encoding=UTF8&psc=1&refRID=V7ASCKPSBJQH4TGGKHS7',
 'https://www.amazon.in/Square-Enix-Marvels-Avengers-PS4/dp/B07YSTD51Q/ref=zg_bs_videogames_47/260-4607565-2776363?_encoding=UTF8&psc=1&refRID=V7ASCKPSBJQH4TGGKHS7',
 'https://www.amazon.in/Spirili-Rechargable-Console-Screen-Classic/dp/B07VC4N1LM/ref=zg_bs_videogames_9/260-4607565-2776363?_encoding=UTF8&psc=1&refRID=V7ASCKPSBJQH4T

In [10]:
# https://www.amazon.in/Cosmic-Byte-GS410-Headphones-Grey/dp/B07K7XRJTZ/
# https://www.amazon.in/Sony-Uncharted-Collection-Hits-PS4/dp/B07M5V17R9/

# <base_url>/<slug>/dp/<product_id>/

In [11]:
my_regex_pattern = r"https://www.amazon.in/(?P<slug>[\w-]+)/dp/(?P<product_id>[\w-]+)/"

# \w- = it will look for every cheracter from 'a-z' (lowercase and uppercase), every number from '0-9' and also a '-' .
# [\w-]+ = + will match one or more occurence of regex.

my_url = 'https://www.amazon.in/Sony-Uncharted-Collection-Hits-PS4/dp/B07M5V17R9/'

In [12]:
regex = re.compile(my_regex_pattern)

In [13]:
my_match = regex.match(my_url)

In [14]:
print(my_match['slug'])
print(my_match['product_id'])

Sony-Uncharted-Collection-Hits-PS4
B07M5V17R9


In [15]:
regex_options = [
    r"https://www.amazon.in/gp/product/(?P<product_id>[\w-]+)/",
    r"https://www.amazon.in/dp/(?P<product_id>[\w-]+)/",
    r"https://www.amazon.in/(?P<slug>[\w-]+)/dp/(?P<product_id>[\w-]+)/",
]

In [16]:
def extract_product_id_from_url(url):
    product_id = None
    for regex_str in regex_options:
        regex = re.compile(regex_str)
        regex_match = regex.match(url)
        if regex_match is not None:
            try:
                product_id = regex_match['product_id']
            except:
                pass
    return product_id

In [17]:
# final_page_links = [link for link in page_links if extract_product_id_from_url(link) is not None]
# len(page_links) == len(cleaned_links) it will never be equal.

def clean_page_links(all_links=[]):
    final_page_links = []
    for link in all_links:
        product_id =  extract_product_id_from_url(link)
        if product_id is not None:
            final_page_links.append({'product_id': product_id, 'url': link})
    return final_page_links

cleaned_links = clean_page_links(all_links=page_links)

In [18]:
len(page_links)

165

In [19]:
len(cleaned_links)

49

In [20]:
def scrape_product_page(url, title_lookup = '#productTitle', price_lookup = '#priceblock_ourprice'):
    driver.get(url)
    sleep(1)
    body_element = driver.find_element_by_css_selector('body')
    html_txt = body_element.get_attribute('innerHTML')
    html_obj = HTML(html=html_txt)
    product_title = html_obj.find(title_lookup, first=True).text
    product_price = html_obj.find(price_lookup, first=True).text
    return product_title, product_price

In [21]:
def scrape_product_data(items=[]):
    data_extracted = []
    for item in items:
        link = item['url']
        product_id = item['product_id']
        title, price = (None, None)
        try:
            title, price = scrape_product_page(url=link)
        except:
            pass

        if title is not None and price is not None:
            print('Title: ', title)
            print('Price: ', price)
            print('Link: ', link, '\n')
            
            product_data = {
                'product_id': product_id,
                'title': title,
                'price': price,
                'link': link,
            }
            data_extracted.append(product_data)
    return data_extracted

In [22]:
extracted_data = scrape_product_data(items=cleaned_links)

Title:  Minecraft Java Edition PC Download Code Only (No CD/DVD)
Price:  ₹ 2,085.00
Link:  https://www.amazon.in/Minecraft-Java-Download-Code-Only/dp/B07VMXV8KS/ref=zg_bs_videogames_16/260-4607565-2776363?_encoding=UTF8&psc=1&refRID=V7ASCKPSBJQH4TGGKHS7 

Title:  Oculus Quest 2 — Advanced All-In-One Virtual Reality Headset (64GB)
Price:  ₹ 41,698.00
Link:  https://www.amazon.in/Oculus-Quest-Advanced-All-One/dp/B08F7PTF53/ref=zg_bs_videogames_44/260-4607565-2776363?_encoding=UTF8&psc=1&refRID=V7ASCKPSBJQH4TGGKHS7 

Title:  Marvel's Avengers (Free PS5 Upgrade)
Price:  ₹ 1,937.00
Link:  https://www.amazon.in/Square-Enix-Marvels-Avengers-PS4/dp/B07YSTD51Q/ref=zg_bs_videogames_47/260-4607565-2776363?_encoding=UTF8&psc=1&refRID=V7ASCKPSBJQH4TGGKHS7 

Title:  Rewy 400 in 1 Sup Game Box Rechargable Console/Led Screen/Retro Classic Gaming Console (Random Color)
Price:  ₹ 625.00
Link:  https://www.amazon.in/Spirili-Rechargable-Console-Screen-Classic/dp/B07VC4N1LM/ref=zg_bs_videogames_9/260-46075

Title:  PowerA Wired Officially Licensed Controller for Xbox One, Xbox One S, Xbox One X & Windows 10 - Black
Price:  ₹ 2,199.00
Link:  https://www.amazon.in/PowerA-Officially-Licensed-Controller-Windows/dp/B07NPY1YT5/ref=zg_bs_videogames_20/260-4607565-2776363?_encoding=UTF8&psc=1&refRID=V7ASCKPSBJQH4TGGKHS7 

Title:  Cosmic Byte GS410 Headphones with Mic and for PS4, Xbox One, Laptop, PC, iPhone and Android Phones (Black/Green)
Price:  ₹ 1,099.00
Link:  https://www.amazon.in/Cosmic-Byte-GS410-Headphones-Green/dp/B07K7YR8SP/ref=zg_bs_videogames_31/260-4607565-2776363?_encoding=UTF8&psc=1&refRID=V7ASCKPSBJQH4TGGKHS7 

Title:  SpinBot BattleMods X2 Gaming Grip Handle with Conductive Triggers Combo for COD Mobile/Garena Free Fire/etc-Supports for Most Android and iOS Phones-(Jet Black)
Price:  ₹ 649.00
Link:  https://www.amazon.in/SpinBot-BattleMods-Conductive-Triggers-etc-Supports/dp/B0823H4Y4X/ref=zg_bs_videogames_37/260-4607565-2776363?_encoding=UTF8&psc=1&refRID=V7ASCKPSBJQH4TGGKHS7 

In [23]:
pprint(extracted_data)

[{'link': 'https://www.amazon.in/Minecraft-Java-Download-Code-Only/dp/B07VMXV8KS/ref=zg_bs_videogames_16/260-4607565-2776363?_encoding=UTF8&psc=1&refRID=V7ASCKPSBJQH4TGGKHS7',
  'price': '₹\xa02,085.00',
  'product_id': 'B07VMXV8KS',
  'title': 'Minecraft Java Edition PC Download Code Only (No CD/DVD)'},
 {'link': 'https://www.amazon.in/Oculus-Quest-Advanced-All-One/dp/B08F7PTF53/ref=zg_bs_videogames_44/260-4607565-2776363?_encoding=UTF8&psc=1&refRID=V7ASCKPSBJQH4TGGKHS7',
  'price': '₹\xa041,698.00',
  'product_id': 'B08F7PTF53',
  'title': 'Oculus Quest 2 — Advanced All-In-One Virtual Reality Headset '
           '(64GB)'},
 {'link': 'https://www.amazon.in/Square-Enix-Marvels-Avengers-PS4/dp/B07YSTD51Q/ref=zg_bs_videogames_47/260-4607565-2776363?_encoding=UTF8&psc=1&refRID=V7ASCKPSBJQH4TGGKHS7',
  'price': '₹\xa01,937.00',
  'product_id': 'B07YSTD51Q',
  'title': "Marvel's Avengers (Free PS5 Upgrade)"},
 {'link': 'https://www.amazon.in/Spirili-Rechargable-Console-Screen-Classic/dp/B0