In [1]:
import pandas as pd
import re
from datetime import datetime
from pathlib import Path  # using instead of os
from pprint import pprint
from requests_html import HTML
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from time import sleep

In [2]:
BASE_DIR = Path.cwd()
parent_dir = BASE_DIR.parent
CHROMEDRIVER_DIR = parent_dir / '.chromedriver'
CHROMEDRIVER_PATH = GECKODRIVER_DIR / 'chromedriver'

DATA_DIR = BASE_DIR / "data"

if not DATA_DIR.exists():
    DATA_DIR.mkdir(exist_ok=True)

product_category_links_output = DATA_DIR / "category-products.csv"
product_output = DATA_DIR / "products.csv"

In [3]:
options = Options()
options.add_argument('--headless')
driver = webdriver.Firefox(executable_path=CHROMEDRIVER_PATH, options=options)

In [4]:
categories = [
    {'name': 'gift-cards', 'url': 'https://www.amazon.in/gp/bestsellers/gift-cards/'},
    {'name': 'videogames', 'url': 'https://www.amazon.in/gp/bestsellers/videogames/'},
    {'name': 'electronics', 'url': 'https://www.amazon.in/gp/bestsellers/electronics/'},
]

In [5]:
regex_str = r"https://www.amazon.in/(?P<slug>[\w-]+)/dp/(?P<product_id>[\w-]+)/"

def extract_product_id_from_url(url):
    product_id = None
    regex = re.compile(regex_str)
    regex_match = regex.match(url)
    if regex_match is not None:
        try:
            product_id = regex_match['product_id']
        except:
            pass
    return product_id

In [6]:
def clean_page_links(all_links=[], category=None):
    final_page_links = []
    for link in all_links:
        product_id =  extract_product_id_from_url(link)
        if product_id is not None:
            final_page_links.append({'product_id': product_id, 'category':category, 'url': link})
    return final_page_links

In [7]:
def scrape_category_product_links(categories=[]):
    all_product_links = []
    for category in categories:
        sleep(1)
        url = category.get('url')
        driver.get(url)
        body_element = driver.find_element_by_css_selector('body')
        html_txt = body_element.get_attribute('innerHTML')
        html_obj = HTML(html=html_txt)
        page_links = [f"https://www.amazon.in{link}" for link in html_obj.links if link.startswith('/')]
        cleaned_links = clean_page_links(all_links=page_links, category=category['name'])
        all_product_links += (cleaned_links)
    return all_product_links

In [8]:
def extract_categories_and_save(categories=[]):
    all_product_links = scrape_category_product_links(categories)
    category_df = pd.DataFrame(all_product_links)
    category_df.to_csv(product_category_links_output, index=False)

In [9]:
extract_categories_and_save(categories=categories)

In [10]:
def scrape_product_page(url, title_lookup = '#productTitle', price_lookup = '#priceblock_ourprice'):
    driver.get(url)
    sleep(1)
    body_element = driver.find_element_by_css_selector('body')
    html_txt = body_element.get_attribute('innerHTML')
    html_obj = HTML(html=html_txt)
    product_title = html_obj.find(title_lookup, first=True).text
    product_price = html_obj.find(price_lookup, first=True).text
    return product_title, product_price

In [11]:
def row_scrape_event(row, *args, **kwargs):
    link = row['url']
    scraped = 0  # False
    try:
        scraped = row['scraped']
    except:
        pass
    
    if scraped == 1 or scraped == '1':
        return row
    
    product_id = row['product_id']
    title, price = (None, None)
    try:
        title, price = scrape_product_page(url=link)
    except:
        pass
    row['title'] = title
    row['price'] = price
    row['scraped'] = 1  # True
    row['timestamp'] = datetime.now().timestamp()
    return row

In [12]:
df = pd.read_csv(product_category_links_output)
df.head()

Unnamed: 0,product_id,category,url
0,B07C8Y15V5,gift-cards,https://www.amazon.in/BigBasket-2_model-eGift-...
1,B07BTPJ1Z3,gift-cards,https://www.amazon.in/Congratulations-Great-li...
2,B0797M1M18,gift-cards,https://www.amazon.in/Together-forever-mail-Am...
3,B018TV9G2E,gift-cards,https://www.amazon.in/Thank-You-Post-its-mail-...
4,B00K5ZQ6D0,gift-cards,https://www.amazon.in/Congratulations-Flowers-...


In [13]:
df.shape

(149, 3)

In [14]:
df_sub = df.tail(20)  # for fast output
# df_sub = df.copy()

In [None]:
df_sub = df_sub.apply(row_scrape_event, axis=1)

In [None]:
df_sub.to_csv(product_output, index=False)

In [None]:
products_df = pd.read_csv(product_output)
products_df.head()

In [None]:
# final_df = pd.concat([products_df, df_sub])
# final_df.to_csv(product_output, index=False)

In [None]:
# final_df.head()