In [1]:
import logging
from datetime import datetime
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

# Set up logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(message)s')
logger = logging.getLogger()

def current_time_slot():
    now = datetime.now()
    return now.strftime('%Y-%m-%d %H:%M')

def parse_rank_numbers(rank_text):
    ranks = rank_text.split('\n')
    primary_rank = ranks[0].split()[0].lstrip('#').replace(',', '')
    secondary_rank = ranks[1].split()[0].lstrip('#').replace(',', '') if len(ranks) > 1 else ''
    return f"{primary_rank}({secondary_rank})" if secondary_rank else primary_rank

def fetch_rank_for_asin(driver, asin):
    try:
        rank_element = driver.find_element(By.XPATH, f"//div[@data-asin='{asin}']//span[contains(@class, 'zg-bdg-text')]")
        rank_text = rank_element.text.strip().replace('#', '')
        logger.info(f"Rank for ASIN {asin}: {rank_text}")
        return rank_text
    except Exception as e:
        logger.warning(f"Could not fetch rank for ASIN {asin}: {e}")
        return None

def fetch_rank_in_subcategory(driver, subcategory_url, subcategory_name, asin):
    try:
        driver.get(subcategory_url)
        logger.info(f"Opening Subcategory URL: {subcategory_url} for {subcategory_name}")

        WebDriverWait(driver, 20).until(
            EC.presence_of_element_located((By.XPATH, f"//div[@data-asin='{asin}']"))
        )
        logger.info(f"ASIN {asin} found on subcategory page: {subcategory_name}")

        rank = fetch_rank_for_asin(driver, asin)
        if rank:
            return f"{rank}"  # Only return the rank for simplified formatting
        return None
    except Exception as e:
        logger.warning(f"ASIN {asin} not found or rank unavailable in subcategory {subcategory_name}: {e}")
        return None

def fetch_subcategories_and_ranks(driver):
    """
    Fetch the next three subcategories and their rankings after skipping "Any Department" and "Toys & Games".
    """
    try:
        subcategory_elements = driver.find_elements(By.XPATH, "//div[@role='treeitem'] | //a[contains(@href, '/zgbs/')]")
        subcategories = []
        count = 0

        for element in subcategory_elements:
            try:
                subcategory_name = element.text.strip()
                subcategory_link = element.get_attribute("href")

                # Skip "Any Department", "Toys & Games", and similar categories
                if all(skip not in subcategory_name for skip in ["Any Department", "Toys & Games"]) and subcategory_link:
                    subcategories.append((subcategory_name, subcategory_link))
                    logger.info(f"Found subcategory: {subcategory_name} - {subcategory_link}")
                    count += 1
                    if count >= 5:  # Limit to the next three valid subcategories
                        break
            except Exception as e:
                logger.warning(f"Error processing subcategory element: {e}")

        return subcategories
    except Exception as e:
        logger.error(f"Failed to fetch subcategories: {e}")
        return []

def check_asins_in_category_page(driver, category_url, asin_list):
    """
    Check if ASINs exist in the category page and fetch their ranks.
    """
    try:
        driver.get(category_url)
        logger.info(f"Opening Category URL: {category_url}")

        found_asins = []
        subcategories = fetch_subcategories_and_ranks(driver)

        for asin in asin_list:
            ranks = []
            main_rank = fetch_rank_for_asin(driver, asin)
            if main_rank:
                ranks.append(main_rank)  # Append the main category rank last

            for subcategory_name, subcategory_link in subcategories:
                if subcategory_link:
                    sub_rank = fetch_rank_in_subcategory(driver, subcategory_link, subcategory_name, asin)
                    if sub_rank:
                        ranks.insert(0, sub_rank)  # Insert subcategory ranks at the beginning

            final_rank = " ".join([f"({rank})" for rank in ranks])
            found_asins.append((asin, final_rank))

        return found_asins, subcategories
    except Exception as e:
        logger.error(f"An error occurred while checking ASINs: {e}")
        return [], []

# def scrape_best_sellers_rank(driver, product, url, asin_list):
#     try:
#         driver.get(url)
#         rank_section = WebDriverWait(driver, 80).until(
#             EC.visibility_of_element_located((By.XPATH, "//th[contains(text(), 'Best Sellers Rank')]//following-sibling::td"))
#         )
#         rank_text = rank_section.text.strip()
#         best_seller_rank = parse_rank_numbers(rank_text)
#         logger.info(f"Best Sellers Rank for {product}: {best_seller_rank}")

#         category_links = rank_section.find_elements(By.XPATH, ".//a")
#         if len(category_links) > 1:
#             second_category_url = category_links[1].get_attribute("href")
#             found_asins_with_ranks, subcategories = check_asins_in_category_page(driver, second_category_url, asin_list)

#             # Add the best seller rank to the final output
#             for asin, ranks in found_asins_with_ranks:
#                 final_output = f"{best_seller_rank} {ranks}"
#                 logger.info(f"Final rank output for ASIN {asin}: {final_output}")
#         else:
#             logger.warning(f"No second category link found for {product}.")
#     except Exception as e:
#         logger.error(f"Failed to scrape {product} ({url}): {e}")


import re

def scrape_best_sellers_rank(driver, product, url, asin_list):
    try:
        driver.get(url)
        rank_section = WebDriverWait(driver, 20).until(
            EC.visibility_of_element_located((By.XPATH, "//th[contains(text(), 'Best Sellers Rank')]//following-sibling::td"))
        )
        rank_text = rank_section.text.strip()
        best_seller_rank = parse_rank_numbers(rank_text)
        logger.info(f"Best Sellers Rank for {product}: {best_seller_rank}")

        category_links = rank_section.find_elements(By.XPATH, ".//a")
        if len(category_links) > 1:
            second_category_url = category_links[1].get_attribute("href")
            found_asins_with_ranks, subcategories = check_asins_in_category_page(driver, second_category_url, asin_list)

            # Add the best seller rank to the final output
            for asin, ranks in found_asins_with_ranks:
                # Debugging: Ensure `ranks` contains full rank information
                logger.debug(f"Raw ranks for ASIN {asin}: {ranks}")

                # Extract the first three numbers
                final_output = f"{best_seller_rank} {ranks}"
                first_three_numbers = extract_first_three_numbers(final_output)

                # Log the final output
                logger.info(f"Final rank output for ASIN {asin}: {first_three_numbers}")
        else:
            logger.warning(f"No second category link found for {product}.")
    except Exception as e:
        logger.error(f"Failed to scrape {product} ({url}): {e}")

def extract_first_three_numbers(rank_output):
    """Extract the first three numbers from the rank output."""
    # Regular expression to match numbers followed by optional parentheses
    # matches = re.findall(r'\d+\(.*?\)', rank_output)

    # # Debugging: Log matches for validation
    # logger.debug(f"Extracted matches: {matches}")

    # # Return the first three matches joined with a space
    # return " ".join(matches[:3])

    # Regular expression to match numbers with or without leading digits
    matches = re.findall(r'\d+\(\d+\)|\(\d+\)', rank_output)
    logger.debug(f"Extracted matches: {matches}")
    # Return the first three matches joined with a space
    return "".join(matches[:3])


def run_scrape_single_url(product, url, asin_list):
    options = Options()
    options.add_argument("--disable-gpu")
    options.add_argument("--no-sandbox")
    options.add_argument("--disable-dev-shm-usage")
    options.add_argument("--log-level=3")

    driver = webdriver.Chrome(options=options)
    scrape_best_sellers_rank(driver, product, url, asin_list)
    driver.quit()

if __name__ == "__main__":
    product_name = "Sample Product"
    product_url = "https://www.amazon.com/Skillmatics-Art-Craft-Activity-Princesses/dp/B0BV2YFF5K/ref=zg_bs_g_166057011_d_sccl_7/138-9322492-1070323?th=1"
    asin_list = ["B0BV2YFF5K"]

    logger.info("Running scraper for a single URL...")
    run_scrape_single_url(product_name, product_url, asin_list)

2025-01-29 15:24:48,460 - Running scraper for a single URL...

KeyboardInterrupt



__Testing On the different url__

In [12]:
import logging
from datetime import datetime
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

# Set up logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(message)s')
logger = logging.getLogger()

def current_time_slot():
    now = datetime.now()
    return now.strftime('%Y-%m-%d %H:%M')

def parse_rank_numbers(rank_text):
    ranks = rank_text.split('\n')
    primary_rank = ranks[0].split()[0].lstrip('#').replace(',', '')
    secondary_rank = ranks[1].split()[0].lstrip('#').replace(',', '') if len(ranks) > 1 else ''
    return f"{primary_rank}({secondary_rank})" if secondary_rank else primary_rank

def fetch_rank_for_asin(driver, asin):
    try:
        rank_element = driver.find_element(By.XPATH, f"//div[@data-asin='{asin}']//span[contains(@class, 'zg-bdg-text')]")
        rank_text = rank_element.text.strip().replace('#', '')
        logger.info(f"Rank for ASIN {asin}: {rank_text}")
        return rank_text
    except Exception as e:
        logger.warning(f"Could not fetch rank for ASIN {asin}: {e}")
        return None

def fetch_rank_in_subcategory(driver, subcategory_url, subcategory_name, asin):
    try:
        driver.get(subcategory_url)
        logger.info(f"Opening Subcategory URL: {subcategory_url} for {subcategory_name}")

        WebDriverWait(driver, 20).until(
            EC.presence_of_element_located((By.XPATH, f"//div[@data-asin='{asin}']"))
        )
        logger.info(f"ASIN {asin} found on subcategory page: {subcategory_name}")

        rank = fetch_rank_for_asin(driver, asin)
        if rank:
            return f"{rank}"  # Only return the rank for simplified formatting
        return None
    except Exception as e:
        logger.warning(f"ASIN {asin} not found or rank unavailable in subcategory {subcategory_name}: {e}")
        return None

def fetch_subcategories_and_ranks(driver):
    """
    Fetch the next three subcategories and their rankings after skipping "Any Department" and "Toys & Games".
    """
    try:
        subcategory_elements = driver.find_elements(By.XPATH, "//div[@role='treeitem'] | //a[contains(@href, '/zgbs/')]")
        subcategories = []
        count = 0

        for element in subcategory_elements:
            try:
                subcategory_name = element.text.strip()
                subcategory_link = element.get_attribute("href")

                # Skip "Any Department", "Toys & Games", and similar categories
                if all(skip not in subcategory_name for skip in ["Any Department", "Toys & Games"]) and subcategory_link:
                    subcategories.append((subcategory_name, subcategory_link))
                    logger.info(f"Found subcategory: {subcategory_name} - {subcategory_link}")
                    count += 1
                    if count >= 5:  # Limit to the next three valid subcategories
                        break
            except Exception as e:
                logger.warning(f"Error processing subcategory element: {e}")

        return subcategories
    except Exception as e:
        logger.error(f"Failed to fetch subcategories: {e}")
        return []

def check_asins_in_category_page(driver, category_url, asin_list):
    """
    Check if ASINs exist in the category page and fetch their ranks.
    """
    try:
        driver.get(category_url)
        logger.info(f"Opening Category URL: {category_url}")

        found_asins = []
        subcategories = fetch_subcategories_and_ranks(driver)

        for asin in asin_list:
            ranks = []
            main_rank = fetch_rank_for_asin(driver, asin)
            if main_rank:
                ranks.append(main_rank)  # Append the main category rank last

            for subcategory_name, subcategory_link in subcategories:
                if subcategory_link:
                    sub_rank = fetch_rank_in_subcategory(driver, subcategory_link, subcategory_name, asin)
                    if sub_rank:
                        ranks.insert(0, sub_rank)  # Insert subcategory ranks at the beginning

            final_rank = " ".join([f"({rank})" for rank in ranks])
            found_asins.append((asin, final_rank))

        return found_asins, subcategories
    except Exception as e:
        logger.error(f"An error occurred while checking ASINs: {e}")
        return [], []

# def scrape_best_sellers_rank(driver, product, url, asin_list):
#     try:
#         driver.get(url)
#         rank_section = WebDriverWait(driver, 80).until(
#             EC.visibility_of_element_located((By.XPATH, "//th[contains(text(), 'Best Sellers Rank')]//following-sibling::td"))
#         )
#         rank_text = rank_section.text.strip()
#         best_seller_rank = parse_rank_numbers(rank_text)
#         logger.info(f"Best Sellers Rank for {product}: {best_seller_rank}")

#         category_links = rank_section.find_elements(By.XPATH, ".//a")
#         if len(category_links) > 1:
#             second_category_url = category_links[1].get_attribute("href")
#             found_asins_with_ranks, subcategories = check_asins_in_category_page(driver, second_category_url, asin_list)

#             # Add the best seller rank to the final output
#             for asin, ranks in found_asins_with_ranks:
#                 final_output = f"{best_seller_rank} {ranks}"
#                 logger.info(f"Final rank output for ASIN {asin}: {final_output}")
#         else:
#             logger.warning(f"No second category link found for {product}.")
#     except Exception as e:
#         logger.error(f"Failed to scrape {product} ({url}): {e}")


import re

def scrape_best_sellers_rank(driver, product, url, asin_list):
    try:
        driver.get(url)
        rank_section = WebDriverWait(driver, 80).until(
            EC.visibility_of_element_located((By.XPATH, "//th[contains(text(), 'Best Sellers Rank')]//following-sibling::td"))
        )
        rank_text = rank_section.text.strip()
        best_seller_rank = parse_rank_numbers(rank_text)
        logger.info(f"Best Sellers Rank for {product}: {best_seller_rank}")

        category_links = rank_section.find_elements(By.XPATH, ".//a")
        if len(category_links) > 1:
            second_category_url = category_links[1].get_attribute("href")
            found_asins_with_ranks, subcategories = check_asins_in_category_page(driver, second_category_url, asin_list)

            # Add the best seller rank to the final output
            for asin, ranks in found_asins_with_ranks:
                # Debugging: Ensure `ranks` contains full rank information
                logger.debug(f"Raw ranks for ASIN {asin}: {ranks}")

                # Extract the first three numbers
                final_output = f"{best_seller_rank} {ranks}"
                first_three_numbers = extract_first_three_numbers(final_output)

                # Log the final output
                logger.info(f"Final rank output for ASIN {asin}: {first_three_numbers}")
        else:
            logger.warning(f"No second category link found for {product}.")
    except Exception as e:
        logger.error(f"Failed to scrape {product} ({url}): {e}")

def extract_first_three_numbers(rank_output):
    """Extract the first three numbers from the rank output."""
    # Regular expression to match numbers followed by optional parentheses
    # matches = re.findall(r'\d+\(.*?\)', rank_output)

    # # Debugging: Log matches for validation
    # logger.debug(f"Extracted matches: {matches}")

    # # Return the first three matches joined with a space
    # return " ".join(matches[:3])

    # Regular expression to match numbers with or without leading digits
    matches = re.findall(r'\d+\(\d+\)|\(\d+\)', rank_output)
    logger.debug(f"Extracted matches: {matches}")
    # Return the first three matches joined with a space
    return "".join(matches[:3])


def run_scrape_single_url(product, url, asin_list):
    options = Options()
    options.add_argument("--disable-gpu")
    options.add_argument("--no-sandbox")
    options.add_argument("--disable-dev-shm-usage")
    options.add_argument("--log-level=3")
    # options = Options()
    # options.add_argument("--disable-gpu")
    # options.add_argument("--no-sandbox")
    # options.add_argument("--disable-dev-shm-usage")
    # options.add_argument("--log-level=3")
    # options.add_argument("--disable-blink-features=AutomationControlled")
    # options.add_argument("user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36")
    # options.add_experimental_option("excludeSwitches", ["enable-automation"])
    # options.add_experimental_option("useAutomationExtension", False)

    # Open driver
    # driver = webdriver.Chrome(options=options)
    # driver.execute_script("Object.defineProperty(navigator, 'webdriver', {get: () => undefined})")
    
    # Random delay before each request
    # import random, time
    # sleep_time = random.uniform(5, 15)
    # logger.info(f"Sleeping for {sleep_time:.2f} seconds before accessing {url}")
    # time.sleep(sleep_time)
    # driver.get(url)

    driver = webdriver.Chrome(options=options)
    scrape_best_sellers_rank(driver, product, url, asin_list)
    driver.quit()

if __name__ == "__main__":
    product_name = "Sample Product"
    product_url = "https://www.amazon.com/Skillmatics-Art-Craft-Activity-Poke/dp/B0CXTJ9JHK?ref_=ast_sto_dp&th=1"
    asin_list = ["B0CZP42TND"]

    logger.info("Running scraper for a single URL...")
    run_scrape_single_url(product_name, product_url, asin_list)

2025-01-29 18:52:35,072 - Running scraper for a single URL...
2025-01-29 18:52:59,794 - Best Sellers Rank for Sample Product: 41(1)
2025-01-29 18:53:00,198 - Opening Category URL: https://www.amazon.com/gp/bestsellers/toys-and-games/166078011/ref=pd_zg_hrsr_toys-and-games
2025-01-29 18:53:00,241 - Could not fetch rank for ASIN B0CZP42TND: Message: no such element: Unable to locate element: {"method":"xpath","selector":"//div[@data-asin='B0CZP42TND']//span[contains(@class, 'zg-bdg-text')]"}
  (Session info: chrome=131.0.6778.265); For documentation on this error, please visit: https://www.selenium.dev/documentation/webdriver/troubleshooting/errors#no-such-element-exception
Stacktrace:
	GetHandleVerifier [0x00007FF7491480D5+2992373]
	(No symbol) [0x00007FF748DDBFD0]
	(No symbol) [0x00007FF748C7590A]
	(No symbol) [0x00007FF748CC926E]
	(No symbol) [0x00007FF748CC955C]
	(No symbol) [0x00007FF748D127D7]
	(No symbol) [0x00007FF748CEF3AF]
	(No symbol) [0x00007FF748D0F584]
	(No symbol) [0x00007

In [1]:
import logging
from datetime import datetime
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import gspread
from google.oauth2.service_account import Credentials
import pandas as pd
import re

# Set up logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(message)s')
logger = logging.getLogger()

# Path to your service account key file
SERVICE_ACCOUNT_FILE = 'ranking-436314-4daf4b7d4292.json'

# Define the scope
scope = ["https://spreadsheets.google.com/feeds", "https://www.googleapis.com/auth/drive"]

# Authenticate using the service account file
creds = Credentials.from_service_account_file(SERVICE_ACCOUNT_FILE, scopes=scope)
client = gspread.authorize(creds)

# Open the Google Sheet
workbook = client.open("Skillmatics Rank Sheet Streamlit")
url_sheet = workbook.worksheet("Node-URL")
rank_sheet = workbook.worksheet("Node-Rank")

def current_time_slot():
    """Returns the current timestamp for tracking scraping sessions."""
    now = datetime.now()
    return now.strftime('%Y-%m-%d %H:%M')

def parse_rank_numbers(rank_text):
    """Parses the rank text to extract the primary and secondary rankings."""
    ranks = rank_text.split('\n')
    primary_rank = ranks[0].split()[0].lstrip('#').replace(',', '')
    secondary_rank = ranks[1].split()[0].lstrip('#').replace(',', '') if len(ranks) > 1 else ''
    return f"{primary_rank}({secondary_rank})" if secondary_rank else primary_rank



def update_google_sheet(product, current_time, rank_value):
    """Updates the Google Sheet with the latest rank values efficiently."""
    try:
        # Fetch existing data
        existing_data = rank_sheet.get_all_records()
        existing_df = pd.DataFrame(existing_data)
        existing_df.set_index('Product', inplace=True)

        # Create a new DataFrame for the update
        update_data = pd.DataFrame({current_time: [rank_value]}, index=[product])

        # Merge new data
        merged_data = existing_df.combine_first(update_data)

        # Convert DataFrame back to list format
        data = merged_data.reset_index().values.tolist()
        headers = ['Product'] + [col for col in merged_data.columns if col != 'Product']

        # Clear and update the sheet in one batch update
        rank_sheet.clear()
        rank_sheet.update('A1', [headers] + data)

        logger.info(f"Updated Google Sheet: {product} -> {rank_value} at {current_time}")
    except Exception as e:
        logger.error(f"Error updating Google Sheet: {e}")



def fetch_rank_for_asin(driver, asin):
    """Fetches the rank for the provided ASIN on the page."""
    try:
        rank_element = driver.find_element(By.XPATH, f"//div[@data-asin='{asin}']//span[contains(@class, 'zg-bdg-text')]")
        rank_text = rank_element.text.strip().replace('#', '')
        logger.info(f"Rank for ASIN {asin}: {rank_text}")
        return rank_text
    except Exception as e:
        logger.warning(f"Could not fetch rank for ASIN {asin}: {e}")
        return None

def fetch_rank_in_subcategory(driver, subcategory_url, subcategory_name, asin):
    """Fetches the rank of the product in a specific subcategory."""
    try:
        driver.get(subcategory_url)
        logger.info(f"Opening Subcategory URL: {subcategory_url} for {subcategory_name}")

        WebDriverWait(driver, 30).until(
            EC.presence_of_element_located((By.XPATH, f"//div[@data-asin='{asin}']"))
        )
        logger.info(f"ASIN {asin} found on subcategory page: {subcategory_name}")

        rank = fetch_rank_for_asin(driver, asin)
        if rank:
            return f"{rank}"  # Only return the rank for simplified formatting
        return None
    except Exception as e:
        logger.warning(f"ASIN {asin} not found or rank unavailable in subcategory {subcategory_name}: {e}")
        return None

# def fetch_subcategories_and_ranks(driver):
#     """Fetches the next three subcategories, opens their links, and retrieves rankings."""
#     try:
#         subcategory_elements = driver.find_elements(By.XPATH, "//a[contains(@href, '/zgbs/')]")
#         subcategories = []
#         count = 0

#         for element in subcategory_elements:
#             try:
#                 subcategory_name = element.text.strip()
#                 subcategory_link = element.get_attribute("href")

#                 # Skip irrelevant categories
#                 if all(skip not in subcategory_name for skip in ["Any Department", "Toys & Games"]) and subcategory_link:
#                     subcategories.append((subcategory_name, subcategory_link))
#                     logger.info(f"Found subcategory: {subcategory_name} - {subcategory_link}")
#                     count += 1
#                     if count >= 5:  # Limit to 3 subcategories
#                         break
#             except Exception as e:
#                 logger.warning(f"Error processing subcategory element: {e}")

#         return subcategories
#     except Exception as e:
#         logger.error(f"Failed to fetch subcategories: {e}")
#         return []

def fetch_subcategories_and_ranks(driver):
    """
    Fetch the next three subcategories and their rankings after skipping "Any Department" and "Toys & Games".
    """
    try:
        subcategory_elements = driver.find_elements(By.XPATH, "//div[@role='treeitem'] | //a[contains(@href, '/zgbs/')]")
        subcategories = []
        count = 0

        for element in subcategory_elements:
            try:
                subcategory_name = element.text.strip()
                subcategory_link = element.get_attribute("href")

                # Skip "Any Department", "Toys & Games", and similar categories
                if all(skip not in subcategory_name for skip in ["Any Department", "Toys & Games"]) and subcategory_link:
                    subcategories.append((subcategory_name, subcategory_link))
                    logger.info(f"Found subcategory: {subcategory_name} - {subcategory_link}")
                    count += 1
                    if count >= 5:  # Limit to the next three valid subcategories
                        break
            except Exception as e:
                logger.warning(f"Error processing subcategory element: {e}")

        return subcategories
    except Exception as e:
        logger.error(f"Failed to fetch subcategories: {e}")
        return []

def check_asins_in_category_page(driver, category_url, asin_list):
    """Check if ASINs exist in the category page and fetch their ranks."""
    try:
        driver.get(category_url)
        logger.info(f"Opening Category URL: {category_url}")

        found_asins = []
        subcategories = fetch_subcategories_and_ranks(driver)

        for asin in asin_list:
            ranks = []
            main_rank = fetch_rank_for_asin(driver, asin)
            if main_rank:
                ranks.append(main_rank)

            for subcategory_name, subcategory_link in subcategories:
                if subcategory_link:
                    sub_rank = fetch_rank_in_subcategory(driver, subcategory_link, subcategory_name, asin)
                    if sub_rank:
                        ranks.insert(0, sub_rank)

            final_rank = "".join([f"({rank})" for rank in ranks])
            found_asins.append((asin, final_rank))

        return found_asins, subcategories
    except Exception as e:
        logger.error(f"An error occurred while checking ASINs: {e}")
        return [], []

def extract_first_three_numbers(rank_output):
    """Extracts the first three numbers from the rank output."""
    matches = re.findall(r'\d+\(\d+\)|\(\d+\)', rank_output)
    logger.debug(f"Extracted matches: {matches}")
    return "".join(matches[:3])

# def scrape_best_sellers_rank(driver, product, url, asin_list, current_time):
#     """Scrapes the Best Sellers Rank and category ranks for a given product URL and ASINs."""
#     try:
#         driver.get(url)
#         rank_section = WebDriverWait(driver, 80).until(
#             EC.visibility_of_element_located((By.XPATH, "//th[contains(text(), 'Best Sellers Rank')]//following-sibling::td"))
#         )
#         rank_text = rank_section.text.strip()
#         best_seller_rank = parse_rank_numbers(rank_text)
#         logger.info(f"Best Sellers Rank for {product}: {best_seller_rank}")

#         if best_seller_rank.endswith("(1)"):
#             category_links = rank_section.find_elements(By.XPATH, ".//a")
#             if len(category_links) > 1:
#                 second_category_url = category_links[1].get_attribute("href")
#                 found_asins_with_ranks, subcategories = check_asins_in_category_page(driver, second_category_url, asin_list)

#                 category_ranks = [ranks for asin, ranks in found_asins_with_ranks]
#                 category_ranks_str = f"({'})('.join(category_ranks[:3])})" if category_ranks else ""
#                 final_rank = f"{best_seller_rank}{category_ranks_str}"

#                 formatted_rank = extract_first_three_numbers(final_rank)
#                 logger.info(f"Final rank for {product} (ASIN: {asin_list[0]}): {formatted_rank}")
#                 update_google_sheet(product, current_time, formatted_rank)
#             else:
#                 logger.warning(f"No second category link found for {product}.")
#         else:
#             logger.info(f"Skipping category ranking for {product}, as Best Seller Rank is not (1).")
#             update_google_sheet(product, current_time, best_seller_rank)
#     except Exception as e:
#         logger.error(f"Failed to scrape {product} ({url}): {e}")



def scrape_best_sellers_rank(driver, product, url, asin_list, current_time):
    """Scrapes the Best Sellers Rank and category ranks for a given product URL and ASINs."""
    max_retries = 3  # Maximum retries for throttling
    retries = 0

    while retries < max_retries:
        try:
            # If the current page is already open, just refresh instead of loading again
            if driver.current_url != url:
                driver.get(url)
            else:
                logger.info(f"Refreshing already opened page for {product}")
                driver.refresh()

            logger.info(f"Fetching {product}: Attempt {retries + 1}")

            # Wait for rank section to be visible
            rank_section = WebDriverWait(driver, 30).until(
                EC.visibility_of_element_located((By.XPATH, "//th[contains(text(), 'Best Sellers Rank')]//following-sibling::td"))
            )

            rank_text = rank_section.text.strip()
            best_seller_rank = parse_rank_numbers(rank_text)
            logger.info(f"Best Sellers Rank for {product}: {best_seller_rank}")

            # If no rank is found or throttle detected, refresh and retry
            if not best_seller_rank or "throttle" in rank_text.lower():
                raise Exception("Throttle detected, retrying with refresh...")

            category_links = rank_section.find_elements(By.XPATH, ".//a")
            category_ranks = []

            if category_links:
                for link in category_links[:3]:  # Limit to first 3 category links
                    category_url = link.get_attribute("href")
                    category_name = link.text.strip()
                    logger.info(f"Checking category: {category_name} -> {category_url}")

                    if category_url:
                        category_rank = fetch_rank_in_subcategory(driver, category_url, category_name, asin_list[0])
                        if category_rank:
                            category_ranks.append(f"{category_rank}")

            # Format the final rank string
            category_ranks_str = f"({'})('.join(category_ranks)})" if category_ranks else ""
            final_rank = f"{best_seller_rank}{category_ranks_str}"

            formatted_rank = extract_first_three_numbers(final_rank)
            logger.info(f"Final rank for {product} (ASIN: {asin_list[0]}): {formatted_rank}")

            update_google_sheet(product, current_time, formatted_rank)
            return  # Exit the loop after a successful scrape

        except Exception as e:
            logger.warning(f"Attempt {retries + 1} failed for {product}: {e}")
            retries += 1

            if retries < max_retries:
                logger.info(f"Refreshing {product} page and retrying...")
                driver.refresh()  # Only refresh instead of opening a new URL
                WebDriverWait(driver, 10).until(
                    EC.presence_of_element_located((By.TAG_NAME, "body"))
                )
            else:
                logger.error(f"Failed to scrape {product} after {max_retries} retries.")
                update_google_sheet(product, current_time, "Failed to Fetch")  # Mark failure in the sheet
                return


from concurrent.futures import ThreadPoolExecutor, as_completed

def scrape_wrapper(product, url, asin, options, current_time):
    """Wrapper function to handle multi-threaded scraping."""
    try:
        driver = webdriver.Chrome(options=options)  # Initialize WebDriver inside thread
        scrape_best_sellers_rank(driver, product, url, [asin], current_time)
        driver.quit()
    except Exception as e:
        logger.error(f"Error in scrape_wrapper for {product}: {e}")

def run_scrape_all():
    """Parallel execution for scraping multiple URLs"""
    url_data = url_sheet.get_all_records()
    df = pd.DataFrame(url_data)
    df.columns = df.columns.str.strip()

    product_names = df['Products'].tolist()
    url_list = df['URL'].tolist()
    asin_list = df['ASIN'].tolist()
    current_time = current_time_slot()

    options = Options()
    # options.add_argument("--headless")
    options.add_argument("--disable-gpu")
    options.add_argument("--no-sandbox")
    options.add_argument("--disable-dev-shm-usage")
    options.add_argument("--log-level=3")

    num_threads = min(5, len(product_names))  # Prevent excessive threads for small data

    with ThreadPoolExecutor(max_workers=num_threads) as executor:
        futures = {
            executor.submit(scrape_wrapper, product, url, asin, options, current_time): product
            for product, url, asin in zip(product_names, url_list, asin_list)
        }

        for future in as_completed(futures):
            try:
                future.result()  # Raise exceptions if any
            except Exception as e:
                logger.error(f"Scraping failed for {futures[future]}: {e}")

if __name__ == "__main__":
    logger.info("Starting the scraper for all products...")
    run_scrape_all()

2025-02-03 09:47:05,126 - Starting the scraper for all products...
2025-02-03 09:47:14,137 - Fetching Poke In Art: Attempt 1
2025-02-03 09:47:33,424 - Best Sellers Rank for Poke In Art: 66(1)
2025-02-03 09:47:33,535 - Checking category: See Top 100 in Toys & Games -> https://www.amazon.com/gp/bestsellers/toys-and-games/ref=pd_zg_ts_toys-and-games
2025-02-03 09:47:36,897 - Opening Subcategory URL: https://www.amazon.com/gp/bestsellers/toys-and-games/ref=pd_zg_ts_toys-and-games for See Top 100 in Toys & Games
2025-02-03 09:48:07,049 - ASIN B0CZP42TND not found or rank unavailable in subcategory See Top 100 in Toys & Games: Message: 
Stacktrace:
	GetHandleVerifier [0x00007FF63BC102F5+28725]
	(No symbol) [0x00007FF63BB72AE0]
	(No symbol) [0x00007FF63BA0510A]
	(No symbol) [0x00007FF63BA593D2]
	(No symbol) [0x00007FF63BA595FC]
	(No symbol) [0x00007FF63BAA3407]
	(No symbol) [0x00007FF63BA7FFEF]
	(No symbol) [0x00007FF63BAA0181]
	(No symbol) [0x00007FF63BA7FD53]
	(No symbol) [0x00007FF63BA4A0E