In [None]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from webdriver_manager.chrome import ChromeDriverManager
from concurrent.futures import ThreadPoolExecutor
import time
import threading

# Website details
website_configs = [
    {
        "name": "Virgio",
        "url": "https://www.virgio.com",
        "collections": ["https://www.virgio.com/collections/all"],
        "product_pattern": "/products/"
    },
    {
        "name": "NykaaFashion",
        "url": "https://www.nykaafashion.com",
        "collections": ["https://www.nykaafashion.com/women/c/6557"],
        "product_pattern": "/p/"
    },
    {
        "name": "TataCliq",
        "url": "https://www.tatacliq.com",
        "collections": ["https://www.tatacliq.com/mens-clothing/c-msh11"],
        "product_pattern": "/p-"
    },
    {
        "name": "Westside",
        "url": "https://www.westside.com",
        "collections": ["https://www.westside.com/collections/new-in-western-wear-for-women"],
        "product_pattern": "/products/"
    }
]

# Lock for print synchronization
print_lock = threading.Lock()

def create_driver():
    """Create and configure Chrome WebDriver instance."""
    options = webdriver.ChromeOptions()
    options.add_argument('--disable-blink-features=AutomationControlled')
    options.add_argument('--no-sandbox')
    options.add_argument('--disable-dev-shm-usage')
    options.add_argument("start-maximized")
    options.add_argument("user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36")
    # options.add_argument('--headless')  # Uncomment if needed
    options.accept_insecure_certs = True

    service = Service(ChromeDriverManager().install())
    driver = webdriver.Chrome(service=service, options=options)
    return driver

def scrape_website(config):
    driver = create_driver()
    product_links = set()
    try:
        for collection_url in config["collections"]:
            with print_lock:
                print(f"\n[{config['name']}] Scraping: {collection_url}")

            driver.get(collection_url)
            time.sleep(5)
            body = driver.find_element(By.TAG_NAME, "body")
            no_new_product_count = 0
            MAX_NO_NEW_COUNT = 5

            while no_new_product_count < MAX_NO_NEW_COUNT:
                # body.send_keys(Keys.END)
                # Scroll slowly and repeatedly to trigger lazy loading
                for _ in range(10):
                    driver.execute_script("window.scrollBy(0, 500);")
                    time.sleep(1)

                time.sleep(3)  # Wait after scroll for new products to load

                prev_count = len(product_links)

                # === Custom logic for NykaaFashion ===
                if config["name"] == "NykaaFashion":
                    try:
                        new_links = driver.execute_script("""
                            const links = new Set();

                            // Try finding standard anchor tags with href
                            document.querySelectorAll('a[href*="/p/"]').forEach(a => {
                                links.add(a.href);
                            });

                            // Some product cards may use onclick with URLs
                            document.querySelectorAll('[onclick]').forEach(el => {
                                const onclick = el.getAttribute('onclick');
                                const match = onclick && onclick.match(/location.href='(\\/p\\/[^']+)'/);
                                if (match) {
                                    links.add("https://www.nykaafashion.com" + match[1]);
                                }
                            });

                            // If hrefs are in data attributes (like data-href)
                            document.querySelectorAll('[data-href]').forEach(el => {
                                const val = el.getAttribute('data-href');
                                if (val && val.includes('/p/')) {
                                    links.add("https://www.nykaafashion.com" + val);
                                }
                            });

                            return Array.from(links);
                        """)
                        for url in new_links:
                            product_links.add(url)
                    except Exception as e:
                        with print_lock:
                            print(f"[{config['name']}] JS extraction failed: {e}")
                else:
                    # === Generic scraping logic ===
                    try:
                        all_links = driver.find_elements(By.TAG_NAME, "a")
                        for link in all_links:
                            try:
                                url = link.get_attribute("href")
                                if url and config["product_pattern"] in url:
                                    product_links.add(url)
                            except Exception:
                                continue
                    except Exception as e:
                        with print_lock:
                            print(f"[{config['name']}] Warning during link extraction: {e}")

                with print_lock:
                    print(f"[{config['name']}] Found {len(product_links)} product links so far...")

                if len(product_links) == prev_count:
                    no_new_product_count += 1
                else:
                    no_new_product_count = 0

                for _ in range(4):
                    body.send_keys(Keys.PAGE_UP)
                    time.sleep(1)

        # Save to file
        filename = f"{config['name'].replace(' ', '_').lower()}_product_links.csv"
        with open(filename, "w", encoding="utf-8") as f:
            for link in product_links:
                f.write(link + "\n")

        with print_lock:
            print(f"[{config['name']}] Saved {len(product_links)} links to {filename}")

    except Exception as e:
        with print_lock:
            print(f"[{config['name']}] Error: {e}")

    finally:
        driver.quit()

# Run all scrapers in parallel
if __name__ == "__main__":
    with ThreadPoolExecutor(max_workers=4) as executor:
        executor.map(scrape_website, website_configs)


[TataCliq] Scraping: https://www.tatacliq.com/mens-clothing/c-msh11

[Westside] Scraping: https://www.westside.com/collections/new-in-western-wear-for-women

[Virgio] Scraping: https://www.virgio.com/collections/all

[NykaaFashion] Scraping: https://www.nykaafashion.com/women/c/6557
[TataCliq] Error: Message: unknown error: cannot determine loading status
from target frame detached
  (Session info: chrome=135.0.7049.42)
Stacktrace:
	GetHandleVerifier [0x00A680E3+60707]
	GetHandleVerifier [0x00A68124+60772]
	(No symbol) [0x008904FE]
	(No symbol) [0x00880F10]
	(No symbol) [0x0087EF30]
	(No symbol) [0x0087F9ED]
	(No symbol) [0x0088C39B]
	(No symbol) [0x0089D6E5]
	(No symbol) [0x008A2F86]
	(No symbol) [0x0087FFFD]
	(No symbol) [0x0089CF64]
	(No symbol) [0x0091EB40]
	(No symbol) [0x008FCE46]
	(No symbol) [0x008CC5D3]
	(No symbol) [0x008CD424]
	GetHandleVerifier [0x00CABBC3+2435075]
	GetHandleVerifier [0x00CA7163+2416035]
	GetHandleVerifier [0x00CC350C+2531660]
	GetHandleVerifier [0x00A7F1B