In [1]:
!pip install selenium




In [1]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import pandas as pd
import time

# Set up Selenium WebDriver
options = webdriver.ChromeOptions()
options.add_argument("--start-maximized")
options.add_argument("--headless")  # Run without opening browser
driver = webdriver.Chrome()

BASE_URL = "https://www.nykaa.com"
SKINCARE_URL = "https://www.nykaa.com/sp/skin-native-desktop/skin"

def get_category_links(): 
    """Extracts links of categories (Cleansers, Serums, etc.)."""
    driver.get(SKINCARE_URL)
    time.sleep(5)

    category_elements = driver.find_elements(By.CSS_SELECTOR, "a[href*='/skin']")
    category_links = {cat.text: cat.get_attribute("href") for cat in category_elements if cat.text.strip()}

    return category_links

def get_product_links(category_url):
    """Extracts product links from a category page."""
    driver.get(category_url)
    time.sleep(5)

    product_links = []
    while True:
        # Find all product links
        products = driver.find_elements(By.CSS_SELECTOR, "a[href*='/p/']")
        for product in products:
            product_links.append(product.get_attribute("href"))

        # Check if "Next" button exists and click it
        try:
            next_button = driver.find_element(By.CSS_SELECTOR, ".css-1zi560")  # Adjust selector if necessary
            driver.execute_script("arguments[0].click();", next_button)
            time.sleep(5)
        except:
            break  # No next button, stop pagination

    return list(set(product_links))  # Remove duplicates

def scrape_product_details(product_url):
    """Scrapes title, price, rating, reviews, description, and ingredients from a product page."""
    driver.get(product_url)
    time.sleep(5)

    try:
        title = driver.find_element(By.CSS_SELECTOR, "h1").text.strip()
        price = driver.find_element(By.CSS_SELECTOR, ".css-111z9ua").text.strip()
        star_rating = driver.find_element(By.CSS_SELECTOR, ".css-wskh5y").text.strip() if driver.find_elements(By.CSS_SELECTOR, ".css-wskh5y") else "No rating"


        overall_rating = driver.find_element(By.CSS_SELECTOR, ".css-m6n3ou").text.strip() if driver.find_elements(By.CSS_SELECTOR, ".css-m6n3ou") else "No reviews"
        description = driver.find_element(By.CSS_SELECTOR, ".css-1l34a92").text.strip() if driver.find_elements(By.CSS_SELECTOR, ".css-1l34a92") else "No description"
        ingredients = driver.find_element(By.CSS_SELECTOR, ".css-1yuhvjn").text.strip() if driver.find_elements(By.CSS_SELECTOR, ".css-1yuhvjn") else "No ingredients"

        return {
            "Title": title,
            "Price": price,
            "Rating": rating,
            "Reviews": reviews,
            "Description": description,
            "Ingredients": ingredients,
            "URL": product_url
        }
    except Exception as e:
        print(f"Error scraping {product_url}: {e}")
        return None

def main():
    """Main function to scrape all skincare products from Nykaa."""
    category_links = get_category_links()
    all_products = []

    for category, link in category_links.items():
        print(f"Scraping category: {category}")
        product_links = get_product_links(link)

        for product_url in product_links[:200]:  # Adjust limit to avoid excessive requests
            print(f"Scraping product: {product_url}")
            product_data = scrape_product_details(product_url)
            if product_data:
                all_products.append(product_data)
            time.sleep(2)  # Prevent being blocked

    # Save data to CSV
    df = pd.DataFrame(all_products)
    df.to_csv("nykaa_skincare_products.csv", index=False)
    print("Data saved to nykaa_skincare_products.csv")

    driver.quit()  # Close browser

if __name__ == "__main__":
    main()


Scraping category: Skin
Data saved to nykaa_skincare_products.csv


In [None]:
category_links = {

"SERUM": "https://www.nykaa.com/skin/moisturizers/serums-essence/c/8397?transaction_id=30c6d4d33475ff089bb0e3d5a55f62b0",
"CLEANSERS": "https://www.nykaa.com/skin/cleansers/c/8378?desktop&category_filter=8380,8379&transaction_id=277e576ff773ed9dbe654081fbeb525e",
#MOISTURIZER: 'https://www.nykaa.com/skin/moisturizers/face-moisturizer-day-cream/c/8394?transaction_id=05d46f9acc7f15fa39709dc25663f59b'
"TONERS": "https://www.nykaa.com/skin/moisturizers/face-moisturizer-day-cream/c/8394?transaction_id=05d46f9acc7f15fa39709dc25663f59b",
"UNDER_EYE_CREAM" : "https://www.nykaa.com/skin/eye-care/under-eye-cream-serums/c/8403?transaction_id=147658577dce55491e3e43f4c458a7e5",
"FACE_OIL":"https://www.nykaa.com/skin/moisturizers/face-oils/c/8396",
"MOISTURIZER":"https://www.nykaa.com/skin/moisturizers/c/8393",
"SUNSCREEN" : "https://www.nykaa.com/skin/sun-care/c/8428?transaction_id=e80fa585841e81ffb26274f509d1b28e"}



In [3]:
category_links

{'SERUM': 'https://www.nykaa.com/skin/moisturizers/serums-essence/c/8397?transaction_id=30c6d4d33475ff089bb0e3d5a55f62b0',
 'CLEANSERS': 'https://www.nykaa.com/skin/cleansers/c/8378?desktop&category_filter=8380,8379&transaction_id=277e576ff773ed9dbe654081fbeb525e',
 'TONERS': 'https://www.nykaa.com/skin/moisturizers/face-moisturizer-day-cream/c/8394?transaction_id=05d46f9acc7f15fa39709dc25663f59b',
 'UNDER_EYE_CREAM': 'https://www.nykaa.com/skin/eye-care/under-eye-cream-serums/c/8403?transaction_id=147658577dce55491e3e43f4c458a7e5',
 'FACE_OIL': 'https://www.nykaa.com/skin/moisturizers/face-oils/c/8396',
 'MOISTURIZER': 'https://www.nykaa.com/skin/moisturizers/c/8393',
 'SUNSCREEN': 'https://www.nykaa.com/skin/sun-care/c/8428?transaction_id=e80fa585841e81ffb26274f509d1b28e'}

In [4]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time
import pandas as pd
from urllib.parse import urlparse, parse_qs, urlencode, urlunparse
import re
from selenium.common.exceptions import WebDriverException, TimeoutException

# Set up WebDriver options
options = webdriver.ChromeOptions()

def start_driver():
    """Starts a new WebDriver session with error handling."""
    try:
        driver = webdriver.Chrome(options=options)
        wait = WebDriverWait(driver, 15)
        return driver, wait
    except WebDriverException as e:
        print(f"❌ WebDriver failed to start: {e}")
        time.sleep(5)
        return start_driver()

# Start WebDriver session
driver, wait = start_driver()

for category, category_url in category_links.items():
    print(f"\n🔗 Scraping Category: {category}")
    
    data = []  # Reset data for each category
    scraped_urls = set()
    page_no = 1
    product_links = set()
    prev_page_links = set()

    while True:
        try:
            parsed_url = urlparse(category_url)
            query_params = parse_qs(parsed_url.query)
            query_params["page_no"] = [str(page_no)]
            new_query_string = urlencode(query_params, doseq=True)
            new_url = urlunparse((parsed_url.scheme, parsed_url.netloc, parsed_url.path, parsed_url.params, new_query_string, parsed_url.fragment))

            driver.get(new_url)
            time.sleep(2)

            print(f"\n📄 Scraping Page {page_no} of {category}")
            
            current_page_links = {p.get_attribute("href") for p in driver.find_elements(By.XPATH, "//a[contains(@href, '/p/')]")}
            
            if not current_page_links or current_page_links == prev_page_links:
                print("🚀 No more products found or duplicate page detected. Stopping pagination.")
                break
            
            product_links.update(current_page_links)
            print(f"✅ Found {len(current_page_links)} products on page {page_no}. Total: {len(product_links)}")

            prev_page_links = current_page_links
            page_no += 1
        
        except WebDriverException as e:
            print(f"⚠️ Network error on page {page_no}, retrying... {e}")
            driver.quit()
            time.sleep(5)
            driver, wait = start_driver()
            continue  # Retry the same page

    for product_url in product_links:
        if product_url in scraped_urls:
            continue  # Skip already scraped products
        
        try:
            driver.get(product_url)
            time.sleep(2)

            product_name = wait.until(EC.presence_of_element_located((By.TAG_NAME, "h1"))).text.strip()
            
            # Handle popups
            try:
                close_popup = wait.until(EC.element_to_be_clickable((By.XPATH, "//button[contains(text(), '×')]")))
                close_popup.click()
                time.sleep(1)
            except:
                pass

            try:
                product_price = driver.find_element(By.XPATH, "//span[contains(@class, 'css') and contains(text(), '₹')]").text.strip()
            except:
                product_price = "Price not available"
            
            try:
                overall_rating = driver.find_element(By.CSS_SELECTOR, ".css-wskh5y").text.strip()
            except:
                overall_rating = "No overall rating"
            
            try:
                star_rating = driver.find_element(By.CSS_SELECTOR, "div.css-m6n3ou").text.strip()
            except:
                star_rating = "No star rating"
            
            # Extract Skin Type Suitability
            skin_types = ["Combination", "Dry", "Normal", "Oily", "Sensitive"]
            skin_suitability = {st: 0 for st in skin_types}  
            
            page_text = driver.page_source.lower()
            for st in skin_types:
                if re.search(rf"\b{st.lower()}\b", page_text):
                    skin_suitability[st] = 1
            
            try:
                ingredients = "Not available"
                # Click the "Ingredients" tab if available
                try:
                    ingredients_tab = driver.find_element(By.XPATH, "//h3[contains(text(),'Ingredients')]")
                    driver.execute_script("arguments[0].click();", ingredients_tab)
                    time.sleep(2)
                except:
                    pass
                # Click "Read More" if present
                try:
                    read_more_button = driver.find_element(By.CSS_SELECTOR, "div.css-1eymbsg a")
                    driver.execute_script("arguments[0].click();", read_more_button)
                    time.sleep(2)
                except:
                    pass
                # Extract ingredients
                ingredients_elements = driver.find_elements(By.CSS_SELECTOR, "div#content-details p")
                ingredients = " | ".join([elem.text.strip() for elem in ingredients_elements if elem.text.strip()])
            except Exception as e:
                ingredients = f"Error extracting: {e}"
            
            data.append([
                category, product_name, product_price, star_rating, overall_rating,
                skin_suitability["Combination"], skin_suitability["Dry"],
                skin_suitability["Normal"], skin_suitability["Oily"], skin_suitability["Sensitive"], 
                ingredients, product_url
            ])
            
            scraped_urls.add(product_url)
            print(f" Category:{category} |📦 Scraped: {product_name} - {product_price} | ⭐ {star_rating} | Verified: {overall_rating} | URL: {product_url} | Skin Types: {skin_suitability} | Ingredients: {ingredients}")
        
        except WebDriverException as e:
            print(f"⚠️ Network error on product {product_url}, retrying... {e}")
            driver.quit()
            time.sleep(5)
            driver, wait = start_driver()
            continue  # Retry the same product
        
        except Exception as e:
            print(f"❌ Error scraping {product_url}: {e}")

    # Save data per category
    df = pd.DataFrame(data, columns=["Category", "Product Name", "Price", "Star Rating", "Overall Rating",
                                     "Combination", "Dry", "Normal", "Oily", "Sensitive", "Ingredients", "Product URL"])
    filename = f"nykaa_{category.replace(' ', '_').lower()}.csv"
    df.to_csv(filename, index=False, encoding="utf-8")
    print(f"\n✅ Data for '{category}' saved to {filename}")

# Close WebDriver
driver.quit()
print("\n✅ Scraping completed!")



🔗 Scraping Category: SERUM

📄 Scraping Page 1 of SERUM
✅ Found 21 products on page 1. Total: 21

📄 Scraping Page 2 of SERUM
✅ Found 21 products on page 2. Total: 41

📄 Scraping Page 3 of SERUM
✅ Found 21 products on page 3. Total: 61

📄 Scraping Page 4 of SERUM
✅ Found 21 products on page 4. Total: 81

📄 Scraping Page 5 of SERUM
✅ Found 21 products on page 5. Total: 101

📄 Scraping Page 6 of SERUM
✅ Found 21 products on page 6. Total: 121

📄 Scraping Page 7 of SERUM
✅ Found 21 products on page 7. Total: 141

📄 Scraping Page 8 of SERUM
✅ Found 21 products on page 8. Total: 161

📄 Scraping Page 9 of SERUM
✅ Found 21 products on page 9. Total: 180

📄 Scraping Page 10 of SERUM
✅ Found 21 products on page 10. Total: 200

📄 Scraping Page 11 of SERUM
✅ Found 21 products on page 11. Total: 220

📄 Scraping Page 12 of SERUM
✅ Found 21 products on page 12. Total: 240

📄 Scraping Page 13 of SERUM
✅ Found 21 products on page 13. Total: 260

📄 Scraping Page 14 of SERUM
✅ Found 21 products on page 1

ReadTimeoutError: HTTPConnectionPool(host='localhost', port=64681): Read timed out. (read timeout=120)

In [5]:
category_links = {

"TONERS": "https://www.nykaa.com/skin/moisturizers/face-moisturizer-day-cream/c/8394?transaction_id=05d46f9acc7f15fa39709dc25663f59b",
"UNDER_EYE_CREAM" : "https://www.nykaa.com/skin/eye-care/under-eye-cream-serums/c/8403?transaction_id=147658577dce55491e3e43f4c458a7e5",
"FACE_OIL":"https://www.nykaa.com/skin/moisturizers/face-oils/c/8396",
"MOISTURIZER":"https://www.nykaa.com/skin/moisturizers/c/8393",
"SUNSCREEN" : "https://www.nykaa.com/skin/sun-care/c/8428?transaction_id=e80fa585841e81ffb26274f509d1b28e"}



In [6]:
category_links

{'TONERS': 'https://www.nykaa.com/skin/moisturizers/face-moisturizer-day-cream/c/8394?transaction_id=05d46f9acc7f15fa39709dc25663f59b',
 'UNDER_EYE_CREAM': 'https://www.nykaa.com/skin/eye-care/under-eye-cream-serums/c/8403?transaction_id=147658577dce55491e3e43f4c458a7e5',
 'FACE_OIL': 'https://www.nykaa.com/skin/moisturizers/face-oils/c/8396',
 'MOISTURIZER': 'https://www.nykaa.com/skin/moisturizers/c/8393',
 'SUNSCREEN': 'https://www.nykaa.com/skin/sun-care/c/8428?transaction_id=e80fa585841e81ffb26274f509d1b28e'}

In [None]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time
import pandas as pd
from urllib.parse import urlparse, parse_qs, urlencode, urlunparse
import re
from selenium.common.exceptions import WebDriverException, TimeoutException

# Set up WebDriver options
options = webdriver.ChromeOptions()

def start_driver():
    """Starts a new WebDriver session with error handling."""
    try:
        driver = webdriver.Chrome(options=options)
        wait = WebDriverWait(driver, 15)
        return driver, wait
    except WebDriverException as e:
        print(f"❌ WebDriver failed to start: {e}")
        time.sleep(5)
        return start_driver()

# Start WebDriver session
driver, wait = start_driver()

for category, category_url in category_links.items():
    print(f"\n🔗 Scraping Category: {category}")
    
    data = []  # Reset data for each category
    scraped_urls = set()
    page_no = 1
    product_links = set()
    prev_page_links = set()

    while True:
        try:
            parsed_url = urlparse(category_url)
            query_params = parse_qs(parsed_url.query)
            query_params["page_no"] = [str(page_no)]
            new_query_string = urlencode(query_params, doseq=True)
            new_url = urlunparse((parsed_url.scheme, parsed_url.netloc, parsed_url.path, parsed_url.params, new_query_string, parsed_url.fragment))

            driver.get(new_url)
            time.sleep(2)

            print(f"\n📄 Scraping Page {page_no} of {category}")
            
            current_page_links = {p.get_attribute("href") for p in driver.find_elements(By.XPATH, "//a[contains(@href, '/p/')]")}
            
            if not current_page_links or current_page_links == prev_page_links:
                print("🚀 No more products found or duplicate page detected. Stopping pagination.")
                break
            
            product_links.update(current_page_links)
            print(f"✅ Found {len(current_page_links)} products on page {page_no}. Total: {len(product_links)}")

            prev_page_links = current_page_links
            page_no += 1
        
        except WebDriverException as e:
            print(f"⚠️ Network error on page {page_no}, retrying... {e}")
            driver.quit()
            time.sleep(5)
            driver, wait = start_driver()
            continue  # Retry the same page

    for product_url in product_links:
        if product_url in scraped_urls:
            continue  # Skip already scraped products
        
        try:
            driver.get(product_url)
            time.sleep(2)

            product_name = wait.until(EC.presence_of_element_located((By.TAG_NAME, "h1"))).text.strip()
            
            # Handle popups
            try:
                close_popup = wait.until(EC.element_to_be_clickable((By.XPATH, "//button[contains(text(), '×')]")))
                close_popup.click()
                time.sleep(1)
            except:
                pass

            try:
                product_price = driver.find_element(By.XPATH, "//span[contains(@class, 'css') and contains(text(), '₹')]").text.strip()
            except:
                product_price = "Price not available"
            
            try:
                overall_rating = driver.find_element(By.CSS_SELECTOR, ".css-wskh5y").text.strip()
            except:
                overall_rating = "No overall rating"
            
            try:
                star_rating = driver.find_element(By.CSS_SELECTOR, "div.css-m6n3ou").text.strip()
            except:
                star_rating = "No star rating"
            
            # Extract Skin Type Suitability
            skin_types = ["Combination", "Dry", "Normal", "Oily", "Sensitive"]
            skin_suitability = {st: 0 for st in skin_types}  
            
            page_text = driver.page_source.lower()
            for st in skin_types:
                if re.search(rf"\b{st.lower()}\b", page_text):
                    skin_suitability[st] = 1
            
            try:
                ingredients = "Not available"
                # Click the "Ingredients" tab if available
                try:
                    ingredients_tab = driver.find_element(By.XPATH, "//h3[contains(text(),'Ingredients')]")
                    driver.execute_script("arguments[0].click();", ingredients_tab)
                    time.sleep(2)
                except:
                    pass
                # Click "Read More" if present
                try:
                    read_more_button = driver.find_element(By.CSS_SELECTOR, "div.css-1eymbsg a")
                    driver.execute_script("arguments[0].click();", read_more_button)
                    time.sleep(2)
                except:
                    pass
                # Extract ingredients
                ingredients_elements = driver.find_elements(By.CSS_SELECTOR, "div#content-details p")
                ingredients = " | ".join([elem.text.strip() for elem in ingredients_elements if elem.text.strip()])
            except Exception as e:
                ingredients = f"Error extracting: {e}"
            
            data.append([
                category, product_name, product_price, star_rating, overall_rating,
                skin_suitability["Combination"], skin_suitability["Dry"],
                skin_suitability["Normal"], skin_suitability["Oily"], skin_suitability["Sensitive"], 
                ingredients, product_url
            ])
            
            scraped_urls.add(product_url)
            print(f" Category:{category} |📦 Scraped: {product_name} - {product_price} | ⭐ {star_rating} | Verified: {overall_rating} | URL: {product_url} | Skin Types: {skin_suitability} | Ingredients: {ingredients}")
        
        except WebDriverException as e:
            print(f"⚠️ Network error on product {product_url}, retrying... {e}")
            driver.quit()
            time.sleep(5)
            driver, wait = start_driver()
            continue  # Retry the same product
        
        except Exception as e:
            print(f"❌ Error scraping {product_url}: {e}")

    # Save data per category
    df = pd.DataFrame(data, columns=["Category", "Product Name", "Price", "Star Rating", "Overall Rating",
                                     "Combination", "Dry", "Normal", "Oily", "Sensitive", "Ingredients", "Product URL"])
    filename = f"nykaa_{category.replace(' ', '_').lower()}.csv"
    df.to_csv(filename, index=False, encoding="utf-8")
    print(f"\n✅ Data for '{category}' saved to {filename}")

# Close WebDriver
driver.quit()
print("\n✅ Scraping completed!")



🔗 Scraping Category: TONERS

📄 Scraping Page 1 of TONERS
✅ Found 20 products on page 1. Total: 20

📄 Scraping Page 2 of TONERS
✅ Found 20 products on page 2. Total: 40

📄 Scraping Page 3 of TONERS
✅ Found 20 products on page 3. Total: 60

📄 Scraping Page 4 of TONERS
✅ Found 20 products on page 4. Total: 80

📄 Scraping Page 5 of TONERS
✅ Found 20 products on page 5. Total: 100

📄 Scraping Page 6 of TONERS
✅ Found 20 products on page 6. Total: 120

📄 Scraping Page 7 of TONERS
✅ Found 20 products on page 7. Total: 140

📄 Scraping Page 8 of TONERS
✅ Found 20 products on page 8. Total: 160

📄 Scraping Page 9 of TONERS
✅ Found 20 products on page 9. Total: 180

📄 Scraping Page 10 of TONERS
✅ Found 20 products on page 10. Total: 200

📄 Scraping Page 11 of TONERS
✅ Found 20 products on page 11. Total: 220

📄 Scraping Page 12 of TONERS
✅ Found 20 products on page 12. Total: 240

📄 Scraping Page 13 of TONERS
✅ Found 20 products on page 13. Total: 260

📄 Scraping Page 14 of TONERS
✅ Found 20 pro

There was an error managing chromedriver (error sending request for url (https://googlechromelabs.github.io/chrome-for-testing/known-good-versions-with-downloads.json)); using driver found in the cache
Error sending stats to Plausible: error sending request for url (https://plausible.io/api/event)


⚠️ Network error on product https://www.nykaa.com/skinfood-carrot-carotene-moist-effector/p/12886673?productId=12886673&pps=9, retrying... Message: unknown error: net::ERR_INTERNET_DISCONNECTED
  (Session info: chrome=134.0.6998.89)
Stacktrace:
	GetHandleVerifier [0x00007FF6FF5CFE65+26629]
	(No symbol) [0x00007FF6FF536030]
	(No symbol) [0x00007FF6FF3C931A]
	(No symbol) [0x00007FF6FF3C5D10]
	(No symbol) [0x00007FF6FF3B68C9]
	(No symbol) [0x00007FF6FF3B8638]
	(No symbol) [0x00007FF6FF3B6BD6]
	(No symbol) [0x00007FF6FF3B6656]
	(No symbol) [0x00007FF6FF3B631A]
	(No symbol) [0x00007FF6FF3B3E9F]
	(No symbol) [0x00007FF6FF3B478C]
	(No symbol) [0x00007FF6FF3CD16A]
	(No symbol) [0x00007FF6FF47102E]
	(No symbol) [0x00007FF6FF447AAA]
	(No symbol) [0x00007FF6FF470169]
	(No symbol) [0x00007FF6FF447883]
	(No symbol) [0x00007FF6FF410550]
	(No symbol) [0x00007FF6FF411803]
	GetHandleVerifier [0x00007FF6FF9272DD+3529853]
	GetHandleVerifier [0x00007FF6FF93DA42+3621858]
	GetHandleVerifier [0x00007FF6FF932

There was an error managing chromedriver (error sending request for url (https://googlechromelabs.github.io/chrome-for-testing/known-good-versions-with-downloads.json)); using driver found in the cache


⚠️ Network error on product https://www.nykaa.com/mamaearth-milky-soft-face-cream-for-babies-with-milk-protein-murumuru-butter-50gm/p/290032?productId=290032&pps=4, retrying... Message: unknown error: net::ERR_INTERNET_DISCONNECTED
  (Session info: chrome=134.0.6998.89)
Stacktrace:
	GetHandleVerifier [0x00007FF6FF5CFE65+26629]
	(No symbol) [0x00007FF6FF536030]
	(No symbol) [0x00007FF6FF3C931A]
	(No symbol) [0x00007FF6FF3C5D10]
	(No symbol) [0x00007FF6FF3B68C9]
	(No symbol) [0x00007FF6FF3B8638]
	(No symbol) [0x00007FF6FF3B6BD6]
	(No symbol) [0x00007FF6FF3B6656]
	(No symbol) [0x00007FF6FF3B631A]
	(No symbol) [0x00007FF6FF3B3E9F]
	(No symbol) [0x00007FF6FF3B478C]
	(No symbol) [0x00007FF6FF3CD16A]
	(No symbol) [0x00007FF6FF47102E]
	(No symbol) [0x00007FF6FF447AAA]
	(No symbol) [0x00007FF6FF470169]
	(No symbol) [0x00007FF6FF447883]
	(No symbol) [0x00007FF6FF410550]
	(No symbol) [0x00007FF6FF411803]
	GetHandleVerifier [0x00007FF6FF9272DD+3529853]
	GetHandleVerifier [0x00007FF6FF93DA42+362185

There was an error managing chromedriver (error sending request for url (https://googlechromelabs.github.io/chrome-for-testing/known-good-versions-with-downloads.json)); using driver found in the cache
Error sending stats to Plausible: error sending request for url (https://plausible.io/api/event)


⚠️ Network error on product https://www.nykaa.com/lotus-herbals-whiteglow-skin-whitening-brightening-deep-moisturising-cream-spf-20-pa-40-g/p/11110689?productId=11110689&pps=18, retrying... Message: unknown error: net::ERR_NAME_NOT_RESOLVED
  (Session info: chrome=134.0.6998.89)
Stacktrace:
	GetHandleVerifier [0x00007FF6FF5CFE65+26629]
	(No symbol) [0x00007FF6FF536030]
	(No symbol) [0x00007FF6FF3C931A]
	(No symbol) [0x00007FF6FF3C5D10]
	(No symbol) [0x00007FF6FF3B68C9]
	(No symbol) [0x00007FF6FF3B8638]
	(No symbol) [0x00007FF6FF3B6BD6]
	(No symbol) [0x00007FF6FF3B6656]
	(No symbol) [0x00007FF6FF3B631A]
	(No symbol) [0x00007FF6FF3B3E9F]
	(No symbol) [0x00007FF6FF3B478C]
	(No symbol) [0x00007FF6FF3CD16A]
	(No symbol) [0x00007FF6FF47102E]
	(No symbol) [0x00007FF6FF447AAA]
	(No symbol) [0x00007FF6FF470169]
	(No symbol) [0x00007FF6FF447883]
	(No symbol) [0x00007FF6FF410550]
	(No symbol) [0x00007FF6FF411803]
	GetHandleVerifier [0x00007FF6FF9272DD+3529853]
	GetHandleVerifier [0x00007FF6FF93DA

In [1]:
category_links = {

"SUNSCREEN" : "https://www.nykaa.com/skin/sun-care/c/8428?transaction_id=e80fa585841e81ffb26274f509d1b28e"}



In [None]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time
import pandas as pd
from urllib.parse import urlparse, parse_qs, urlencode, urlunparse
import re
from selenium.common.exceptions import WebDriverException, TimeoutException

# Set up WebDriver options
options = webdriver.ChromeOptions()

def start_driver():
    """Starts a new WebDriver session with error handling."""
    try:
        driver = webdriver.Chrome(options=options)
        wait = WebDriverWait(driver, 15)
        return driver, wait
    except WebDriverException as e:
        print(f"❌ WebDriver failed to start: {e}")
        time.sleep(5)
        return start_driver()

# Start WebDriver session
driver, wait = start_driver()

for category, category_url in category_links.items():
    print(f"\n🔗 Scraping Category: {category}")
    
    data = []  # Reset data for each category
    scraped_urls = set()
    page_no = 1
    product_links = set()
    prev_page_links = set()

    while True:
        try:
            parsed_url = urlparse(category_url)
            query_params = parse_qs(parsed_url.query)
            query_params["page_no"] = [str(page_no)]
            new_query_string = urlencode(query_params, doseq=True)
            new_url = urlunparse((parsed_url.scheme, parsed_url.netloc, parsed_url.path, parsed_url.params, new_query_string, parsed_url.fragment))

            driver.get(new_url)
            time.sleep(2)

            print(f"\n📄 Scraping Page {page_no} of {category}")
            
            current_page_links = {p.get_attribute("href") for p in driver.find_elements(By.XPATH, "//a[contains(@href, '/p/')]")}
            
            if not current_page_links or current_page_links == prev_page_links:
                print("🚀 No more products found or duplicate page detected. Stopping pagination.")
                break
            
            product_links.update(current_page_links)
            print(f"✅ Found {len(current_page_links)} products on page {page_no}. Total: {len(product_links)}")

            prev_page_links = current_page_links
            page_no += 1
        
        except WebDriverException as e:
            print(f"⚠️ Network error on page {page_no}, retrying... {e}")
            driver.quit()
            time.sleep(5)
            driver, wait = start_driver()
            continue  # Retry the same page

    for product_url in product_links:
        if product_url in scraped_urls:
            continue  # Skip already scraped products
        
        try:
            driver.get(product_url)
            time.sleep(2)

            product_name = wait.until(EC.presence_of_element_located((By.TAG_NAME, "h1"))).text.strip()
            
            # Handle popups
            try:
                close_popup = wait.until(EC.element_to_be_clickable((By.XPATH, "//button[contains(text(), '×')]")))
                close_popup.click()
                time.sleep(1)
            except:
                pass

            try:
                product_price = driver.find_element(By.XPATH, "//span[contains(@class, 'css') and contains(text(), '₹')]").text.strip()
            except:
                product_price = "Price not available"
            
            try:
                overall_rating = driver.find_element(By.CSS_SELECTOR, ".css-wskh5y").text.strip()
            except:
                overall_rating = "No overall rating"
            
            try:
                star_rating = driver.find_element(By.CSS_SELECTOR, "div.css-m6n3ou").text.strip()
            except:
                star_rating = "No star rating"
            
            # Extract Skin Type Suitability
            skin_types = ["Combination", "Dry", "Normal", "Oily", "Sensitive"]
            skin_suitability = {st: 0 for st in skin_types}  
            
            page_text = driver.page_source.lower()
            for st in skin_types:
                if re.search(rf"\b{st.lower()}\b", page_text):
                    skin_suitability[st] = 1
            
            try:
                ingredients = "Not available"
                # Click the "Ingredients" tab if available
                try:
                    ingredients_tab = driver.find_element(By.XPATH, "//h3[contains(text(),'Ingredients')]")
                    driver.execute_script("arguments[0].click();", ingredients_tab)
                    time.sleep(2)
                except:
                    pass
                # Click "Read More" if present
                try:
                    read_more_button = driver.find_element(By.CSS_SELECTOR, "div.css-1eymbsg a")
                    driver.execute_script("arguments[0].click();", read_more_button)
                    time.sleep(2)
                except:
                    pass
                # Extract ingredients
                ingredients_elements = driver.find_elements(By.CSS_SELECTOR, "div#content-details p")
                ingredients = " | ".join([elem.text.strip() for elem in ingredients_elements if elem.text.strip()])
            except Exception as e:
                ingredients = f"Error extracting: {e}"
            
            data.append([
                category, product_name, product_price, star_rating, overall_rating,
                skin_suitability["Combination"], skin_suitability["Dry"],
                skin_suitability["Normal"], skin_suitability["Oily"], skin_suitability["Sensitive"], 
                ingredients, product_url
            ])
            
            scraped_urls.add(product_url)
            print(f" Category:{category} |📦 Scraped: {product_name} - {product_price} | ⭐ {star_rating} | Verified: {overall_rating} | URL: {product_url} | Skin Types: {skin_suitability} | Ingredients: {ingredients}")
        
        except WebDriverException as e:
            print(f"⚠️ Network error on product {product_url}, retrying... {e}")
            driver.quit()
            time.sleep(5)
            driver, wait = start_driver()
            continue  # Retry the same product
        
        except Exception as e:
            print(f"❌ Error scraping {product_url}: {e}")

    # Save data per category
    df = pd.DataFrame(data, columns=["Category", "Product Name", "Price", "Star Rating", "Overall Rating",
                                     "Combination", "Dry", "Normal", "Oily", "Sensitive", "Ingredients", "Product URL"])
    filename = f"nykaa_{category.replace(' ', '_').lower()}.csv"
    df.to_csv(filename, index=False, encoding="utf-8")
    print(f"\n✅ Data for '{category}' saved to {filename}")

# Close WebDriver
driver.quit()
print("\n✅ Scraping completed!")



🔗 Scraping Category: SUNSCREEN

📄 Scraping Page 1 of SUNSCREEN
✅ Found 22 products on page 1. Total: 22

📄 Scraping Page 2 of SUNSCREEN
✅ Found 22 products on page 2. Total: 42

📄 Scraping Page 3 of SUNSCREEN
✅ Found 22 products on page 3. Total: 62

📄 Scraping Page 4 of SUNSCREEN
✅ Found 21 products on page 4. Total: 82

📄 Scraping Page 5 of SUNSCREEN
✅ Found 21 products on page 5. Total: 102

📄 Scraping Page 6 of SUNSCREEN
✅ Found 21 products on page 6. Total: 122

📄 Scraping Page 7 of SUNSCREEN
✅ Found 21 products on page 7. Total: 142

📄 Scraping Page 8 of SUNSCREEN
✅ Found 21 products on page 8. Total: 162

📄 Scraping Page 9 of SUNSCREEN
✅ Found 21 products on page 9. Total: 182

📄 Scraping Page 10 of SUNSCREEN
✅ Found 21 products on page 10. Total: 202

📄 Scraping Page 11 of SUNSCREEN
✅ Found 21 products on page 11. Total: 222

📄 Scraping Page 12 of SUNSCREEN
✅ Found 21 products on page 12. Total: 242

📄 Scraping Page 13 of SUNSCREEN
✅ Found 21 products on page 13. Total: 262

📄

In [3]:
df

Unnamed: 0,Category,Product Name,Price,Star Rating,Overall Rating,Combination,Dry,Normal,Oily,Sensitive,Ingredients,Product URL
0,SUNSCREEN,Indulgeo Essentials U-We Protect Hydrating Sun...,₹842,4.8/5,No overall rating,1,1,1,1,0,,https://www.nykaa.com/indulgeo-essentials-u-we...
1,SUNSCREEN,VI-JOHN Sunscreen SPF 50 PA+++ Brightening Ski...,₹319,5/5,( 1 ),1,1,1,1,0,,https://www.nykaa.com/vi-john-sunscreen-spf-50...
2,SUNSCREEN,Aroma Magic Cucumber Sunscreen Lotion SPF 30 U...,₹355,4.3/5,No overall rating,1,1,1,1,1,,https://www.nykaa.com/aroma-magic-cucumber-sun...
3,SUNSCREEN,Missha Glow Cushion Light No.21P Fair SPF 37 P...,₹2117,3.7/5,( 20 ),1,1,1,1,0,,https://www.nykaa.com/missha-glow-cushion-ligh...
4,SUNSCREEN,Skin1004 Madagascar Centella Hyalu-Cica Silky-...,₹2039,No star rating,No overall rating,1,1,1,1,0,,https://www.nykaa.com/skin1004-madagascar-cent...
...,...,...,...,...,...,...,...,...,...,...,...,...
1159,SUNSCREEN,Sotrue SPF 50+ Glow Stick Sunscreen For Invisi...,₹479,4.3/5,( 27 ),1,1,1,1,0,,https://www.nykaa.com/sotrue-spf-50-glow-stick...
1160,SUNSCREEN,FCL Light Weight NonGreasy Broad Spectrum SPF ...,₹1075,4.1/5,( 82 ),1,1,1,1,1,,https://www.nykaa.com/fixderma-light-weight-no...
1161,SUNSCREEN,Lacto Calamine Skin & Body Lotion Moisturizer ...,₹272,4.3/5,( 289 ),1,1,1,1,0,,https://www.nykaa.com/lacto-calamine-face-loti...
1162,SUNSCREEN,Plum Bright Boost Duo Hydrate & Protect - Suit...,₹848,4.3/5,( 29225 ),1,1,1,1,0,,https://www.nykaa.com/plum-bright-boost-duo-hy...


In [6]:
import pandas as pd

# Define column names
columns = [
    "Category", "Product Name", "Product Price", "Star Rating", "Overall Rating",
    "Combination Skin", "Dry Skin", "Normal Skin", "Oily Skin", "Sensitive Skin",
    "Ingredients", "Product URL"
]

# Create a DataFrame
df = pd.DataFrame(data, columns=columns)

# Save to Excel file
df.to_excel("scraped_nykaa_sunscreen.xlsx", index=False, engine="openpyxl")

print("✅ Data saved to 'scraped_nykaa_sunscreen.xlsx'")


✅ Data saved to 'scraped_nykaa_sunscreen.xlsx'
