In [48]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time
import pandas as pd
import string

# Set up Chrome options (Optional for headless mode)
chrome_options = Options()
chrome_options.add_argument("--headless")  # Comment this line if you want to see the browser

# Specify the path to your chromedriver.exe
service = Service("C:/BITBRIJESHSIR/upgrade/AI/Selenium/chromedriver-win64/chromedriver.exe")

# Initialize the WebDriver
driver = webdriver.Chrome(service=service, options=chrome_options)

spices = []

for letter in string.ascii_lowercase[:26]:  # Looping from 'a' to 'z'
    # Open the target URL
    driver.get(f"https://www.thespicehouse.com/collections/letter-{letter}")

    # Wait for the product list to load on the page
    WebDriverWait(driver, 10).until(
        EC.presence_of_all_elements_located((By.CLASS_NAME, "products-list"))
    )

    # Find all product rows on the page
    products_elements = driver.find_elements(By.CLASS_NAME, "product-row")
    
    for product in products_elements:
        spice = {}  # Initialize a dictionary for each spice
        
        # Extract the spice name (product name)
        item = product.find_elements(By.TAG_NAME, "a")
        if len(item) > 0:
            spice_name = item[0].find_elements(By.TAG_NAME, "div")[1].find_elements(By.TAG_NAME, "h3")[0].text
            spice["spice_name"] = spice_name
        
        # Extract variants (packaging, weight, price)
        variants = product.find_elements(By.CLASS_NAME, "product__variants")
        if variants:
            variant_details = []  

            for variant in variants[0].find_elements(By.TAG_NAME, "option"):
                variant_data = {}
                variant_title = variant.get_attribute("data-variant-title")
                if variant_title:
                    variant_split = variant_title.split(",")
                    if len(variant_split) > 1:
                        variant_data["packaging_type"] = variant_split[0]
                        variant_data["weight"] = variant_split[1].strip()
                variant_data["price"] = variant.get_attribute("data-price")
                
                variant_details.append(variant_data)
            
            spice["variants"] = variant_details
        
        spices.append(spice)

driver.quit()

df = pd.DataFrame(spices)

df.to_excel("spices.xlsx", index=False, engine='openpyxl')  # Save as Excel

print(df)


                     spice_name  \
0    Adobo, Salt-Free Seasoning   
1                  Ajowan Seeds   
2      Allspice Berries, Ground   
3       Allspice Berries, Whole   
4                Amchoor Powder   
..                          ...   
280           Virtual Gift Card   
281          Vulcan's Fire Salt   
282        World's Fair BBQ Rub   
283           Yuzu Kosho Powder   
284                     Za'atar   

                                              variants  
0    [{'packaging_type': 'Jar', 'weight': '2.6oz.',...  
1    [{'packaging_type': 'Flatpack', 'weight': '1.9...  
2    [{'packaging_type': 'Jar', 'weight': '2.2oz.',...  
3    [{'packaging_type': 'Flatpack', 'weight': '1.5...  
4    [{'packaging_type': 'Flatpack', 'weight': '2.5...  
..                                                 ...  
280  [{'price': '$25.00'}, {'price': '$50.00'}, {'p...  
281  [{'packaging_type': 'Jar', 'weight': '2.5oz.',...  
282  [{'packaging_type': 'Jar', 'weight': '2.7oz.',...  
283  [{'p

In [52]:
import json
with open("spices.json", "w", encoding="utf-8") as json_file:
    json.dump(spices, json_file, indent=4, ensure_ascii=False)

print("Scraping complete. Data saved to spices.json.")

Scraping complete. Data saved to spices.json.
