In [19]:
from bs4 import BeautifulSoup
import pandas as pd 
import requests
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
import time

In [20]:
url_unique = 'https://www.superga.com'
url_base = 'https://www.superga.com/collections/'
url_page = 'womens-low-cut?filter.p.m.custom.seasons=Spring%2FSummer&page='  # Add page number here
header = {"User-Agent": "Chrome/39.0"}

In [21]:
# Create a list to store rows
rows = []

# Set up the Chrome WebDriver
options = webdriver.ChromeOptions()
options.add_argument('--headless')  # Run in headless mode (no UI)
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)

In [22]:
# Loop through all pages (e.g., 6 pages)
for page_num in range(1, 7):  # Loop for pages 1 to 6
    # Construct the full URL for the current page
    url = url_base + url_page + str(page_num)
    print(f"Scraping page: {url}")
    
    # Request the page content
    page = requests.get(url, headers=header)
    soup = BeautifulSoup(page.text, 'html.parser')

    # Find all product links on the current page
    scarpebasse = soup.find_all('a', class_="grid-product__link")

    # Check if products were found
    print(f"Found {len(scarpebasse)} product links on page {page_num}")

    # Loop through each product link
    for product in scarpebasse:
        try:
            # Get the product URL
            url_scarpebasse = product.get('href')  # Get the link from the product
            url1 = url_unique + url_scarpebasse  # Construct the full product URL
            driver.get(url1)  # Load the product page

            # Wait for JavaScript to render the page
            time.sleep(5)

            # Extract the rendered HTML
            html_content = driver.page_source
            soup_scarpebasse = BeautifulSoup(html_content, 'html.parser')

            # Extract product details
            name_scarpebasse = soup_scarpebasse.find('h1', class_='h2 product-single__title')
            colore_scarpebasse = soup_scarpebasse.find('span', class_='cb-italic cb-block cb-font-500')
            prezzo_originale_scarpebasse = soup_scarpebasse.find('span', class_='related__price mr')
            colors_divs = soup_scarpebasse.find_all('div', class_='related__color-name skeleton-effect')
            sizes = soup_scarpebasse.find('ul', class_='related__select-optgroup-options')

            # Check if product details exist
            nomi = name_scarpebasse.text.strip() if name_scarpebasse else ""
            colore = colore_scarpebasse.text.strip() if colore_scarpebasse else ""
            prezzo_originale = prezzo_originale_scarpebasse.text.strip() if prezzo_originale_scarpebasse else ""

            # Remove color info from the name
            if colore in nomi:
                nomi = nomi.replace(colore, "")

            # Get all color options
            all_colors = [color_div.text.strip() for color_div in colors_divs if color_div.text.strip()]

            # Get all size options
            all_size = [size.text.strip() for size in sizes.find_all('li') if size.text.strip()] if sizes else []

            # Debugging: Ensure data is extracted
            print(f"Name: {nomi}, Color: {colore}, Price: {prezzo_originale}, Colors: {all_colors}, Sizes: {all_size}")

            # Loop through all color and size combinations and add them to rows
            for color in all_colors:
                for size in all_size:
                    # Debugging: Ensure data is being appended correctly
                    print(f"Adding: {nomi}, {color}, {size}, {prezzo_originale}")
                    rows.append({
                        'name': nomi,  # Product name
                        'color': color,  # Color option
                        'size': size[:2],  # Size option (only first 2 characters for size)
                        'price_sale': "",  # Sale price (if any)
                        'price': prezzo_originale[1:]  # Original price (stripping currency symbol)
                    })

        except Exception as e:
            print(f"Error processing product: {product}. Error: {e}")

# Create DataFrame from the rows
df = pd.DataFrame(rows)

# Check if rows is empty before creating DataFrame
if df.empty:
    print("No data was collected.")
else:
    print("Data successfully collected!")
    print(df)

Scraping page: https://www.superga.com/collections/womens-low-cut?filter.p.m.custom.seasons=Spring%2FSummer&page=1
Found 28 product links on page 1
Name: 2750-COTU CLASSIC, Color: WHITE, Price: €65.00, Colors: ['AZURE ICE-FAVORIO', 'AZURE LT-F AVORIO', 'AZURE TURQUOISE', 'AZURE TURQUOISE-FAVORIO', 'BEIGE', 'BEIGE GESSO', 'BEIGE LT-F AVORIO', 'BEIGE NATURAL-FAVORIO', 'BLACK', 'BLACK-FWHITE', 'BLUE BALLAD', 'BLUE COLD', 'BLUE LT CYANEUS-FAVORIO', 'BLUE LT DUSTY-FAVORIO', 'BLUE LT GREY-FAVORIO', 'BLUE NAVY', 'BLUE SPECTRUM-FAVORIO', 'BROWN DUSTY', 'BROWN LT CURRY-FAVORIO', 'BROWN REDDISH-FAVORIO', 'COTTON CANDY', 'DK BORDEAUX', 'FULL BLACK', 'GRAY DK SAGE', 'GREEN ICEBERG-F AVORIO', 'GREEN LT-FAVORIO', 'GREEN OLIVE-F AVORIO', 'GREEN PRIMROSE-F AVORIO', 'GREEN SAFARI-F AVORIO', 'GREEN SHERWOOD', 'GREEN WATER-FAVORIO', 'GREY ASH', 'GREY MD-FAVORIO', 'GREY SILVER-FAVORIO', 'ISLAND GREEN', 'LT GREY', 'LT GREY-ROSE GOLD', 'NAVY-FWHITE', 'ORANGE', 'ORANGE APRICOT-F AVORIO', 'ORANGE TOMATO', 'OR

In [23]:
# Save the DataFrame to a CSV file
df.to_csv('db_scarpebasse.csv', index=False)