In [13]:
from bs4 import BeautifulSoup
import pandas as pd 
import requests
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
import time

In [14]:
url_unique = 'https://www.superga.com'
url_base = 'https://www.superga.com/collections/'
url_page = 'womens-platform?filter.p.m.custom.seasons=Spring%2FSummer' 
header = {"User-Agent": "Chrome/39.0"}

In [15]:
# Create a list to store rows
rows = []

# Set up the Chrome WebDriver
options = webdriver.ChromeOptions()
options.add_argument('--headless')  # Run in headless mode (no UI)
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)

In [None]:
# Loop through all pages (e.g., 6 pages)
for page_num in range(1, 4):  # Loop for pages 1 to 6
    # Construct the full URL for the current page
    url = url_base + url_page + "&page"+ str(page_num)
    print(f"Scraping page: {url}")
    
    # Request the page content
    page = requests.get(url, headers=header)
    soup = BeautifulSoup(page.text, 'html.parser')

    # Find all product links on the current page
    platform = soup.find_all('a', class_="grid-product__link")

    # Check if products were found
    print(f"Found {len(platform)} product links on page {page_num}")

    # Loop through each product link
    for product in platform:
        try:
            # Get the product URL
            url_platform = product.get('href')  # Get the link from the product
            url1 = url_unique + url_platform  # Construct the full product URL
            driver.get(url1)  # Load the product page

            # Wait for JavaScript to render the page
            time.sleep(5)

            # Extract the rendered HTML
            html_content = driver.page_source
            soup_platform = BeautifulSoup(html_content, 'html.parser')

            # Extract product details
            name_platform = soup_platform.find('h1', class_='h2 product-single__title')
            colore_platform = soup_platform.find('span', class_='cb-italic cb-block cb-font-500')
            prezzo_originale_platform = soup_platform.find('span', class_='related__price mr')
            colors_divs = soup_platform.find_all('div', class_='related__color-name skeleton-effect')
            sizes = soup_platform.find('ul', class_='related__select-optgroup-options')

            # Check if product details exist
            nomi = name_platform.text.strip() if name_platform else ""
            colore = colore_platform.text.strip() if colore_platform else ""
            prezzo_originale = prezzo_originale_platform.text.strip() if prezzo_originale_platform else ""

            # Remove color info from the name
            if colore in nomi:
                nomi = nomi.replace(colore, "")

            # Get all color options
            all_colors = [color_div.text.strip() for color_div in colors_divs if color_div.text.strip()]

            # Get all size options
            all_size = [size.text.strip() for size in sizes.find_all('li') if size.text.strip()] if sizes else []

            # Debugging: Ensure data is extracted
            print(f"Name: {nomi}, Color: {colore}, Price: {prezzo_originale}, Colors: {all_colors}, Sizes: {all_size}")

            # Loop through all color and size combinations and add them to rows
            for color in all_colors:
                for size in all_size:
                    # Debugging: Ensure data is being appended correctly
                    print(f"Adding: {nomi}, {color}, {size}, {prezzo_originale}")
                    rows.append({
                        'name': nomi,  # Product name
                        'color': color,  # Color option
                        'size': size[:2],  # Size option (only first 2 characters for size)
                        'price_sale': "",  # Sale price (if any)
                        'price': prezzo_originale[1:]  # Original price (stripping currency symbol)
                    })

        except Exception as e:
            print(f"Error processing product: {product}. Error: {e}")

# Create DataFrame from the rows
df = pd.DataFrame(rows)

# Check if rows is empty before creating DataFrame
if df.empty:
    print("No data was collected.")
else:
    print("Data successfully collected!")
    print(df)

Scraping page: https://www.superga.com/collections/womens-platform?filter.p.m.custom.seasons=Spring%2FSummer&page1
Found 28 product links on page 1
Name: 2790 PLATFORM, Color: WHITE, Price: €75.00, Colors: ['AZURE ICE-FAVORIO', 'AZURE TURQUOISE-FAVORIO', 'BEIGE', 'BEIGE GESSO', 'BEIGE LT EGGSHELL-F AVORIO', 'BEIGE LT-FAVORIO', 'BEIGE NATURAL-FAVORIO', 'BLACK', 'BLACK-FWHITE', 'BLUE NAVY', 'BLUE SPECTRUM-FAVORIO', 'FULL BLACK', 'FULL WHITE AVORIO', 'GRAY DK SAGE', 'GREEN PRIMROSE-F AVORIO', 'GREEN WATER-FAVORIO', 'GREY ASH', 'GREY COLOMBA-F AVORIO', 'GREY FOSSIL-FAVORIO', 'GREY MD-FAVORIO', 'GREY SILVER-FAVORIO', 'NAVY-FAVORIO', 'NAVY-FWHITE', 'ORANGE APRICOT-F AVORIO', 'ORANGE LT CORAL-FAVORIO', 'PINK BLUSH-F AVORIO', 'PINK DUSTY-F AVORIO', 'PINK ISH-F AVORIO', 'PINK SKIN-F AVORIO', 'PINK SMOKE', 'PINK-FAVORIO', 'RED FLAME', 'TOTAL BEIGE RAW', 'VIOLET LILLA-FAVORIO', 'VIOLET LT ASH', 'VIOLET PURPLE-FAVORIO', 'WHITE'], Sizes: ['35', '35.5Available in other colors', '36', '37', '37.5', '

In [17]:
# Save the DataFrame to a CSV file
df.to_csv('db_platform.csv', index=False)