In [1]:
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin
import time
import random

In [2]:
import sys, os
sys.path.append(os.path.join(os.path.dirname('__file__'), '..', 'DB_and_Azure'))
import sql_db_functions as SQLf

In [3]:
import re

def get_price(text):

    # Remove any non-numeric characters except for ',' and '.'
    cleaned_text = re.sub(r'[^\d,\.]', '', text)
    
    # Replace comma with a period if there's no period already (to handle decimal part)
    if ',' in cleaned_text and '.' not in cleaned_text:
        cleaned_text = cleaned_text.replace(',', '.')

    elif ',' not in cleaned_text and '.' in cleaned_text:
        cleaned_text = cleaned_text.replace('.', '')

    elif ',' in cleaned_text and '.' in cleaned_text:
        # If both ',' and '.' are present, keep only the period as the decimal separator
        cleaned_text = cleaned_text.replace('.', '')
        cleaned_text = cleaned_text.replace(',', '.')
    
    # Convert the string to a float
    number = float(cleaned_text)
    
    return number

In [4]:
def scrape_gap_product(url):
    response = requests.get(url)
    if response.status_code != 200:
        print(f"Failed to retrieve the webpage. Status code: {response.status_code}")
        return None

    soup = BeautifulSoup(response.content, 'html.parser')

    product = {}

    # Extract product name
    title_element = soup.find('h1', class_='pdp-mfe-1x22u9v')
    if title_element:
        product['name'] = title_element.text.strip()

    # Extract product price
    price_section = soup.find('div', class_='pdp-pricing pdp-mfe-1jiw3bl')
    if price_section:
        current_price_element = price_section.find('span', class_='pdp-pricing--highlight pdp-pricing__selected pdp-mfe-1jiw3bl')
        original_price_element = price_section.find('span', class_='product-price--pdp__regular')
        if current_price_element:
            product['current_price'] = current_price_element.text.strip()
        if original_price_element:
            product['original_price'] = original_price_element.text.strip()

    # Extract product color
    color_element = soup.find('span', class_='swatch-label__value')
    if color_element:
        product['color'] = color_element.text.strip()

    # Extract product sizes
    sizes = []
    size_elements = soup.select('.pdp-variant_radio input')
    for size in size_elements:
        if not size.has_attr('disabled'):
            sizes.append(size['value'])
    product['sizes'] = sizes

    # Extract first three product images
    images = []
    image_elements = soup.select('img.pdp-images__image')[:3]
    for img in image_elements:
        img_url = urljoin(url, img['src'])
        images.append(img_url)
    product['images'] = images

    # Extract product description
    description_element = soup.find('p', class_='product-information-item__overview')
    if description_element:
        product['description'] = description_element.text.strip()

    # Extract product details
    details_element = soup.find('ul', class_='product-information-item__list')
    if details_element:
        product_details = [li.text.strip() for li in details_element.find_all('li')]
        product['details'] = product_details

    return product

In [5]:
def scrape_gap_catalog(catalog_url):
    response = requests.get(catalog_url)
    if response.status_code != 200:
        print(f"Failed to retrieve the catalog page. Status code: {response.status_code}")
        return None

    soup = BeautifulSoup(response.content, 'html.parser')

    # Extract product page URLs
    product_links = soup.select('a.product-card__link')
    product_urls = [urljoin(catalog_url, link['href']) for link in product_links]

    all_products = []

    for product_url in product_urls:
        retry_count = 0
        while retry_count < 3:
            product_data = scrape_gap_product(product_url)
            if product_data:
                all_products.append(product_data)
                break
            else:
                retry_count += 1
                time.sleep(5)  # wait 5 seconds before retrying

        break

        # Delay to avoid being blocked
        time.sleep(random.uniform(2, 5))  # Random delay between 2 and 5 seconds

    return all_products

In [8]:
# Example usage
catalog_url = 'https://www.gap.com/browse/category.do?cid=34608&nav=meganav%3AWomen%3ACategories%3AShirts%20%26%20Tops#pageId=0&department=136'

catalog_url = 'https://www.gap-italia.it/it/c/donna/bluse-e-camicie/'
catalog_data = scrape_gap_catalog(catalog_url)

if catalog_data:
    print(catalog_data)

In [9]:
catalog_data

[]