In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import urllib3
import datetime
import csv

headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.0.0 Safari/537.36"
}

def get_all_products(url):
    response = requests.get(url, headers=headers)
    if response.status_code != 200:
        print(f"Failed to fetch {url}")
        return None

    soup = BeautifulSoup(response.text, "html.parser")
    
    # Find all product containers
    products = soup.find_all("div", class_="product-item-details")

    product_list = []

    for product in products:
        # Extract product name
        name_tag = product.select_one("h3.product-item-name a")
        product_name = name_tag.get_text(strip=True) if name_tag else "No name found"
        
        # Extract special price
        special_price_tag = product.select_one('span[id^="product-price-"] .price')
        special_price = special_price_tag.get_text(strip=True) if special_price_tag else "No price found"

        # Extract old price (if available)
        old_price_tag = product.select_one('span[id^="old-price-"] .price')
        old_price = old_price_tag.get_text(strip=True) if old_price_tag else "N/A"

        # Extract discount (if available)
        discount_tag = product.select_one(".price-save-amount")
        discount = discount_tag.get_text(strip=True) if discount_tag else "No discount"

        product_list.append({
            "Name": product_name,
            "Special Price": special_price,
            "Old Price": old_price,
            "Discount": discount
        })

    return product_list

# Example category or search page URL (replace with actual URL)
courts_url = "https://www.courts.com.sg/clearance?product_list_limit=40"

products = get_all_products(courts_url)

# Print results
for product in products:
    print(product)


{'Name': 'LOGITECH 910-006765 LOGITECH M100R USB WIRED MOUSE BLK CARTON BOX 3Y', 'Special Price': 'S$9.00', 'Old Price': 'S$12.00', 'Discount': '(save 25%)'}
{'Name': 'CANON GI-790 PBK BLACK INK BOTTLE', 'Special Price': 'S$14.70', 'Old Price': 'S$15.00', 'Discount': '(save 2%)'}
{'Name': 'CANON GI-790 C CYAN INK BOTTLE', 'Special Price': 'S$14.70', 'Old Price': 'S$15.00', 'Discount': '(save 2%)'}
{'Name': 'LOGITECH 910-005934 PICO UNIFYING RECEIVER [CARTON PACK] (1Y)', 'Special Price': 'S$14.00', 'Old Price': 'S$16.00', 'Discount': '(save 12%)'}
{'Name': 'MORRIES MS-1228S DRY IRON', 'Special Price': 'S$18.00', 'Old Price': 'S$20.30', 'Discount': '(save 11%)'}
{'Name': 'ASUS TUF GAMING P1 GAMING MOUSEMAT TUF GAMING P1 GAMING MOUSEMAT', 'Special Price': 'S$16.00', 'Old Price': 'S$19.00', 'Discount': '(save 15%)'}
{'Name': 'CORNELL CJKS10L CORDLESS KETTLE (1L)', 'Special Price': 'S$16.00', 'Old Price': 'S$33.90', 'Discount': '(save 52%)'}
{'Name': 'MASTERPLUG SWC62N-MPA 13A 6 GANG INDIVI

In [1]:
import requests
from bs4 import BeautifulSoup

# Base URL (pagination parameter `p` is appended later)
BASE_URL = "https://www.courts.com.sg/clearance?p={}&product_list_limit=40"

# Total pages (hardcoded)
TOTAL_PAGES = 23

# Headers to simulate a real browser request
HEADERS = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
}

# List to store extracted product data
products_list = []

# Function to extract product data from a page
def extract_products(page_soup):
    products = page_soup.find_all("div", class_="product-item-details")
    for product in products:
        try:
            # Extract product name
            name_tag = product.find("h3", class_="product-item-name")
            name = name_tag.get_text(strip=True) if name_tag else "N/A"
            
            # Extract price
            price_tag = product.find("span", {"data-price-type": "finalPrice"})
            price = price_tag.get_text(strip=True) if price_tag else "N/A"
            
            # Extract old price (if available)
            old_price_tag = product.find("span", {"data-price-type": "oldPrice"})
            old_price = old_price_tag.get_text(strip=True) if old_price_tag else "N/A"
            
            # Calculate discount (if old price exists)
            discount = "N/A"
            if old_price != "N/A" and price != "N/A":
                old_price_value = float(old_price.replace("S$", "").replace(",", ""))
                price_value = float(price.replace("S$", "").replace(",", ""))
                discount = f"{round(((old_price_value - price_value) / old_price_value) * 100, 2)}%" if old_price_value > price_value else "0%"

            # Append extracted data
            products_list.append([name, price, old_price, discount])

        except Exception as e:
            print(f"Error extracting product: {e}")

# Loop through all 23 pages
for page in range(1, TOTAL_PAGES + 1):
    print(f"Scraping page {page}/{TOTAL_PAGES}...")
    url = BASE_URL.format(page)
    response = requests.get(url, headers=HEADERS)
    
    if response.status_code == 200:
        soup = BeautifulSoup(response.text, "html.parser")
        extract_products(soup)
    else:
        print(f"Failed to retrieve page {page}, status code: {response.status_code}")

# Save data to CSV
csv_filename = "courts_products.csv"
with open(csv_filename, "w", newline="", encoding="utf-8") as file:
    writer = csv.writer(file)
    writer.writerow(["Product Name", "Price", "Old Price", "Discount"])
    writer.writerows(products_list)

print(f"✅ Data saved to {csv_filename}")


Scraping page 1/23...
Scraping page 2/23...
Scraping page 3/23...
Scraping page 4/23...
Scraping page 5/23...
Scraping page 6/23...
Scraping page 7/23...
Scraping page 8/23...
Scraping page 9/23...
Scraping page 10/23...
Scraping page 11/23...
Scraping page 12/23...
Scraping page 13/23...
Scraping page 14/23...
Scraping page 15/23...
Scraping page 16/23...
Scraping page 17/23...
Scraping page 18/23...
Scraping page 19/23...
Scraping page 20/23...
Scraping page 21/23...
Scraping page 22/23...
Scraping page 23/23...
✅ Data saved to courts_products.csv


#####################--This is the latest code for harvey norman

In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
import random
import os
import re
from urllib.parse import urljoin

def get_product_info(product_container):
    """
    Extract product information from a single product container
    
    Args:
        product_container (BeautifulSoup): HTML element containing product info
        
    Returns:
        dict: Dictionary containing product details
    """
    product_data = {}
    
    # Extract product title and URL
    product_info_div = product_container.find('div', class_='product-info')
    if product_info_div:
        title_element = product_info_div.find('a', class_='product-title')
        if title_element:
            product_data['product_name'] = title_element.text.strip()
            product_data['product_url'] = title_element.get('href', '')
            if product_data['product_url'].startswith('//'):
                product_data['product_url'] = 'https:' + product_data['product_url']
    
    # Extract product image URL
    img_container = product_container.find('div', class_='product-image')
    if img_container:
        img_tag = img_container.find('img')
        if img_tag:
            img_url = img_tag.get('src', '')
            if img_url:
                if img_url.startswith('//'):
                    img_url = 'https:' + img_url
                product_data['image_url'] = img_url
    
    # Extract pricing information
    footer_div = product_container.find('div', class_='product-footer')
    if footer_div:
        # Current price
        price_span = footer_div.find('span', class_='price')
        if price_span:
            price_value_span = price_span.find('span', id=lambda x: x and x.startswith('sec_discounted_price_'))
            if price_value_span:
                product_data['current_price'] = f"S${price_value_span.text.strip()}"
        
        # Original/Was price
        old_price_span = footer_div.find('span', class_='price-old')
        if old_price_span:
            old_price_value_span = old_price_span.find('span', id=lambda x: x and x.startswith('sec_list_price_'))
            if old_price_value_span:
                product_data['original_price'] = f"S${old_price_value_span.text.strip()}"
    
    # Calculate discount percentage if both prices exist
    if 'current_price' in product_data and 'original_price' in product_data:
        try:
            current = float(product_data['current_price'].replace('S$', '').replace(',', ''))
            original = float(product_data['original_price'].replace('S$', '').replace(',', ''))
            if original > 0:
                discount = ((original - current) / original) * 100
                product_data['discount_percentage'] = f"{discount:.2f}%"
        except (ValueError, TypeError):
            pass
    
    return product_data

def get_next_page_url(soup, current_url):
    """
    Find the URL of the next page
    
    Args:
        soup (BeautifulSoup): Parsed HTML of the current page
        current_url (str): URL of the current page
        
    Returns:
        str or None: URL of the next page or None if not found
    """
    # Method 1: Look for pagination links
    pagination = soup.find('div', class_='ty-pagination')
    if pagination:
        next_link = pagination.find('a', class_='ty-pagination__next')
        if next_link and 'href' in next_link.attrs:
            next_page_url = next_link['href']
            if next_page_url.startswith('//'):
                next_page_url = 'https:' + next_page_url
            return next_page_url
    
    # Method 2: Check if the current URL has a page parameter and increment it
    if '/page-' in current_url:
        # Extract the current page number
        match = re.search(r'/page-(\d+)', current_url)
        if match:
            current_page = int(match.group(1))
            next_page = current_page + 1
            next_url = current_url.replace(f'/page-{current_page}', f'/page-{next_page}')
            return next_url
    
    # Method 3: If this is the first page, append page-2
    if '/page-' not in current_url:
        # Check if URL ends with / or not
        if current_url.endswith('/'):
            next_url = f"{current_url}page-2/"
        else:
            next_url = f"{current_url}/page-2/"
        return next_url
    
    return None

def check_if_page_has_products(soup):
    """
    Check if the page has any products
    
    Args:
        soup (BeautifulSoup): Parsed HTML of the page
        
    Returns:
        bool: True if the page has products, False otherwise
    """
    # Method 1: Check for product-info divs
    product_info_divs = soup.find_all('div', class_='product-info')
    if product_info_divs:
        return True
    
    # Method 2: Check for grid items
    grid_items = soup.find_all('div', class_='ty-grid-list__item')
    if grid_items:
        return True
    
    # Method 3: Check for empty category/no products message
    empty_msg = soup.find('p', class_='ty-no-items')
    if empty_msg:
        return False
    
    return False

def scrape_page(url, headers):
    """
    Scrape a single page of the Harvey Norman website
    
    Args:
        url (str): URL of the page to scrape
        headers (dict): HTTP headers for the request
        
    Returns:
        tuple: (list of product dictionaries, next page URL or None, has_products)
    """
    try:
        print(f"Scraping: {url}")
        response = requests.get(url, headers=headers)
        response.raise_for_status()
        
        soup = BeautifulSoup(response.content, 'html.parser')
        
        # Check if the page has products
        has_products = check_if_page_has_products(soup)
        if not has_products:
            print("No products found on this page")
            return [], None, False
        
        products = []
        
        # Find all product containers - look for the grid items first
        product_containers = soup.find_all('div', class_='ty-grid-list__item')
        
        # If no grid items found, try another approach with the structure from the HTML snippet
        if not product_containers:
            # Find divs that contain product-image, bx, and product-footer
            container_candidates = soup.find_all('div')
            for div in container_candidates:
                if (div.find('div', class_='product-image') and 
                    div.find('div', class_='bx') and 
                    div.find('div', class_='product-footer')):
                    product_containers.append(div)
        
        print(f"Found {len(product_containers)} product containers")
        
        # Extract product information from each container
        for container in product_containers:
            product_data = get_product_info(container)
            if product_data and 'product_name' in product_data:
                products.append(product_data)
        
        # Get the URL for the next page
        next_page_url = get_next_page_url(soup, url)
        
        return products, next_page_url, True
    
    except Exception as e:
        print(f"Error scraping {url}: {e}")
        return [], None, False

def scrape_all_pages(start_url, max_pages=100):
    """
    Scrape multiple pages from Harvey Norman
    
    Args:
        start_url (str): URL to start scraping from
        max_pages (int): Maximum number of pages to scrape
        
    Returns:
        pd.DataFrame: DataFrame containing all scraped product data
    """
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
        'Accept-Language': 'en-US,en;q=0.9',
        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
        'Referer': 'https://www.harveynorman.com.sg/'
    }
    
    all_products = []
    current_url = start_url
    page_count = 1
    
    while current_url and page_count <= max_pages:
        products, next_page_url, has_products = scrape_page(current_url, headers)
        
        if products:
            all_products.extend(products)
            print(f"Page {page_count}: Scraped {len(products)} products. Total so far: {len(all_products)}")
        
        if not has_products or not next_page_url:
            print("No more products or next page found. Scraping complete.")
            break
        
        # Update for next iteration
        current_url = next_page_url
        page_count += 1
        
        # Add a delay to avoid overwhelming the server
        time.sleep(random.uniform(1.5, 3))
    
    # Convert to DataFrame
    df = pd.DataFrame(all_products)
    
    # Remove duplicates if any
    if not df.empty and 'product_url' in df.columns:
        df = df.drop_duplicates(subset=['product_url'])
    
    return df

def main():
    """Main function to run the scraper"""
    base_url = "https://www.harveynorman.com.sg/promotions-en/harvey-raya-sale-2025-en/"
    max_pages = 100
    
    print(f"Starting to scrape Harvey Norman Singapore promotions up to {max_pages} pages")
    
    # Create output directory
    os.makedirs("harvey_norman_data", exist_ok=True)
    
    # Scrape all pages
    df = scrape_all_pages(base_url, max_pages)
    
    if df.empty:
        print("No products were found.")
        return
    
    # Save to CSV
    timestamp = time.strftime("%Y%m%d-%H%M%S")
    output_file = f"harvey_norman_data/harvey_norman_products_{timestamp}.csv"
    df.to_csv(output_file, index=False)
    
    print(f"Scraping complete! Found {len(df)} unique products.")
    print(f"Data saved to {output_file}")
    
    # Print a sample of the data
    print("\nSample of scraped data:")
    print(df.head())

if __name__ == "__main__":
    main()

Starting to scrape Harvey Norman Singapore promotions up to 100 pages
Scraping: https://www.harveynorman.com.sg/promotions-en/harvey-raya-sale-2025-en/
Found 54 product containers
Page 1: Scraped 54 products. Total so far: 54
Scraping: https://www.harveynorman.com.sg/promotions-en/harvey-raya-sale-2025-en/page-2/
Found 54 product containers
Page 2: Scraped 54 products. Total so far: 108
Scraping: https://www.harveynorman.com.sg/promotions-en/harvey-raya-sale-2025-en/page-3/
Found 54 product containers
Page 3: Scraped 54 products. Total so far: 162
Scraping: https://www.harveynorman.com.sg/promotions-en/harvey-raya-sale-2025-en/page-4/
Found 54 product containers
Page 4: Scraped 54 products. Total so far: 216
Scraping: https://www.harveynorman.com.sg/promotions-en/harvey-raya-sale-2025-en/page-5/
Found 54 product containers
Page 5: Scraped 54 products. Total so far: 270
Scraping: https://www.harveynorman.com.sg/promotions-en/harvey-raya-sale-2025-en/page-6/
Found 54 product containers
P