In [152]:
import requests
from bs4 import BeautifulSoup as bs
import csv
import regex as re
import concurrent.futures
import time
import sys
from pathlib import Path

In [153]:
# File paths relative to .exe location
WEBPATH_FILE ="webpath.txt"
FREQUENCY_FILE ="frequency.txt"
RESULT_FILE = "scraperesult.csv"
FAULTY_FILE = "faultypath.txt"

with open(WEBPATH_FILE, "r") as file:
    webpath = file.readlines()

# Some global constants for scraping
# such as headers, name and price patterns, 
# and return values for not found cases
HEADERS = {"User-Agent": "Mozilla/5.0 (Windows NT 11.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/134.0.6998.166 Safari/537.36"}
PATTERNS = {
    "bunnings": [
        re.compile(r"MuiTypography-root sc-500f213-2 .* MuiTypography-h1"), 
        "sc-bbcf7fe4-3 kAMCuk"
    ],
    "jbhifi": [
        re.compile(r"_12mtftw9"),  # Name pattern
        "PriceTag_actualWrapperDefault__1eb7mu915"  # Updated price pattern
    ]
}
NOTFOUND = "N/A"

In [154]:
# Function to get the website name from the URL
def get_website_name(url):
    """Extract website name from URL using regex."""
    pattern = r'https?://(?:www\.)?([^.]+)'
    match = re.search(pattern, url)
    return match.group(1) if match else NOTFOUND

In [155]:
for line in webpath:
    print(get_website_name(line.strip()))

bunnings
bunnings
bunnings
bunnings
jbhifi
jbhifi
jbhifi


In [156]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.options import Options

def scrape_with_selenium(url, patterns):
    """Scrape JB Hi-Fi using Selenium to handle JavaScript"""
    options = Options()
    options.add_argument('--headless')  # Run in background
    options.add_argument('--no-sandbox')
    options.add_argument('--disable-dev-shm-usage')

    driver = webdriver.Chrome(options=options)
    NAMEPATTERN, PRICEPATTERN = patterns
    
    try:
        driver.get(url)
        
        # Wait for price element to load
        price_element = driver.find_element(By.CLASS_NAME, PRICEPATTERN)
        
        # Get the price text
        price_text = "".join(price_element.text.split())
        
        # Get product name
        try:
            name_element = driver.find_element(By.TAG_NAME, 'h1')
            name_text = name_element.text
        except:
            name_text = NOTFOUND
            
        return (name_text, price_text)
        
    except Exception as e:
        return (NOTFOUND, NOTFOUND)
    finally:
        driver.quit()

In [157]:
def scrape_requests(url, patterns):
    
    NAMEPATTERN, PRICEPATTERN = patterns
    
    try:
        response = requests.get(url, headers=HEADERS, timeout=10)    

        if response.status_code == 200:
            soup = bs(response.content, "html.parser")
            name = soup.find("h1", class_=NAMEPATTERN) # Get the product name
            price = soup.find("p", class_=PRICEPATTERN) # Get the price tag

            name = name.text.strip() if name else NOTFOUND
            price = price.text.strip() if price else NOTFOUND
            
        else:
            name = price = NOTFOUND
        
    except requests.RequestException as e:
        name = price = NOTFOUND

    return (name, price)

In [158]:
def scrape_single_url(url):
    url = url.strip()
    website_name = get_website_name(url)
    patterns = PATTERNS.get(website_name, (NOTFOUND, NOTFOUND))

    try:
        name, price = scrape_requests(url, patterns)
        if name == NOTFOUND or price == NOTFOUND:
            name, price = scrape_with_selenium(url, patterns)
        
        return (url, name, price)
    except Exception as e:
        print(f"Error scraping {url}: {e}")
        return (url, NOTFOUND, NOTFOUND)

In [159]:
# Parallel scraping 
def paralle_scrape(webpath, max_workers=5):
    start_time = time.time()

    # I don't really understand how parallel process works
    # so this is just a copy past from the internet
    with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
        results = list(executor.map(scrape_single_url, webpath))

    end_time = time.time()
    print(f"Scraping completed in {end_time - start_time:.2f} seconds")

    return results

In [160]:
new_scrape = paralle_scrape(webpath, max_workers=8)
print(new_scrape)

Scraping completed in 36.54 seconds
[('https://www.bunnings.com.au/dulux-4l-interior-paint-wash-wear-plus-kitchen-bathroom-low-sheen-vivid-white-4l_p1370128', 'Dulux 4L Interior Paint Wash&Wear +PLUS Kitchen & Bathroom Low Sheen Vivid White - 4L', '$125.50'), ('https://www.bunnings.com.au/200-x-50mm-2-4m-treated-pine-sleeper-h4_p8032702', '200 x 50mm 2.4m Treated Pine Sleeper H4', '$18'), ('https://www.bunnings.com.au/ozito-1800w-2030psi-high-pressure-washer_p0254158', 'Ozito 1800W 2030PSI High Pressure Washer', '$99'), ('https://www.bunnings.com.au/estilo-chrome-3-function-hand-shower-connector-set-wels-3-star-rated-9l-min_p5002610', 'Estilo Chrome 3 Function Hand Shower & Connector Set WELS 3 Star Rated 9L/min', '$31.97'), ('https://www.jbhifi.com.au/products/philips-hue-lightstrip-1m-extension', 'Philips Hue Lightstrip 1m Extension', '$25'), ('https://www.jbhifi.com.au/products/apple-macbook-pro-14-inch-with-m4-pro-chip-512gb-24gb-space-black2024', 'Apple MacBook Pro 14-inch with M4

In [161]:
def read_from_csv(file):
    with open(file, "r") as csv_file:
        csv_file = csv.reader(csv_file)
        headers = next(csv_file)  # Read the header line
        data = {}

        print(list(csv_file))
        for line in list(csv_file)[1:]:
            data[line[0]] = {key: value for 
                             key, value in zip(headers[1:], 
                                               line[1:])}
    return headers, data

In [162]:
headers, old_scrape = read_from_csv("scraperesult.csv")
print(headers)
print(old_scrape)

[[], ['Dulux 4L Interior Paint Wash&Wear +PLUS Kitchen & Bathroom Low Sheen Vivid White - 4L', 'https://www.bunnings.com.au/dulux-4l-interior-paint-wash-wear-plus-kitchen-bathroom-low-sheen-vivid-white-4l_p1370128', '$125.50', '30-06-2025', '30-06-2025', '$125.50'], [], ['200 x 50mm 2.4m Treated Pine Sleeper H4', 'https://www.bunnings.com.au/200-x-50mm-2-4m-treated-pine-sleeper-h4_p8032702', '$18', '30-06-2025', '30-06-2025', '$18'], [], ['Ozito 1800W 2030PSI High Pressure Washer', 'https://www.bunnings.com.au/ozito-1800w-2030psi-high-pressure-washer_p0254158', '$99', '30-06-2025', '30-06-2025', '$99'], [], ['Estilo Chrome 3 Function Hand Shower & Connector Set WELS 3 Star Rated 9L/min', 'https://www.bunnings.com.au/estilo-chrome-3-function-hand-shower-connector-set-wels-3-star-rated-9l-min_p5002610', '$31.97', '30-06-2025', '30-06-2025', '$31.97'], [], ['Philips Hue Lightstrip 1m Extension', 'https://www.jbhifi.com.au/products/philips-hue-lightstrip-1m-extension', '$25', '30-06-2025',

In [163]:
def create_new_item_entry(name, path, price, today_date):
    """Create entry for a new item not in old data."""
    return (name, path, price, today_date, today_date, price)

def create_lower_price_entry(name, path, new_price, today_date):
    """Create entry when new price is lower than old price."""
    return (name, path, new_price, today_date, today_date, new_price)

def create_same_price_entry(name, path, price, old_item, today_date):
    """Create entry when price hasn't changed - extend the date range."""
    return (name, path, price, 
            old_item.get("Start Date", today_date), 
            today_date, 
            price)

def create_higher_price_entry(name, path, old_item, new_price):
    """Create entry when new price is higher - keep old lowest price."""
    old_price = old_item["price"]
    return (name, path, old_price, 
            old_item.get("Start Date", ""), 
            old_item.get("End Date", ""), 
            new_price)

def process_single_item(path, name, price, old_scrape, today_date):
    """Process a single scraped item and return the appropriate entry."""
    if name == NOTFOUND or price == NOTFOUND:
        return path
        
    curr_price = float(price.replace("$", ""))
    
    # New item - not in old data
    if name not in old_scrape:
        return create_new_item_entry(name, path, price, today_date)
    
    # Existing item - compare prices
    old_item = old_scrape[name]
    old_price = float(old_item["Lowest Price"].replace("$", ""))
    
    if curr_price < old_price:
        return create_lower_price_entry(name, path, price, today_date)
    elif curr_price == old_price:
        return create_same_price_entry(name, path, price, old_item, today_date)
    else:
        return create_higher_price_entry(name, path, old_item, price)

def compareScrape_new_old(new_scrape, old_scrape):
    """Compare new scrape results with old data and create updated entries.
    
    Args:
        new_scrape: List of (path, name, price) tuples from current scrape
        old_scrape: Dictionary of existing item data
        
    Returns:
        List of processed entries ready for CSV writing
    """
    data = []
    faulty_links = []
    today_date = time.strftime("%d-%m-%Y")
    
    for path, name, price in new_scrape:
        entry = process_single_item(path, name, price, old_scrape, today_date)
        if entry == path:
            faulty_links.append(path)
              # Only add valid entries
        elif entry:
            data.append(entry)
            
    return data, faulty_links

In [164]:
new_data, faulty_links = compareScrape_new_old(new_scrape, old_scrape)
print(new_data)

[('Dulux 4L Interior Paint Wash&Wear +PLUS Kitchen & Bathroom Low Sheen Vivid White - 4L', 'https://www.bunnings.com.au/dulux-4l-interior-paint-wash-wear-plus-kitchen-bathroom-low-sheen-vivid-white-4l_p1370128', '$125.50', '30-06-2025', '30-06-2025', '$125.50'), ('200 x 50mm 2.4m Treated Pine Sleeper H4', 'https://www.bunnings.com.au/200-x-50mm-2-4m-treated-pine-sleeper-h4_p8032702', '$18', '30-06-2025', '30-06-2025', '$18'), ('Ozito 1800W 2030PSI High Pressure Washer', 'https://www.bunnings.com.au/ozito-1800w-2030psi-high-pressure-washer_p0254158', '$99', '30-06-2025', '30-06-2025', '$99'), ('Estilo Chrome 3 Function Hand Shower & Connector Set WELS 3 Star Rated 9L/min', 'https://www.bunnings.com.au/estilo-chrome-3-function-hand-shower-connector-set-wels-3-star-rated-9l-min_p5002610', '$31.97', '30-06-2025', '30-06-2025', '$31.97'), ('Philips Hue Lightstrip 1m Extension', 'https://www.jbhifi.com.au/products/philips-hue-lightstrip-1m-extension', '$25', '30-06-2025', '30-06-2025', '$25'

In [165]:
def write_to_csv(file, headers, data):
    # Use newline='' to prevent blank lines on Windows
    with open(file, "w") as csv_file:
        writer = csv.writer(csv_file)
        writer.writerow(headers)
        for line in data:
            writer.writerow(line)

In [166]:
write_to_csv("scraperesult.csv", headers, new_data)