In [31]:
import requests
from bs4 import BeautifulSoup as bs
import csv
import regex as re
import concurrent.futures
import time
import sys
import logging
from pathlib import Path
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.options import Options


# File paths relative to .exe location
WEBPATH_FILE =  "webpath2.txt"
FREQUENCY_FILE =  "frequency.txt"
RESULT_FILE =  "scraperesult.csv"
FAULTY_FILE =  "faultypath.txt"

try:
    with open(WEBPATH_FILE, "r") as file:
        webpath = file.readlines()
except FileNotFoundError:
    print(f"File {WEBPATH_FILE} not found. Created an empty file. Please add URLs to scrape." ) 
    with open(WEBPATH_FILE, "w") as file:
        file.write("")

    webpath = []

# Some global constants for scraping
# such as headers, name and price patterns, 
# and return values for not found cases
HEADERS = {"User-Agent": "Mozilla/5.0 (Windows NT 11.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/134.0.6998.166 Safari/537.36"}
PATTERNS = {
    "bunnings": [
        re.compile(r"MuiTypography-root sc-500f213-2 .* MuiTypography-h1"), 
        "sc-bbcf7fe4-3 kAMCuk"
    ],
    "jbhifi": [
        re.compile(r"_12mtftw9"),  # Name pattern
        "PriceTag_actualWrapperDefault__1eb7mu915"  # Updated price pattern
    ]
}
NOTFOUND = "N/A"


In [32]:
# Function to get the website name from the URL
def get_website_name(url):
    """Extract website name from URL using regex."""
    pattern = r'https?://(?:www\.)?([^.]+)'
    match = re.search(pattern, url)
    return match.group(1) if match else NOTFOUND

In [33]:
for line in webpath:
    print(get_website_name(line.strip()))

bunnings
bunnings
bunnings
bunnings
bunnings
jbhifi
jbhifi
jbhifi
jbhifi
jbhifi


In [51]:
def scrape_with_selenium(url, patterns):
    options = Options()
    options.add_argument('--headless')
    options.add_argument('--no-sandbox')
    options.add_argument('--disable-dev-shm-usage')
    options.add_argument('--disable-gpu')  # Added for better performance
    options.add_argument('--disable-extensions')  # Disable extensions
    options.add_argument('--disable-images')  # Don't load images
    
    driver = webdriver.Chrome(options=options)
    NAMEPATTERN, PRICEPATTERN = patterns
    
    try:
        driver.get(url)
        driver.set_page_load_timeout(20)
        # Use explicit wait with timeout
        wait = WebDriverWait(driver, 20)  # 5 second timeout
        
        # Wait for price element with explicit condition
        price_element = wait.until(
            EC.presence_of_element_located((By.CLASS_NAME, PRICEPATTERN))
        )
        
        price_text = "".join(price_element.text.split())
        
        # Get product name with explicit wait
        try:
            name_element = driver.find_element(By.TAG_NAME, 'h1')
            name_text = name_element.text
        except:
            name_text = NOTFOUND
            
        return (name_text, price_text)
        
    except Exception as e:
        return (NOTFOUND, NOTFOUND)
    finally:
        driver.quit()


In [47]:
def scrape_requests(url, patterns):
    
    NAMEPATTERN, PRICEPATTERN = patterns
    
    try:
        response = requests.get(url, headers=HEADERS, timeout=10)    

        if response.status_code == 200:
            soup = bs(response.content, "html.parser")
            name = soup.find("h1", class_=NAMEPATTERN) # Get the product name
            price = soup.find("p", class_=PRICEPATTERN) # Get the price tag

            name = name.text.strip() if name else NOTFOUND
            price = price.text.strip() if price else NOTFOUND
            
        else:
            name = price = NOTFOUND
        
    except requests.RequestException as e:
        name = price = NOTFOUND

    return (name, price)


In [48]:
def scrape_single_url(url):
    url = url.strip()
    website_name = get_website_name(url)
    patterns = PATTERNS.get(website_name, (NOTFOUND, NOTFOUND))

    try:
        name, price = scrape_requests(url, patterns)
        if name == NOTFOUND or price == NOTFOUND:
            name, price = scrape_with_selenium(url, patterns)
        
        print(url, name, price)
        return (url, name, price)
    except Exception as e:
        print(f"Error scraping {url}: {e}")
        return (url, NOTFOUND, NOTFOUND)


In [49]:
def paralle_scrape(webpath, max_workers=5):

    # I don't really understand how parallel process works
    # so this is just a copy past from the internet
    with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
        results = list(executor.map(scrape_single_url, webpath))

    return results

In [52]:
results = paralle_scrape(webpath, max_workers=5)
for line in results:
    print(line)

https://www.bunnings.com.au/200-x-50mm-2-4m-treated-pine-sleeper-h4_p8032702 200 x 50mm 2.4m Treated Pine Sleeper H4 $18
https://www.bunnings.com.au/estilo-chrome-3-function-hand-shower-connector-set-wels-3-star-rated-9l-min_p5002610 Estilo Chrome 3 Function Hand Shower & Connector Set WELS 3 Star Rated 9L/min $31.97
https://www.bunnings.com.au/ozito-1800w-2030psi-high-pressure-washer_p0254158 Ozito 1800W 2030PSI High Pressure Washer $99
https://www.bunnings.com.au/dulux-4l-interior-paint-wash-wear-plus-kitchen-bathroom-low-sheen-vivid-white-4l_p1370128 Dulux 4L Interior Paint Wash&Wear +PLUS Kitchen & Bathroom Low Sheen Vivid White - 4L $125.50
https://www.jbhifi.com.au/products/alogic-ultra-mini-usb-c-to-usb-a-adapter-space-grey ALOGIC Ultra Mini USB-C to USB-A Adapter (Space Grey) $15
https://www.jbhifi.com.au/products/apple-macbook-pro-14-inch-with-m4-pro-chip-512gb-24gb-space-black2024 Apple MacBook Pro 14-inch with M4 Pro Chip, 512GB/24GB (Space Black)[2024] $3127
https://www.jbh

In [39]:
def read_from_csv(file):
    try:
        with open(file, "r") as csv_file:
            csv_file = csv.reader(csv_file)
            data = {}

            try:
                headers = next(csv_file)  # Read the header line
            except StopIteration:
                # If the file is empty, create headers
                headers = ["Name", "Link", "Lowest Price", "Start Date", "End Date", "Today Price"]
                return headers, data
            
            for line in list(csv_file)[1:]:
                if len(line) == 6:
                    data[line[0]] = {key: value for 
                                    key, value in zip(headers[1:], 
                                                    line[1:])}
        return headers, data
    
    except FileNotFoundError:
        # Create empty CSV with headers
        headers = ["Name", "Link", "Lowest Price", "Start Date", "End Date", "Today Price"]
        with open(file, "w", newline='') as csv_file:
            writer = csv.writer(csv_file)
            writer.writerow(headers)
        return headers, {}

In [40]:
def create_new_item_entry(name, path, price, today_date):
    """Create entry for a new item not in old data."""
    return (name, path, price, today_date, today_date, price)

def create_lower_price_entry(name, path, new_price, today_date):
    """Create entry when new price is lower than old price."""
    return (name, path, new_price, today_date, today_date, new_price)

def create_same_price_entry(name, path, price, old_item, today_date):
    """Create entry when price hasn't changed - extend the date range."""
    return (name, path, price, 
            old_item.get("Start Date", today_date), 
            today_date, 
            price)

def create_higher_price_entry(name, path, old_item, new_price):
    """Create entry when new price is higher - keep old lowest price."""
    old_price = old_item["price"]
    return (name, path, old_price, 
            old_item.get("Start Date", ""), 
            old_item.get("End Date", ""), 
            new_price)

def single_item_comparison(path, name, price, old_scrape, today_date):
    """Process a single scraped item and return the appropriate entry."""
    if name == NOTFOUND or price == NOTFOUND:
        return path
        
    curr_price = float(price.replace("$", ""))
    
    # New item - not in old data
    if name not in old_scrape:
        return create_new_item_entry(name, path, price, today_date)
    
    # Existing item - compare prices
    old_item = old_scrape[name]
    old_price = float(old_item["Lowest Price"].replace("$", ""))
    
    if curr_price < old_price:
        return create_lower_price_entry(name, path, price, today_date)
    elif curr_price == old_price:
        return create_same_price_entry(name, path, price, old_item, today_date)
    else:
        return create_higher_price_entry(name, path, old_item, price)

def compareScrape_new_old(new_scrape, old_scrape):
    """Compare new scrape results with old data and create updated entries.
    
    Args:
        new_scrape: List of (path, name, price) tuples from current scrape
        old_scrape: Dictionary of existing item data
        
    Returns:
        List of processed entries ready for CSV writing
    """
    data = []
    faulty_links = []
    today_date = time.strftime("%d-%m-%Y")
    
    for path, name, price in new_scrape:
        entry = single_item_comparison(path, name, price, old_scrape, today_date)
        if entry == path:
            faulty_links.append(path)
              # Only add valid entries
        elif entry:
            data.append(entry)
            
    return data, faulty_links

In [41]:
def write_to_csv(file, headers, data):
    with open(file, "w") as csv_file:
        writer = csv.writer(csv_file)
        writer.writerow(headers)
        for line in data:
            writer.writerow(line)

In [None]:

# Read the old scrape result from CSV
headers, data = read_from_csv(RESULT_FILE)

new_data, faulty_links = compareScrape_new_old(results, data)

write_to_csv(RESULT_FILE, headers, new_data)

https://www.bunnings.com.au/dulux-1step-prep-primer-sealer-undercoat-4l-4l_ N/A N/A
https://www.bunnings.com.au/dulux-4l-interior-paint-wash-wear-plus-kitchen-bathroom-low-sheen-vivid-white-4l_p1370128 Dulux 4L Interior Paint Wash&Wear +PLUS Kitchen & Bathroom Low Sheen Vivid White - 4L $125.50
https://www.bunnings.com.au/200-x-50mm-2-4m-treated-pine-sleeper-h4_p8032702 200 x 50mm 2.4m Treated Pine Sleeper H4 $18
https://www.bunnings.com.au/ozito-1800w-2030psi-high-pressure-washer_p0254158 Ozito 1800W 2030PSI High Pressure Washer $99
https://www.bunnings.com.au/estilo-chrome-3-function-hand-shower-connector-set-wels-3-star-rated-9l-min_p5002610 Estilo Chrome 3 Function Hand Shower & Connector Set WELS 3 Star Rated 9L/min $31.97


KeyboardInterrupt: 