In [1]:
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from bs4 import BeautifulSoup
import time
import pandas as pd
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import NoSuchElementException,TimeoutException, StaleElementReferenceException, ElementClickInterceptedException
from selenium.webdriver.remote.webelement import WebElement
from pathlib import Path
from datetime import datetime
import random
from typing import List, Optional, Dict
from bs4 import BeautifulSoup, Tag

In [2]:
class driver_to_crawl_page:
    def __init__(self, 
                 url: str,
                 headless: bool = True,
                 element_by: str = 'xpath'):
        
        self.url = url
        self.headless = headless
        self.element_by = element_by

    def build_driver(self) -> webdriver.Chrome:
        chrome_options = Options()
        if self.headless:
            chrome_options.add_argument("--no-sandbox")
            chrome_options.add_argument("--disable-gpu")
            chrome_options.add_argument("--window-size=1920,1080")
            chrome_options.add_argument("--disable-dev-shm-usage")
            driver = webdriver.Chrome(
            service=Service(ChromeDriverManager().install()),
            options=chrome_options
        )
        return driver
    
    def click_button (self, website_name: str, car_type: str, button_path: str, timeout: float = 0.5):
        driver = self.build_driver()
        driver.get(self.url)

        while True:
            try:
                time.sleep(timeout)
                button = driver.find_element(by = self.element_by, value = button_path)
                button.click()
                print('clicked button')
            except NoSuchElementException:
                print('There is no button to click, function end')
                html = driver.page_source
                break

        output = Path ('output')/'html'/f'{website_name}'
        output.mkdir(parents=True, exist_ok=True)
        ts = datetime.now().strftime('%Y%m%d')
        filename= f'{ts}_{website_name}_{car_type}.html'
        file_path = output / filename

        if html is None:
            print['failed to save']
        file_path.write_text(html, encoding='utf-8')
        print(f'saved {filename} to {file_path}')

        return file_path
    
    def infinite_scroll (self, website_name: str, car_type: str, timeout: float = 2.5, scroll_step: int = 500):
        driver = self.build_driver()
        driver.get(self.url)
        start_count_round=0
        max_count_round =3

        while True:
            max_scroll = driver.execute_script("""
        return Math.max(
            document.body.scrollHeight,
            document.documentElement.scrollHeight
        ) - window.innerHeight;
    """)
            current_position = driver.execute_script("return window.scrollY;")
            if current_position >= max_scroll:
                html = driver.page_source
                print ('Reached bottom, function end')
                break

            step = scroll_step
            remaining = max_scroll - current_position
            if remaining > step:
                delta = step
            else:
                remaining
            
            driver.execute_script(f'window.scrollBy(0,{int(delta)});')
            time.sleep(timeout)
            new_position = driver.execute_script('return window.scrollY;')
            print(f'{new_position}/{max_scroll}')

            if new_position == current_position:
                start_count_round += 1
                if start_count_round >= max_count_round:
                    print('Scrolled done, please check again')
                    break    
            else:
                start_count_round = 0

        output = Path('output')/'html'/f'{website_name}'
        output.mkdir(parents=True, exist_ok=True)
        ts = datetime.now().strftime('%Y%m%d')
        filename= f'{ts}_{website_name}_{car_type}.html'
        file_path = output / filename

        if html is None:
            print['failed to save']
        file_path.write_text(html, encoding='utf-8')
        print(f'saved {filename} to {file_path}')

        return file_path



In [4]:
class html_parser:
    def __init__(self, path: Path):
        self.path=path

    def soup(self) -> BeautifulSoup:
        html_path=Path (self.path)
        html_text = html_path.read_text(encoding='utf-8')
        soup = BeautifulSoup(html_text,'html.parser')
        return soup
    
    def select_cards_css (self, css_parents: str) -> list[Tag]:
        soup = self.soup()
        parents = soup.select(css_parents)
        return parents
    
    def get_text(self, parent: list[Tag], css_elements: str) -> list[str]:
        text = []
        for p in parent:
            try:
                t = p.select_one(css_elements)
                text.append(t.get_text(strip=True))
            except:
                text.append(None)
                continue

        return text
    
    def get_attribute (self, parent: list[Tag], css_element: str, attribute: str) -> list[str]:
        attr = []
        for p in parent:
            try:
                a = p.select_one(css_element)
                attr.append(a.get(attribute))
            except:
                attr.append(None)
                continue
        
        return attr

In [None]:
def get_data (
        path: str, 
        car_type: str, 
        website_name: str,
        parent_css: str,
        name_css: str,
        link_prodcut_css:str,
        link_prodcut_attr:str,
        price_css:str,
        odo_css:str,
        trans_css:str,
        fuel_css:str,
        ):
    parent = html_parser(path).select_cards_css(parent_css)
    name = html_parser(path). get_text(parent = parent, css_elements=name_css)
    link_product = html_parser(path). get_attribute(parent, link_prodcut_css, link_prodcut_attr)
    price = html_parser(path). get_text(parent, price_css)
    odo = html_parser(path). get_text(parent, odo_css)
    trans = html_parser(path). get_text(parent, trans_css)
    fuel = html_parser(path). get_text(parent, fuel_css)
    type = car_type
    date = datetime.now().strftime('%Y%m%d')

    data = {
        'name':name,
        'link_product':link_product,
        'price':price,
        'odometer':odo,
        'transmissions':trans,
        'fuel_type':fuel,
        'car_type':type,
        'update_date':date,
    }

    df = pd.DataFrame(data)

    output = Path ('output') / 'csv' / f'{website_name}'
    output.mkdir(parents=True, exist_ok=True)
    filename= f'{date}_{website_name}_{type}.csv'
    file_path = output / filename

    df.to_csv(file_path, index=False, encoding="utf-8-sig")

## Car24 (Infinite Scroll Website)

In [None]:
driver_to_crawl_page(url='https://www.cars24.com.au/buy-used-cars-australia?sf=bodyType%3ACab%20Chassis&sf=cityName%3ABrisbane&sf=sellerType%3ACARS24&sf=gaId:GA1.3.410291439.1761286374&sf=city:BNE100&entireMakeSelected=false').infinite_scroll('cars24','Cab Chassis')
driver_to_crawl_page(url='https://www.cheapcarco.com.au/buy-used-cars-australia?sf=bodyType%3ACoupe&sf=cityName%3ABrisbane&sf=sellerType%3ACARS24&sf=gaId:GA1.3.410291439.1761286374&sf=city:BNE100&entireMakeSelected=false').infinite_scroll('cars24','Coupe')
driver_to_crawl_page(url='https://www.cars24.com.au/buy-used-cars-australia?sf=bodyType%3AHatchback&sf=cityName%3ABrisbane&sf=sellerType%3ACARS24&sf=gaId:GA1.3.410291439.1761286374&sf=city:BNE100&entireMakeSelected=false').infinite_scroll('cars24','Hatchback')
driver_to_crawl_page(url='https://www.cars24.com.au/buy-used-cars-australia?sf=bodyType%3ASUV&sf=cityName%3ABrisbane&sf=sellerType%3ACARS24&sf=gaId:GA1.3.410291439.1761286374&sf=city:BNE100&entireMakeSelected=false').infinite_scroll('cars24','SUV')
driver_to_crawl_page(url='https://www.cars24.com.au/buy-used-cars-australia?sf=bodyType%3ASedan&sf=cityName%3ABrisbane&sf=sellerType%3ACARS24&sf=gaId:GA1.3.410291439.1761286374&sf=city:BNE100&entireMakeSelected=false').infinite_scroll('cars24','Sedan')
driver_to_crawl_page(url='https://www.cars24.com.au/buy-used-cars-australia?sf=bodyType%3AUte&sf=cityName%3ABrisbane&sf=sellerType%3ACARS24&sf=gaId:GA1.3.410291439.1761286374&sf=city:BNE100&entireMakeSelected=false').infinite_scroll('cars24','Ute')
driver_to_crawl_page(url='https://www.cars24.com.au/buy-used-cars-australia?sf=bodyType%3AVan&sf=cityName%3ABrisbane&sf=sellerType%3ACARS24&sf=gaId:GA1.3.410291439.1761286374&sf=city:BNE100&entireMakeSelected=false').infinite_scroll('cars24','Van')
driver_to_crawl_page(url='https://www.cars24.com.au/buy-used-cars-australia?sf=bodyType%3AWagon&sf=cityName%3ABrisbane&sf=sellerType%3ACARS24&sf=gaId:GA1.3.410291439.1761286374&sf=city:BNE100&entireMakeSelected=false').infinite_scroll('cars24','Wagon')


In [55]:
parent_css= 'div.col-md-4'
name_css= 'div > h2'
link_prodcut_css= 'div > a'
link_prodcut_attr ='href'
price_css = 'div > div > div > strong'
odo_css = 'ul > li:nth-of-type(1)'
trans_css = 'ul > li:nth-of-type(2)'
fuel_css = 'ul > li:nth-of-type(3)'


get_data('output/html/cars24/20251029_cars24_Cab Chassis.html', 'Cab Chassis','cars24', parent_css, name_css, link_prodcut_css, link_prodcut_attr,price_css, odo_css,trans_css,fuel_css)
get_data('output/html/cars24/20251029_cars24_Coupe.html', 'Coupe','cars24', parent_css, name_css, link_prodcut_css, link_prodcut_attr,price_css, odo_css,trans_css,fuel_css)
get_data('output/html/cars24/20251029_cars24_Hatchback.html', 'Hatchback','cars24', parent_css, name_css, link_prodcut_css, link_prodcut_attr,price_css, odo_css,trans_css,fuel_css)
get_data('output/html/cars24/20251029_cars24_Sedan.html', 'Sedan','cars24', parent_css, name_css, link_prodcut_css, link_prodcut_attr,price_css, odo_css,trans_css,fuel_css)
get_data('output/html/cars24/20251029_cars24_SUV.html', 'SUV','cars24', parent_css, name_css, link_prodcut_css, link_prodcut_attr,price_css, odo_css,trans_css,fuel_css)
get_data('output/html/cars24/20251029_cars24_Ute.html', 'Ute','cars24', parent_css, name_css, link_prodcut_css, link_prodcut_attr,price_css, odo_css,trans_css,fuel_css)
get_data('output/html/cars24/20251029_cars24_Van.html', 'Van','cars24', parent_css, name_css, link_prodcut_css, link_prodcut_attr,price_css, odo_css,trans_css,fuel_css)
get_data('output/html/cars24/20251029_cars24_Wagon.html', 'Wagon','cars24', parent_css, name_css, link_prodcut_css, link_prodcut_attr,price_css, odo_css,trans_css,fuel_css)

## Cheapcarco (Click 'Load more' button')

In [None]:
driver_to_crawl_page(url='https://www.cheapcarco.com.au/used-vehicles-rocklea/list?q=category:Coupes').click_button('cheapcarco','Coupe','/html/body/div[1]/div/main/div/div/div/div/div[2]/button')
driver_to_crawl_page(url='https://www.cheapcarco.com.au/used-vehicles-rocklea/list?q=category:Hatchbacks').click_button('cheapcarco','Hatchback','/html/body/div[1]/div/main/div/div/div/div/div[2]/button')
driver_to_crawl_page(url='https://www.cheapcarco.com.au/used-vehicles-rocklea/list?q=category:AWDs,SUVs').click_button('cheapcarco','SUV','/html/body/div[1]/div/main/div/div/div/div/div[2]/button')
driver_to_crawl_page(url='https://www.cheapcarco.com.au/used-vehicles-rocklea/list?q=category:Sedans').click_button('cheapcarco','Sedan','/html/body/div[1]/div/main/div/div/div/div/div[2]/button')
driver_to_crawl_page(url='https://www.cheapcarco.com.au/used-vehicles-rocklea/list?q=category:Utes').click_button('cheapcarco','Ute','/html/body/div[1]/div/main/div/div/div/div/div[2]/button')
driver_to_crawl_page(url='https://www.cheapcarco.com.au/used-vehicles-rocklea/list?q=category:Vans').click_button('cheapcarco','Van','/html/body/div[1]/div/main/div/div/div/div/div[2]/button')
driver_to_crawl_page(url='https://www.cheapcarco.com.au/used-vehicles-rocklea/list?q=category:Wagons').click_button('cheapcarco','Wagon','/html/body/div[1]/div/main/div/div/div/div/div[2]/button')

In [12]:
parent_css= 'main div[class*="flex"][class*="flex-col"][class*="h-full"]'
name_css = 'div > h5'
link_prodcut_css= 'a'
link_prodcut_attr ='href'
price_css = 'div:nth-of-type(3) > h3'
odo_css = 'div:nth-of-type(2) > div: nth-of-type(1) > span: nth-of-type(2)'
trans_css = 'div:nth-of-type(2) > div: nth-of-type(3) > span: nth-of-type(2)'
fuel_css = 'div:nth-of-type(4) > span: nth-of-type(2)'

get_data('output/html/cheapcarco/20251029_cheapcarco_Coupe.html', 'Coupe','cheapcarco', parent_css, name_css, link_prodcut_css, link_prodcut_attr,price_css, odo_css,trans_css,fuel_css)
get_data('output/html/cheapcarco/20251029_cheapcarco_Hatchback.html', 'Hatchback','cheapcarco', parent_css, name_css, link_prodcut_css, link_prodcut_attr,price_css, odo_css,trans_css,fuel_css)
get_data('output/html/cheapcarco/20251029_cheapcarco_Sedan.html', 'Sedan','cheapcarco', parent_css, name_css, link_prodcut_css, link_prodcut_attr,price_css, odo_css,trans_css,fuel_css)
get_data('output/html/cheapcarco/20251029_cheapcarco_SUV.html', 'SUV','cheapcarco', parent_css, name_css, link_prodcut_css, link_prodcut_attr,price_css, odo_css,trans_css,fuel_css)
get_data('output/html/cheapcarco/20251029_cheapcarco_Ute.html', 'Ute','cheapcarco', parent_css, name_css, link_prodcut_css, link_prodcut_attr,price_css, odo_css,trans_css,fuel_css)
get_data('output/html/cheapcarco/20251029_cheapcarco_Van.html', 'Van','cheapcarco', parent_css, name_css, link_prodcut_css, link_prodcut_attr,price_css, odo_css,trans_css,fuel_css)
get_data('output/html/cheapcarco/20251029_cheapcarco_Wagon.html', 'Wagon','cheapcarco', parent_css, name_css, link_prodcut_css, link_prodcut_attr,price_css, odo_css,trans_css,fuel_css)