In [None]:
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from bs4 import BeautifulSoup
import time
import pandas as pd
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import NoSuchElementException,TimeoutException, StaleElementReferenceException, ElementClickInterceptedException
from pathlib import Path
from datetime import date

### Build Driver and Load full HTML page by click button or scroll infinity

In [None]:
class driver_to_crawl_page:
    def __init__(self, 
                 headless: bool = True,
                 url: str ='https://www.cheapcarco.com.au/used-vehicles-rocklea/list',
                 element_by: str = 'xpath',
                 elements_value: str = "//main/*/*/*/*/*/*/*/*/*/div[contains(@class, 'flex flex-col h-full')]"
                 ):
        self.url = url
        self.headless = headless
        self.element_by = element_by
        self.elements_value=elements_value
        
    def build_driver(self) -> webdriver.Chrome:
        chrome_options = Options()
        if self.headless:
            chrome_options.add_argument("--headless=new")
            chrome_options.add_argument("--no-sandbox")
            chrome_options.add_argument("--disable-gpu")
            chrome_options.add_argument("--window-size=1920,1080")
            chrome_options.add_argument("--disable-dev-shm-usage")
            driver = webdriver.Chrome(
            service=Service(ChromeDriverManager().install()),
            options=chrome_options
        )
        return driver
    
    def click_button (self, timeout: float = 0.5, button_path: str = '/html/body/div[1]/div/main/div/div/div/div/div[2]/button') -> webdriver:
        driver = self.build_driver()
        driver.get(self.url)

        while True:
            try:
                time.sleep(timeout)
                button = driver.find_element(by = self.element_by, value = button_path)
                button.click()
            except NoSuchElementException:
                print('Crawling new products list')
                break

        products = driver.find_elements(by = self.element_by, value = self.elements_value)
        return products
    
    def infinite_scroll (self, timeout: float = 0.5) -> webdriver:
        driver = self.build_driver()
        driver.get(self.url)
        last_height = driver.execute_script("return document.body.scrollHeight")
        while True:
            driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
            time.sleep(timeout)
            driver.execute_script('window.scrollBy(0,1000)')
            new_height = driver.execute_script('return document.body.scrollHeight')
            print (f'{new_height} - {last_height}')

            if (new_height == last_height):
                print ('scroll done')
                break
            else:
                last_height = new_height
                
        products = driver.find_elements(by = self.element_by, value = self.elements_value)
        return products
        

## Cheapcarco

In [None]:
driver = driver_to_crawl_page().build_driver()
products = driver_to_crawl_page().click_button()


### Crawl website by find element

In [None]:
class WebsiteCrawling:

    def __init__(self,
                 product: webdriver,
                 driver: webdriver.Chrome,
                 url: str ='https://www.cheapcarco.com.au/used-vehicles-rocklea/list',
                 elements_by: str = 'xpath'
                 ):
        self.url = url
        self.driver = driver
        self.driver.get(self.url)
        self.elements_by = elements_by
        self.products = product
    
    def get_link (self, path: str, get_attribute_part: str) -> list:
        link = []
        for p in self.products:
            try:
                l = p.find_element(by = self.elements_by, value = path)
                link.append(l.get_attribute(get_attribute_part))
            except NoSuchElementException:
                link.append(None)
                continue

        return link
    def get_text (self, path: str) -> list:
        text = []
        for p in self.products:
            try:
                t = p.find_element(by = self.elements_by, value = path)
                text.append(t.text)
            except NoSuchElementException:
                text.append(None)
                continue
        return text

In [None]:
name = WebsiteCrawling(products,driver).get_text('.//h5')
link_product = WebsiteCrawling(products,driver).get_link('.//a','href')
link_image = WebsiteCrawling(products,driver).get_link('.//a/div/img','src')
price = WebsiteCrawling(products,driver).get_text('.//div[3]/h3')
odo = WebsiteCrawling(products,driver).get_text('.//div[2]/div[1]/span[2]')
cyl = WebsiteCrawling(products,driver).get_text('.//div[2]/div[2]/span[2]')
trans = WebsiteCrawling(products,driver).get_text('.//div[2]/div[3]/span[2]')
fuel = WebsiteCrawling(products,driver).get_text('.//div[4]/span[2]')


In [None]:
now = date.today()
cheapcarco = {
    'name':name,
    'link_product': link_product,
    'link_image': link_image,
    'price':price,
    'odometer':odo,
    'cylinder':cyl,
    'transmission': trans,
    'fuel_type':fuel,
    'last_update': now
    }

cheapcarco = pd.DataFrame(cheapcarco)
cheapcarco.to_csv('cheapcarco.csv', index=False)