In [7]:
from enum import Enum
from typing import Dict, List
from patchright.async_api import async_playwright
from lxml import html
import asyncio
from abc import ABC, abstractmethod

In [8]:
async def scroll_to_bottom(page, scroll_delay=0.25):
    """
    Scroll to the bottom of the page iteratively, with delays to ensure dynamic content is fully loaded.
    
    Args:
        page: The Playwright page instance.
        scroll_delay: Delay in seconds between scrolls to allow content loading.
    """
    previous_height = 0
    next_scroll = 500
    for i in range(10):
        # Scroll 500 pixels at a time
        await page.evaluate(f"window.scrollTo(0, {next_scroll})")
        next_scroll += 1000

        # Wait for content to load
        await asyncio.sleep(scroll_delay)

        # Measure the new scroll height
        current_height = await page.evaluate("document.body.scrollHeight")

        # If no new content is loaded, stop scrolling
        if current_height == previous_height:
            pass
            # break
        previous_height = current_height

In [None]:
class WebsiteInterface(ABC):
    def __init__(self):
        self.base_url = ""

    @abstractmethod
    async def crawl(self) -> List[Dict[str, str]]:
        """
        Abstract method to crawl the website and extract listings.
        Must be implemented by subclasses.
        """
        pass

    @abstractmethod
    def get_filters_info(self) -> str:
        """
        Abstract method to return a prompt for the LLM describing the filters and expected output format.
        Must be implemented by subclasses.
        """
        pass

    @abstractmethod
    def set_filters_from_llm_response(self, llm_response: str):
        """
        Abstract method to process the LLM's response and set the URL with appropriate filters.
        Must be implemented by subclasses.
        """
        pass

class AutotraderInterface(WebsiteInterface):
    class ListingType(Enum):
        NEW = "NEW"
        USED = "USED"
        CERTIFIED = "CERTIFIED"
        TP_CERT = "3P_CERT" # Third-party certified
    
    def __init__(self):
        self.base_url = "https://www.autotrader.com/cars-for-sale/all-cars"
        # https://www.autotrader.com/cars-for-sale/all-cars/floral-park-ny?endYear=2022&makeCode=BMW&makeCode=FORD&newSearch=true&startYear=2012&zip=11001

        # All the filters are search parameters in the URL
        self.filters = {
            "zip": int,
            "startYear": int,
            "endYear": int,
            "makeCode": str,  # Can appear multiple times
            "listingType": AutotraderInterface.ListingType, # Can appear multiple times
            "mileage": int,
        }
        
    async def crawl(self) -> List[Dict[str, str]]:
        listings = []
        
        url = self.url

        async with async_playwright() as p:
            # Launch browser in headless mode
            browser = await p.chromium.launch(headless=True,
                                                args=[
                                                    "--disable-blink-features=AutomationControlled",
                                                    "--disable-extensions",
                                                    "--disable-infobars",
                                                    "--enable-automation",
                                                    "--no-first-run",
                                                    "--enable-webgl",
                                                ])
            
            context = await browser.new_context(
                user_agent='Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
                viewport={"width": 1920, "height": 1080},
                locale="en-US",
                timezone_id="Europe/Paris",
                java_script_enabled=True,
            )

            # Modify navigator properties to hide automation
            page = await context.new_page()
            
            await page.goto(url)
            
            # If "This site asks for consent to use your data" is present in the page, click on "Consent" button with class "fc-button fc-cta-consent fc-primary-button"
            try:
                await page.click('text="Consent"')
            except:
                pass
            
            # Scroll to the bottom of the page
            await scroll_to_bottom(page)
            
            page_content = await page.content()
            
            # Parse HTML using lxml
            tree = html.fromstring(page_content)

            # XPath to select each car listing container
            listings_elements = tree.xpath('//div[@data-cmp="inventoryListing"]')

            listings = []

            for listing in listings_elements:
                car_data = {}
                # Extract car details
                car_data['title'] = listing.xpath('.//h2[@data-cmp="subheading"]/text()')
                car_data['url'] = listing.xpath('.//a[@data-cmp="link"]/@href')
                car_data['mileage'] = listing.xpath('.//div[@data-cmp="mileageSpecification"]/text()')
                car_data['price'] = listing.xpath('.//div[@data-cmp="firstPrice"]/text()')
                car_data['dealer'] = listing.xpath('.//div[@class="text-subdued"]/text()')
                car_data['phone'] = listing.xpath('.//span[@data-cmp="phoneNumber"]/text()')
                car_data['image'] = listing.xpath('.//img[@data-cmp="inventoryImage"]/@src')

                # Clean up extracted data
                car_data = {key: (val[0].strip() if val else None) for key, val in car_data.items()}
                listings.append(car_data)
                
            if __name__ == "__main__":
                print("Found the following car listings:")
                # Display the extracted data
                for car in listings:
                    print(car)

            await browser.close()
        
        return listings
    
    def get_filters_info(self) -> str:
        """
        Return a prompt for the LLM describing the filters and expected output format.
        """
        return f"""
        You are a helpful assistant that translates user requirements into a URL with query parameters.

        The base URL is: {self.base_url}
        Filters:
        - zip: User's zip code (integer, required).
        - startYear: Minimum year of the car (integer).
        - endYear: Maximum year of the car (integer).
        - makeCode: Car manufacturer code (string, can appear multiple times).
        - listingType: Type of listing (one of {', '.join([e.value for e in self.ListingType])}).
        - mileage: Maximum mileage of the car (integer).

        Example Output:
        A complete URL with query parameters, e.g.,:
        "{self.base_url}?zip=10001&startYear=2010&endYear=2020&makeCode=BMW&makeCode=FORD&listingType=USED&mileage=50000"

        Based on the user needs, format the response as only the complete URL (no extra explanations).
        """
        
    def set_filters_from_llm_response(self, llm_response: str):
        """
        Process the LLM's response and set the URL with the provided parameters.
        """
        # Validate and set the URL from LLM's response
        if llm_response.startswith(self.base_url):
            self.url = llm_response.strip()
        else:
            raise ValueError("Invalid URL format provided by LLM response: " + llm_response)

In [16]:

if __name__ == "__main__":
    import nest_asyncio
    nest_asyncio.apply()

    interface = AutotraderInterface()
    interface.url = "https://www.autotrader.com/cars-for-sale/ford/new-york-ny?mileage=4000&zip=10001"

    listings = asyncio.run(interface.crawl())

Found the following car listings:
{'title': 'Used 2023 Ford Bronco Raptor', 'url': '/cars-for-sale/vehicle/722672626?city=New%20York&listingType=USED&makeCode=FORD&mileage=4000&newSearch=false&referrer=%2Fford%2Fnew-york-ny%3Fmileage%3D4000%26zip%3D10001&state=NY&zip=10001&clickType=listing', 'mileage': '280 miles', 'price': '75,000', 'dealer': 'Land Rover Parsippany', 'phone': '(862) 420-3351', 'image': 'https://images.autotrader.com/scaler/408/306/hn/c/ecf342d8606a4b6b862893ac32f28a10.jpg'}
{'title': 'Used 2023 Ford E-Transit 148" High Roof Extended', 'url': '/cars-for-sale/vehicle/722051013?city=New%20York&listingType=USED&makeCode=FORD&mileage=4000&newSearch=false&referrer=%2Fford%2Fnew-york-ny%3Fmileage%3D4000%26zip%3D10001&state=NY&zip=10001&clickType=listing', 'mileage': '39 miles', 'price': '39,995', 'dealer': 'Tasca Ford Mazda Subaru', 'phone': '(914) 515-2082', 'image': 'https://images.autotrader.com/scaler/304/228/hn/c/261df47c8e424ea994f67e2f1c7c01f4.jpg'}
{'title': 'Used 2