In [None]:
if __name__ == '__main__':
    %pip install patchright lxml nest_asyncio
    !playwright install
    !patchright install chromium

In [None]:
import nest_asyncio
nest_asyncio.apply()

from enum import Enum
from typing import Dict, List
from patchright.async_api import async_playwright
from lxml import html
import asyncio
from abc import ABC, abstractmethod
import re

In [None]:
async def scroll_to_bottom(page, scroll_delay=0.1):
    """
    Scroll to the bottom of the page iteratively, with delays to ensure dynamic content is fully loaded.
    
    Args:
        page: The Playwright page instance.
        scroll_delay: Delay in seconds between scrolls to allow content loading.
    """
    
    print("Scrolling through the page...")
    
    scroll_size = 2160

    next_scroll = scroll_size
    for i in range(3):
        # Scroll 500 pixels at a time
        await page.evaluate(f"window.scrollTo(0, {next_scroll})")

        next_scroll += scroll_size

        # Wait for content to load
        await asyncio.sleep(scroll_delay)
        
    print("Finished scrolling through the page.")



In [None]:
async def block_unnecessary_resources(route):
    if route.request.resource_type in ["image"]:
        await route.abort()
    else:
        await route.continue_()

In [None]:
class WebsiteInterface(ABC):
    def __init__(self):
        self.base_url = ""
        
    @abstractmethod
    async def crawl(self) -> List[Dict[str, str]]:
        """
        Abstract method to crawl the website and extract listings.
        Must be implemented by subclasses.
        """
        pass

    @abstractmethod
    def get_filters_info(self) -> str:
        """
        Abstract method to return a prompt for the LLM describing the filters and expected output format.
        Must be implemented by subclasses.
        """
        pass

    @abstractmethod
    def set_filters_from_llm_response(self, llm_response: str):
        """
        Abstract method to process the LLM's response and set the URL with appropriate filters.
        Must be implemented by subclasses.
        """
        pass

class AutotraderInterface(WebsiteInterface):
    def __init__(self):
        self.base_url = "https://www.autotrader.com/cars-for-sale/all-cars"
        # https://www.autotrader.com/cars-for-sale/all-cars/floral-park-ny?endYear=2022&makeCode=BMW&makeCode=FORD&newSearch=true&startYear=2012&zip=11001
        
    async def crawl(self) -> List[Dict[str, str]]:
        listings = []
        
        url = self.url

        playwright = await async_playwright().start()

        # Launch browser in headless mode
        browser = await playwright.chromium.launch(headless=True,
                                                    args=[
                                                            "--no-sandbox",
                                                            "--disable-setuid-sandbox",
                                                            "--disable-dev-shm-usage",
                                                            "--disable-extensions",
                                                            "--disable-gpu"
                                                    ]
                                                    )
        
        context = await browser.new_context(
            user_agent='Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/130.0.0.0 Safari/537.36',
            viewport={"width": 1920, "height": 1080},
            # no_viewport=True
            locale="en-US",
            timezone_id="America/New_York",
            # java_script_enabled=False,
        )


        print("Opening browser page")

        page = await context.new_page()
        
        await page.route("**/*", block_unnecessary_resources)

        print("Loading page")
        
        await page.goto(url, wait_until="domcontentloaded")

        print("Page partially loaded. Starting to scroll.")

        # Print text of the page
        # print(await page.content())
        
        # If "This site asks for consent to use your data" is present in the page, click on "Consent" button with class "fc-button fc-cta-consent fc-primary-button"
        # print("Accepting cookies")
        # try:
        #     page.click('text="Consent"', timeout=30000)
        #     print("Cookies accepted")
        # except:
        #     print("No cookies to accept")
        
        # Scroll to the bottom of the page
        await scroll_to_bottom(page)
        
        page_content = await page.content()
        
        # Parse HTML using lxml
        tree = html.fromstring(page_content)

        # XPath to select each car listing container
        listings_elements = tree.xpath('//div[@data-cmp="inventoryListing"]')

        listings = []

        for listing in listings_elements:
            car_data = {}
            # Extract car details
            car_data['title'] = listing.xpath('.//h2[@data-cmp="subheading"]/text()')
            car_data['mileage'] = listing.xpath('.//div[@data-cmp="mileageSpecification"]/text()')
            car_data['price'] = listing.xpath('.//div[@data-cmp="firstPrice"]/text()')
            car_data['dealer'] = listing.xpath('.//div[@class="text-subdued"]/text()')
            car_data['phone'] = listing.xpath('.//span[@data-cmp="phoneNumber"]/text()')
            car_data['url'] = listing.xpath('.//a[@data-cmp="link"]/@href')
            car_data['image'] = listing.xpath('.//img[@data-cmp="inventoryImage"]/@src')
            
            # Clean up extracted data
            car_data = {key: (val[0].strip() if val else None) for key, val in car_data.items()}
            
            car_data['url'] = car_data['url'].split('?')[0]
            
            # Add domain to the URL. Extract domain from the base URL without the path
            car_data['url'] = re.sub(r'^(https?://[^/]+).*$', r'\1', self.base_url) + car_data['url']
            
            # Set the ID of the listing as the ID of the WebsiteInterface and the car number from URL
            car_data = { "id": f"{self.__class__.__name__}_{car_data['url'].split('/')[-1]}" } | car_data
            
            listings.append(car_data)
            
        if __name__ == "__main__":
            print("Found the following car listings:")
            # Display the extracted data
            for car in listings:
                print(car)

        print("Found", len(listings), "listings")

        await browser.close()
        
        return listings
    
    async def crawl_listing(self, listing_url) -> List[Dict[str, str]]:
        listing_info = ""
        
        url = listing_url

        playwright = await async_playwright().start()

        # Launch browser in headless mode
        browser = await playwright.chromium.launch(headless=True,
                                                    args=[
                                                            "--no-sandbox",
                                                            "--disable-setuid-sandbox",
                                                            "--disable-dev-shm-usage",
                                                            "--disable-extensions",
                                                            "--disable-gpu"
                                                    ]
                                                    )
        
        context = await browser.new_context(
            user_agent='Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/130.0.0.0 Safari/537.36',
            viewport={"width": 1920, "height": 1080},
            # no_viewport=True
            locale="en-US",
            timezone_id="America/New_York",
            # java_script_enabled=False,
        )


        print("Opening browser page")

        page = await context.new_page()
        
        await page.route("**/*", block_unnecessary_resources)

        print("Loading page")
        
        await page.goto(url, wait_until="domcontentloaded")

        print("Page partially loaded. Starting to scroll.")

        # Scroll to the bottom of the page
        await scroll_to_bottom(page)

        # Get full HTML
        page_content = await page.content()

        # Parse HTML using lxml to extract all the text
        tree = html.fromstring(page_content)
        listing_info = tree.xpath("//div[contains(@class, 'container') and contains(@class, 'margin-top-5')]/div[contains(@class, 'row')]//text()")
        listing_info = "\t".join(listing_info).strip()

        # Seller information should already be included in the listing information
        # seller_info = tree.xpath("//div[@id='sellerComments']//text()")
        # listing_info = listing_info + seller_info
            
        if __name__ == "__main__":
            print("Found the following information:")
            # Print the extracted text
            print(listing_info)

        await browser.close()
        
        return listing_info
    
    def get_filters_info(self) -> str:
        """
        Return a prompt for the LLM describing the filters and expected output format.
        """
        return f"""
        You are a helpful assistant that translates user requirements into a URL with query parameters.

        The base URL is: {self.base_url}
        Filters:
        - zip: User's zip code (integer).
        - searchRadius: Search radius in miles (integer, e.g., 75, 100, 200).
        - startYear: Minimum year of the car (integer).
        - endYear: Maximum year of the car (integer).
        - makeCode: Car manufacturer code (string, can appear multiple times, e.g., "BMW", "FORD").
        - listingType: Type of listing (one of "NEW", "USED", "CERTIFIED", "3P_CERT").
        - mileage: Maximum mileage of the car (integer).
        - driveGroup: Type of drive (one of "AWD4WD", "FWD", "RWD").
        - extColorSimple: External color of the car (e.g., "BLACK", "WHITE", "RED", "GRAY").
        - intColorSimple: Internal color of the car (e.g., "BEIGE", "BLACK", "BLUE").
        - mpgRange: Fuel efficiency in miles per gallon (e.g., "30-MPG").
        - fuelTypeGroup: Type of fuel (one of "GSL", "DSL", "HYB", "ELE", "PIH").
        - bodyStyleSubtypeCode: Type of body style (e.g., "FULLSIZE_CREW", "COMPACT_EXTEND").
        - truckBedLength: Truck bed length (e.g., "SHORT", "EXTRA SHORT", "UNSPECIFIED").
        - vehicleStyleCode: Vehicle style (e.g., "CONVERT", "WAGON", "HATCH", "SUVCROSS").
        - dealType: Type of deal (e.g., "goodprice", "greatprice").
        - doorCode: Number of doors (e.g., "2", "3", "4").
        - engineDisplacement: Engine size range in liters (e.g., "1.0-1.9", "2.0-2.9").
        - featureCode: Specific features of the car (e.g., "1062" for heated seats, "1327" for navigation).
        - transmissionCode: Transmission type (e.g., "AUT" for automatic, "MAN" for manual).
        - vehicleHistoryType: Vehicle history (e.g., "NO_ACCIDENTS", "ONE_OWNER", "CLEAN_TITLE").
        - newSearch: Boolean to indicate a new search (e.g., "true").
        - sortBy: Sorting option for the results (optional). 
            Options:
            - "relevance" (default): Sort by relevance.
            - "derivedpriceASC": Sort by price, lowest to highest.
            - "derivedpriceDESC": Sort by price, highest to lowest.
            - "distanceASC": Sort by distance, closest to farthest.
            - "datelistedASC": Sort by date, oldest first.
            - "datelistedDESC": Sort by date, newest first.
            - "mileageASC": Sort by mileage, lowest to highest.
            - "mileageDESC": Sort by mileage, highest to lowest.
            - "yearASC": Sort by year, oldest to newest.
            - "yearDESC": Sort by year, newest to oldest.
        
        Special filters:
        - price: Price is embedded in the path of the URL, e.g., "/cars-over-45000" or "/cars-between-10000-and-20000".

        Example Output:
        A complete URL with query parameters, e.g.,:
        "{self.base_url}/cars-between-10000-and-20000?zip=10001&startYear=2010&endYear=2020&makeCode=BMW&makeCode=FORD&listingType=USED&mileage=50000&fuelTypeGroup=GSL&intColorSimple=BLACK&vehicleHistoryType=NO_ACCIDENTS"

        Based on the user's needs, format the response as only the complete URL (no extra explanations). The URL is an example, don't include filters if they are not needed by the user.
        """
        
    def set_filters_from_llm_response(self, llm_response: str):
        """
        Process the LLM's response and set the URL with the provided parameters.
        """
        # Validate and set the URL from LLM's response
        if llm_response.startswith(self.base_url):
            self.url = llm_response.strip()
        else:
            raise ValueError("Invalid URL format provided by LLM response: " + llm_response)

In [None]:
# if __name__ == "__main__":
#     interface = AutotraderInterface()
#     interface.url = "https://www.autotrader.com/cars-for-sale/all-cars/cars-under-10000?newSearch=true&bodyStyleSubtypeCode=COMPACT&mpgRange=30-MPG&vehicleHistoryType=CLEAN_TITLE"

#     listings = await interface.crawl()

In [None]:
# if __name__ == "__main__":
#     interface = AutotraderInterface()
#     listing_info = await interface.crawl_listing("https://www.autotrader.com/cars-for-sale/vehicle?listingId=726641083&clickType=elot")