## Importing necessary libraries

In [1]:
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from bs4 import BeautifulSoup
import json
import time


from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from webdriver_manager.chrome import ChromeDriverManager

## Getting individual car links from all pages

In [2]:
 # Set up headless Chrome browser
options = Options()
#options.add_argument("--headless=new")
options.add_argument("--disable-blink-features=AutomationControlled")
options.add_argument("window-size=1920,1080")
#options.add_argument("user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36")

driver = webdriver.Chrome(options=options)

url = f"https://www.carvana.com/cars"
wait = WebDriverWait(driver, 10)

# Start on a specific page
driver.get(url)
time.sleep(5)

result = []

# Loop over pages
NUM_PAGES = 1
for i in range(NUM_PAGES):
    print(f"--- Page {i+1} ---")
    
    #scroll_to_bottom()
    
    # Optional: you can extract data here before clicking next
    soup = BeautifulSoup(driver.page_source, "html.parser")

    # Parse vehicle links from <script type="application/ld+json">
    json_scripts = soup.find_all("script", type="application/ld+json")
    car_links = []

    for script in json_scripts:
        try:
            data = json.loads(script.string)
            if data.get("@type") == "Vehicle":
                car_links.append(data.get("offers", {}).get("url", ""))
        except Exception:
            continue
    result.extend(car_links)
    
    try:
        next_button = wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, 'li[data-qa="next-page"]')))
        driver.execute_script("arguments[0].scrollIntoView({block: 'center'});", next_button)
        time.sleep(1)
        #driver.find_element(By.TAG_NAME, "body").click()  # unfocus any search box
        #time.sleep(1)
        driver.execute_script("arguments[0].click();", next_button)
        time.sleep(2)
    except Exception as e:
        print("Next button not found or not clickable:", e)
        break

driver.quit()


The chromedriver version (134.0.6998.165) detected in PATH at /opt/homebrew/bin/chromedriver might not be compatible with the detected chrome version (135.0.7049.96); currently, chromedriver 135.0.7049.95 is recommended for chrome 135.*, so it is advised to delete the driver in PATH and retry


--- Page 1 ---


In [6]:
soup.find_all("script", type="application/ld+json")

[<script data-qa="vehicle-ld" type="application/ld+json">{"@context":"https://schema.org","@type":"Vehicle","itemCondition":"Used","name":"2023 Dodge Charger","modelDate":2023,"manufacturer":"Dodge","model":"Charger","color":"Black","image":"https://cdnblob.fastly.carvana.io/2003578509/post-large/normalized/zoomcrop/2003578509-edc-02.jpg?v=2025.4.18_21.37.21","brand":"Dodge","description":"Used 2023 Dodge Charger SXT with 9918 miles - $30,990","mileageFromOdometer":9918,"sku":2003578509,"vehicleIdentificationNumber":"2C3CDXJG4PH657674","offers":{"@type":"Offer","price":30990,"priceCurrency":"USD","availability":"http://schema.org/InStock","priceValidUntil":"January 1, 2030","url":"https://www.carvana.com/vehicle/3548013"}}</script>,
 <script data-qa="vehicle-ld" type="application/ld+json">{"@context":"https://schema.org","@type":"Vehicle","itemCondition":"Used","name":"2023 Hyundai IONIQ 6","modelDate":2023,"manufacturer":"Hyundai","model":"IONIQ 6","color":"White","image":"https://cdn

## Scraping individual car data

In [7]:
import random 
import pandas as pd
from tqdm import tqdm

In [8]:
with open("columns.txt", "r") as file:
    existing_cols = [line.strip() for line in file if line.strip()]

In [9]:
# Read URLs from txt file into a list
with open("unique_urls.txt", "r") as file:
    unique_urls = [line.strip() for line in file if line.strip()]

# Print the list
print(f'Unique urls : {len(unique_urls)}')

prev_result = pd.read_csv('car_data.csv', low_memory=False)
scraped_urls = list(prev_result[~prev_result['bodyType'].isna()]['url'])

print(f'Scraped urls : {len(scraped_urls)}')

Unique urls : 19396
Scraped urls : 19397


In [None]:
# Set up headless Chrome browser
auto_check = "https://apik.carvana.io/merch/vdp/api/autocheck/v1/get?vehicleId="
failed_url = []


for i, car_url in tqdm(enumerate(unique_urls), total=len(unique_urls), desc="Scraping car details"):
#for car_url in unique_urls:
    try: 
        if car_url not in scraped_urls:
            options = Options()
            #options.add_argument("--headless=new")
            options.add_argument("--disable-blink-features=AutomationControlled")
            options.add_argument("window-size=1920,1080")
            #options.add_argument("user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36")

            driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)

            driver.get(car_url)
            time.sleep(3)
            car_soup = BeautifulSoup(driver.page_source, "html.parser")

            auto_check_url = auto_check + car_url[-7:]
            driver.get(auto_check_url)
            time.sleep(3)
            auto_check_soup = BeautifulSoup(driver.page_source, "html.parser")

            driver.quit()

            image_tags = car_soup.find_all("img", {"data-qa": "carousel-item"})
            image_urls = [img['src'] for img in image_tags if 'src' in img.attrs]

            json_scripts = car_soup.find_all("script", type="application/json")
            for script in json_scripts:
                try:
                    json_data = json.loads(script.string)
                    vehicle_details = json_data.get("props", {}).get("pageProps", {}).get("forProviders", {}).get("forVehicleContext", {}).get("vehicleDetails", {})
                    vehicle_details['url'] = car_url
                    try:
                        owner = auto_check_soup.find("span", class_="box-title-owners").find("span").text
                    except AttributeError:
                        owner = "1"
                    vehicle_details['owners'] = owner

                    #vehicle_details['status'] = car_response.status_code
                    df = pd.DataFrame([vehicle_details])
                    #car_data = pd.concat([car_data, df], ignore_index=True)
                    df = df.reindex(columns=existing_cols, fill_value="")
                    #df.to_csv("car_data.csv", index=False, mode='a', header=False) # appends data to my existing file
                    with open('car_data.csv', mode='a', newline='', encoding='utf-8') as f:
                        df.to_csv(f, index=False, header=False)
                        f.flush()

                    #### scraping the image urls ####
                    image_df = pd.DataFrame([{'make': df.loc[0,'make'], 'model': df.loc[0,'model'], 'image_url': url} for url in image_urls])
                    with open('images.csv', mode='a', newline='', encoding='utf-8') as f:
                        image_df.to_csv(f, index=False, header=False)
                        f.flush()
                    
                except json.JSONDecodeError:
                    continue
                
                time.sleep(random.uniform(1, 3))
                
    except Exception as e:
        failed_url.append(car_url)

Scraping car details: 100%|██████████| 19396/19396 [5:53:59<00:00,  1.10s/it]  


In [None]:
# new testing

In [10]:
urls_tobe_scraped = ['https://www.carvana.com/vehicle/3474709', 'https://www.carvana.com/vehicle/3557345', 'https://www.carvana.com/vehicle/3456879']

In [37]:
# Set up headless Chrome browser
auto_check = "https://apik.carvana.io/merch/vdp/api/autocheck/v1/get?vehicleId="
failed_url = []


for i, car_url in tqdm(enumerate(urls_tobe_scraped), total=len(urls_tobe_scraped), desc="Scraping car details"):
#for car_url in unique_urls:
    try: 
    #if car_url not in scraped_urls:
        options = Options()
        #options.add_argument("--headless=new")
        options.add_argument("--disable-blink-features=AutomationControlled")
        options.add_argument("window-size=1920,1080")
        #options.add_argument("user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36")
        prefs = {
                    "profile.managed_default_content_settings.images": 2,
                    "profile.managed_default_content_settings.stylesheets": 2,
                    "profile.managed_default_content_settings.fonts": 2
                }
        options.add_experimental_option("prefs", prefs)

        driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)

        driver.get(car_url)
        time.sleep(3)
        car_soup = BeautifulSoup(driver.page_source, "html.parser")

        auto_check_url = auto_check + car_url[-7:]
        driver.get(auto_check_url)
        time.sleep(3)
        auto_check_soup = BeautifulSoup(driver.page_source, "html.parser")

        driver.quit()

        image_tags = car_soup.find_all("img", {"data-qa": "carousel-item"})
        image_urls = [img['src'] for img in image_tags if 'src' in img.attrs]

        json_scripts = car_soup.find_all("script", type="application/json")
        for script in json_scripts:
            try:
                json_data = json.loads(script.string)
                vehicle_details = json_data.get("props", {}).get("pageProps", {}).get("forProviders", {}).get("forVehicleContext", {}).get("vehicleDetails", {})
                vehicle_details['url'] = car_url
                try:
                    owner = auto_check_soup.find("span", class_="box-title-owners").find("span").text
                except AttributeError:
                    owner = "1"
                vehicle_details['owners'] = owner

                #vehicle_details['status'] = car_response.status_code
                df = pd.DataFrame([vehicle_details])
                #car_data = pd.concat([car_data, df], ignore_index=True)
                df = df.reindex(columns=existing_cols, fill_value="")
                #df.to_csv("car_data.csv", index=False, mode='a', header=False) # appends data to my existing file
                with open('car_data.csv', mode='a', newline='', encoding='utf-8') as f:
                    df.to_csv(f, index=False, header=False)
                    f.flush()

                #### scraping the image urls ####
                image_df = pd.DataFrame([{'make': df.loc[0,'make'], 'model': df.loc[0,'model'], 'image_url': url} for url in image_urls])
                with open('images.csv', mode='a', newline='', encoding='utf-8') as f:
                    image_df.to_csv(f, index=False, header=False)
                    f.flush()
                
            except json.JSONDecodeError:
                continue
            
            time.sleep(random.uniform(1, 3))
                
    except Exception as e:
        failed_url.append(car_url)

Scraping car details:   0%|          | 0/3 [00:00<?, ?it/s]

Scraping car details:  67%|██████▋   | 2/3 [00:31<00:15, 15.54s/it]


KeyboardInterrupt: 

In [40]:
! conda install lxml


  deprecated.topic(

  deprecated.topic(
Channels:
 - defaults
 - conda-forge
Platform: osx-arm64
Collecting package metadata (repodata.json): done
Solving environment: done

## Package Plan ##

  environment location: /opt/miniconda3/envs/carvana

  added / updated specs:
    - lxml


The following packages will be downloaded:

    package                    |            build
    ---------------------------|-----------------
    libxml2-2.13.7             |       h0b34f26_0         614 KB
    libxslt-1.1.41             |       hf4d3faa_0         240 KB
    lxml-5.3.0                 |  py311h1d4350b_1         1.3 MB
    ------------------------------------------------------------
                                           Total:         2.1 MB

The following NEW packages will be INSTALLED:

  icu                pkgs/main/osx-arm64::icu-73.1-h313beb8_0 
  libiconv           pkgs/main/osx-arm64::libiconv-1.16-h80987f9_3 
  libxml2            pkgs/main/osx-arm64::libxml2-2.13.7-h0b34f2

In [34]:
len(result)

210

In [17]:
import asyncio
import json
from bs4 import BeautifulSoup
from playwright.async_api import async_playwright
import nest_asyncio

nest_asyncio.apply()

async def scrape_carvana():
    result = []
    NUM_PAGES = 10
    base_url = "https://www.carvana.com/cars"

    async with async_playwright() as p:
        browser = await p.chromium.launch(headless=False)
        context = await browser.new_context(
            viewport={"width": 100, "height": 50},
            java_script_enabled=True,
        )

        # Block unnecessary resources (images, fonts, stylesheets)
        async def block_junk(route):
            if route.request.resource_type in ["image", "stylesheet", "font"]:
                await route.abort()
            else:
                await route.continue_()

        page = await context.new_page()
        await page.route("**/*", block_junk)
        await page.goto(base_url, timeout=6000)

        for i in range(NUM_PAGES):
            print(f"--- Page {i+1} ---")

            # Scroll as fast as possible to the bottom
            previous_height = -1
            while True:
                await page.evaluate("window.scrollTo(0, document.body.scrollHeight)")
                await page.wait_for_timeout(10)  # FAST scroll
                current_height = await page.evaluate("document.body.scrollHeight")
                if current_height == previous_height:
                    break
                previous_height = current_height

            # Parse and extract data
            html = await page.content()
            soup = BeautifulSoup(html, "html.parser")
            json_scripts = soup.find_all("script", type="application/ld+json")

            for script in json_scripts:
                try:
                    data = json.loads(script.string)
                    if data.get("@type") == "Vehicle":
                        result.append(data.get("offers", {}).get("url", ""))
                except Exception:
                    continue

            # Click Next
            try:
                next_btn = await page.wait_for_selector('li[data-qa="next-page"]', timeout=3000)
                await next_btn.scroll_into_view_if_needed()
                await next_btn.click()
                await page.wait_for_timeout(200)  # Allow for next page to load
            except Exception as e:
                print("Next button not found or not clickable:", e)
                break

        await browser.close()

    print(f"Total car links scraped: {len(result)}")
    return result

# Run the async function
results = await scrape_carvana()


--- Page 1 ---
--- Page 2 ---
--- Page 3 ---
--- Page 4 ---
--- Page 5 ---
--- Page 6 ---
--- Page 7 ---
--- Page 8 ---
--- Page 9 ---
--- Page 10 ---
Total car links scraped: 210
