## Importing necessary libraries

In [None]:
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from bs4 import BeautifulSoup
import json
import time
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from webdriver_manager.chrome import ChromeDriverManager

## Getting individual car links from all pages

In [None]:
 # Set up headless Chrome browser
options = Options()
#options.add_argument("--headless=new")
options.add_argument("--disable-blink-features=AutomationControlled")
options.add_argument("window-size=1920,1080")
#options.add_argument("user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36")

driver = webdriver.Chrome(options=options)

url = f"https://www.carvana.com/cars"
wait = WebDriverWait(driver, 10)

# Start on a specific page
driver.get(url)
time.sleep(5)

result = []

# Loop over pages
NUM_PAGES = 2014
for i in range(NUM_PAGES):
    print(f"--- Page {i+1} ---")
    
    #scroll_to_bottom()
    
    # Optional: you can extract data here before clicking next
    soup = BeautifulSoup(driver.page_source, "html.parser")

    # Parse vehicle links from <script type="application/ld+json">
    json_scripts = soup.find_all("script", type="application/ld+json")
    car_links = []

    for script in json_scripts:
        try:
            data = json.loads(script.string)
            if data.get("@type") == "Vehicle":
                car_links.append(data.get("offers", {}).get("url", ""))
        except Exception:
            continue
    result.extend(car_links)
    
    try:
        next_button = wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, 'li[data-qa="next-page"]')))
        driver.execute_script("arguments[0].scrollIntoView({block: 'center'});", next_button)
        time.sleep(1)
        #driver.find_element(By.TAG_NAME, "body").click()  # unfocus any search box
        #time.sleep(1)
        driver.execute_script("arguments[0].click();", next_button)
        time.sleep(2)
    except Exception as e:
        print("Next button not found or not clickable:", e)
        break

driver.quit()


The chromedriver version (134.0.6998.165) detected in PATH at /opt/homebrew/bin/chromedriver might not be compatible with the detected chrome version (135.0.7049.96); currently, chromedriver 135.0.7049.95 is recommended for chrome 135.*, so it is advised to delete the driver in PATH and retry


--- Page 1 ---


## Scraping individual car data

In [7]:
import random 
import pandas as pd
from tqdm import tqdm

In [8]:
with open("columns.txt", "r") as file:
    existing_cols = [line.strip() for line in file if line.strip()]

In [9]:
# Read URLs from txt file into a list
with open("unique_urls.txt", "r") as file:
    unique_urls = [line.strip() for line in file if line.strip()]

# Print the list
print(f'Unique urls : {len(unique_urls)}')

prev_result = pd.read_csv('car_data.csv', low_memory=False)
scraped_urls = list(prev_result[~prev_result['bodyType'].isna()]['url'])

print(f'Scraped urls : {len(scraped_urls)}')

Unique urls : 19396
Scraped urls : 19397


In [None]:
# Set up headless Chrome browser
auto_check = "https://apik.carvana.io/merch/vdp/api/autocheck/v1/get?vehicleId="
failed_url = []


for i, car_url in tqdm(enumerate(unique_urls), total=len(unique_urls), desc="Scraping car details"):
#for car_url in unique_urls:
    try: 
        if car_url not in scraped_urls:
            options = Options()
            #options.add_argument("--headless=new")
            options.add_argument("--disable-blink-features=AutomationControlled")
            options.add_argument("window-size=1920,1080")
            #options.add_argument("user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36")

            driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)

            driver.get(car_url)
            time.sleep(3)
            car_soup = BeautifulSoup(driver.page_source, "html.parser")

            auto_check_url = auto_check + car_url[-7:]
            driver.get(auto_check_url)
            time.sleep(3)
            auto_check_soup = BeautifulSoup(driver.page_source, "html.parser")

            driver.quit()

            image_tags = car_soup.find_all("img", {"data-qa": "carousel-item"})
            image_urls = [img['src'] for img in image_tags if 'src' in img.attrs]

            json_scripts = car_soup.find_all("script", type="application/json")
            for script in json_scripts:
                try:
                    json_data = json.loads(script.string)
                    vehicle_details = json_data.get("props", {}).get("pageProps", {}).get("forProviders", {}).get("forVehicleContext", {}).get("vehicleDetails", {})
                    vehicle_details['url'] = car_url
                    try:
                        owner = auto_check_soup.find("span", class_="box-title-owners").find("span").text
                    except AttributeError:
                        owner = "1"
                    vehicle_details['owners'] = owner

                    #vehicle_details['status'] = car_response.status_code
                    df = pd.DataFrame([vehicle_details])
                    #car_data = pd.concat([car_data, df], ignore_index=True)
                    df = df.reindex(columns=existing_cols, fill_value="")
                    #df.to_csv("car_data.csv", index=False, mode='a', header=False) # appends data to my existing file
                    with open('car_data.csv', mode='a', newline='', encoding='utf-8') as f:
                        df.to_csv(f, index=False, header=False)
                        f.flush()

                    #### scraping the image urls ####
                    image_df = pd.DataFrame([{'make': df.loc[0,'make'], 'model': df.loc[0,'model'], 'image_url': url} for url in image_urls])
                    with open('images.csv', mode='a', newline='', encoding='utf-8') as f:
                        image_df.to_csv(f, index=False, header=False)
                        f.flush()
                    
                except json.JSONDecodeError:
                    continue
                
                time.sleep(random.uniform(1, 3))
                
    except Exception as e:
        failed_url.append(car_url)

Scraping car details: 100%|██████████| 19396/19396 [5:53:59<00:00,  1.10s/it]  
