In [1]:
# importing necessary packages
import pandas as pd
import numpy as np
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException
from selenium.common.exceptions import NoSuchElementException

from webdriver_manager.chrome import ChromeDriverManager

import time
import random

In [2]:
# specific url I want to scrape from
# all make and models of used cars within a 250 mile radius of Iowa City (52242)
url = "https://www.cars.com/shopping/results/?stock_type=used&makes%5B%5D=&models%5B%5D=&maximum_distance=250&zip=52242"

In [3]:
# opening bot operated chrome tab
browser = webdriver.Chrome()
browser.maximize_window()
browser.get(url)

In [4]:
# starting a blank list for just the urls
# we'll loop through and gather the urls first, then visit each url individually to scrape the data from the page
urls = []

In [5]:
# building a random scroll function, as to avoid detection
def random_scroll(browser, total_scroll_time):
    total_height = browser.execute_script("return document.body.scrollHeight")
    scroll_steps = random.randint(3, 10)
    scroll_position = 0
    for step in range(scroll_steps):
        scroll_position += total_height // scroll_steps
        browser.execute_script(f"window.scrollTo(0, {scroll_position});")
        time.sleep(random.uniform(1, 4))
        new_height = browser.execute_script("return document.body.scrollHeight")
        if new_height > total_height:
            total_height = new_height

In [6]:
WebDriverWait(browser, 10).until(
    lambda driver: browser.execute_script("return document.readyState") == "complete"
)

True

In [7]:
# verifying the location of the link on the page
link_element = browser.find_element(By.CLASS_NAME, 'vehicle-card-link')
href = link_element.get_attribute('href')
print("Link:", href)

Link: https://www.cars.com/vehicledetail/cba89f30-b255-4763-aadb-eb4c6b48f44a/?attribution_type=premier


In [8]:
# looping for urls on the individal page. This will be done for multiple pages. For now, I'm just scraping the first page of data
browser.get(url)
link_elements = browser.find_elements(By.CLASS_NAME, 'vehicle-card-link')

# using the [2:] slicer to skip the sponsored listings, as they appear on every page. will lead to duplicates if not 
for link in link_elements[2:]:
    urls.append(link.get_attribute('href'))
    
# testing to make sure it's scraping
for index, link in enumerate(urls):
    print(f"Link {index + 1}: {link}")

Link 1: https://www.cars.com/vehicledetail/f6253a0e-292d-4e65-bd01-c56889b461c0/
Link 2: https://www.cars.com/vehicledetail/f6a1787b-a7a3-4ae9-aa50-0b14264d5c19/
Link 3: https://www.cars.com/vehicledetail/3bf68441-871f-4057-b681-5fd292c813be/
Link 4: https://www.cars.com/vehicledetail/f7b0037b-9500-4e1b-b74e-eef35783b3e5/
Link 5: https://www.cars.com/vehicledetail/03543369-5849-4d75-b877-c06915ba797f/
Link 6: https://www.cars.com/vehicledetail/b5f45fa9-919a-4b88-b140-dd9d5d6349bc/
Link 7: https://www.cars.com/vehicledetail/15ca0c01-ea97-4056-98bb-f29d9ad5bb52/
Link 8: https://www.cars.com/vehicledetail/134151f9-1fde-4496-8794-94db0758d8e3/
Link 9: https://www.cars.com/vehicledetail/f5d5ab4e-fd6f-4d6b-bf14-8c42cdd68b72/
Link 10: https://www.cars.com/vehicledetail/43ee8e9d-766f-47e1-a23e-c3a5702bb12a/
Link 11: https://www.cars.com/vehicledetail/1a0a3961-e276-48a2-99ed-43b3deee2855/
Link 12: https://www.cars.com/vehicledetail/4c802d2b-ba4d-4777-8880-7406eb78b138/
Link 13: https://www.cars

In [9]:
# verifying the links are being added to the list
urls

['https://www.cars.com/vehicledetail/f6253a0e-292d-4e65-bd01-c56889b461c0/',
 'https://www.cars.com/vehicledetail/f6a1787b-a7a3-4ae9-aa50-0b14264d5c19/',
 'https://www.cars.com/vehicledetail/3bf68441-871f-4057-b681-5fd292c813be/',
 'https://www.cars.com/vehicledetail/f7b0037b-9500-4e1b-b74e-eef35783b3e5/',
 'https://www.cars.com/vehicledetail/03543369-5849-4d75-b877-c06915ba797f/',
 'https://www.cars.com/vehicledetail/b5f45fa9-919a-4b88-b140-dd9d5d6349bc/',
 'https://www.cars.com/vehicledetail/15ca0c01-ea97-4056-98bb-f29d9ad5bb52/',
 'https://www.cars.com/vehicledetail/134151f9-1fde-4496-8794-94db0758d8e3/',
 'https://www.cars.com/vehicledetail/f5d5ab4e-fd6f-4d6b-bf14-8c42cdd68b72/',
 'https://www.cars.com/vehicledetail/43ee8e9d-766f-47e1-a23e-c3a5702bb12a/',
 'https://www.cars.com/vehicledetail/1a0a3961-e276-48a2-99ed-43b3deee2855/',
 'https://www.cars.com/vehicledetail/4c802d2b-ba4d-4777-8880-7406eb78b138/',
 'https://www.cars.com/vehicledetail/2800ab3e-8bab-4325-afeb-27e771175785/',

In [10]:
# creating blank lists to fill with scraped data

# the data frame I plan to vertically merge with has year, make, and model as its own columns. I will seperate them mannually later, as the listing 
# title is one entry, and I cannot scrape them seperately
other_lists = {
    'name': [],
    'price': []
}

In [11]:
    # Define blank lists for each type of data
data_lists = {
    'Exterior color': [],
    'Interior color': [],
    'Drivetrain': [],
    'MPG': [],
    'Fuel type': [],
    'Transmission': [],
    'Engine': [],
    'VIN': [],
    'Stock #': [],
    'Mileage': []
}

In [12]:
# for now, I'm only scraping three of the links, many more later in the project
entry_count = 0
for url in urls:
    if entry_count >= 3:
        break 
    else:
        entry_count += 1

    print('-'*70)
    print(f"Visiting {url}")
    print('-'*70)
    browser.get(url)
    time.sleep(5)
    random_scroll(browser, 10)

    # Scraping for car name 
    title_element = browser.find_element(By.XPATH, '//h1[@class="listing-title"]')
    car_name = title_element.text
    print(f"{car_name}")
    other_lists['name'].append(car_name)

    # Scraping price
    price_element = browser.find_element(By.XPATH, '//span[contains(@class, "primary-price")]')
    prices = price_element.text
    print(f"{prices}")
    other_lists['price'].append(prices)
    
    # Scrape other data to data_lists (might filter this further later, as not all the data is necessary)
    data = {}
    section = WebDriverWait(browser, 10).until(
        EC.presence_of_element_located((By.XPATH, '//section[contains(@class, "sds-page-section basics-section")]'))
    )
    dts = section.find_elements(By.TAG_NAME, 'dt')
    dds = section.find_elements(By.TAG_NAME, 'dd')

    for dt, dd in zip(dts, dds):
        data[dt.text.strip()] = dd.text.strip()  # Populate the dictionary
    print("Scraped Data:", data)

    # Populate the lists
    for dt, dd in data.items():
        if dt in data_lists:
            if dd == "-" or not dd.strip():
                dd = np.nan
            data_lists[dt].append(dd)
            print(f"Added {dd} to {dt}")  # Debugging (remove)

    # Confirm populated lists
    print("Data Lists:")
    for key, values in data_lists.items():
        print(f"{key}: {values}")

----------------------------------------------------------------------
Visiting https://www.cars.com/vehicledetail/f6253a0e-292d-4e65-bd01-c56889b461c0/
----------------------------------------------------------------------
2024 Mazda CX-50 2.5 Turbo Premium Plus Package
$37,430
Scraped Data: {'Exterior color': 'Polymetal Gray Metallic', 'Interior color': 'Black w/Brown', 'Drivetrain': 'All-wheel Drive', 'MPG': '23–29', 'Fuel type': 'Gasoline', 'Transmission': 'Automatic', 'Engine': 'SKYACTIV-G 2.5L I-4 gasoline direct injection, DOHC, variable va', 'VIN': '7MMVABEY6RN221468', 'Stock #': '4500911', 'Mileage': '11,626 mi.'}
Added Polymetal Gray Metallic to Exterior color
Added Black w/Brown to Interior color
Added All-wheel Drive to Drivetrain
Added 23–29 to MPG
Added Gasoline to Fuel type
Added Automatic to Transmission
Added SKYACTIV-G 2.5L I-4 gasoline direct injection, DOHC, variable va to Engine
Added 7MMVABEY6RN221468 to VIN
Added 4500911 to Stock #
Added 11,626 mi. to Mileage
Dat

In [13]:
print(data_lists)

{'Exterior color': ['Polymetal Gray Metallic', 'Black', 'Coastal Cream'], 'Interior color': ['Black w/Brown', 'Magma Grey', 'Black'], 'Drivetrain': ['All-wheel Drive', 'All-wheel Drive', 'All-wheel Drive'], 'MPG': ['23–29', '17–24', '26–27'], 'Fuel type': ['Gasoline', 'Gasoline', 'Hybrid'], 'Transmission': ['Automatic', 'Automatic', 'Automatic'], 'Engine': ['SKYACTIV-G 2.5L I-4 gasoline direct injection, DOHC, variable va', '3L V-6 gasoline direct injection, DOHC, variable valve control,', 'Intercooled Turbo Gas/Electric I-4 2.4 L/146'], 'VIN': ['7MMVABEY6RN221468', 'W1N0J6EB3MG003264', '5TDADAB59RS004999'], 'Stock #': ['4500911', '9504500', 'T2U004999'], 'Mileage': ['11,626 mi.', '21,310 mi.', '16,753 mi.']}


In [14]:
print(other_lists)

{'name': ['2024 Mazda CX-50 2.5 Turbo Premium Plus Package', '2021 Mercedes-Benz AMG GLC 43 4MATIC Coupe', '2024 Toyota Grand Highlander Hybrid Limited MAX'], 'price': ['$37,430', '$47,830', '$57,163']}


In [15]:
# merging my two dictionaries together
merged = other_lists | data_lists

In [16]:
merged

{'name': ['2024 Mazda CX-50 2.5 Turbo Premium Plus Package',
  '2021 Mercedes-Benz AMG GLC 43 4MATIC Coupe',
  '2024 Toyota Grand Highlander Hybrid Limited MAX'],
 'price': ['$37,430', '$47,830', '$57,163'],
 'Exterior color': ['Polymetal Gray Metallic', 'Black', 'Coastal Cream'],
 'Interior color': ['Black w/Brown', 'Magma Grey', 'Black'],
 'Drivetrain': ['All-wheel Drive', 'All-wheel Drive', 'All-wheel Drive'],
 'MPG': ['23–29', '17–24', '26–27'],
 'Fuel type': ['Gasoline', 'Gasoline', 'Hybrid'],
 'Transmission': ['Automatic', 'Automatic', 'Automatic'],
 'Engine': ['SKYACTIV-G 2.5L I-4 gasoline direct injection, DOHC, variable va',
  '3L V-6 gasoline direct injection, DOHC, variable valve control,',
  'Intercooled Turbo Gas/Electric I-4 2.4 L/146'],
 'VIN': ['7MMVABEY6RN221468', 'W1N0J6EB3MG003264', '5TDADAB59RS004999'],
 'Stock #': ['4500911', '9504500', 'T2U004999'],
 'Mileage': ['11,626 mi.', '21,310 mi.', '16,753 mi.']}

In [17]:
# converting the dictionaries into a dataframe
cars_df = pd.DataFrame(merged)

In [18]:
cars_df

Unnamed: 0,name,price,Exterior color,Interior color,Drivetrain,MPG,Fuel type,Transmission,Engine,VIN,Stock #,Mileage
0,2024 Mazda CX-50 2.5 Turbo Premium Plus Package,"$37,430",Polymetal Gray Metallic,Black w/Brown,All-wheel Drive,23–29,Gasoline,Automatic,"SKYACTIV-G 2.5L I-4 gasoline direct injection,...",7MMVABEY6RN221468,4500911,"11,626 mi."
1,2021 Mercedes-Benz AMG GLC 43 4MATIC Coupe,"$47,830",Black,Magma Grey,All-wheel Drive,17–24,Gasoline,Automatic,"3L V-6 gasoline direct injection, DOHC, variab...",W1N0J6EB3MG003264,9504500,"21,310 mi."
2,2024 Toyota Grand Highlander Hybrid Limited MAX,"$57,163",Coastal Cream,Black,All-wheel Drive,26–27,Hybrid,Automatic,Intercooled Turbo Gas/Electric I-4 2.4 L/146,5TDADAB59RS004999,T2U004999,"16,753 mi."


In [22]:
# saving to a csv so if I need to redo anything, I don't have to scrape again
cars_df.to_csv("cars_df2.csv")

In [62]:
cars_df2 = pd.read_csv("cars_df2.csv")

In [63]:
# Splitting name into three categories
cars_df2[['model_year', 'brand', 'model']] = cars_df2['name'].str.split(' ',n=2, expand=True)

In [64]:
cars_df2

Unnamed: 0.1,Unnamed: 0,name,price,Exterior color,Interior color,Drivetrain,MPG,Fuel type,Transmission,Engine,VIN,Stock #,Mileage,model_year,brand,model
0,0,2024 Mazda CX-50 2.5 Turbo Premium Plus Package,"$37,430",Polymetal Gray Metallic,Black w/Brown,All-wheel Drive,23–29,Gasoline,Automatic,"SKYACTIV-G 2.5L I-4 gasoline direct injection,...",7MMVABEY6RN221468,4500911,"11,626 mi.",2024,Mazda,CX-50 2.5 Turbo Premium Plus Package
1,1,2021 Mercedes-Benz AMG GLC 43 4MATIC Coupe,"$47,830",Black,Magma Grey,All-wheel Drive,17–24,Gasoline,Automatic,"3L V-6 gasoline direct injection, DOHC, variab...",W1N0J6EB3MG003264,9504500,"21,310 mi.",2021,Mercedes-Benz,AMG GLC 43 4MATIC Coupe
2,2,2024 Toyota Grand Highlander Hybrid Limited MAX,"$57,163",Coastal Cream,Black,All-wheel Drive,26–27,Hybrid,Automatic,Intercooled Turbo Gas/Electric I-4 2.4 L/146,5TDADAB59RS004999,T2U004999,"16,753 mi.",2024,Toyota,Grand Highlander Hybrid Limited MAX


In [65]:
# renaming columns to match the case and rules of the other dataframe
cars_df2 = cars_df2.rename(columns={
    'Exterior color': 'ext_col',
    'Interior color': 'int_col',
    'Fuel type': 'fuel_type',
    'Transmission': 'transmission', 
    'Mileage': 'mileage',
    'Engine': 'engine'
})

In [55]:
cars_df2

Unnamed: 0.1,Unnamed: 0,name,price,ext_col,int_col,Drivetrain,MPG,fuel_type,transmission,engine,VIN,Stock #,mileage,model_year,brand,model
0,0,2024 Mazda CX-50 2.5 Turbo Premium Plus Package,"$37,430",Polymetal Gray Metallic,Black w/Brown,All-wheel Drive,23–29,Gasoline,Automatic,"SKYACTIV-G 2.5L I-4 gasoline direct injection,...",7MMVABEY6RN221468,4500911,"11,626 mi.",2024,Mazda,CX-50 2.5 Turbo Premium Plus Package
1,1,2021 Mercedes-Benz AMG GLC 43 4MATIC Coupe,"$47,830",Black,Magma Grey,All-wheel Drive,17–24,Gasoline,Automatic,"3L V-6 gasoline direct injection, DOHC, variab...",W1N0J6EB3MG003264,9504500,"21,310 mi.",2021,Mercedes-Benz,AMG GLC 43 4MATIC Coupe
2,2,2024 Toyota Grand Highlander Hybrid Limited MAX,"$57,163",Coastal Cream,Black,All-wheel Drive,26–27,Hybrid,Automatic,Intercooled Turbo Gas/Electric I-4 2.4 L/146,5TDADAB59RS004999,T2U004999,"16,753 mi.",2024,Toyota,Grand Highlander Hybrid Limited MAX


### Changing datatypes to be able to do calculations later

In [66]:
cars_df2['price'] = cars_df2['price'].str.replace("$","").str.replace(",","")
cars_df2['price'] = cars_df2['price'].astype("int")

In [67]:
cars_df2['mileage'] = cars_df2['mileage'].str.replace("mi.","").str.replace(",","")
cars_df2['mileage'] = cars_df2['mileage'].astype("int")

In [69]:
cars_df2.drop(['name', 'Stock #'], axis=1, inplace=True)

In [70]:
cars_df2

Unnamed: 0.1,Unnamed: 0,price,ext_col,int_col,Drivetrain,MPG,fuel_type,transmission,engine,VIN,mileage,model_year,brand,model
0,0,37430,Polymetal Gray Metallic,Black w/Brown,All-wheel Drive,23–29,Gasoline,Automatic,"SKYACTIV-G 2.5L I-4 gasoline direct injection,...",7MMVABEY6RN221468,11626,2024,Mazda,CX-50 2.5 Turbo Premium Plus Package
1,1,47830,Black,Magma Grey,All-wheel Drive,17–24,Gasoline,Automatic,"3L V-6 gasoline direct injection, DOHC, variab...",W1N0J6EB3MG003264,21310,2021,Mercedes-Benz,AMG GLC 43 4MATIC Coupe
2,2,57163,Coastal Cream,Black,All-wheel Drive,26–27,Hybrid,Automatic,Intercooled Turbo Gas/Electric I-4 2.4 L/146,5TDADAB59RS004999,16753,2024,Toyota,Grand Highlander Hybrid Limited MAX


In [71]:
cars_df2.to_csv("cars_scraped_cleaned.csv")