## To Do
1. Create a Load from csv function
2. Create algo that will pull any data newer than latest entry in csv
3. Deal with errored data
4. Figure out how to do pull new data either daily or weekly - using flask perhaps?
5. Use Plotly Dash to create web app to analyze data

In [1]:
'''
Program to pull auction data from BringaTrailer.com

Created by Tropskee on 10/29/2022
Est. Completion Time: 8 hours
'''

import requests
import re
import time
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
# from selenium.webdriver.chrome.service import Service
# from webdriver_manager.chrome import ChromeDriverManager
# from selenium.webdriver.chrome.service import Service as ChromeService
# from webdriver_manager.chrome import ChromeDriverManager

# base_url = "https://bringatrailer.com/"
# subaru_models = ["subaru/wrx-sti/"]

def get_make_and_model(make_and_model):
    '''
    Accepts URL subdirectory and returns make and model of vehicle
    '''
    split_make_and_model = make_and_model.split('/')
    return split_make_and_model[0], split_make_and_model[1]
    
def get_url_of_all_models():
    url_all_models = 'https://bringatrailer.com/models/'
    
    try:
        html = requests.get(url_all_models)
    except requests.exceptions.RequestException as e:  # This is the correct syntax
        print(e)
        print('Models html data not acquired')
        pass
    
    # <a class="previous-listing-image-link" href="https://bringatrailer.com/acoma/">
    soup = BeautifulSoup(html.content, 'html.parser')
    found_model_urls = soup.find_all("a", href=True, class_ = "previous-listing-image-link")
    models_url_list = [url["href"] for url in found_model_urls]
    return models_url_list

    
def get_model_html(model_subd_url):
    '''
    Get html of overlying vehicle model webpage
    
    :input model_subd_url str: Subdirectory url of vehicle model
    
    :return model_html str: Html content of vehicle model overview page
    '''
    # base_url = "https://bringatrailer.com/"
    try:
        driver = webdriver.Safari()
    except:
        pass
        # Using Chrome
        # from selenium import webdriver
        # from selenium.webdriver.chrome.service import Service as ChromeService
        # from webdriver_manager.chrome import ChromeDriverManager
        # driver = webdriver.Chrome(service=ChromeService(ChromeDriverManager().install()))
        
        # Using IE
        # from selenium import webdriver
        # from selenium.webdriver.ie.service import Service as IEService
        # from webdriver_manager.microsoft import IEDriverManager
        # driver = webdriver.Ie(service=IEService(IEDriverManager().install()))
        
        # # Using Edge
        # from selenium import webdriver
        # from selenium.webdriver.edge.service import Service as EdgeService
        # from webdriver_manager.microsoft import EdgeChromiumDriverManager
        # driver = webdriver.Edge(service=EdgeService(EdgeChromiumDriverManager().install()))
    
    driver.get(model_subd_url)
    
    # Scroll to click 'show more' button to get all previously auctioned vehicle URL's
    try:
        # Used for main auctions page # button_selector = 'body > main > div:nth-child(6) > div > div > div > div.overlayable > div.auctions-footer.auctions-footer-previous > button'
        button_selector = 'body > main > div.container > div > div > div.filter-group > div.overlayable > div.auctions-footer.auctions-footer-previous > button'
        while(driver.find_element(By.CSS_SELECTOR, button_selector)):
            # try:
            WebDriverWait(driver,10).until(EC.element_to_be_clickable((By.CSS_SELECTOR, button_selector))).click()
            time.sleep(1) # Needed for page to load and "show more" button clicked
            #     WebDriverWait(driver,10).until(EC.element_to_be_clickable((By.CSS_SELECTOR, button_selector))).click()
            # except:
            #     WebDriverWait(driver,10).until(EC.element_to_be_clickable((By.CSS_SELECTOR, button_selector)))#.click()
            #     time.sleep(1) # Needed for page to load and "show more" button clicked
            #     WebDriverWait(driver,10).until(EC.element_to_be_clickable((By.CSS_SELECTOR, button_selector))).click()
            #     time.sleep(1)
            
    except Exception as e:
        print(e)
        print('No more vehicles to show')
    
    html = driver.page_source
    driver.quit()
    model_html = BeautifulSoup(html)
        
    return model_html

def get_vehicle_html(model_url):
    '''
    Get page html of (1) vehicle's auction data
    '''
    try:
        html = requests.get(model_url)
    except requests.exceptions.RequestException as e:  # This is the correct syntax
        print(e)
        print('Model HTML data not acquired')
        pass
        
    # Pull html data
    soup = BeautifulSoup(html.content, 'html.parser')
    
    return soup

def get_model_urls(model_html):
    '''
    Get all urls of previously auction vehicles pertaining to a specific model
    
    :input soup str: Beautiful soup string of html data
    
    :return urls list: List containing urls of previoulsy auction vehicles
    '''
    
    get_models = model_html.find_all("div", class_ = "blocks")
    
    urls = []
    for a in get_models[0].find_all("a", href=True):
        if a["href"] not in urls:
            urls.append(a["href"])
        else:
            pass
    print(f"Found {len(urls)} vehicle auction URL's.")
    return urls


# def get_sale_date(soup):
#     '''
#     Get vehicle auction date
#     '''
#     # date_html = soup.find_all('span', class_ = "data-value")
#     date_html = soup.find_all('span', class_ = "date")
#     # print(date_html[0].text.split()[1])
#     sold_date = date_html[0].text.split()[1]
    
#     return sold_date

def get_auction_result(soup):
    '''
    Get auction result data ex. "Sold for $22,250 on 10/27/22"
    '''
    # for auction_result in soup.find("span", class_ = "data-label"):
    auction_result = soup.find("span", class_ = 'info-value')
    # print(auction_result.text)    
    return auction_result.text
    

# def get_price(soup, sold_bool):
#     '''
#     Get vehicle sale price or max bid if not sold
#     '''
#     if sold_bool:
#         for price in soup.find(class_ = "data-value price"):
#             car_price_str = re.findall('[0-9]+,[0-9]+', price)
#             return int(car_price_str[0].replace(",",""))
#     else:
#         for price in soup.find(class_ = "data-value"):
#             car_price_str = re.findall('[0-9]+,[0-9]+', price)
#             return int(car_price_str[0].replace(",",""))


def get_listing_details(soup):
    '''
    Get vehicle listing details - vin, miles, etc.
    '''
    
    listing_details = soup.find_all("div", class_="item")
    return listing_details

def get_model_year(soup):
    '''
    Get vehicle model year from html
    '''
    model_year_text = soup.find("h1", class_ = 'post-title').text
    model_year = re.findall("(\d{4})", model_year_text)
    return model_year
    

# def get_vin(soup):
#     '''
#     Get vehicle VIN
#     '''
#     # Get VIN
#     for vin in soup.find("ul", class_="listing-essentials-items").find_all("li")[3]:
#         return vin
     
    
# def get_miles(soup):
#     '''
#     Get vehicle miles
#     '''
#     for miles in soup.find("ul",class_="listing-essentials-items").find_all("li")[4]:
#         return miles
            

# def main():
#     soup = get_html(models[0])
#     model_urls = get_model_urls(soup)
#     print(model_urls)
    
# main()

In [41]:
model_urls = get_url_of_all_models() # Get URLs directing to every model's page
print(len(model_urls), 'models found')

all_historic_auction_vehicle_urls = []
uncollected_errored_urls = []
completed_historic_auction_vehicle_urls = []

894 models found


In [42]:
for url in model_urls[:]:
    if url in completed_historic_auction_vehicle_urls or url in uncollected_errored_urls:
        continue
    else:
        try:
            soup = get_model_html(url)
            indv_url_lst = get_model_urls(soup)
            for indv_url in indv_url_lst:
                all_historic_auction_vehicle_urls.append(indv_url)
            completed_historic_auction_vehicle_urls.append(url)
        except:
            uncollected_errored_urls.append(url)
            print(f'{url} not collected due to error')

print(all_historic_auction_vehicle_urls[:5])

Message: 

No more vehicles to show
Found 3 vehicle auction URL's.
Message: 

No more vehicles to show
Found 79 vehicle auction URL's.
Message: 

No more vehicles to show
Found 116 vehicle auction URL's.
Message: 

No more vehicles to show
Found 27 vehicle auction URL's.
Message: 

No more vehicles to show
Found 64 vehicle auction URL's.
Message: 

No more vehicles to show
Found 18 vehicle auction URL's.
Message: 

No more vehicles to show
Found 26 vehicle auction URL's.
Message: 

No more vehicles to show
Found 89 vehicle auction URL's.
Message: 

No more vehicles to show
Found 380 vehicle auction URL's.
Message: 

No more vehicles to show
Found 32 vehicle auction URL's.
Message: 

No more vehicles to show
Found 25 vehicle auction URL's.
Message: 

No more vehicles to show
Found 62 vehicle auction URL's.
Message: 

No more vehicles to show
Found 32 vehicle auction URL's.
Message: 

No more vehicles to show
Found 183 vehicle auction URL's.
Message: 

No more vehicles to show
Found 4 ve

## SAVE ALL MODEL & VEHICLE URLS

In [57]:
# auction_results_url = "auctions/results/"
# soup = get_model_html(auction_results_url)
# model_urls = get_model_urls(soup)

from datetime import datetime
now = datetime.now()
# Month abbreviation, day and year
day = now.strftime("%b_%d_%Y_%H_%M_%S")

# Capture all vehicle urls
pd_all_urls = pd.DataFrame()
pd_all_urls["URL"] = all_historic_auction_vehicle_urls
filepath_vehicle_urls = f'./src/urls/all_vehicle_urls/all_vehicle_urls_{day}.csv' # Save version for backup
pd_all_urls.to_csv(filepath_vehicle_urls, index=True)
pd_all_urls.to_csv('./src/urls/all_vehicle_urls/all_vehicle_urls.csv', index=True)


# Capture all models urls
pd_model_urls = pd.DataFrame()
pd_model_urls["URL"] = model_urls
filepath_model_urls = f'./src/urls/model_overview_urls/model_vehicle_urls_{day}.csv' # Save version for backup
pd_model_urls.to_csv(filepath_model_urls, index=True)
pd_model_urls.to_csv('./src/urls/model_overview_urls/model_vehicle_urls.csv', index=True)


## START COLLECTING INDIVIDUAL VEHICLE DATA FROM URLS

In [2]:
import pandas as pd

pd_db = pd.DataFrame()
# Pandas DF column names
COLUMNS = ['Make', 'Model', 'Year', 'Kilometers', 'Miles', 'Mileage Notes', 'Sale_Status', 'Final Bid Price', 'Auction Date', 'VIN', 'Details', 'URL']

## READ URL DATA FROM CSV

In [3]:
pd_all_vehicle_urls = pd.read_csv('./src/urls/all_vehicle_urls/all_vehicle_urls.csv')
pd_all_model_urls = pd.read_csv('./src/urls/model_overview_urls/model_vehicle_urls.csv')

print(pd_all_vehicle_urls["URL"])

0        https://bringatrailer.com/listing/1975-acoma-m...
1        https://bringatrailer.com/listing/1975-acoma-m...
2        https://bringatrailer.com/listing/1975-acoma-m...
3        https://bringatrailer.com/listing/1997-acura-i...
4        https://bringatrailer.com/listing/1995-acura-i...
                               ...                        
79979    https://bringatrailer.com/listing/2011-zimmer-...
79980    https://bringatrailer.com/listing/1986-zimmer-...
79981    https://bringatrailer.com/listing/1987-zimmer-...
79982    https://bringatrailer.com/listing/1982-ford-zi...
79983    https://bringatrailer.com/listing/1987-zimmer-...
Name: URL, Length: 79984, dtype: object


## GET INDIVIDUAL VEHICLE DATA FROM URLS

In [4]:
import pandas as pd
import os.path

# For logging url's that resulted in an error
error_urls = []
COLUMNS = ['Make', 'Model', 'Year', 'Kilometers', 'Miles', 'Mileage Notes', 'Sale_Status', 'Final Bid Price', 'Auction Date', 'VIN', 'Details', 'URL']

# Loop through all urls for a specific model, get listing data, and update db
for idx, model_url in enumerate(pd_all_vehicle_urls["URL"]):
    # Filepath to csv to read/write
    filepath = r'./src/vehicle_data/BAT_auction_data.csv'
    error_filepath = r'./src/vehicle_data/BAT_auction_data_error.csv'
    
    # Check if vehicle data already exists
    if os.path.exists(filepath):
        completed_urls = pd.read_csv(filepath)
        # print(model_url in completed_urls["URL"])
        # print(model_url == completed_urls["URL"][0])
        if completed_urls["URL"].str.contains(model_url).any():
            # print("Data already received for this vehicle.. skipping.")
            continue
    if os.path.exists(error_filepath):
        error_urls = pd.read_csv(error_filepath)
        if error_urls["URL"].str.contains(model_url).any():
            # print("Data already received for this vehicle.. skipping.")
            continue
    
    try:
        # Initiliaze pandas dataframe variables to None
        vehicle_make, vehicle_model, model_year, vehicle_kilometers, vehicle_mileage, vehicle_mileage_notes, sale_status, sale_price, sale_date, vehicle_vin, joined_results = [None] * 11

        # Initialize auction result variables to None
        vehicle_data_soup, auction_result_str, listing_details_html, listing_details = [None] * 4

        # Get vehicle html
        vehicle_data_soup = get_vehicle_html(model_url)

        # Parse html and get auction data
        auction_result_str = get_auction_result(vehicle_data_soup)

        # Get Make & Model
        vehicle_make, vehicle_model = [s.text for s in vehicle_data_soup.find_all("button", class_ = 'group-title')][0:2]
        vehicle_make = vehicle_make.replace('Make', '').strip(' ')
        vehicle_model = vehicle_model.replace('Model', '').replace(vehicle_make, '').strip(' ')

        # Get listing details - vin, miles, etc. - second entry
        listing_details_html = get_listing_details(vehicle_data_soup)
        # Find the listing_details using keyword "Listing Details"
        for detail in listing_details_html:
            if detail.find("strong") and "Listing Details" in detail.find("strong"):
                listing_details = detail
                break

        # Extract details from html "li"
        results = None
        results = [detail.text for detail in listing_details.find_all('li')]
        # Replace any hyphens in listing details with spaces
        # results = [detail.replace("-"," ") for detail in results]

        # Get vehicle mileage, normally second entry in "results"
        is_mileage_units = True
        mileage_words = ['Miles', 'miles', 'Mile', 'mile'] 
        kilometer_words = ['Kilometers', 'kilometers', 'Kilometer', 'kilometer', 'KM', 'km']
        for result in results:
            result = result.replace("-"," ")
            if any(word in result for word in kilometer_words):
                vehicle_mileage_notes = result
                is_mileage_units = False
                break
            elif any(word in result for word in mileage_words):
                vehicle_mileage_notes = result
                break

        # Extract Mileage figure from vehicle_mileage_notes
        if vehicle_mileage_notes is None:
            vehicle_mileage = None
        else:
            vehicle_mileage = re.findall('[0-9]+,[0-9]+', vehicle_mileage_notes)
            if len(vehicle_mileage) < 1: # If no match is found, mileage must contain 'k' at end i.e., 47k miles
                vehicle_mileage = re.findall('[0-9]+[kK]', vehicle_mileage_notes)
                if len(vehicle_mileage) >= 1: # If match is found, strip k from end
                    vehicle_mileage = [vehicle_mileage[0].strip('k').strip('K') + ',000']
            if len(vehicle_mileage) < 1: # Still no match found, try mileage < 1,000
                vehicle_mileage = re.findall('[0-9]+', vehicle_mileage_notes)
            if vehicle_mileage == []:
                vehicle_mileage = None
        vehicle_mileage = vehicle_mileage if type(vehicle_mileage) is not list else vehicle_mileage[0]

        # Check if units are in miles or km and make adjustments if needed
        if vehicle_mileage is not None:
            vehicle_mileage = int(vehicle_mileage.replace(',',''))
            if is_mileage_units: # units are mileage
                vehicle_kilometers = int(1.60934 * vehicle_mileage)
            else: # units are km
                vehicle_kilometers = vehicle_mileage
                vehicle_mileage = int(0.621371 * vehicle_mileage)


        # Get vehicle model year
        model_year = get_model_year(vehicle_data_soup)

        # Check sale status - i.e., sold or not
        sale_status = "Not Sold"
        if "Sold" in auction_result_str or "sold" in auction_result_str:
            sale_status = "Sold"

        # Get vehicle sale date
        sale_date = auction_result_str.split()[-1]

        # Get vehicle sale price
        sale_price = int(re.findall('[0-9]+,[0-9]+', auction_result_str)[0].replace(",",""))

        # Get vehicle vin, first entry in "results"
        vehicle_vin = results[0].split()[-1]

        # Combine results into 1 string
        joined_results = " ,".join(results)

        # Create pd series for ingestion into pd_db
        pd_series = pd.Series([vehicle_make, vehicle_model, model_year[0], vehicle_kilometers, vehicle_mileage, vehicle_mileage_notes, sale_status, sale_price, sale_date, vehicle_vin, joined_results, model_url])

        # # Append DataFrame - make, model, year, mileage, sale_status, price, date, vin, other_details
        # pd_db = pd_db.append(pd_series, ignore_index=True)
                
        # with open(filepath,'a') as fd:
        #     fd.write(pd_series)
        
        pd_db = pd.DataFrame()
        pd_db = pd_db.append(pd_series, ignore_index=True)
        pd_db.reset_index()
        pd_db.columns = COLUMNS
        pd_db.set_index('Auction Date', inplace=True)
        if os.path.exists(filepath):
            pd_db.to_csv(filepath, mode='a', header=False)
        else:
            pd_db.to_csv(filepath, mode='a', header=True)
        
        # Print out every 100 completed vehicles
        if idx % 100 == 0:
            print(f'---Completed {idx} out of {len(pd_all_vehicle_urls["URL"])} vehicles---')
    
    except Exception as e:
        # print(e)
        error_db = pd.DataFrame()
        error_series = pd.Series([model_url])
        error_db = error_db.append(error_series, ignore_index=True)
        # error_db['URl'] = model_url
        error_db.columns = ['URL']
        error_db.set_index('URL', inplace=True)
        if os.path.exists(error_filepath):
            error_db.to_csv(error_filepath, mode='a', header=False)
        else:
            error_db.to_csv(error_filepath, mode='a', header=True)
        # error_urls.append(model_url)
        print(f'URL data not collected for {model_url}')

        
# pd_db.reset_index()
# pd_db.columns = COLUMNS
# pd_db.set_index('Auction Date', inplace=True)
# pd_db.head()



In [5]:
print(len(pd_db))
pd_db.tail(20)

0


In [120]:
from datetime import datetime
now = datetime.now()
# Month abbreviation, day and year
day = now.strftime("%b_%d_%Y_%H_%M_%S")

filepath = f'./Desktop/BAT_Analytics/src/vehicle_data/BAT_auction_data_{day}.csv'
pd_db.to_csv(filepath, index=True)