In [2]:
from bs4 import BeautifulSoup
import re
import numpy as np
import datetime
import requests
import json
from bs4 import BeautifulSoup, SoupStrainer
from sklearn.pipeline import Pipeline
import pandas as pd
import pandas as pd
import numpy as np
import joblib
import xgboost as xgb
import requests
import time
import lxml
import cchardet

url = "https://bringatrailer.com/listing/2016-ford-mustang-shelby-gt350-18/"
html = requests.get(url)
VEHICLE_COLUMNS = ['make', 'model', 'year', 'miles', 'color', 'auction_year', 'engine_size', 'cylinders']

def model_pipeline(df):
    '''
    Return prediction from incoming data
    '''
    # Load XGB model
    xgb_model = xgb.XGBRegressor()
    xgb_model.load_model('./api/models/xgb_model.h5')
    preprocessor = joblib.load('./api/models/preprocessor.joblib')

    model_pipeline = Pipeline(steps=[
            ('preprocessor', preprocessor),
            ('model', xgb_model)
        ])

    return model_pipeline.predict(df)

### Auction year and price ~ 0.01s optimization

In [3]:
def get_auction_year_and_price(vehicle_data_soup):
    '''
    Get auction year and price from vehicle html data
    
    :input vehicle_data_soup str: Vehicle html data loaded by BeautifulSoup

    :return auction_year datetime.year: Auction year ie. 2022
    :return sale_price int: Sale price in $
    :return live bool: Whether the auction is currently live or finished
    '''
    try:
        auction_result_string = vehicle_data_soup.find("span", class_ = 'listing-available-countdown')

        # Get auction year & sale price for live auction
        if auction_result_string:
            # Get vehicle sale date
            auction_year = datetime.datetime.now().year
            # Get current vehicle price
            auction_current_price = vehicle_data_soup.find("strong", class_ = 'info-value').text
            sale_price = int(re.findall('[0-9]+,[0-9]+', auction_current_price)[0].replace(",",""))
            live = True

        else: # Auction is over, get historic year and price
            auction_result_string = vehicle_data_soup.find("span", class_ = 'info-value').text
            sale_date = auction_result_string.split()[-1]
            auction_year = (datetime.datetime.strptime(sale_date, '%m/%d/%y').year)
            # Get vehicle sale price
            sale_price = int(re.findall('[0-9]+,[0-9]+', auction_result_string)[0].replace(",",""))
            live = False
    except:
        auction_year = datetime.datetime.now().year
        sale_price = 1
        live="False"

    return auction_year, sale_price, live

In [4]:
# a = 0
# for i in range(10):
#     start = time.time()
#     # strainer1 = SoupStrainer(class_='listing-available-countdown')
#     strainer1 = SoupStrainer(["span", "strong"])


#     auction_year, sale_price, live = get_auction_year_and_price(BeautifulSoup(html.content, 'html.parser', parse_only=strainer1))
#     # auction_year, sale_price, live = get_auction_year_and_price(BeautifulSoup(html.content, 'html.parser'))


#     print(auction_year, sale_price, live)
#     a += time.time()-start
#     print('It took', time.time()-start, 'seconds.')
# print("\nAverage time taken:", a/10)

### Make and Model ~ 0.135s optimzation

In [5]:
def get_make_and_model(vehicle_data_soup):
    '''
    Get make and model from vehicle html data
    
    :input vehicle_data_soup str: Vehicle html data loaded by BeautifulSoup

    :return vehicle_make str: Vehicle make ie. Honda
    :return vehicle_model str: Vehicle model ie. Accord
    '''
    try:
        vehicle_make, vehicle_model = [s.text for s in vehicle_data_soup.find_all("button", class_ = 'group-title')][0:2]
        vehicle_make = vehicle_make.replace('Make', '').strip(' ')
        vehicle_model = vehicle_model.replace('Model', '').replace(vehicle_make, '').strip(' ')
    except:
        vehicle_make = "BAD"
        vehicle_model = "URL"

    return vehicle_make.lower(), vehicle_model.lower()

In [7]:
# a = 0
# for i in range(10):
#     start = time.time()
#     # strainer1 = SoupStrainer(class_='listing-available-countdown')
#     strainer = SoupStrainer("button")

    
#     vehicle_make, vehicle_model = get_make_and_model(BeautifulSoup(html.content, 'html.parser', parse_only=strainer))
#     # vehicle_make, vehicle_model = get_make_and_model(BeautifulSoup(html.content, 'html.parser'))


#     a += time.time()-start
#     print('It took', time.time()-start, 'seconds.')

# print(vehicle_make, vehicle_model)
# print("\nAverage time taken:", a/10)

### Listing Details ~ 0.13s optimization

In [8]:
def get_listing_details(vehicle_data_soup):
    '''
    Get list of auction details from html data
    
    :input vehicle_data_soup str: Vehicle html data loaded by BeautifulSoup

    :return results list: List of vehicle details ie, paint color, miles, etc.
    '''
    try:
        # Get listing details - vin, miles, etc. - second entry
        listing_details_html = vehicle_data_soup.find_all("div", class_="item")

        # Find the listing_details using keyword "Listing Details"
        for detail in listing_details_html:
            if detail.find("strong") and "Listing Details" in detail.find("strong"):
                listing_details = detail
                break

        # Extract details from html "li"
        results = None
        results = [detail.text for detail in listing_details.find_all('li')]
    except:
        results = []

    return results

In [9]:
# a = 0
# for i in range(10):
#     start = time.time()
#     # strainer1 = SoupStrainer(class_='listing-available-countdown')
#     strainer = SoupStrainer("div", class_="item")

    
#     # results = get_listing_details(BeautifulSoup(html.content, 'html.parser', parse_only=strainer))
#     results = get_listing_details(BeautifulSoup(html.content, 'html.parser'))


#     a += time.time()-start
#     print('It took', time.time()-start, 'seconds.')

# print(results)
# print("\nAverage time taken:", a/10)

In [10]:
def get_mileage(listing_details):
    '''
    Extract vehicle mileage from listing details
    
    :input listing_details list: List of auction details from html data

    :return vehicle_mileage str: Mileage of vehicle
    '''
    try:
        # Get vehicle mileage, normally second entry in "results"
        is_mileage_units = True
        mileage_words = ['Miles', 'miles', 'Mile', 'mile'] 
        kilometer_words = ['Kilometers', 'kilometers', 'Kilometer', 'kilometer', 'KM', 'km']
        for result in listing_details:
            result = result.replace("-"," ")
            if any(word in result for word in kilometer_words):
                vehicle_mileage_notes = result
                is_mileage_units = False
                break
            elif any(word in result for word in mileage_words):
                vehicle_mileage_notes = result
                break

        # Extract Mileage figure from vehicle_mileage_notes
        if vehicle_mileage_notes is None:
            vehicle_mileage = None
        else:
            vehicle_mileage = re.findall('[0-9]+,[0-9]+', vehicle_mileage_notes)
            if len(vehicle_mileage) < 1: # If no match is found, mileage must contain 'k' at end i.e., 47k miles
                vehicle_mileage = re.findall('[0-9]+[kK]', vehicle_mileage_notes)
                if len(vehicle_mileage) >= 1: # If match is found, strip k from end
                    vehicle_mileage = [vehicle_mileage[0].strip('k').strip('K') + ',000']
            if len(vehicle_mileage) < 1: # Still no match found, try mileage < 1,000
                vehicle_mileage = re.findall('[0-9]+', vehicle_mileage_notes)
            if vehicle_mileage == []:
                vehicle_mileage = None
        vehicle_mileage = vehicle_mileage if type(vehicle_mileage) is not list else vehicle_mileage[0]

        # Check if units are in miles or km and make adjustments if needed
        if vehicle_mileage is not None:
            vehicle_mileage = int(vehicle_mileage.replace(',',''))
            if is_mileage_units: # units are mileage
                vehicle_kilometers = int(1.60934 * vehicle_mileage)
            else: # units are km
                vehicle_kilometers = vehicle_mileage
                vehicle_mileage = int(0.621371 * vehicle_mileage)
    except:
        vehicle_mileage = 75000
    
    return vehicle_mileage

In [11]:
def get_paint_color(listing_details):
    '''
    Extract paint color from listing details
    
    :input listing_details list: List of auction details from html data

    :return paint_color str: Color of vehicle
    '''
    try:
        colors = ["white", "black", "gray", "silver", "blue", "red", "brown", "green", "orange", "beige", "purple", "gold", "yellow"]
        paint_string = None
        paint_color = None

        for result in listing_details:
            if paint_string:
                break
            results_separated = result.split(",")
            results_separated = [result.strip().lower() for detail in results_separated]
            
            for detail in results_separated:
                if "paint" in detail or any(color in detail for color in colors):
                    paint_string = detail
                    break
            
        for word in paint_string.split():
            if any(color in word for color in colors):
                paint_color = word
    except:
        paint_color = "silver"

    return paint_color

### Model Year ~0.13s optimization

In [12]:
def get_model_year(vehicle_data_soup):
    '''
    Extract model year from vehicle html
    
    :input vehicle_data_soup str: Vehicle html data loaded by BeautifulSoup

    :return model_year str: Model year of vehicle
    '''
    try:
        model_year_text = vehicle_data_soup.find("h1", class_ = 'post-title').text
        model_year = re.findall("(\d{4})", model_year_text)


    except:
        model_year = ["2022"]
    
    # print(model_year)
    return "2022" if not model_year else model_year[0]

In [13]:
# a = 0
# for i in range(10):
#     start = time.time()
    
#     strainer = SoupStrainer("h1")#, class_ = 'post-title')
    
#     results = get_model_year(BeautifulSoup(html.content, 'html.parser', parse_only=strainer))
#     # results = get_model_year(BeautifulSoup(html.content, 'html.parser'))


#     a += time.time()-start
#     print('It took', time.time()-start, 'seconds.')

# print(results)
# print("\nAverage time taken:", a/10)

In [14]:
def get_engine_size(listing_details):
    '''
    Extract engine size from listing details
    
    :input listing_details list: List of auction details from html data

    :return engine_size float: Engine size of vehicle
    '''
    try:
        ########### Extract engine size string from details ###########
        eng_keywords = ["liter", "v6", "v8", "engine", "inline", "three", "four", "five", "six", "eight", "ci", "cc", "flathead", "cylinder", "dohc", "sohc", "ohc", "turbocharged"]
        eng_size_keywords = ["liter", "ci", "cc"]
        eng_size_re = ["[0-9]+.[0-9]+", "[0-9]+.[0-9]+l"]
        match = False
        engine_size_string = None

        for details in listing_details:
            if match:
                break
            for detail in details.split():
                detail=detail.lower()
                if any(word in detail for word in eng_keywords):
                    match = True
                    if len(re.findall(eng_size_re[0], detail)) > 0 or len(re.findall(eng_size_re[1], detail)) > 0:
                        engine_size_string = detail
                    elif any(w in detail for w in eng_size_keywords):
                        engine_size_string = detail
                        break

        ########### Extract actual engine size from engine size string ###########
        # Best number finding regex ever!
        numeric_const_pattern = '[-+]? (?: (?: \d* \. \d+ ) | (?: \d+ \.? ) )(?: [Ee] [+-]? \d+ ) ?'
        rx = re.compile(numeric_const_pattern, re.VERBOSE)
        engine_size = None

        match = rx.findall(engine_size_string)

        # Convert cc or ci to liters
        if "cc" in engine_size_string or "cubic centimeters" in engine_size_string:
            size = engine_size_string.replace("cc", "")
            size = engine_size_string.replace("cubic centimeters", "")
            # print('cc found', size)
            engine_size = (float(size)/1000)

        elif "ci" in engine_size_string or "c.i." in engine_size_string:
            size = engine_size_string.replace("ci", "")
            size = engine_size_string.replace("c.i.", "")
            engine_size = (float(size)*0.0163871)

        else:
            # if liter size > 12, it has errored so default to 2.0
            if float(match[0]) > 12:
                engine_size = 2.0
            else:
                engine_size = (float(match[0]))
    except:
        engine_size = 2.0
            
    return engine_size

In [15]:
def get_num_cylinders(listing_details):
    '''
    Extract number of cylinders from listing details
    
    :input listing_details list: List of auction details from html data

    :return num_cylinders int: Vehicle cylinder count
    '''
    try:
        cylinder_keywords = ["inline", "cylinder", "cyl","two", "three", "four", "five", "six", "eight", "v4", "v6", "v8", "v10", "v12", "v-4", "v-6", "vr6", "v-8", "v-10", "v-12", "w12", "w-12", "flat4", "flat-4", "flat 4", "flat6", "flat 6", "flat-6"]
        # cylinder_re = [""]
        singles = ["1", "single"]
        ones = ["1", "one"]
        twins = ["2", "twin"]
        triples = ["3", "triple"]
        twos = ["2", "two"]
        threes = ["3", "three"]
        fours = ["4", "four"]
        fives = ["5", "five"]
        sixes = ["6", "six"]
        eights = ["8", "eight"]
        tens = ["10", "ten"]
        twelves = ["12", "twelve"]

        num_cyl_string = None
        match = False

        # Extract num of cylinders
        for sentence in listing_details:
            if match:
                break
            words = sentence.lower().split(" ")
            for word in words:
                if any(w in word for w in cylinder_keywords):
                    num_cyl_string = word
                    match = True
                    break

        num_cylinders = None

        # Extract int from cylinder str
        for k in [threes, fours, fives, sixes, eights, tens, twelves, ones, twos, singles, twins, triples]:
            if k[0] in num_cyl_string or k[1] in num_cyl_string:
                num_cylinders = int(k[0])
                match = True
                break
    except:
        num_cylinders = 4
    
    return num_cylinders

In [16]:
def main(vehicle_data_soup):
    '''
    Combine all vehicle data into exported dict to be imported into XGBoost model
    
    :input vehicle_data_soup str: Vehicle html data loaded by BeautifulSoup

    :return vehicle_dict dict: All extracted vehicle data
    '''
    # Get image
    try:
        image = vehicle_data_soup.find_all('img', class_='post-image')[0]['src']
    except:
        image = None
    # Get vehicle details
    auction_year, sale_price, live = get_auction_year_and_price(vehicle_data_soup)
    vehicle_make, vehicle_model = get_make_and_model(vehicle_data_soup)
    listing_details = get_listing_details(vehicle_data_soup)
    vehicle_mileage = get_mileage(listing_details)
    paint_color = get_paint_color(listing_details)
    model_year = get_model_year(vehicle_data_soup)
    engine_size = get_engine_size(listing_details)
    num_cylinders = get_num_cylinders(listing_details)

    vehicle_dict = {
        "make": vehicle_make,
        "model": vehicle_model,
        "year": model_year,
        "miles": vehicle_mileage,
        "color": paint_color,
        "auction_year": auction_year,
        "engine_size": engine_size,
        "cylinders": num_cylinders,
        "bid_price": sale_price,
        "image": image,
        "live": live
    }

    return vehicle_dict

## Solution

A quick search indicates that html.parser is written in pure python and slow.

The internet is unanimous, one must install and use lxml alongside BeautifulSoup. lxml is a C parser that should be much much faster!

BUT.. Install lxml…. Run… Get the same result.

Well, didn’t help.

Deep down in the google results, a link to the official documentation, with at the bottom of the page, a small section on performance (also advising to use lxml), including a hidden gem in the last sentence.

https://beautiful-soup-4.readthedocs.io/en/latest/#improving-performance
[…] You can speed up encoding detection significantly by installing the cchardet library.

Boom, with lxml and cchardet we can speed up by > 2x!

Hence the problem was the character detection.

SOURCE: https://thehftguy.com/2020/07/28/making-beautifulsoup-parsing-10-times-faster/

### With Optimization

In [19]:
a = 0
for i in range(10):    
    start = time.time()

    url = "https://bringatrailer.com/listing/2016-ford-mustang-shelby-gt350-18/"

    html = requests.get(url)

    strainer1 = SoupStrainer(["span", "strong"])
    strainer2 = SoupStrainer("button")
    strainer3 = SoupStrainer("div", class_="item")
    strainer4 = SoupStrainer("h1")
    strainerAll = SoupStrainer(["h1", "span", "strong", "button", "div"])
    
    
    # vehicle_data_soup = BeautifulSoup(html.content, 'html.parser', parse_only=strainerAll)
    vehicle_data_soup = BeautifulSoup(html.content, 'lxml', parse_only=strainerAll)


    try:
        image = vehicle_data_soup.find_all('img', class_='post-image')[0]['src']
    except:
        image = None
    # Get vehicle details
    auction_year, sale_price, live = get_auction_year_and_price(vehicle_data_soup)
    vehicle_make, vehicle_model = get_make_and_model(vehicle_data_soup)

    listing_details = get_listing_details(vehicle_data_soup)

    vehicle_mileage = get_mileage(listing_details)
    paint_color = get_paint_color(listing_details)

    model_year = get_model_year(vehicle_data_soup)

    engine_size = get_engine_size(listing_details)
    num_cylinders = get_num_cylinders(listing_details)

    vehicle_dict = {
        "make": vehicle_make,
        "model": vehicle_model,
        "year": model_year,
        "miles": vehicle_mileage,
        "color": paint_color,
        "auction_year": auction_year,
        "engine_size": engine_size,
        "cylinders": num_cylinders,
        "bid_price": sale_price,
        "image": image,
        "live": live
    }

    df = pd.DataFrame()
    for col in VEHICLE_COLUMNS:
        df[col] = pd.Series(vehicle_dict.get(col))

    # print(vehicle_dict)
    print(str(model_pipeline(df)[0]))
    a += time.time()-start
    print('It took', time.time()-start, 'seconds.')

print(vehicle_dict)
print("\nAverage time taken:", a/10)

52019.41
It took 0.4820108413696289 seconds.
52019.41
It took 0.43700695037841797 seconds.
52019.41
It took 0.4424400329589844 seconds.
52019.41
It took 0.37610721588134766 seconds.
52019.41
It took 0.4471302032470703 seconds.
52019.41
It took 0.3542752265930176 seconds.
52019.41
It took 0.44507694244384766 seconds.
52019.41
It took 0.43358397483825684 seconds.
52019.41
It took 0.39540696144104004 seconds.
52019.41
It took 0.44266581535339355 seconds.
{'make': 'ford', 'model': 'mustang s550', 'year': '2016', 'miles': 8000, 'color': 'w/white', 'auction_year': 2022, 'engine_size': 5.2, 'cylinders': 8, 'bid_price': 53500, 'image': 'https://bringatrailer.com/wp-content/uploads/2022/10/2016_ford_mustang-shelby-gt350_ext3-83811.jpg?fit=940%2C626', 'live': False}

Average time taken: 0.42556960582733155


### Without Optimization

In [20]:
a = 0
for i in range(10):
    start = time.time()

    url = "https://bringatrailer.com/listing/2016-ford-mustang-shelby-gt350-18/"

    html = requests.get(url)


    vehicle_data_soup = BeautifulSoup(html.content, 'lxml')

    try:
        image = vehicle_data_soup.find_all('img', class_='post-image')[0]['src']
    except:
        image = None
    # Get vehicle details
    auction_year, sale_price, live = get_auction_year_and_price(vehicle_data_soup)

    vehicle_make, vehicle_model = get_make_and_model(vehicle_data_soup)

    listing_details = get_listing_details(vehicle_data_soup)

    vehicle_mileage = get_mileage(listing_details)
    paint_color = get_paint_color(listing_details)

    model_year = get_model_year(vehicle_data_soup)

    engine_size = get_engine_size(listing_details)
    num_cylinders = get_num_cylinders(listing_details)

    vehicle_dict = {
        "make": vehicle_make,
        "model": vehicle_model,
        "year": model_year,
        "miles": vehicle_mileage,
        "color": paint_color,
        "auction_year": auction_year,
        "engine_size": engine_size,
        "cylinders": num_cylinders,
        "bid_price": sale_price,
        "image": image,
        "live": live
    }

    df = pd.DataFrame()
    for col in VEHICLE_COLUMNS:
        df[col] = pd.Series(vehicle_dict.get(col))

    # print(vehicle_dict)
    print(str(model_pipeline(df)[0]))
    a += time.time()-start
    print('It took', time.time()-start, 'seconds.')

print(vehicle_dict)
print("\nAverage time taken:", a/10)

52019.41
It took 0.41950178146362305 seconds.
52019.41
It took 0.42633914947509766 seconds.
52019.41
It took 0.42108702659606934 seconds.
52019.41
It took 0.3792998790740967 seconds.
52019.41
It took 0.4238548278808594 seconds.
52019.41
It took 0.55702805519104 seconds.
52019.41
It took 0.3780968189239502 seconds.
52019.41
It took 0.43133091926574707 seconds.
52019.41
It took 0.44681406021118164 seconds.
52019.41
It took 0.3826878070831299 seconds.
{'make': 'ford', 'model': 'mustang s550', 'year': '2016', 'miles': 8000, 'color': 'w/white', 'auction_year': 2022, 'engine_size': 5.2, 'cylinders': 8, 'bid_price': 53500, 'image': 'https://bringatrailer.com/wp-content/uploads/2022/10/2016_ford_mustang-shelby-gt350_ext3-83811.jpg?fit=940%2C626', 'live': False}

Average time taken: 0.4266030788421631


In [60]:
## 1.8 seconds