In [None]:
# TLDR - the most efficient way to parse is to execute javascript inside the driver (NOT use driver get method)

In [42]:
# import selenium
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service as ChromeService

from concurrent.futures import ThreadPoolExecutor

import time
import asyncio

In [12]:
driver = None
url ='https://www.seek.com.au/Data-Engineer-jobs/in-Melbourne-VIC-3000?page=2'
try:
    options = webdriver.ChromeOptions()
    options.page_load_strategy = 'none'
    driver = webdriver.Chrome(options=options)
    driver.get(url)
    articles = WebDriverWait(driver, 22).until(
        EC.presence_of_all_elements_located((By.TAG_NAME, 'article'))
    )
    print(f"Found {len(elements)} elements.")
except Exception as e:
    print(f"An error occurred: {e}")
finally:
    # Stop the page from loading
    driver.execute_script("window.stop();")
    # driver.quit()

Found 4 elements.


In [16]:
job_ads = {}

In [23]:
# no multiprocessing
all_articles_startTime = time.perf_counter()
for article in articles:
    job_id = article.get_attribute("data-job-id")
    job_ads[job_id] = {}
    elements = article.find_elements(By.CSS_SELECTOR, '[data-automation]')
    for el in elements: #data-automation data-testid
        job_ads[job_id][(el.get_attribute("data-automation")).replace('-','_')] = el.text
print(f'all articles parsed run time: {time.perf_counter()-all_articles_startTime}')

all articles parsed run time: 8.018609104000006


In [34]:
# article level multiprocessing
def parse_article(article):
    try:
        job_id = article.get_attribute("data-job-id")
        job_ad = {}
        job_ad[job_id] = {}
        elements = article.find_elements(By.CSS_SELECTOR, '[data-automation]')
        for el in elements: #data-automation data-testid
            job_ad[job_id][(el.get_attribute("data-automation")).replace('-','_')] = el.text
        # print(job_ad)
    except Exception as e:
        print(f'Error: {job_ad}')
    return job_ad


# Define the main() coroutine
async def main():
    # Get the current event loop
    loop = asyncio.get_running_loop()

    # start time
    start = loop.time()
    
    # Create a list of futures for running lucky_number() five times in different threads
    tasks = [loop.run_in_executor(None, parse_article, article) for article in articles]

    # Wait for all futures to complete and collect the results
    try:
        results = await asyncio.gather(*tasks)
        print(f"The articles {results}")
    except ValueError as e:
        print(f"An exception occurred: {e}")

    # end time
    end = loop.time()

    # Print the total execution time
    print(f"Total execution time: {end - start} second(s)")


# asyncio.run(main())
await main()

The articles [{'84991909': {'job_list_view_job_link': '', 'job_list_item_link_overlay': '', 'premiumAdBadge': 'Featured', 'company_logo_container': '', 'company_logo': '', 'jobTitle': 'Software Engineer (Hybrid)', 'jobCompany': 'SEEK Limited', 'signed_out_save_job': '', 'jobCardLocation': ', Melbourne VIC', 'jobLocation': 'Melbourne VIC', 'jobSubClassification': 'Developers/Programmers', 'jobClassification': '(Information & Communication Technology)', 'jobShortDescription': 'Join SEEK as a Software Engineer in the Candidate Quality squad, focusing on data engineering and machine learning solutions.'}}, {'85062575': {'job_list_view_job_link': '', 'job_list_item_link_overlay': '', 'premiumAdBadge': 'Featured', 'jobTitle': 'FastAPI/Python Engineer', 'jobCompany': 'Mark My Words', 'signed_out_save_job': '', 'jobCardLocation': 'Melbourne VIC', 'jobLocation': 'Melbourne VIC', 'jobSubClassification': 'Developers/Programmers', 'jobClassification': '(Information & Communication Technology)', 'j

In [48]:
start_time = time.perf_counter()
# for article in articles:
#     parse_article(article)
parse_article(articles[1])
print(f'total run time = {time.perf_counter() - start_time}')

total run time = 0.2796069430000898


In [45]:
# using ProcessPoolExecutor
from concurrent.futures import ProcessPoolExecutor

def parse_article(article):
    try:
        job_id = article.get_attribute("data-job-id")
        job_ad = {}
        job_ad[job_id] = {}
        elements = article.find_elements(By.CSS_SELECTOR, '[data-automation]')
        for el in elements: #data-automation data-testid
            job_ad[job_id][(el.get_attribute("data-automation")).replace('-','_')] = el.text
    except Exception as e:
        print(f'Error: {job_ad}')
    return job_ad

async def extract_texts_concurrently():
    loop = asyncio.get_running_loop()
    with ThreadPoolExecutor(max_workers=5) as executor:
        tasks = [loop.run_in_executor(executor, parse_article, article) for article in articles]
        return await asyncio.gather(*tasks)

start_time = time.perf_counter()
await extract_texts_concurrently()
print(f'total run time = {time.perf_counter() - start_time}')

total run time = 5.784008115000006


In [63]:
# using in browser js
elements_html = driver.execute_script("""
    jobs = {}
    return Array.from(document.getElementsByTagName('article')).map(article => Array.from(article.querySelectorAll('[data-automation]'))).map(el => el.map(d => `${d.dataset.automation}: ${d.innerText}`))

""")

In [65]:
elements_html_v2 = driver.execute_script("""
    let articles = Array.from(document.getElementsByTagName('article'))
    let jobs = {}
    articles.map(article => {
    	let jobId = article.dataset.jobId
    	jobs[jobId] = {}
    	let elements = Array.from(article.querySelectorAll('[data-automation]'))
    	return elements.map(el => {
    		let dataKey = el.dataset.automation
    		jobs[jobId][dataKey] = el.innerText
    	})
    })
    return jobs
""")

In [66]:
elements_html_v2

{'84618675': {'job-list-item-link-overlay': '',
  'job-list-view-job-link': '',
  'jobCardLocation': 'Melbourne VIC',
  'jobClassification': '(Information & Communication Technology)',
  'jobCompany': 'Private Advertiser',
  'jobListingDate': '29d ago',
  'jobLocation': 'Melbourne VIC',
  'jobShortDescription': 'We are seeking an experienced Data Engineer to join our team.',
  'jobSubClassification': 'Database Development & Administration',
  'jobTitle': 'Data Engineer',
  'signed-out-save-job': ''},
 '84717234': {'company-logo': '',
  'company-logo-container': '',
  'job-list-item-link-overlay': '',
  'job-list-view-job-link': '',
  'jobCardLocation': 'Melbourne VIC',
  'jobClassification': '(Information & Communication Technology)',
  'jobCompany': 'GRANITE CONSULTING',
  'jobListingDate': '25d ago',
  'jobLocation': 'Melbourne VIC',
  'jobShortDescription': 'Exciting opportunity for Data Engineer ( Pathology) with our Healthcare client in Melbourne.',
  'jobSubClassification': 'Engi