In [4]:
# TODO: https://medium.com/@sakthiveltvt.thangaraj/introduction-to-nest-asyncio-for-python-developers-afd7bed44768
# TODO: https://www.bing.com/search?pglt=161&q=asynchio+nesting

import time
import asyncio
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from concurrent.futures import ThreadPoolExecutor

# ANSI colors
c = (
    "\033[0m",   # End of color
    "\033[36m",  # Cyan
    "\033[91m",  # Red
    "\033[35m",  # Magenta
)

# articles = []
# global job_ads
job_ads = {}
# job_ids = []


def parse_article(el, job_ad_id):
    job_ads[job_ad_id][(el.get_attribute("data-automation")).replace('-','_')] = el.text 

def parse_articles(article):
    start_time_outer = time.perf_counter()
    try:
        print(f'parsing articles')
        job_id = article.get_attribute("data-job-id")
        print(f'jid: {c[2]}{job_id}{c[0]}')
        job_ads[job_id], start_time_inner, elements = {}, time.perf_counter(), article.find_elements(By.CSS_SELECTOR, '[data-automation]')
        print(f'{c[1]}find data elements time:{time.perf_counter() - start_time_inner}s | startime:{start_time_inner}{c[0]}')
        # with ThreadPoolExecutor(max_workers=64) as executor:
        #     result = executor.map(parse_article, elements, [job_id for i in elements])
        # executor.shutdown(wait=True) # Shutdown the executor
        for el in elements: #data-automation data-testid
            job_ads[job_id][(el.get_attribute("data-automation")).replace('-','_')] = el.text
    except Exception as e:
        print('Error parse_articles(): ', e)
    finally:
        print(f'{c[3]}parse article time = {time.perf_counter() - start_time_outer}, startime: {start_time_outer}{c[0]}')

# Function to scrape one page using Selenium
def scrape_with_selenium(url):
    try:
        # options = Options()
        options = webdriver.ChromeOptions()
        options.page_load_strategy = 'none'
        driver = webdriver.Chrome(options=options)
        driver.get(url)
        print(f"Scraped {url}")
        start_time = time.perf_counter()
        articles = WebDriverWait(driver, 22).until(
            EC.presence_of_all_elements_located((By.TAG_NAME, 'article'))
        )
        print(f'articles collected... {c[1]}collection time: {time.perf_counter() - start_time}s | start time: {start_time}{c[0]}')
        start_time = time.perf_counter()
        (print('stopping page load...'), driver.execute_script("window.stop();"))
        try:
            # with ThreadPoolExecutor(max_workers=64) as executor:
            #     print('EXECUTOR🪓🪓🪓')
            #     result = executor.map(parse_articles, articles) #articles[:5])
            # executor.shutdown(wait=True) # Shutdown the executor
            jobs = driver.execute_script("""
                let articles = Array.from(document.getElementsByTagName('article'))
                let jobs = {}
                articles.map(article => {
                	let jobId = article.dataset.jobId
                	jobs[jobId] = {}
                	let elements = Array.from(article.querySelectorAll('[data-automation]'))
                	return elements.map(el => {
                		let dataKey = el.dataset.automation
                		jobs[jobId][dataKey] = el.innerText
                	})
                })
                return jobs
            """)
            job_ads.update(jobs)
        except Exception as e:
            print('Error:', e)
        finally:
            driver.quit()
        print('parse end time = ', time.perf_counter())
        print('run time (parsing article) = ', time.perf_counter() - start_time)
        
    except Exception as e:
        print('Error: scrape_with_selenium()', e)
    finally:
        # driver.quit()
        return articles

# Async wrapper to run the blocking Selenium code
async def async_scrape(url, executor):
    try:
        loop = asyncio.get_running_loop()
        return await loop.run_in_executor(executor, scrape_with_selenium, url)
    except Exception as e:
        print('Error:', e)
    
async def populate_results(url, executor):
    try:
        article = await async_scrape(url, executor)
        # articles.append(article)
    except Exception as e:
        print('Error:', e)

# Main async function to scrape multiple URLs
async def main(urls):
    try:
        results = []
        with ThreadPoolExecutor(max_workers=128) as executor:
            tasks = [populate_results(url, executor) for url in urls]
            results = await asyncio.gather(*tasks)
        print(f"\nScraped {len(results)} pages.")
    except Exception as e:
        print('Error:', e)


# Example usage
if __name__ == "__main__":
    try:
        start_time = time.perf_counter()
        urls = [
        f'https://www.seek.com.au/Data-Engineer-jobs/in-Melbourne-VIC-3000?page={x+1}' for x in range(10)
        ]
        await main(urls)
        print('run time = ', time.perf_counter() - start_time)
    except Exception as e:
        print('Error:', e)

Scraped https://www.seek.com.au/Data-Engineer-jobs/in-Melbourne-VIC-3000?page=10
Scraped https://www.seek.com.au/Data-Engineer-jobs/in-Melbourne-VIC-3000?page=9
Scraped https://www.seek.com.au/Data-Engineer-jobs/in-Melbourne-VIC-3000?page=3
Scraped https://www.seek.com.au/Data-Engineer-jobs/in-Melbourne-VIC-3000?page=8
Scraped https://www.seek.com.au/Data-Engineer-jobs/in-Melbourne-VIC-3000?page=5
Scraped https://www.seek.com.au/Data-Engineer-jobs/in-Melbourne-VIC-3000?page=4
Scraped https://www.seek.com.au/Data-Engineer-jobs/in-Melbourne-VIC-3000?page=1
Scraped https://www.seek.com.au/Data-Engineer-jobs/in-Melbourne-VIC-3000?page=6
Scraped https://www.seek.com.au/Data-Engineer-jobs/in-Melbourne-VIC-3000?page=2
Scraped https://www.seek.com.au/Data-Engineer-jobs/in-Melbourne-VIC-3000?page=7
articles collected... [36mcollection time: 1.0073391690020799s | start time: 17174.4679624[0m
stopping page load...
articles collected... [36mcollection time: 1.113316417002352s | start time: 1717

In [3]:
job_ads

{}