In [24]:
import pandas as pd
from urllib.parse import urlparse
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.edge.service import Service
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time
from tqdm.auto import tqdm

tqdm.pandas()

In [25]:
options = webdriver.EdgeOptions()
options.use_chromium = True
# options.add_argument("--headless")        # remove this line if you want to see the browser
driver = webdriver.Edge(options=options)


wait = WebDriverWait(driver, 2)

In [26]:
df = pd.read_csv('workday_domains_cleaned.csv')

# create new pandas DF called postings
postings = pd.DataFrame(columns=['job_title', 'job_location', 'job_link', 'time_posted', 'company_name'])
problems = []

In [27]:
def get_job_postings(url, base_url, postings, problems, pbar, index, row):
    while(True):
            
            # wait for section element with data-automation-id="jobResults" to exist
            wait.until(EC.presence_of_element_located((By.XPATH, '//*[@data-automation-id="jobResults"]')))

            search_output = driver.find_element(By.XPATH, '//*[@data-automation-id="jobResults"]')

            # get direct children of search_output with tag name ul
            ul = search_output.find_element(By.XPATH, './/ul')

            # get all direct descendants of ul with tag name li that contains an a tag with data-automation-id="jobTitle"
            for li in ul.find_elements(By.XPATH, './/li[.//a[@data-automation-id="jobTitle"]]'):
                try:

                    # find an a tag element in li with data-automation-id="jobTitle"
                    a = li.find_element(By.XPATH, './/a[@data-automation-id="jobTitle"]')

                    # get the text and href of a
                    job_title = a.text
                    pbar.set_description(f"Processing {job_title}")
                    job_url = a.get_attribute('href')
                    
                    # add base url to job url
                    if not job_url.startswith('http'):
                        job_url = base_url + job_url

                    # find a div wtih data-automation-id="locations"
                    location_div = li.find_element(By.XPATH, './/div[@data-automation-id="locations"]')
                    # find dd tag within location_div
                    location_dd = location_div.find_element(By.XPATH, './/dd')
                    # get the text of dd
                    job_location = location_dd.text

                    # find a div with data-automation-id="postedOn"
                    posted_on_div = li.find_element(By.XPATH, './/div[@data-automation-id="postedOn"]')
                    # find dd tag within posted_on_div
                    posted_on_dd = posted_on_div.find_element(By.XPATH, './/dd')
                    # get the text of dd
                    time_posted = posted_on_dd.text

                    company_name = row['customer_name']

                    # see if exists in postings
                    if (postings['job_link'] == job_url).any():

                        problems.append(job_url)
                        return

                    postings.loc[len(postings)] = [job_title, job_location, job_url, time_posted, company_name]
                except Exception as e:
                    # print(f"Error: {e}")
                    continue

            # find a button element with type="button" and aria-label="next"
            # if exists, click it
            # if does not exist, break
            try:
                next_button = search_output.find_element(By.XPATH, './/button[@type="button" and @aria-label="next"]')
                next_button.click()
                time.sleep(1)
            except Exception as e:
                print(f"No more pages: {e}")
                break

In [28]:
# for index, row in df.iterrows():
# use tqdm
for index, row in (pbar := tqdm(df.iterrows(), total=df.shape[0], desc="Processing rows", unit="row")):

    url = row['workday_domain_fr']

    # get base url from url with urlparse
    parsed_url = urlparse(url)
    base_url = f"{parsed_url.scheme}://{parsed_url.netloc}"


    driver.get(url)

    
    try:
        # wait for //*[contains(@data-automation-id, "keywordSearchInput") or @placeholder="Search for jobs or keywords"] to exist
        wait.until(EC.presence_of_element_located((By.XPATH, '//*[contains(@data-automation-id, "keywordSearchInput") or @placeholder="Search for jobs or keywords"]')))

        # find element with data-automation-id="keywordSearchSection"
        search_section = driver.find_element(By.XPATH, '//*[@data-automation-id="keywordSearchSection"]')

        # in input element selected by //*[contains(@data-automation-id, "keywordSearchInput") or @placeholder="Search for jobs or keywords"] search for "software"
        search_input = driver.find_element(By.XPATH, '//*[contains(@data-automation-id, "keywordSearchInput") or @placeholder="Search for jobs or keywords"]')
        search_input.send_keys("software")

        # from search_section find element in button tag with inner text "Search"
        search_button = search_section.find_element(By.XPATH, './/button[normalize-space(text())="Search"]')
        # click the search button
        search_button.click()

        # wait 0.5 seconds
        time.sleep(1)

        # call get_job_postings
        get_job_postings(url, base_url, postings, problems, pbar, index, row)

        



    except Exception as e:
        print(f"Error: {e}")
        print(f"URL: {url}")
        continue

        

Processing rows:   0%|          | 0/2589 [00:00<?, ?row/s]

No more pages: Message: no such element: Unable to locate element: {"method":"xpath","selector":".//button[@type="button" and @aria-label="next"]"}
  (Session info: MicrosoftEdge=136.0.3240.64); For documentation on this error, please visit: https://www.selenium.dev/documentation/webdriver/troubleshooting/errors#no-such-element-exception
Stacktrace:
	GetHandleVerifier [0x00007FF772C0FCA5+25029]
	(No symbol) [0x00007FF772B64CB0]
	Microsoft::Applications::Events::EventProperty::to_string [0x00007FF772E6ADAA+1947706]
	(No symbol) [0x00007FF772977D88]
	(No symbol) [0x00007FF77297804B]
	(No symbol) [0x00007FF77296E1AC]
	(No symbol) [0x00007FF7729990CF]
	(No symbol) [0x00007FF77296E15D]
	(No symbol) [0x00007FF77296E01D]
	(No symbol) [0x00007FF772999350]
	(No symbol) [0x00007FF77296E15D]
	(No symbol) [0x00007FF7729B6698]
	(No symbol) [0x00007FF772998DF3]
	(No symbol) [0x00007FF77296D6A6]
	(No symbol) [0x00007FF77296CBB3]
	(No symbol) [0x00007FF77296D4D3]
	(No symbol) [0x00007FF772A75D0D]
	(No

In [29]:
postings

Unnamed: 0,job_title,job_location,job_link,time_posted,company_name
0,Senior Software Engineer,Fuze Portugal - Aveiro Office,https://8x8inc.wd5.myworkdayjobs.com/en-US/8x8...,Posted 30+ Days Ago,"8x8, Inc."
1,Software Development Engineer,Singapore-8x8 Asia,https://8x8inc.wd5.myworkdayjobs.com/en-US/8x8...,Posted 13 Days Ago,"8x8, Inc."
2,Staff Software Engineer,2 Locations,https://8x8inc.wd5.myworkdayjobs.com/en-US/8x8...,Posted 30+ Days Ago,"8x8, Inc."
3,Software Development Engineer,2 Locations,https://8x8inc.wd5.myworkdayjobs.com/en-US/8x8...,Posted 30+ Days Ago,"8x8, Inc."
4,Staff Software Engineer,Romania-Cluj Office,https://8x8inc.wd5.myworkdayjobs.com/en-US/8x8...,Posted 30+ Days Ago,"8x8, Inc."
...,...,...,...,...,...
69374,Expert Data Engineer,2 Locations,https://zuehlke.wd3.myworkdayjobs.com/en-US/Zu...,Posted 30+ Days Ago,Zuhlke Engineering AG
69375,DevOps Engineer,2 Locations,https://zuehlke.wd3.myworkdayjobs.com/en-US/Zu...,Posted 30+ Days Ago,Zuhlke Engineering AG
69376,Business Development Director with focus on th...,2 Locations,https://zuehlke.wd3.myworkdayjobs.com/en-US/Zu...,Posted 30+ Days Ago,Zuhlke Engineering AG
69377,Client Principal or Director - Industry & Cons...,Munich,https://zuehlke.wd3.myworkdayjobs.com/en-US/Zu...,Posted 30+ Days Ago,Zuhlke Engineering AG


In [32]:
# save postings to csv
postings.to_csv('workday_postings.csv', index=False)

In [31]:
#lenght of problems
print(f"Problems: {len(problems)}")

Problems: 482
