In [61]:
import pandas as pd
from urllib.parse import urlparse
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.common.exceptions import (
    NoSuchElementException,
    TimeoutException,
    WebDriverException
)
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from urllib.parse import urlparse, urlunparse
from tqdm.auto import tqdm

tqdm.pandas()

In [62]:
df = pd.read_csv('workday_domains.csv')
df

Unnamed: 0,customer_name,workday_domain
0,"8x8, Inc.",https://8x8inc.wd5.myworkdayjobs.com/en-US/8x8...
1,"AT&T, Services Inc.",https://att.wd1.myworkdayjobs.com/ATTGeneral/j...
2,Hong Kong Broadband Network,
3,Orange Belgium SA.,https://orange.wd3.myworkdayjobs.com/Orange_Ca...
4,TalkTalk,https://talktalk.wd3.myworkdayjobs.com/TalkTal...
...,...,...
3495,YOUView,https://gici.wd5.myworkdayjobs.com/en-US/Caree...
3496,"ZAGG, Inc.",
3497,Zoopla (ZPG Limited),
3498,Zuhlke Engineering AG,https://zuehlke.wd3.myworkdayjobs.com/en-US/Zu...


In [63]:
options = Options()
# options.add_argument("--headless")
# options.add_argument("--disable-gpu")
driver = webdriver.Chrome(options=options)

# optional: increase default wait time
wait = WebDriverWait(driver, 2)

In [64]:
def resolve_workday_domain(start_url: str) -> str:
    """
    Go to start_url, click the logoLink, wait for redirect,
    and return the netloc (domain) of the final URL.
    Returns empty string on failure.
    """
    try:
        driver.get(start_url)
        # wait until the logoLink is present & clickable
        logo = wait.until(
            EC.element_to_be_clickable(
                (By.CSS_SELECTOR, '[data-automation-id="logoLink"]')
            )
        )
        logo.click()
        # wait for the URL to change
        wait.until(lambda d: urlparse(d.current_url).netloc != urlparse(start_url).netloc)
        final_url = driver.current_url
        return urlparse(final_url).netloc
    except (NoSuchElementException, TimeoutException):
        return ""  # or None, or keep original start_url
    except Exception as e:
        # you might want to log e somewhere
        return ""

In [65]:
def get_ancestor_urls(url: str) -> list[str]:
    """
    Given a full URL, return a list of ancestor URLs by stripping off
    the last path segment repeatedly until only protocol and host remain.
    """
    parsed = urlparse(url)
    scheme, netloc, path = parsed.scheme, parsed.netloc, parsed.path
    # Remove trailing slash and split into segments
    segments = path.rstrip('/').split('/') if path else []
    ancestors = []
    # Build URLs from longest path to shortest
    for i in range(len(segments), 0, -1):
        new_path = '/'.join(segments[:i])
        ancestor = urlunparse((scheme, netloc, new_path, '', '', ''))
        ancestors.append(ancestor)
    # Finally add the root domain
    # root = f"{scheme}://{netloc}"
    # ancestors.append(root)
    return ancestors

In [66]:
def find_workday_domain(full_url: str) -> str | None:
    """
    For a given Workday job URL, visit each ancestor URL in order using Selenium
    and finds the search bar.
    Return the first matching URL or None if not found.
    """
    for candidate in get_ancestor_urls(full_url):
        try:
            driver.get(candidate)
            # Wait until the search input appears (or timeout)
            wait.until(
                #EC.presence_of_element_located data-automation-id contains the word "search" ignoring cases using xpath or has the placeholder="Search for jobs or keywords"
                EC.presence_of_element_located(
                    (By.XPATH, '//*[contains(@data-automation-id, "keywordSearchInput") or @placeholder="Search for jobs or keywords"]')
                )
            )
            return candidate
        except (TimeoutException, WebDriverException):
            # Element not found or page load error; try next ancestor
            continue
    return None

In [67]:
def process_dataframe(
    df: pd.DataFrame,
    input_col: str = 'workday_domain',
    output_col: str = 'workday_domain_fr'
) -> pd.DataFrame:
    """
    Process the DataFrame, filling output_col with the first ancestor URL
    that contains the search input element, or None.
    """
    # Initialize the output column
    df[output_col] = None
    # Iterate only over non-blank entries
    for idx, val in (pbar := tqdm(df[input_col].items(), total=len(df), desc="Processing URLs")):
        if isinstance(val, str) and val.strip():
            url = find_workday_domain(val)
            pbar.set_description(f"Processed {idx}: {val} -> {url}")
            df.at[idx, output_col] = url
    return df

In [68]:
df = process_dataframe(df, input_col='workday_domain', output_col='workday_domain_fr')

Processing URLs:   0%|          | 0/3500 [00:00<?, ?it/s]

In [72]:
df

Unnamed: 0,customer_name,workday_domain,workday_domain_fr
0,"8x8, Inc.",https://8x8inc.wd5.myworkdayjobs.com/en-US/8x8...,https://8x8inc.wd5.myworkdayjobs.com/en-US/8x8...
1,"AT&T, Services Inc.",https://att.wd1.myworkdayjobs.com/ATTGeneral/j...,https://att.wd1.myworkdayjobs.com/ATTGeneral
2,Hong Kong Broadband Network,,
3,Orange Belgium SA.,https://orange.wd3.myworkdayjobs.com/Orange_Ca...,https://orange.wd3.myworkdayjobs.com/Orange_Ca...
4,TalkTalk,https://talktalk.wd3.myworkdayjobs.com/TalkTal...,https://talktalk.wd3.myworkdayjobs.com/TalkTal...
...,...,...,...
3495,YOUView,https://gici.wd5.myworkdayjobs.com/en-US/Caree...,https://gici.wd5.myworkdayjobs.com/en-US/Careers
3496,"ZAGG, Inc.",,
3497,Zoopla (ZPG Limited),,
3498,Zuhlke Engineering AG,https://zuehlke.wd3.myworkdayjobs.com/en-US/Zu...,https://zuehlke.wd3.myworkdayjobs.com/en-US/Zu...


In [69]:

# domains = []
#for url in df["workday_domain"]:
# for url in tqdm(df["workday_domain"], desc="Resolving domains"):
#     if pd.isna(url) or not url.startswith("http"):
#         domains.append("")  # skip invalid URLs
#     else:
#         domains.append(resolve_workday_domain(url))


In [70]:
# df["workday_domain_fr"] = domains
# driver.quit()
# df

In [74]:
# save to csv
df.to_csv("workday_domains.csv", index=False)

In [75]:
# delete null rows
df = df[df['workday_domain_fr'].notna()]
# save to csv
df.to_csv("workday_domains_cleaned.csv", index=False)

In [106]:

newDF = df = pd.read_csv('workday_domains.csv')

In [110]:
industries = ["Communications", "Education", "Energy and Resources", "Financial Services", "Healthcare", "Hospitality", "Life Sciences", "Manufacturing", "Media and Entertainment", "Misc", "Nonprofit", "Other", "Professional and Business Services", "Public Sector", "Retail", "Technology", "Transportation", "overflow1", "overflow2", "overflow3", "overflow4", "overflow5", "overflow6", "overflow7", "overflow8", "overflow9"]
industryNum = 0

breakpoint = ["California College of the Arts", "Bruce Power L.P.", "Bank of America", "Bon Secours Mercy Health", "Chick-fil-A, Inc.", "Abbott", "avago-broadcom", "Comcast Cable Communications Management LLC", "Gammon", "ASPCA (American Society for the Prevention of Cruelty to Animals)", "AirAsia Berhad", "Accenture (Proquire)", "Chickasaw Nation Industries", "CarMax", "Adobe Systems Incorporated", "DB Cargo logo"]

for index, row in newDF.iterrows():
    if row["customer_name"] in breakpoint:
        industryNum += 1
    
    newDF.at[index, "industry"] = industries[industryNum]

In [111]:

# print all instances in newDF where in column industry is different from previous row  
for index, row in newDF.iterrows():
    if index == 0:
        continue
    elif row['industry'] != newDF.iloc[index - 1]['industry']:
        print(f"Row {index}: {row['customer_name']} - {row['industry']}")


Row 88: California College of the Arts - Education
Row 356: Bruce Power L.P. - Energy and Resources
Row 454: Bank of America - Financial Services
Row 910: Bon Secours Mercy Health - Healthcare
Row 1290: Chick-fil-A, Inc. - Hospitality
Row 1423: Abbott - Life Sciences
Row 1499: avago-broadcom - Manufacturing
Row 1860: Comcast Cable Communications Management LLC - Media and Entertainment
Row 1921: Gammon - Misc
Row 1929: ASPCA (American Society for the Prevention of Cruelty to Animals) - Nonprofit
Row 2038: AirAsia Berhad - Other
Row 2206: Accenture (Proquire) - Professional and Business Services
Row 2726: Chickasaw Nation Industries - Public Sector
Row 2913: CarMax - Retail
Row 3169: Adobe Systems Incorporated - Technology
Row 3499: DB Cargo logo - Transportation


In [115]:
newDF.to_csv("workday_domains.csv", index=False)

In [116]:
# drop na
newDF = newDF.dropna(subset=['workday_domain_fr'])
# save to csv
newDF.to_csv("workday_domains_cleaned.csv", index=False)