In [29]:
import pandas as pd
from googlesearch import search
from dotenv import load_dotenv
import os
import time
import requests
from tqdm import tqdm

In [30]:
load_dotenv()  # reads .env into os.environ

API_KEY = os.getenv("GOOGLE_API_KEY")
CX     = os.getenv("GOOGLE_CSE_ID")
NUM_RESULTS = 5
PAUSE_SECONDS = 1.0

In [31]:
# read the CSV file
df = pd.read_csv('workday_customers.csv')
df

Unnamed: 0,customer_name
0,"8x8, Inc."
1,"AT&T, Services Inc."
2,Hong Kong Broadband Network
3,Orange Belgium SA.
4,TalkTalk
...,...
3495,YOUView
3496,"ZAGG, Inc."
3497,Zoopla (ZPG Limited)
3498,Zuhlke Engineering AG


In [None]:
def find_workday_domain_old(company_name, num_results=5):
    """
    Search Google for 'company_name site:myworkdayjobs.com' 
    and return the first URL containing myworkdayjobs.com, or None.
    """
    query = f"{company_name} site:myworkdayjobs.com"
    try:
        for url in search(query, num_results=num_results, region="ca"):
            if "myworkdayjobs.com" in url:
                return url
    except Exception as e:
        print(f"Error searching for {company_name!r}: {e}")
    return None

In [32]:
def find_workday_domain(company_name, api_key=API_KEY, cx=CX,
                        num_results=NUM_RESULTS):
    """
    Query Google Custom Search for 'company_name site:myworkdayjobs.com'
    and return the first URL containing 'myworkdayjobs.com', or None.
    """
    endpoint = "https://www.googleapis.com/customsearch/v1"
    params = {
        "key": api_key,
        "cx": cx,
        "q": f'{company_name} site:myworkdayjobs.com',
        "num": num_results,
    }
    resp = requests.get(endpoint, params=params)
    resp.raise_for_status()
    data = resp.json()

    for item in data.get("items", []):
        link = item.get("link", "")
        if "myworkdayjobs.com" in link:
            return link
    return None

In [None]:
# first20 = df["customer_name"].head(20)
# df.loc[first20.index, "workday_domain"] = first20.apply(find_workday_domain)
# df.loc[first20.index, ["customer_name", "workday_domain"]]


Unnamed: 0,customer_name,workday_domain
0,"8x8, Inc.",https://8x8inc.wd5.myworkdayjobs.com/en-US/8x8...
1,"AT&T, Services Inc.",https://att.wd1.myworkdayjobs.com/ATTGeneral/j...
2,Hong Kong Broadband Network,
3,Orange Belgium SA.,https://orange.wd3.myworkdayjobs.com/Orange_Ca...
4,TalkTalk,https://talktalk.wd3.myworkdayjobs.com/TalkTal...
5,TELUS Communications,https://lifeworks.wd3.myworkdayjobs.com/en-US/...
6,42dot Inc. logo,https://workday.wd5.myworkdayjobs.com/Workday
7,A1 (Telekom Austria),https://a1group.wd3.myworkdayjobs.com/en-US/A1...
8,Alma Media Corporation logo,https://ferring.wd3.myworkdayjobs.com/de-CH/Fe...
9,"AuditBoard, Inc.",https://amat.wd1.myworkdayjobs.com/External/jo...


In [33]:
results = []
for name in tqdm(df["customer_name"], desc="Searching Workday domains", unit="company"):
    url = find_workday_domain(name)
    results.append(url)
    time.sleep(PAUSE_SECONDS)

df["workday_domain"] = results

print(df)

# Save the updated DataFrame to a new CSV file
df.to_csv('workday_domains.csv', index=False)

Searching Workday domains: 100%|██████████| 3500/3500 [1:23:35<00:00,  1.43s/company]

                    customer_name  \
0                       8x8, Inc.   
1             AT&T, Services Inc.   
2     Hong Kong Broadband Network   
3              Orange Belgium SA.   
4                        TalkTalk   
...                           ...   
3495                      YOUView   
3496                   ZAGG, Inc.   
3497         Zoopla (ZPG Limited)   
3498        Zuhlke Engineering AG   
3499                DB Cargo logo   

                                         workday_domain  
0     https://8x8inc.wd5.myworkdayjobs.com/en-US/8x8...  
1     https://att.wd1.myworkdayjobs.com/ATTGeneral/j...  
2                                                  None  
3     https://orange.wd3.myworkdayjobs.com/Orange_Ca...  
4     https://talktalk.wd3.myworkdayjobs.com/TalkTal...  
...                                                 ...  
3495  https://gici.wd5.myworkdayjobs.com/en-US/Caree...  
3496                                               None  
3497                             


