# Indeed.com daily job scraping script

In [1]:
from selenium.webdriver.common.by import By
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import Select
import re
import time
import pandas as pd
import numpy as np

### Set up driver

In [2]:
DRIVER_PATH = '/home/nessa/Downloads/chromedriver'
driver      = webdriver.Chrome(executable_path = DRIVER_PATH)
#driver.minimize_window()
wait=WebDriverWait(driver, 3)

In [3]:
# implicit wait time
waittime=20

### What are we searching for and where?

In [4]:

job_key_words = ['data science', 'machine learning', 'quantitative researcher']
loc_key_words = ['Seattle', 'Portland', 'San Francisco', 'Oakland']

In [5]:
# sub select by 
ids=['filter-radius','filter-jobtype','filter-dateposted']
options=["within 10 miles","Full-time","Last 24 hours"]

In [6]:
def handle_exception():
    print('handling exception...')
    driver.minimize_window()
    driver.maximize_window()
    driver.refresh()
    #driver.execute_script("window.scrollBy(0, 50)") 
    time.sleep(3)
    return True

In [7]:

def selections(id_,option):
    """
    function that executes a filter selection
    id_ : id of filter element
    option: name (string) specifying criterion 
    """
    print('{}: {}'.format(id_, option))
    try:
        drop = WebDriverWait(driver, waittime).until(EC.element_to_be_clickable((By.ID, id_)))
        time.sleep(1)
        drop.click()
    except:
        handle_exception()
        drop = WebDriverWait(driver, waittime).until(EC.element_to_be_clickable((By.ID, id_)))
        time.sleep(1)
        drop.click()
    

    XPATH = '//a[contains(text(),"{}")]'.format(option)
    try:
        drop = WebDriverWait(driver, waittime).until(EC.element_to_be_clickable((By.XPATH, XPATH)))
        time.sleep(1)
        drop.click()
    except:
        handle_exception()
        drop = WebDriverWait(driver, waittime).until(EC.element_to_be_clickable((By.XPATH, XPATH)))
        time.sleep(1)
        drop.click()


In [8]:


titles=[]
companies=[]
locations=[]
cities=[]
descriptions=[]
min_salaries=[]
max_salaries=[]

def fill_entries():
    """
    just a wrapper for scraping info for every job in a search
    yeah, technically we should probably pass the lists above...
    """
    
    driver.implicitly_wait(3) 
    
    try:
        job_cards = driver.find_elements_by_xpath('//div[contains(@class,"job_seen_beacon")]')
    except:
        handle_exception()
        job_cards = driver.find_elements_by_xpath('//div[contains(@class,"job_seen_beacon")]')
    
    print('{} jobs found'.format(len(job_cards)))
    for job in job_cards:
 
        driver.execute_script("window.scrollBy(0, 300)") 
        try:
            job.click()
        except:
            driver.execute_script("window.scrollBy(0, 100)")
            job.click()
        title = driver.find_elements_by_xpath('//div[contains(@class,"jobsearch-JobInfoHeader-title")]')[0].text
        titles.append(re.split('\n', title)[0])

        info_list = driver.find_elements_by_xpath('//div[contains(@class,"jobsearch-CompanyInfoWithoutHeaderImage")]')
        text=info_list[0].text
        split_text = re.split('\n', text)
        company = split_text[0]
        loc     = split_text[-1]
        try:
            city, loc = re.split('•',loc)
        except:
            city = loc
            loc  ='onsite' 

        # get salary information from job ad; if not available, use indeed's estimate; for no info, make it a 'nan'
        text = job.find_element_by_xpath('//*[@id="jobDetailsSection"]').text
        if ('salary' in text) or ('Salary' in text):
            salary = re.split('\n',text)[2]
        else:
            try:
                salary=driver.find_element_by_xpath('//*[contains(text(),"{}")]'.format("Indeed's estimated salary")).text

            except:
                salary='nan'
        #do some regular expression stuff to get them all into the same format
        try:
            min_salary, max_salary=re.split(' - ', salary)
            max_salary=re.split(' ', max_salary)[0]
            min_salary=re.sub('K', ',000', min_salary)
            max_salary=re.sub('K', ',000', min_salary)
        except:
            min_salary, max_salary = salary, salary

        min_salaries.append(min_salary)
        max_salaries.append(max_salary)



        locations.append(loc)
        cities.append(city)

        companies.append(company)
        
        text = job.find_element_by_xpath('//*[@id="jobDescriptionText"]').text
        descriptions.append(text)

### Run the job search

In [9]:

for job_key_word in job_key_words:
    for loc_key_word in loc_key_words:
        print('searching for {} in {}...'.format(job_key_word,loc_key_word))
        
        driver.get('https://indeed.com')
        try:
            search_job = WebDriverWait(driver, waittime).until(EC.element_to_be_clickable((By.ID, "text-input-what")))
        except:
            handle_exception()
            search_job = WebDriverWait(driver, waittime).until(EC.element_to_be_clickable((By.ID, "text-input-what")))
            
        time.sleep(1)
        
        search_job.send_keys([job_key_word])
        search_job.send_keys(Keys.ENTER)
        
        try:
            WebDriverWait(driver, waittime).until(EC.element_to_be_clickable((By.ID,"text-input-where")))
        except:
            handle_exception()
            WebDriverWait(driver, waittime).until(EC.element_to_be_clickable((By.ID,"text-input-where")))
            
        time.sleep(1)
        
        driver.find_element_by_id("text-input-where").click()
        driver.find_element_by_id("text-input-where").send_keys(Keys.CONTROL + "a")
        driver.find_element_by_id("text-input-where").send_keys(Keys.DELETE)

        try:
            search_loc = WebDriverWait(driver, waittime).until(EC.element_to_be_clickable((By.ID,"text-input-where")))
        except:
            handle_exception()
            search_loc = WebDriverWait(driver, waittime).until(EC.element_to_be_clickable((By.ID,"text-input-where")))
        
        time.sleep(1)
        
        search_loc.send_keys(Keys.CONTROL + "a")
        search_loc.send_keys(Keys.DELETE)
        search_loc.send_keys([loc_key_word])
        search_loc.send_keys(Keys.ENTER)
        
        try:
            search =WebDriverWait(driver, waittime).until(EC.element_to_be_clickable((By.XPATH,'//*[@id="jobsearch"]/button')))
            time.sleep(1)
            search.click()
        except:
            handle_exception()
            search =WebDriverWait(driver, waittime).until(EC.element_to_be_clickable((By.XPATH,'//*[@id="jobsearch"]/button')))
            time.sleep(1)
            search.click()
        
        print('selecting ...')
        for id_, option in zip(ids,options):
            selections(id_,option)
        fill_entries()
        time.sleep(5)


searching for data science in Seattle...
selecting ...
filter-radius: within 10 miles
filter-jobtype: Full-time
filter-dateposted: Last 24 hours
14 jobs found
searching for data science in Portland...
selecting ...
filter-radius: within 10 miles
filter-jobtype: Full-time
filter-dateposted: Last 24 hours
4 jobs found
searching for data science in San Francisco...
selecting ...
filter-radius: within 10 miles
filter-jobtype: Full-time
filter-dateposted: Last 24 hours
11 jobs found
searching for data science in Oakland...
selecting ...
filter-radius: within 10 miles
filter-jobtype: Full-time
filter-dateposted: Last 24 hours
10 jobs found
searching for machine learning in Seattle...
selecting ...
filter-radius: within 10 miles
filter-jobtype: Full-time
filter-dateposted: Last 24 hours
15 jobs found
searching for machine learning in Portland...
selecting ...
filter-radius: within 10 miles
filter-jobtype: Full-time
filter-dateposted: Last 24 hours
5 jobs found
searching for machine learning i

In [10]:
#filter-radius, #filter-radius

### Put results in a dataframe

In [11]:
df_da=pd.DataFrame()
df_da['Title']=titles
df_da['Company']=companies
df_da['City']=cities
df_da['Location']=locations
df_da['Description']=descriptions
df_da['Minimum_salary']=min_salaries
df_da['Maximum_salary']=max_salaries

### Add some more constraints 

In [12]:
drop_pairs = [['Title','Engineer'], ['Title','Product'], ['Company', 'Amazon'], ['Title','Postdoc'],
              ['Company', 'Tesla'], ['Company', 'Meta'], ['Title','NLP'], ['Company','University']]

df_da['selections'] = np.zeros(len(df_da)).astype(bool)

for drop_pair in drop_pairs:
    new_selec = df_da[drop_pair[0]].apply(lambda x: True if drop_pair[1] in x else False)
    df_da['selections']=new_selec|df_da['selections']
df_da = df_da[~df_da['selections']].reset_index(drop=True)

### Do some processing, I forgot to do earlier and add salary constraint

In [13]:
df_da.Minimum_salary = df_da.Minimum_salary.apply(lambda x: re.findall(r'\d+', x)[0] if len(re.findall(r'\d+', x))>0 else np.nan)

In [14]:
df_da.Maximum_salary = df_da.Maximum_salary.apply(lambda x: re.findall(r'\d+', x)[0] if len(re.findall(r'\d+', x))>0 else np.nan)

In [15]:
def count(x):
    try:
        result = np.isnan(x)
    except:
        result = int(x)>140
    return result

In [16]:
salary_cap = df_da.Maximum_salary.apply(lambda x: count(x))

In [19]:
df_da.drop_duplicates(inplace=True)
print(len(df_da[salary_cap]))

18


  print(len(df_da[salary_cap]))


### Print descriptions for remaining job ads

In [24]:
for ii, (title, company, text) in enumerate(zip(df_da[salary_cap].Title, df_da[salary_cap].Company, df_da[salary_cap].Description)):
    print('-----------------------')
    print('job ad', ii+1, ': ',title,', ', company)
    print('\n')
    print(text)
    print('-----------------------')
    print('\n')
    print('\n')

-----------------------
job ad 1 :  Data Scientist, Senior ,  Seattle Childrens Hospital - Seattle, WA


Responsibilities
As a Data Scientist, work with developers, analysts, scientists, clinicians, and business owners in a team-driven, agile environment to develop and scale data science results and products. Work alongside clinicians to develop algorithms and models to solve critical health care needs, and provide input into driving business and clinical operations and quality improvement with advanced analytics. Dive deep into vast repositories of structured, unstructured, and streaming data to find the input for models. Create tools and libraries that can leverage products for efficient use across the enterprise and beyond.
Requirements
- Bachelor's degree or higher in a STEM or relevant analytical field that demonstrates analytical and technical competency and 5+ years as a Data Scientist or Senior Data Analyst OR a Ph. D. in a STEM or relevant analytical field that demonstrates an

  for ii, (title, company, text) in enumerate(zip(df_da[salary_cap].Title, df_da[salary_cap].Company, df_da[salary_cap].Description)):
