# Indeed.com daily job scraping script

In [None]:
from selenium.webdriver.common.by import By
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import Select
import re
import time
import pandas as pd
import numpy as np

### Set up driver

In [None]:
DRIVER_PATH = '/home/nessa/Downloads/chromedriver'
driver      = webdriver.Chrome(executable_path = DRIVER_PATH)
driver.minimize_window()
wait=WebDriverWait(driver, 3)

In [None]:
# implicit wait time
waittime=60

### What are we searching for and where?

In [None]:

job_key_words = ['data science', 'machine learning', 'quantitative researcher']
loc_key_words = ['Seattle', 'Portland', 'San Francisco', 'Oakland']

In [None]:
# sub select by 
ids=['filter-jobtype','filter-radius','filter-dateposted']
options=["Full-time","within 10 miles","Last 24 hours"]

In [None]:

def selections(id_,option):
    """
    function that executes a filter selection
    id_ : id of filter element
    option: name (string) specifying criterion 
    """
    print('{}: {}'.format(id_, option))

    drop = WebDriverWait(driver, waittime).until(EC.element_to_be_clickable((By.ID, id_)))
    time.sleep(1)
    drop.click()

    XPATH = '//a[contains(text(),"{}")]'.format(option)
    drop = WebDriverWait(driver, waittime).until(EC.element_to_be_clickable((By.XPATH, XPATH)))
    time.sleep(1)
    drop.click()


In [None]:


titles=[]
companies=[]
locations=[]
cities=[]
descriptions=[]
min_salaries=[]
max_salaries=[]

def fill_entries():
    """
    just a wrapper for scraping info for every job in a search
    yeah, technically we should probably pass the lists above...
    """
    
    driver.implicitly_wait(3) 

    job_cards = driver.find_elements_by_xpath('//div[contains(@class,"job_seen_beacon")]')
    print('{} jobs found'.format(len(job_cards)))
    for job in job_cards:
 
        driver.execute_script("window.scrollTo(0, 100)") 
        job.click()
        title = driver.find_elements_by_xpath('//div[contains(@class,"jobsearch-JobInfoHeader-title")]')[0].text
        titles.append(re.split('\n', title)[0])

        info_list = driver.find_elements_by_xpath('//div[contains(@class,"jobsearch-CompanyInfoWithoutHeaderImage")]')
        text=info_list[0].text
        split_text = re.split('\n', text)
        company = split_text[0]
        loc     = split_text[-1]
        try:
            city, loc = re.split('•',loc)
        except:
            city = loc
            loc  ='onsite' 

        # get salary information from job ad; if not available, use indeed's estimate; for no info, make it a 'nan'
        text = job.find_element_by_xpath('//*[@id="jobDetailsSection"]').text
        if ('salary' in text) or ('Salary' in text):
            salary = re.split('\n',text)[2]
        else:
            try:
                salary=driver.find_element_by_xpath('//*[contains(text(),"{}")]'.format("Indeed's estimated salary")).text

            except:
                salary='nan'
        #do some regular expression stuff to get them all into the same format
        try:
            min_salary, max_salary=re.split(' - ', salary)
            max_salary=re.split(' ', max_salary)[0]
            min_salary=re.sub('K', ',000', min_salary)
            max_salary=re.sub('K', ',000', min_salary)
        except:
            min_salary, max_salary = salary, salary

        min_salaries.append(min_salary)
        max_salaries.append(max_salary)



        locations.append(loc)
        cities.append(city)

        companies.append(company)
        
        text = job.find_element_by_xpath('//*[@id="jobDescriptionText"]').text
        descriptions.append(text)

### Run the job search

In [None]:

for job_key_word in job_key_words:
    for loc_key_word in loc_key_words:
        print('searching for {} in {}...'.format(job_key_word,loc_key_word))
        
        driver.get('https://indeed.com')
        search_job = WebDriverWait(driver, waittime).until(EC.element_to_be_clickable((By.ID, "text-input-what")))
        time.sleep(1)
        search_job.send_keys([job_key_word])
        search_job.send_keys(Keys.ENTER)

        WebDriverWait(driver, waittime).until(EC.element_to_be_clickable((By.ID,"text-input-where")))
        time.sleep(1)
        driver.find_element_by_id("text-input-where").click()
        driver.find_element_by_id("text-input-where").send_keys(Keys.CONTROL + "a")
        driver.find_element_by_id("text-input-where").send_keys(Keys.DELETE)

        search_loc = WebDriverWait(driver, waittime).until(EC.element_to_be_clickable((By.ID,"text-input-where")))
        time.sleep(1)
        search_loc.send_keys(Keys.CONTROL + "a")
        search_loc.send_keys(Keys.DELETE)
        search_loc.send_keys([loc_key_word])
        search_loc.send_keys(Keys.ENTER)
        search =WebDriverWait(driver, waittime).until(EC.element_to_be_clickable((By.XPATH,'//*[@id="jobsearch"]/button')))
        time.sleep(1)
        search.click()
        
        print('selecting ...')
        for id_, option in zip(ids,options):
            selections(id_,option)
        fill_entries()
        time.sleep(5)

### Put results in a dataframe

In [None]:
df_da=pd.DataFrame()
df_da['Title']=titles
df_da['Company']=companies
df_da['City']=cities
df_da['Location']=locations
df_da['Description']=descriptions
df_da['Minimum_salary']=min_salaries
df_da['Maximum_salary']=max_salaries

### Add some more constraints 

In [None]:
drop_pairs = [['Title','Engineer'], ['Title','Product'], ['Company', 'Amazon'], ['Title','Postdoc'],
              ['Company', 'Tesla'], ['Company', 'Meta'], ['Title','NLP'], ['Company','University']]

df_da['selections'] = np.zeros(len(df_da)).astype(bool)

for drop_pair in drop_pairs:
    new_selec = df_da[drop_pair[0]].apply(lambda x: True if drop_pair[1] in x else False)
    df_da['selections']=new_selec|df_da['selections']
df_da = df_da[~df_da['selections']].reset_index(drop=True)

### Do some processing we forgot to do earlier and add salary constraint

In [None]:
df_da.Minimum_salary = df_da.Minimum_salary.apply(lambda x: re.findall(r'\d+', x)[0] if len(re.findall(r'\d+', x))>0 else np.nan)

In [None]:
df_da.Maximum_salary = df_da.Maximum_salary.apply(lambda x: re.findall(r'\d+', x)[0] if len(re.findall(r'\d+', x))>0 else np.nan)

In [None]:
def count(x):
    try:
        result = np.isnan(x)
    except:
        result = int(x)>140
    return result

In [None]:
salary_cap = df_da.Maximum_salary.apply(lambda x: count(x))

In [None]:
print(len(df_da[salary_cap]))

### Print descriptions for remaining job ads

In [None]:
for title, company, text in zip(df_da[salary_cap].Title, df_da[salary_cap].Company, df_da[salary_cap].Description):
    print('-----------------------')
    print(title,', ', company)
    print('\n')
    print(text)
    print('-----------------------')
    print('\n')
    print('\n')