In [1]:
# Import splinter
from splinter import Browser
from selenium.webdriver.common.keys import Keys
import pandas as pd
import time

In [3]:
t = time.localtime()
current_time = time.strftime("%Y-%m-%d %H:%M:%S", t)
print(current_time)

2023-07-10 14:57:09


In [38]:
# Create browser object
browser = Browser('chrome')

In [39]:
# Open Indeed
url = "https://www.indeed.com/"
browser.visit(url)

# Give the browser time to respond
time.sleep(10)

In [40]:
# Removes the / from the end of the url
url[:-1]

'https://www.indeed.com'

In [41]:
# Array to store results in
data = []

In [42]:
# Function to scrape all the posts on a single page
def scrape_one_page(data, browser):
    # Job Title
    posts_in_page = browser.find_by_css(".jobsearch-ResultsList").find_by_css(".job_seen_beacon")
        
    for post in posts_in_page:
        # Initialize 1D array to store this post's data
        post_data = []
            
        # Grab job id
        try:
            job_id = post.find_by_css("a").first["data-jk"]
            post_data.append(job_id)
        except:
            post_data.append(None)
            
        # Grab job title
        try:
            title = post.find_by_css("span").first.text
            post_data.append(title)
        except:
            post_data.append(None)
            
        # Grab company
        try:
            company = post.find_by_css(".companyName").first.text
            post_data.append(company)
        except:
            post_data.append(None)
            
        # Grab location
        try:
            location = post.find_by_css(".companyLocation").first.text
            post_data.append(location)
        except:
            post_data.append(None)
            
        # Grab job type (if available)
        try:
            job_type = post.find_by_css("[aria-label='Job type']").first.find_by_xpath("..").first.text
            post_data.append(job_type)
        except:
            post_data.append(None)
                
        # Grab salary (if available)
        try:
            salary = post.find_by_css("[aria-label='Salary']").first.find_by_xpath("..").first.text
            post_data.append(salary)
        except:
            try:
                salary = post.find_by_css(".estimated-salary").first.find_by_css("span").first.text
                post_data.append(salary)
            except:
                post_data.append(None)
                
        # Record the time
        t = time.localtime()
        current_time = time.strftime("%Y-%m-%d %H:%M:%S", t)
        post_data.append(current_time)
            
        # Grab job url
        try:
            job_url = post.find_by_css("a").first["href"]
            post_data.append(job_url)
        except:
            post_data.append(None)
        
        data.append(post_data)

In [43]:
# Function to scrape all the posts on all pages
def indeed_scraper(data, search_params, browser):
    base_url = url[:-1]
    # Find the what box & search the job title
    what = browser.find_by_id("text-input-what").first
    what.fill(search_params["what"])
    
    # Find the where box & search for the location
    where = browser.find_by_id("text-input-where").first
    where.type(Keys.CONTROL + "a")
    where.type(Keys.BACKSPACE)
    where.fill(search_params["where"])
    
    where.type(Keys.RETURN)
    
    time.sleep(1)
    
    date_posted = browser.find_by_text("Date posted").first
    date_posted.click()
    
    time.sleep(1)
    
    # This can be set to one of these options = ["Last 24 Hours", "Last 3 days", "Last 7 days", "Last 14 days"]
    last_24_hours = browser.find_by_text("Last 7 days").first
    last_24_hours.click()
    
    last_page = False
    page_count = 0
    
    while not last_page:
        try:  # This code runs for every page except the last page
            next_page_button = browser.find_by_css("[aria-label='Next Page']")
            
            page_count += 1
            print(f"Scraping page {page_count}...")
        
            post_data = scrape_one_page(data, browser)
                            
            print("Clicking to next page.")
            next_page_button.click()
                            
        except:  # This code runs only for the last page
            
            page_count += 1
            print(f"Scraping final page of {page_count} total pages...")
        
            post_data = scrape_one_page(data, browser)
                            
            print("Final page has been scraped.")               
            last_page = True
                       
    return data

In [44]:
# Set "what" for the words to search, and "where" for the location search
search_params = {"what": "Data Scientist", "where": "United States"}

# Run the webscraper to collect all listings from all pages for the last 24 hours

t = time.localtime()
current_time = time.strftime("%Y-%m-%d %H:%M:%S", t)

print(f"Beginning scraping process...{current_time}")
try:
    indeed_scraper(data, search_params, browser)
    t = time.localtime()
    current_time = time.strftime("%Y-%m-%d %H:%M:%S", t)
    print(f"Scraping completed successfully! {current_time}")
    
    browser.quit()
    print("Browser has been closed.")
    
except:
    t = time.localtime()
    current_time = time.strftime("%Y-%m-%d %H:%M:%S", t)
    print(f"Something went wrong with the scraper.. :(\n{current_time}")
    
    browser.quit()
    print("Browser has been closed.")

Beginning scraping process...
Scraping page 1...
Clicking to next page.
Scraping page 2...
Clicking to next page.
Scraping page 3...
Clicking to next page.
Scraping page 4...
Clicking to next page.
Scraping page 5...
Clicking to next page.
Scraping page 6...
Clicking to next page.
Scraping page 7...
Clicking to next page.
Scraping page 8...
Clicking to next page.
Scraping page 9...
Clicking to next page.
Scraping page 10...
Clicking to next page.
Scraping page 11...
Clicking to next page.
Scraping page 12...
Clicking to next page.
Scraping page 13...
Clicking to next page.
Scraping page 14...
Clicking to next page.
Scraping page 15...
Clicking to next page.
Scraping page 16...
Clicking to next page.
Scraping page 17...
Clicking to next page.
Scraping page 18...
Clicking to next page.
Scraping page 19...
Clicking to next page.
Scraping page 20...
Clicking to next page.
Scraping page 21...
Clicking to next page.
Scraping page 22...
Clicking to next page.
Scraping page 23...
Clicking to n

In [46]:
# Name columns for putting the results into a DataFrame
columns = ["id", "title", "company", "location", "job_type", "salary", "time_recorded", "url"]

# Create a Pandas DataFrame
listings_new_df = pd.DataFrame(data, columns = columns)
listings_new_df

Unnamed: 0,id,title,company,location,job_type,salary,url
0,f1a7f4fc726420f4,"Sr. Data Scientist- Risk Modeler, AVP - Hybrid",Citi,"Elk Grove Village, IL 60007",Full-time,"$93,200 - $139,800 a year",https://www.indeed.com/pagead/clk?mo=r&ad=-6NY...
1,79f1c9abf1954819,Principal Data Scientist (Algorithm),WALGREENS,"Deerfield, IL 60015",,,https://www.indeed.com/pagead/clk?mo=r&ad=-6NY...
2,a2c2b68ca8d6a565,Machine Learning Supervisor,Old Republic Title,"Hybrid remote in Minnetonka, MN",Full-time,Estimated $75.9K - $96.1K a year,https://www.indeed.com/pagead/clk?mo=r&ad=-6NY...
3,46883a8ae338078a,Data Scientist,Robert Half,"Portland, ME 04101 \n(West Bayside area)",,Estimated $91.9K - $116K a year,https://www.indeed.com/pagead/clk?mo=r&ad=-6NY...
4,ca739b233c19204c,Data Scientist,Pacer Staffing,"Durham, NC 27709",Full-time,$39 an hour,https://www.indeed.com/pagead/clk?mo=r&ad=-6NY...
...,...,...,...,...,...,...,...
620,efb7eae14cd641bb,Strategic Customers - Data Scientist,Oracle,United States,,Estimated $102K - $129K a year,https://www.indeed.com/rc/clk?jk=efb7eae14cd64...
621,b1e05778e5fce266,Senior Data Scientist,PowerSchool Group LLC,Remote in United States,Full-time,"$110,000 - $140,000 a year",https://www.indeed.com/rc/clk?jk=b1e05778e5fce...
622,55ce5fa397772ed2,Machine Learning Engineer,"Suzy, Inc.",Remote in United States,Full-time,"$128,000 - $150,000 a year",https://www.indeed.com/rc/clk?jk=55ce5fa397772...
623,c46648244d8618b1,Principal Data Architect,FICO,United States,Full-time,"$147,000 - $231,000 a year",https://www.indeed.com/rc/clk?jk=c46648244d861...


In [None]:
# Save the updated DataFrame as a .csv file

t = time.localtime()
current_time = time.strftime("%Y-%m-%d %H:%M:%S", t)

file_path = '../data/listings_new.csv'

listings_new_df.to_csv(file_path)
print(f"New results added and saved to file {file_path}\n{current_time}")